How to Extract Text from PDF or Image Files Easily with JavaScript code

Looking for a simple way to pull text from PDF or image files? This tool uses pdf.js for PDFs and Tesseract.js for images, making it easy to grab the text with just a few clicks. Whether it’s a document or a picture, you can quickly extract what you need and display it right on the page.

Jan 28, 2025 - 23:29
Jan 29, 2025 - 00:37
 0  21
How to Extract Text from PDF or Image Files Easily with JavaScript code
How to Extract Text from PDF or Image Files Easily with JavaScript code

Libraries Used:

  • pdf.js: This is a library used to load and extract text from PDF files. It's pulled in using a CDN.
  • Tesseract.js: This is an OCR library for extracting text from image files. It's also loaded via a CDN.

HTML Structure:

  • There's a file input element that lets the user select a file.
  • The
    with the ID "output" is where the extracted text gets displayed.

Text Extraction Logic:

  • When a PDF is selected, the extractTextFromPDF() function is called. It reads the file as a binary array using FileReader and then extracts the text from each page using pdf.js.
  • If an image is selected, extractTextFromImage() kicks in, and Tesseract.js does the OCR magic to grab the text from the image.

Event Handling:

  • Once a file is picked, the handleFile() function runs. It checks the file type and calls the right extraction function based on whether it's a PDF or an image.

How to Use:

  • Just open the page in your browser.
  • Select a PDF or image file via the file input.
  • The extracted text will appear in the "output" div.

below is exmaple of pdf and  image convert in text 

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>PDF and Image Text Extraction</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/tesseract.js@2.1.4/dist/tesseract.min.js"></script>
    <style>
        body {
            font-family: Arial, sans-serif;
            padding: 20px;
            background-color: #f4f4f9;
        }
        #output {
            margin-top: 20px;
            padding: 10px;
            border: 1px solid #ccc;
            background-color: #fff;
            white-space: pre-wrap;
            word-wrap: break-word;
            max-height: 300px;
            overflow-y: auto;
        }
        #fileInput {
            margin-bottom: 20px;
        }
    </style>
</head>
<body>

    <h1>Extract Text from PDF or Image</h1>
    <input type="file" id="fileInput" />
    <div id="output"></div>

    <script>
        // Function to extract text from a PDF file
        function extractTextFromPDF(file) {
            return new Promise((resolve, reject) => {
                const reader = new FileReader();
                reader.onload = function () {
                    const pdfData = new Uint8Array(this.result);
                    pdfjsLib.getDocument(pdfData).promise.then(pdfDoc => {
                        let text = '';
                        const numPages = pdfDoc.numPages;
                        const pagesPromises = [];

                        // Iterate through each page and extract text
                        for (let pageNum = 1; pageNum <= numPages; pageNum++) {
                            pagesPromises.push(
                                pdfDoc.getPage(pageNum).then(page => {
                                    return page.getTextContent().then(content => {
                                        text += content.items.map(item => item.str).join(' ') + '\n';
                                    });
                                })
                            );
                        }

                        // Resolve all pages and return the extracted text
                        Promise.all(pagesPromises).then(() => {
                            resolve(text);
                        }).catch(reject);
                    }).catch(reject);
                };
                reader.onerror = reject;
                reader.readAsArrayBuffer(file);
            });
        }

        // Function to extract text from an image using OCR (Tesseract.js)
        function extractTextFromImage(file) {
            return new Promise((resolve, reject) => {
                Tesseract.recognize(
                    file,
                    'eng',
                    {
                        logger: (m) => console.log(m)
                    }
                ).then(({ data: { text } }) => {
                    resolve(text);
                }).catch(reject);
            });
        }

        // Function to handle the file input and extract text
        function handleFile(file) {
            const fileType = file.type;

            // Check if the file is a PDF
            if (fileType === 'application/pdf') {
                extractTextFromPDF(file).then(text => {
                    document.getElementById('output').textContent = text;
                }).catch(error => {
                    document.getElementById('output').textContent = 'Error extracting text from PDF: ' + error.message;
                });
            }
            // Check if the file is an image
            else if (fileType.startsWith('image/')) {
                extractTextFromImage(file).then(text => {
                    document.getElementById('output').textContent = text;
                }).catch(error => {
                    document.getElementById('output').textContent = 'Error extracting text from image: ' + error.message;
                });
            } else {
                document.getElementById('output').textContent = 'Unsupported file type. Please select a PDF or image file.';
            }
        }

        // File input change handler
        document.querySelector('#fileInput').addEventListener('change', function (event) {
            const file = event.target.files[0];
            if (file) {
                handleFile(file);
            }
        });
    </script>

</body>
</html>

What's Your Reaction?

like

dislike

love

funny

angry

sad

wow