how to

How to Extract Text from PDF or Image Files Easily with JavaScript code

Looking for a simple way to pull text from PDF or image files? This tool uses pdf.js for PDFs and Tesseract.js for images, making it easy to grab the text with just a few clicks. Whether it’s a document or a picture, you can quickly extract what you need and display it right on the page.

Jan 28, 2025 - 23:29

Jan 29, 2025 - 00:37

0 59

How to Extract Text from PDF or Image Files Easily with JavaScript code

Libraries Used:

pdf.js: This is a library used to load and extract text from PDF files. It's pulled in using a CDN.
Tesseract.js: This is an OCR library for extracting text from image files. It's also loaded via a CDN.

HTML Structure:

There's a file input element that lets the user select a file.
The
with the ID "output" is where the extracted text gets displayed.

Text Extraction Logic:

When a PDF is selected, the extractTextFromPDF() function is called. It reads the file as a binary array using FileReader and then extracts the text from each page using pdf.js.
If an image is selected, extractTextFromImage() kicks in, and Tesseract.js does the OCR magic to grab the text from the image.

Event Handling:

Once a file is picked, the handleFile() function runs. It checks the file type and calls the right extraction function based on whether it's a PDF or an image.

How to Use:

Just open the page in your browser.
Select a PDF or image file via the file input.
The extracted text will appear in the "output" div.

below is exmaple of pdf and image convert in text

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>PDF and Image Text Extraction</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/tesseract.js@2.1.4/dist/tesseract.min.js"></script>
    <style>
        body {
            font-family: Arial, sans-serif;
            padding: 20px;
            background-color: #f4f4f9;
        }
        #output {
            margin-top: 20px;
            padding: 10px;
            border: 1px solid #ccc;
            background-color: #fff;
            white-space: pre-wrap;
            word-wrap: break-word;
            max-height: 300px;
            overflow-y: auto;
        }
        #fileInput {
            margin-bottom: 20px;
        }
    </style>
</head>
<body>

    <h1>Extract Text from PDF or Image</h1>
    <input type="file" id="fileInput" />
    <div id="output"></div>

    <script>
        // Function to extract text from a PDF file
        function extractTextFromPDF(file) {
            return new Promise((resolve, reject) => {
                const reader = new FileReader();
                reader.onload = function () {
                    const pdfData = new Uint8Array(this.result);
                    pdfjsLib.getDocument(pdfData).promise.then(pdfDoc => {
                        let text = '';
                        const numPages = pdfDoc.numPages;
                        const pagesPromises = [];

                        // Iterate through each page and extract text
                        for (let pageNum = 1; pageNum <= numPages; pageNum++) {
                            pagesPromises.push(
                                pdfDoc.getPage(pageNum).then(page => {
                                    return page.getTextContent().then(content => {
                                        text += content.items.map(item => item.str).join(' ') + '\n';
                                    });
                                })
                            );
                        }

                        // Resolve all pages and return the extracted text
                        Promise.all(pagesPromises).then(() => {
                            resolve(text);
                        }).catch(reject);
                    }).catch(reject);
                };
                reader.onerror = reject;
                reader.readAsArrayBuffer(file);
            });
        }

        // Function to extract text from an image using OCR (Tesseract.js)
        function extractTextFromImage(file) {
            return new Promise((resolve, reject) => {
                Tesseract.recognize(
                    file,
                    'eng',
                    {
                        logger: (m) => console.log(m)
                    }
                ).then(({ data: { text } }) => {
                    resolve(text);
                }).catch(reject);
            });
        }

        // Function to handle the file input and extract text
        function handleFile(file) {
            const fileType = file.type;

            // Check if the file is a PDF
            if (fileType === 'application/pdf') {
                extractTextFromPDF(file).then(text => {
                    document.getElementById('output').textContent = text;
                }).catch(error => {
                    document.getElementById('output').textContent = 'Error extracting text from PDF: ' + error.message;
                });
            }
            // Check if the file is an image
            else if (fileType.startsWith('image/')) {
                extractTextFromImage(file).then(text => {
                    document.getElementById('output').textContent = text;
                }).catch(error => {
                    document.getElementById('output').textContent = 'Error extracting text from image: ' + error.message;
                });
            } else {
                document.getElementById('output').textContent = 'Unsupported file type. Please select a PDF or image file.';
            }
        }

        // File input change handler
        document.querySelector('#fileInput').addEventListener('change', function (event) {
            const file = event.target.files[0];
            if (file) {
                handleFile(file);
            }
        });
    </script>

</body>
</html>