How to Extract Text from PDF or Image Files Easily with JavaScript code
Looking for a simple way to pull text from PDF or image files? This tool uses pdf.js for PDFs and Tesseract.js for images, making it easy to grab the text with just a few clicks. Whether it’s a document or a picture, you can quickly extract what you need and display it right on the page.

Libraries Used:
- pdf.js: This is a library used to load and extract text from PDF files. It's pulled in using a CDN.
- Tesseract.js: This is an OCR library for extracting text from image files. It's also loaded via a CDN.
HTML Structure:
- There's a file input element that lets the user select a file.
- The
with the ID "output" is where the extracted text gets displayed.
Text Extraction Logic:
- When a PDF is selected, the
extractTextFromPDF()
function is called. It reads the file as a binary array using FileReader and then extracts the text from each page using pdf.js. - If an image is selected,
extractTextFromImage()
kicks in, and Tesseract.js does the OCR magic to grab the text from the image.
Event Handling:
- Once a file is picked, the
handleFile()
function runs. It checks the file type and calls the right extraction function based on whether it's a PDF or an image.
How to Use:
- Just open the page in your browser.
- Select a PDF or image file via the file input.
- The extracted text will appear in the "output" div.
below is exmaple of pdf and image convert in text
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF and Image Text Extraction</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@2.1.4/dist/tesseract.min.js"></script>
<style>
body {
font-family: Arial, sans-serif;
padding: 20px;
background-color: #f4f4f9;
}
#output {
margin-top: 20px;
padding: 10px;
border: 1px solid #ccc;
background-color: #fff;
white-space: pre-wrap;
word-wrap: break-word;
max-height: 300px;
overflow-y: auto;
}
#fileInput {
margin-bottom: 20px;
}
</style>
</head>
<body>
<h1>Extract Text from PDF or Image</h1>
<input type="file" id="fileInput" />
<div id="output"></div>
<script>
// Function to extract text from a PDF file
function extractTextFromPDF(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = function () {
const pdfData = new Uint8Array(this.result);
pdfjsLib.getDocument(pdfData).promise.then(pdfDoc => {
let text = '';
const numPages = pdfDoc.numPages;
const pagesPromises = [];
// Iterate through each page and extract text
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
pagesPromises.push(
pdfDoc.getPage(pageNum).then(page => {
return page.getTextContent().then(content => {
text += content.items.map(item => item.str).join(' ') + '\n';
});
})
);
}
// Resolve all pages and return the extracted text
Promise.all(pagesPromises).then(() => {
resolve(text);
}).catch(reject);
}).catch(reject);
};
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
}
// Function to extract text from an image using OCR (Tesseract.js)
function extractTextFromImage(file) {
return new Promise((resolve, reject) => {
Tesseract.recognize(
file,
'eng',
{
logger: (m) => console.log(m)
}
).then(({ data: { text } }) => {
resolve(text);
}).catch(reject);
});
}
// Function to handle the file input and extract text
function handleFile(file) {
const fileType = file.type;
// Check if the file is a PDF
if (fileType === 'application/pdf') {
extractTextFromPDF(file).then(text => {
document.getElementById('output').textContent = text;
}).catch(error => {
document.getElementById('output').textContent = 'Error extracting text from PDF: ' + error.message;
});
}
// Check if the file is an image
else if (fileType.startsWith('image/')) {
extractTextFromImage(file).then(text => {
document.getElementById('output').textContent = text;
}).catch(error => {
document.getElementById('output').textContent = 'Error extracting text from image: ' + error.message;
});
} else {
document.getElementById('output').textContent = 'Unsupported file type. Please select a PDF or image file.';
}
}
// File input change handler
document.querySelector('#fileInput').addEventListener('change', function (event) {
const file = event.target.files[0];
if (file) {
handleFile(file);
}
});
</script>
</body>
</html>
What's Your Reaction?






