-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_to_ocr.py
More file actions
28 lines (22 loc) · 1.22 KB
/
pdf_to_ocr.py
File metadata and controls
28 lines (22 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import logging
import tempfile
import subprocess
from pathlib import Path
#This script takes a directory of PDFs as input. It iterates through the PDFs, converts them to TIFF, and then generates OCR from the TIFF.
logging.basicConfig(filename='pdf_to_ocr.log', level=logging.DEBUG)
indir = Path(input("Input directory containing PDFs: "))
outdir = Path(input("Output directory for OCR: "))
outdir.mkdir(exist_ok=True)
with tempfile.TemporaryDirectory() as tmp_dir_name:
for pdf_file in indir.iterdir():
base = (Path(pdf_file).stem)[:-4]
pdf_to_tiff = f"convert -density 300 {pdf_file} -depth 8 -strip -background white -alpha off {tmp_dir_name}/{base}.tiff > /dev/null 2>&1"
pdf_to_tiff_return = subprocess.call(pdf_to_tiff, shell=True)
if pdf_to_tiff_return != 0:
logging.error(f"Failed to convert PDF for {pdf_file}")
else:
tiff_to_ocr = f"tesseract {tmp_dir_name}/{base}.tiff {outdir}/{base}_OCR > /dev/null 2>&1"
tiff_to_ocr_return = subprocess.call(tiff_to_ocr, shell=True)
remove_tiff = f"rm {tmp_dir_name}/{base}.tiff"
subprocess.call(remove_tiff, shell=True)
# jpg_filename = f"{tmp_dir_name}/{sequence_number:06}-{page_pid}.jpg"