#-----This Python code uses a very elementary approach to find out text in----- # a PDF file and then user OCRMYPDF to make the text selectable. There is an # arbitrary criteria set that the number of pages in PDF must be more than 30. # The code skips the file if they are encrypted. The user needs to manually # specify the root folder where all PDF files are stored. # Limitations: uses English (roman scripts), does not check language in PDF text #------------------------------------------------------------------------------ import sys, subprocess, shutil import fitz import os, re root = "F:\Books-World-Hist" +"\\"+ "71_Russia-Socialism" #root = os.path.join("F:\Books-World-Hist", "70_World-Archaeology") # Store the files with no searchable text into a *.txt file out_f = "toOcrList.txt" f = open(out_f, "w") for path, subdirs, files in os.walk(root): for name in files: s = os.path.join(path, name) print(s) ext = os.path.splitext(s)[-1].lower() #f-Strings: similar to str.format() but less verbose f1 = os.path.join(os.getcwd(),name) f2 = name.strip(".pdf") + "_ocr.pdf" #name, extension = os.path.splitext(s) with open(s, 'rb') as pfile: if ext == ".pdf": pf = fitz.open(pfile) n = pf.page_count if (not pf.isEncrypted) and (n >= 30): ptxt = 0 pg = range(int(n/4), int(n/3)) for i in pg: page = pf.loadPage(i) ptxt = max(ptxt, len(page.getText())) if (ptxt < 1): A = s.split('\\') f.write(' '.join(map(str, A))) f.write(' ' + '\n') if not os.path.exists(f1): shutil.copy(s, f1) print(f1 + "---" + f2) subprocess.call(["ocrmypdf","--skip-text","--output-type", "pdf", "-j 3", f"{f1}",f"{f2}"]) pf.close() f.close