#-----This Python code uses a very elementary approach to find out text in-----
# a PDF file and then user OCRMYPDF to make the text selectable. There is an
# arbitrary criteria set that the number of pages in PDF must be more than 30.
# The code skips the file if they are encrypted. The user needs to manually
# specify the root folder where all PDF files are stored.
# Limitations: uses English (roman scripts), does not check language in PDF text
#------------------------------------------------------------------------------
import sys, subprocess, shutil
import fitz
import os, re

root = "F:\Books-World-Hist" +"\\"+ "71_Russia-Socialism"
#root = os.path.join("F:\Books-World-Hist", "70_World-Archaeology")
# Store the files with no searchable text into a *.txt file
out_f = "toOcrList.txt"
f = open(out_f, "w")

for path, subdirs, files in os.walk(root):
	for name in files:
		s = os.path.join(path, name)
		print(s)
		ext = os.path.splitext(s)[-1].lower()
		#f-Strings: similar to str.format() but less verbose
		f1 = os.path.join(os.getcwd(),name)
		f2 = name.strip(".pdf") + "_ocr.pdf"
		#name, extension = os.path.splitext(s)
		with open(s, 'rb') as pfile:
			if ext == ".pdf":
				pf = fitz.open(pfile)
				n = pf.page_count
				if (not pf.isEncrypted) and (n >= 30):
					ptxt = 0
					pg = range(int(n/4), int(n/3))
					for i in pg:
						page = pf.loadPage(i)
						ptxt = max(ptxt, len(page.getText()))
					if (ptxt < 1):
						A = s.split('\\')
						f.write(' '.join(map(str, A)))
						f.write(' ' + '\n')
						if not os.path.exists(f1):
							shutil.copy(s, f1)
							print(f1 + "---" + f2)
						subprocess.call(["ocrmypdf","--skip-text","--output-type",
						"pdf", "-j 3", f"{f1}",f"{f2}"])
				pf.close()
f.close