''' This Python code uses PyMyPDF to convert pages of PDF file into PND images and then uses OpenCV to convert coloured images to Grayscale and finally the PNG files are converted back into a Grayscale or BW PDF User needs to specify threshold value of the pixel color to convert to black and white. This needs to be tried for each type of PDF file as the quality of colour varies for each image. The script finall converts the images into a PDF #--Syntax: py delPages.pdf Input.pdf threhsold firstPg lastPg lastPg < 0 implies single page deletion, lastPg = 0 implies last page of book ''' import fitz, cv2 import glob, sys, os, subprocess from PIL import Image #------------------------------------------------------------------------------ if (len(sys.argv) <= 4): print("\n Error! Usage: py {} input.pdf threshold firstPg lastdPg") print("\n Usage: lastPg = integer < 0 implies single page conversion \n") sys.exit(1) #------------------------------------------------------------------------------ pdf_name = str(sys.argv[1]) th = int(sys.argv[2]) m = int(sys.argv[3]) n = int(sys.argv[4]) doc = fitz.open(pdf_name) nPg = len(doc) if (n == 0): n = nPg # Make directory named by variable output_dir output_dir = "PDF2PNG" os.makedirs(output_dir, exist_ok=True) file_name = os.path.basename(pdf_name) file_name = file_name.split(".")[0] i = 0 if (n >= 0): for i in range(nPg): if (i >= m and i <= n): page = doc.loadPage(i) pg = page.getPixmap(matrix=fitz.Matrix(100/72, 100/72)) png_name = output_dir + "/" + file_name + '{0:04}'.format(i) + ".png" pg.writePNG(png_name) else: #Single page conversion page = doc.loadPage(m) pg = page.getPixmap() png_name = output_dir + "/" + file_name + '{0:04}'.format(m) + ".png" pg.writePNG(png_name) #------------------------------------------------------------------------------ pdff = "PDF-Merged.pdf" pdfx = "PDF-Tempor.pdf" bwFile = "BW.png" pdfm = fitz.open() w = 842 imgList = output_dir + "/" + "*.PNG" img = Image.open(glob.glob(imgList)[0]) wc = (w/float(img.size[0])) h = int((float(img.size[1])*float(wc))) #------------------------------------------------------------------------------ for f in glob.glob(imgList): #convert the image to grayscale format: both the approaches are identical imgGray = cv2.imread(f, 0) #imgGray = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY) #Convert the image to black and white, thresholding turns the border of the #object in the image completely white with all pixels having same intensity (thresh, bwImg) = cv2.threshold(imgGray, th, 255, cv2.THRESH_BINARY) cv2.imwrite(bwFile, bwImg) img2pdf = Image.open(bwFile) img2pdf.resize((w, h), Image.Resampling.LANCZOS) img2pdf.save(pdfx, "PDF", resolution=100.0) p = fitz.open(pdfx) pdfm.insertPDF(p) p.close() pdfm.save(pdff) pdfm.close() os.remove(pdfx) os.remove(bwFile) img.close() #------------------------------------------------------------------------------ #Check whether folder exists or not if os.path.exists(output_dir): # checking whether the folder is empty or not if len(os.listdir(output_dir)) == 0: os.rmdir(folder_path) else: files = glob.glob(output_dir + "/" + "*.png") for f in files: os.remove(f) os.rmdir(output_dir)