#------------------------------------------------------------------------------ #Converts the number of connected pixels above a threshold size to white #It uses CCL: Connected Component Labeling method to find noisy pixels #------------------------------------------------------------------------------ import sys, os, glob import numpy as np import cv2 black = 0 white = 255 extn = ".jpg" threshold = 127 #Define size (number of pixels) a component should consist of. Note that text #may be connected and a smaller number will wipe out texts from the image. This #values needs to be worked by trial-and-error for each type of image. If there #is any shadow behind the text, the likelyhood of text getting erased is high. #While using CCL on texts, it cannot remove salt-and-paper noise else the dots, #commas, (semi)colons.. shall also get removed. arsz = 5000 counter = 1 for f in sorted(glob.glob("*.jpg")): outFile0 = f.strip(extn) + "-GR.png" outFile1 = f.strip(extn) + "-BW.png" outFile2 = f.strip(extn) + "-WB.png" outFile3 = f.strip(extn) + "-CL.png" #Step-1: Open input image in grayscale mode and get its pixels imgGray = cv2.imread(f, 0) #imgGray = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY) #cv2.imwrite(outFile0, imgGray) pixels = np.array(imgGray)[:,:] #Step-2: Change pixels above or below threshold to white (255) or black #ret,imgBW = cv2.threshold(src,0,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU) #Above statement is equivalent to following 3 lines of code pixels[pixels > threshold] = white pixels[pixels < threshold] = black imgBW = pixels #cv2.imwrite(outFile1, imgBW) #Invert black (foreground) and white (background) of image imgWB = cv2.bitwise_not(imgBW) #cv2.imwrite(outFile2, imgWB) # Apply the Component analysis function, CV_32S is output image label type # 4 or 8 is connectivity type analysis = cv2.connectedComponentsWithStats(imgWB, 8, cv2.CV_32S) #nLabels is total number of labels where 0 represents the background label #A label is assigned to each pixel based on its location and neighbours. #If a pixel is black (value = 0), it is skipped as default label is '0'. #Thus, each connected region shall be labeled 1, 2, 3... nLabels-1. (nLabels, labels, values, centroid) = analysis #nLabels = analysis[0], labels = analysis[1], values = analsysi[2]... #labels is matrix of size of input image, each element has value=its label #sizes = values[1:, -1] #Create mask of same dimensions as image, pixel valued 0=black,255=white imgBlack = np.zeros(imgBW.shape, dtype="uint8") #Demo only-not used later imgWhite = np.ones(imgBW.shape, dtype="uint8")*255 # Loop through each component #Create a mask for each label value. 'labels' is a matrix of same size #as input image. For each elements of matrix 'labels' equals component #id 'i', create a copy of that matrix of type unsigned-integer-8-bit. #Construct a mask for the current connected component by finding a #pixels in the labels array that have current connected component ID print("Image ", f"{counter:03d}", " has ", nLabels, " components") j = 1 for i in range(1, nLabels): #Find area of the connected components area = values[i, cv2.CC_STAT_AREA] if (area > arsz): cMask = (labels == i).astype("uint8") * 255 #x = np.array([1.2, 2.3]), x.astype(int) = array([1, 2]) imgBW = cv2.bitwise_or(imgBW, cMask) cv2.imwrite(outFile3, imgBW) j = j + 1 print("Image ",f"{counter:03d}"," has ",j," components > area threshold") print("Image ",f"{counter:03d}", " processed") counter = counter + 1