#------------------------------------------------------------------------------
#Converts the number of connected pixels above a threshold size to white
#It uses CCL: Connected Component Labeling method to find noisy pixels
#------------------------------------------------------------------------------
import sys, os, glob
import numpy as np
import cv2

black = 0
white = 255
extn = ".jpg"
threshold = 127

#Define size (number of pixels) a component should consist of. Note that text
#may be connected and a smaller number will wipe out texts from the image. This
#values needs to be worked by trial-and-error for each type of image. If there
#is any shadow behind the text, the likelyhood of text getting erased is high.
#While using CCL on texts, it cannot remove salt-and-paper noise else the dots,
#commas, (semi)colons.. shall also get removed.
arsz = 5000

counter = 1
for f in sorted(glob.glob("*.jpg")):
	outFile0 = f.strip(extn) + "-GR.png"
	outFile1 = f.strip(extn) + "-BW.png"
	outFile2 = f.strip(extn) + "-WB.png"
	outFile3 = f.strip(extn) + "-CL.png"
	
	#Step-1: Open input image in grayscale mode and get its pixels
	imgGray = cv2.imread(f, 0) 
	#imgGray = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
	#cv2.imwrite(outFile0, imgGray)

	pixels = np.array(imgGray)[:,:]
	
	#Step-2: Change pixels above or below threshold to white (255) or black
	#ret,imgBW = cv2.threshold(src,0,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)
	
	#Above statement is equivalent to following 3 lines of code
	pixels[pixels > threshold] = white
	pixels[pixels < threshold] = black
	imgBW = pixels
	#cv2.imwrite(outFile1, imgBW)

	#Invert black (foreground) and white (background) of image
	imgWB = cv2.bitwise_not(imgBW)
	#cv2.imwrite(outFile2, imgWB)

	# Apply the Component analysis function, CV_32S is output image label type
	# 4 or 8 is connectivity type
	analysis = cv2.connectedComponentsWithStats(imgWB, 8, cv2.CV_32S)
	#nLabels is total number of labels where 0 represents the background label
	#A label is assigned to each pixel based on its location and neighbours.
	#If a pixel is black (value = 0), it is skipped as default label is '0'. 
	#Thus, each connected region shall be labeled 1, 2, 3... nLabels-1.

	(nLabels, labels, values, centroid) = analysis
	#nLabels = analysis[0], labels = analysis[1], values = analsysi[2]...
	#labels is matrix of size of input image, each element has value=its label
	#sizes = values[1:, -1]
	
	#Create mask of same dimensions as image, pixel valued 0=black,255=white
	imgBlack = np.zeros(imgBW.shape, dtype="uint8") #Demo only-not used later
	imgWhite = np.ones(imgBW.shape, dtype="uint8")*255

	# Loop through each component
	#Create a mask for each label value. 'labels' is a matrix of same size 
      #as input image. For each elements of matrix 'labels' equals component 
      #id 'i', create a copy of that matrix of type unsigned-integer-8-bit.
      #Construct a mask for the current connected component by finding a 
      #pixels in the labels array that have current connected component ID
	print("Image ", f"{counter:03d}", " has ", nLabels, " components")
	j = 1
	for i in range(1, nLabels):
		#Find area of the connected components
		area = values[i, cv2.CC_STAT_AREA]
		if (area > arsz):
			cMask = (labels == i).astype("uint8") * 255
			#x = np.array([1.2, 2.3]), x.astype(int) = array([1, 2])
			imgBW = cv2.bitwise_or(imgBW, cMask)
			cv2.imwrite(outFile3, imgBW)
			j = j + 1
	print("Image ",f"{counter:03d}"," has ",j," components > area threshold")
	print("Image ",f"{counter:03d}", " processed")
	counter = counter + 1