#---------This Python code uses BeautifulSoup to extract images and other links
# in a .html file along with their file size. The idea is to find out the files
# contributing maximum to the size of web page or missing image files. This has
# been tested in Linux and some update might be required for Windows.

# The code does not loop into '..' or upper folders relative to the root folder
# specified by user. 'Root' folder here is folder name inside the folder where
# this Python code is stored.
#------------------------------------------------------------------------------
import sys, os
from bs4 import BeautifulSoup as bsp
   
extn = 'html'    #Not needed though specified to check .php and .aspx files
root = 'Home'    #Relative to the folder where this code is stored

# Specify file size in [kB] to check how many image files exceeds this theshold
fs = 500 

# File which shall be used to write relative path of images and the file sizes
f_imgs = 'imageTags.txt'

# File which shall be used to write relative path of images and missing images
f_miss = 'imageMissing.txt'

# --------------No further input is needed from the user-----------------------
f_i = open(f_imgs, "w")
f_m = open(f_miss, "w")

#Convert file size into easy to read format: bytes to kB, MB, GB, TB
def convert_bytes(num):  
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num = num / 1024.0

folder = os.getcwd() + '/' + root  #Change '/' to '\\' for Windows OS

i = 0   #Counter for missing or mis-spelt or wrong extension...
j = 0   #Counter for files greater than 500 kB in size
p = 0   #Counter for image tag referring to upper (..) folder

for path, subdirs, files in sorted(os.walk(folder)):
	for name in files:
		# Get the absolute path of file
		s = os.path.join(path, name)
		
		# Get the extension with dot .
		ext1 = os.path.splitext(s)[-1].lower()
		
		# Get extension of the file without dot .
		ext2 = ext1.partition('.')[2]
		#fn = os.path.join(os.getcwd(),name)
		#name, extension = os.path.splitext(s)
		
		if ext2.lower() == extn:
			print("\nNow processing page: ", s)
			stg = "---- ----- ------- ----- ------ ------ ------ -----"
			print(stg)
			
			stg = "\nNow processing page: " + s + "\n"
			f_i.write(stg)
			
			with open(s) as fp:
				soup = bsp(fp, "html.parser")
				images = soup.find_all("img")
				imgUrls = [img['src'] for img in images]
				for url in imgUrls:
					#Check if src doesn't point to upper level folder
					if url[:2] != '..':
						fx = root + '/' + url
						
						if os.path.exists(fx):
							#Print relative path of image and size
							b = os.path.getsize(fx)
							b = convert_bytes(b)
							k = float(b.partition(' ')[0])
							if k > 500:
								j = j + 1
							print(url.strip(), ',', str(b))
							stg = url.strip()+ ',' + str(b) + '\n'
							f_i.write(stg)
						else:
						#Print missing file in the folder either: 
						#file may be missing or wrong Extension such
						#as jpeg instead of JPG or mismatch of cases
							stg = "--File as per IMG tag SRC in"
							stg = stg + " HTML does not exist"
							print('-=-=-' + url + stg)
							stg = '-=-=-' + url + stg
							f_m.write(stg)
							i = i + 1
					else:
						p = p + 1
			
stg = " = Total number of images which were either missing in destination"
stg = stg + "\n folder or file name / extension mismatch instances found.\n"
print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---")
print(i, stg)

stg = str(i) + stg
if i < 1:
	f_m.write(stg)

print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---")
stg = "\n"+str(p) + " = Total number of image tags referring to .. folders. \n"
f_m.write(stg)
print(stg)

stg = " = Total number of images having size greater than"
print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---")
print(j, stg, fs, ' [kB]')	

f_i.close
f_m.close