#---------This Python code uses BeautifulSoup to extract images and other links # in a .html file along with their file size. The idea is to find out the files # contributing maximum to the size of web page or missing image files. This has # been tested in Linux and some update might be required for Windows. # The code does not loop into '..' or upper folders relative to the root folder # specified by user. 'Root' folder here is folder name inside the folder where # this Python code is stored. #------------------------------------------------------------------------------ import sys, os from bs4 import BeautifulSoup as bsp extn = 'html' #Not needed though specified to check .php and .aspx files root = 'Home' #Relative to the folder where this code is stored # Specify file size in [kB] to check how many image files exceeds this theshold fs = 500 # File which shall be used to write relative path of images and the file sizes f_imgs = 'imageTags.txt' # File which shall be used to write relative path of images and missing images f_miss = 'imageMissing.txt' # --------------No further input is needed from the user----------------------- f_i = open(f_imgs, "w") f_m = open(f_miss, "w") #Convert file size into easy to read format: bytes to kB, MB, GB, TB def convert_bytes(num): for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: if num < 1024.0: return "%3.1f %s" % (num, x) num = num / 1024.0 folder = os.getcwd() + '/' + root #Change '/' to '\\' for Windows OS i = 0 #Counter for missing or mis-spelt or wrong extension... j = 0 #Counter for files greater than 500 kB in size p = 0 #Counter for image tag referring to upper (..) folder for path, subdirs, files in sorted(os.walk(folder)): for name in files: # Get the absolute path of file s = os.path.join(path, name) # Get the extension with dot . ext1 = os.path.splitext(s)[-1].lower() # Get extension of the file without dot . ext2 = ext1.partition('.')[2] #fn = os.path.join(os.getcwd(),name) #name, extension = os.path.splitext(s) if ext2.lower() == extn: print("\nNow processing page: ", s) stg = "---- ----- ------- ----- ------ ------ ------ -----" print(stg) stg = "\nNow processing page: " + s + "\n" f_i.write(stg) with open(s) as fp: soup = bsp(fp, "html.parser") images = soup.find_all("img") imgUrls = [img['src'] for img in images] for url in imgUrls: #Check if src doesn't point to upper level folder if url[:2] != '..': fx = root + '/' + url if os.path.exists(fx): #Print relative path of image and size b = os.path.getsize(fx) b = convert_bytes(b) k = float(b.partition(' ')[0]) if k > 500: j = j + 1 print(url.strip(), ',', str(b)) stg = url.strip()+ ',' + str(b) + '\n' f_i.write(stg) else: #Print missing file in the folder either: #file may be missing or wrong Extension such #as jpeg instead of JPG or mismatch of cases stg = "--File as per IMG tag SRC in" stg = stg + " HTML does not exist" print('-=-=-' + url + stg) stg = '-=-=-' + url + stg f_m.write(stg) i = i + 1 else: p = p + 1 stg = " = Total number of images which were either missing in destination" stg = stg + "\n folder or file name / extension mismatch instances found.\n" print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---") print(i, stg) stg = str(i) + stg if i < 1: f_m.write(stg) print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---") stg = "\n"+str(p) + " = Total number of image tags referring to .. folders. \n" f_m.write(stg) print(stg) stg = " = Total number of images having size greater than" print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---") print(j, stg, fs, ' [kB]') f_i.close f_m.close