#---------This Python code uses BeautifulSoup to extract links as per HREF tags # in a .html file. The idea is to find out the missing links or the files that # contributing maximum to the size of web page or missing link files to which # the HREF tag refers to. # # The code also loops into the '..' or (parent) upper folders relative to root # folder specified by user. 'Root' folder here is folder name inside the folder # where this Python code is stored. # # This code does not print the empty link such as HREF="#", it skips them but # it can recorded by simply adding an IF block. There is alreayd a check being # performed using REGEX to search for link with # value. # # This code will not check if HREF attribute link to an Email Address or phone #------------------------------------------------------------------------------ import sys, os, re from bs4 import BeautifulSoup as bsp extn = 'html' #Not needed though specified to check .php and .aspx files root = 'Home' #Relative to the folder where this code is stored # Specify file size in [kB] to check how many HREF files exceed this threshold # Not yet implemented in this code fs = 500 # File which shall be used to write relative path of images and the file sizes f_imgs = 'hrefLinkTags.txt' # File which shall be used to write relative path of images and missing images f_miss = 'hrefMissingFiles.txt' # --------------No further input is needed from the user----------------------- f_i = open(f_imgs, "w") f_m = open(f_miss, "w") #Convert file size into easy to read format: bytes to kB, MB, GB, TB def convert_bytes(num): for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: if num < 1024.0: return "%3.1f %s" % (num, x) num = num / 1024.0 folder = os.getcwd() + '/' + root #Change '/' to '\\' for Windows OS i = 0 #Counter for missing or misspelt or wrong extension... j = 0 #Counter for files greater than 500 kB in size p = 0 #Counter for image tag referring to upper (..) folder for path, subdirs, files in sorted(os.walk(folder)): for name in files: # Get the absolute path of file s = os.path.join(path, name) # Get the extension with dot . ext1 = os.path.splitext(s)[-1].lower() # Get extension of the file without dot . ext2 = ext1.partition('.')[2] #fn = os.path.join(os.getcwd(),name) #name, extension = os.path.splitext(s) if ext2.lower() == extn: print("\nNow processing page: ", s) stg = "---- ----- ------- ----- ------ ------ ------ -----" print(stg) stg = "\nNow processing page: " + s + "\n" f_i.write(stg) f_m.write(stg) with open(s) as fp: soup = bsp(fp, "html.parser") hlinks = soup.find_all('a') for link in hlinks: lnk = str(link.get('href')) xhrf = lnk.startswith('http') #Check if doesn't point to parent folder if (lnk[0:2] != '..'): regex = re.compile('[#]') xhash = (regex.search(lnk) == None) if (not xhrf) and xhash: fx = root + '/' + lnk if os.path.exists(fx): #Print relative path of href print(fx) stg = fx +'\n' f_i.write(stg) else: #Print missing file in the folder either #file may be missing or wrong Extension such #as htm instead of HTML or mismatch of cases stg = " -File as per tag HREF" stg = stg + " does not exist" print('-=-=- ' + fx + stg) stg = '-=-=- ' + fx + stg +'\n' f_m.write(stg) i = i + 1 else: fx = os.getcwd() + '/' + lnk[3:] if os.path.exists(fx): #Print relative path of href print(fx) stg = fx +'\n' f_i.write(stg) else: #Print missing file in the folder either #file may be missing or wrong Extension such #as htm instead of HTML or mismatch of cases stg = " -File as per tag HREF" stg = stg + " does not exist" print('-=-=- ' + fx + stg) stg = '-=-=- ' + lnk[3:] + stg +'\n' f_m.write(stg) i = i + 1 p = p + 1 stg = " = Total number of links which were either missing in destination" stg = stg + "\n folder or file name / extension mismatch instances found.\n" print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---") print(i, stg) stg = str(i) + stg if i < 1: f_m.write(stg) print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---") stg = "\n"+str(p) + " = Total number of tags referring to .. folders. \n" f_m.write(stg) print(stg) f_i.close f_m.close ''' Folder strcuture and recommended location of this Python script root | |--htmlFileLinks.py ---- This script |--index.html |--Home/ ---- Root folder defined above |--Images/ |--Links/ |--Policies/ |--FAQ.html *--ConstactUs.html '''