#---------This Python code uses BeautifulSoup to extract links as per HREF tags
# in a .html file. The idea is to find out the missing links or the files that
# contributing maximum to the size of web page or missing link files to which
# the HREF tag refers to.
#
# The code also loops into the '..' or (parent) upper folders relative to root
# folder specified by user. 'Root' folder here is folder name inside the folder
# where this Python code is stored.
#
# This code does not print the empty link such as HREF="#", it skips them but
# it can recorded by simply adding an IF block. There is alreayd a check being
# performed using REGEX to search for link with # value.
#
# This code will not check if HREF attribute link to an Email Address or phone
#------------------------------------------------------------------------------
import sys, os, re
from bs4 import BeautifulSoup as bsp
extn = 'html' #Not needed though specified to check .php and .aspx files
root = 'Home' #Relative to the folder where this code is stored
# Specify file size in [kB] to check how many HREF files exceed this threshold
# Not yet implemented in this code
fs = 500
# File which shall be used to write relative path of images and the file sizes
f_imgs = 'hrefLinkTags.txt'
# File which shall be used to write relative path of images and missing images
f_miss = 'hrefMissingFiles.txt'
# --------------No further input is needed from the user-----------------------
f_i = open(f_imgs, "w")
f_m = open(f_miss, "w")
#Convert file size into easy to read format: bytes to kB, MB, GB, TB
def convert_bytes(num):
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
num = num / 1024.0
folder = os.getcwd() + '/' + root #Change '/' to '\\' for Windows OS
i = 0 #Counter for missing or misspelt or wrong extension...
j = 0 #Counter for files greater than 500 kB in size
p = 0 #Counter for image tag referring to upper (..) folder
for path, subdirs, files in sorted(os.walk(folder)):
for name in files:
# Get the absolute path of file
s = os.path.join(path, name)
# Get the extension with dot .
ext1 = os.path.splitext(s)[-1].lower()
# Get extension of the file without dot .
ext2 = ext1.partition('.')[2]
#fn = os.path.join(os.getcwd(),name)
#name, extension = os.path.splitext(s)
if ext2.lower() == extn:
print("\nNow processing page: ", s)
stg = "---- ----- ------- ----- ------ ------ ------ -----"
print(stg)
stg = "\nNow processing page: " + s + "\n"
f_i.write(stg)
f_m.write(stg)
with open(s) as fp:
soup = bsp(fp, "html.parser")
hlinks = soup.find_all('a')
for link in hlinks:
lnk = str(link.get('href'))
xhrf = lnk.startswith('http')
#Check if doesn't point to parent folder
if (lnk[0:2] != '..'):
regex = re.compile('[#]')
xhash = (regex.search(lnk) == None)
if (not xhrf) and xhash:
fx = root + '/' + lnk
if os.path.exists(fx):
#Print relative path of href
print(fx)
stg = fx +'\n'
f_i.write(stg)
else:
#Print missing file in the folder either
#file may be missing or wrong Extension such
#as htm instead of HTML or mismatch of cases
stg = " -File as per tag HREF"
stg = stg + " does not exist"
print('-=-=- ' + fx + stg)
stg = '-=-=- ' + fx + stg +'\n'
f_m.write(stg)
i = i + 1
else:
fx = os.getcwd() + '/' + lnk[3:]
if os.path.exists(fx):
#Print relative path of href
print(fx)
stg = fx +'\n'
f_i.write(stg)
else:
#Print missing file in the folder either
#file may be missing or wrong Extension such
#as htm instead of HTML or mismatch of cases
stg = " -File as per tag HREF"
stg = stg + " does not exist"
print('-=-=- ' + fx + stg)
stg = '-=-=- ' + lnk[3:] + stg +'\n'
f_m.write(stg)
i = i + 1
p = p + 1
stg = " = Total number of links which were either missing in destination"
stg = stg + "\n folder or file name / extension mismatch instances found.\n"
print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---")
print(i, stg)
stg = str(i) + stg
if i < 1:
f_m.write(stg)
print ("\n----= ----= ----= ----= ----= ----= ----= ----= ---")
stg = "\n"+str(p) + " = Total number of tags referring to .. folders. \n"
f_m.write(stg)
print(stg)
f_i.close
f_m.close
'''
Folder strcuture and recommended location of this Python script
root
|
|--htmlFileLinks.py ---- This script
|--index.html
|--Home/ ---- Root folder defined above
|--Images/
|--Links/
|--Policies/
|--FAQ.html
*--ConstactUs.html
'''