HOW TO: Compare content on 2 HTML files with Python

Posted by Daniel on September 5th at 9:46pm

The following code will help you to compare the content between two HTML files with Python.

import os
from bs4 import BeautifulSoup
def getfiles(path1, path2):

    #Load files on root of path1 on files1
    for root, dir, names in os.walk(path1):
        files1 = names
        break #Will break the for to read just the root folder

    #Load files on root of path2 on files2
        for root, dir, names in os.walk(path2):
        files2 = names
        break #Will break the for to read just the root folder


    #Compares the two list of files and select files with the same name on both paths
    interfiles = set(files1).intersection(files2)

    #Select just HTML files on mylist
    mylist = [ fi for fi in interfiles if fi.endswith('.html')]


    print '\nI will check:', len(mylist), 'files in total... HOLD ON!\n'
    return mylist


def checkcontent(path1, path2):

    #Get files from both paths
    mylist = getfiles(path1, path2)

    difcontent = 0
    diftitles = 0
    titles = []
    notitles = []

    print '='*50
    print 'Files With Different Content'
    print '='*50


    for files in mylist:

        #Select files on path1 and add them to the sooup
        htmlDoc = open (path1+files, 'r+')
        soup1 = BeautifulSoup(htmlDoc.read())

        #Select div class description inside div class bodytext
        find1 = soup1.select('.bodytext .description')

        #Select H3 tags
        header1 = soup1.h3

        #Select files on path2 and add them to the sopu
        htmlDoc = open (path2+files, 'r+')
        soup2 = BeautifulSoup(htmlDoc.read())

        #Select div class description inside div class bodytext
        find2 = soup2.select('.bodytext .description')

        #Select H1 tag
        header2 = soup2.h1

        #Check if the are H1 and H3 tags
        if (header2 == None or header1 == None):
            notitles.append(files)

        #Compares headers
        else:
            for headers in header1:
                h1 = headers
            for headers2 in header2:
                h3 = headers2
            if not h1 == h3:
                titles.append(files)
                diftitles += 1

        #Read lines on HTML files
        for lines1 in find1:
            l = lines1
        for lines2 in find2:
            n = lines2

        #Compares content
        if not l == n:
            print files
            difcontent += 1

    #Print results
    print '\n'
    print '='*50
    print 'Files With No Title'
    print '='*50

    for lines in notitles:
        print lines

    print '\n'
    print '='*50
    print 'Files With Different Titles'
    print '='*50

    for lines in titles:
        print lines

    print "\nI've found", difcontent, 'files with different content'
    print "I've found", diftitles, 'different titles'

def main():

    mypath = "PATH_TO_FOLDER1"
    mypath2 = "PATH_TO_FOLDER2"

    checkcontent(mypath, mypath2)


if __name__ == "__main__":
    main()