Batch Check HTML Size with Python and Perl

Xah Lee, 2005-10

The following script takes a directory and print all the sizes of html files, counting the sizes of inline images.

This script is useful in making sure that HTML file are under certain size. This is useful because web visitors with slow connection may take a long time to load html files with lots of inline images.

# -*- coding: utf-8 -*-
# Python


# Wed Oct  5 15:50:31 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org

import re, os.path, sys

inpath= '/Users/t/web/'

while inpath[-1] == '/': inpath = inpath[0:-1] # get rid of trailing slash

if (not os.path.exists(inpath)):
    print "dir " + inpath + " doesn't exist!"
    sys.exit(1)

##################################################
# subroutines


def getInlineImg(file_full_path):
    '''getInlineImg(file_full_path) returns a list of the sources of inline images. For example, it may return ['xx.jpg','../image.png']'''    
    FF = open(file_full_path,'rb')
    txt_segs = re.split( r'src', unicode(FF.read(),'utf-8'))
    txt_segs.pop(0)
    FF.close()
    linx=[]
    for linkBlock in txt_segs:
        matchResult = re.search(r'\s*=\s*\"([^\"]+)\"', linkBlock)
        if matchResult: linx.append( matchResult.group(1) ) 
    return linx


def linkFullPath(dir,locallink):
    '''linkFullPath(dir, locallink) returns a string that is the full path to the local link. For example, linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns 'Users/t/public_html/a/image/t.png'. The returned result will not contain double slash or '../' string.'''
    result = dir + '/' + locallink
    result = re.sub(r'//+', r'/', result)
    while re.search(r'/[^\/]+\/\.\.', result): result = re.sub(r'/[^\/]+\/\.\.', '', result)
    return result

def listInlineImg(htmlfile):
    '''listInlineImg(html_file_full_path) returns a list where each element is a full path to inline images in the html.'''
    dir=os.path.dirname(htmlfile)
    imgPaths = getInlineImg(htmlfile)
    result = []
    for aPath in imgPaths:
        result.append(linkFullPath( dir, aPath))
    return result


##################################################
# main

fileSizeList=[]
def checkLink(dummy, dirPath, fileList):
    for fileName in fileList:
        if '.html' == os.path.splitext(fileName)[1] and os.path.isfile(dirPath+'/'+fileName):
            totalSize = os.path.getsize(dirPath+'/'+fileName)
            imagePathList = listInlineImg(dirPath+'/'+fileName)
            for imgPath in imagePathList: totalSize += os.path.getsize(imgPath)
            fileSizeList.append([totalSize, dirPath+'/'+fileName])


os.path.walk(inpath, checkLink, 'dummy')

fileSizeList.sort(key=lambda x:x[0],reverse=True)

for it in fileSizeList: print it
print "done reporting."

The following is a Perl version. The Python version above is a direct translation of this Perl version.

# perl


# Tue Oct  4 14:36:48 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org

use Data::Dumper;
use File::Find;
use File::Basename;

$inpath = '/Users/t/web/';

while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing slash

die "dir $inpath doesn't exist! $!" unless -e $inpath;

##################################################
# subroutines


# getInlineImg($file_full_path) returns a array that is sources of inline images. For example, it may return ('xx.jpg','../image.png')
sub getInlineImg ($) { $full_file_path= $_[0];
  @linx =(); open (FF, "<$full_file_path") or die "error: can not open $full_file_path $!";
  while (<FF>) { @txt_segs = split(m/src/, $_); shift @txt_segs;
    for $linkBlock (@txt_segs) {
        if ($linkBlock =~ m@\s*=\s*\"([^\"]+)\"@) { push @linx, $1; }
    }
  } close FF;
  return @linx;
}


# linkFullPath($dir,$locallink) returns a string that is the full path to the local link. For example, linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns 'Users/t/public_html/a/image/t.png'. The returned result will not contain double slash or '../' string.
sub linkFullPath($$){
    $result=$_[0] . $_[1];
    $result =~ s@\/+@\/@g;
    while ($result =~ s@/[^\/]+\/\.\.@@) {};
    return $result;
}


# listInlineImg($html_file_full_path) returns a array where each element is a full path to inline images in the html.
sub listInlineImg($) {
  my $htmlfile= $_[0];

  my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
  my @imgPaths = getInlineImg($htmlfile);

  my @result=();
  foreach my $aPath (@imgPaths) { push @result, linkFullPath($dir,$aPath);}
  return @result;
}

##################################################
# main
sub checkLink {
    if (
        $File::Find::name =~ m@\.html$@ && -T $File::Find::name
    ) {
        $totalSize= -s $File::Find::name;
        @imagePathList = listInlineImg($File::Find::name);
        for my $imgPath (@imagePathList) {$totalSize += -s $imgPath;};
        push (@fileSizeList, [$totalSize, $File::Find::name]);
  };
}

find(\&checkLink, $inpath);

@fileSizeList = sort { $b->[0] <=> $a->[0]} @fileSizeList;

print Dumper(\@fileSizeList);
print "done reporting.";

Note that in some web browsers or web development tools, they can calculate the size of a web page. However, the utility of the above script is that they can be applied to many files in batch.


Related essays:


Page created: 2005-10.
© 2005 by Xah Lee.
Xah Signet