#!/usr/bin/perl -w
# $Id: clean-up-html-code,v 1.1 2004/06/19 11:50:37 suter Exp $
#
# Copyright (c) 2001 Mark Suter All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Possible usage:
# find /var/www/www.example.com/ -iname \*.html -exec ~/bin/clean-up-html-code {} \;
use strict;
use HTML::Clean;
## Our single file to play with
my $filename = $ARGV[0];
## Initialise our cleaner
my $h = new HTML::Clean($filename) or die "$0: can't initialise HTML::Clean: $!\n";
## Select all optimisations
$h->level(9);
## Before we start
my $d = $h->data();
my $origlen = length($$d);
# Select what I want, comments from "perldoc HTML::Clean"
$h->strip({
whitespace => 1, # Remove excess whitespace
shortertags => 0, # -> , etc..
blink => 0, # No blink tags.
contenttype => 0, # Remove default contenttype.
comments => 1, # Remove excess comments.
entities => 0, # " -> ", etc.
dequote => 0, # remove quotes from tag parameters where possible.
defcolor => 1, # recode colors in shorter form. (#ffffff -> white, etc.)
javascript => 1, # remove excess spaces and newlines in javascript code.
htmldefaults => 0, # remove default values for some html tags
lowercasetags => 1, # translate all HTML tags to lowercase
});
## Was it worth the effort?
my $newlen = length($$d);
printf "%6d -> %6d %2d%% %s\n", $origlen, $newlen, (100 * abs($origlen - $newlen)) / $origlen, $filename;
## Dump the cleaned code (this is a procesed copy, so no backup needed)
open OUTPUT, "> $filename" or die "$0: can't overwrite '$filename': $!\n";
print OUTPUT $$d or die "$0: can't print to filehandle: $!\n";
close OUTPUT or die "$0: can't close filehandle: $!\n";