#!/usr/bin/perl -w
#by Torben Menke
#http://tokes.to.funpic.de
# cleans html-pages by removing lots of stuff, leaving a very simple page


use warnings;
use strict;
use Data::Dumper;

my $content;
my $fileold = $ARGV[0];
my $filenew = $fileold;
my ($sizeold, $sizenew);
$filenew =~ s/(\.[^\.]+)$/-clean$1/;

print "$fileold -> $filenew\n";

open (FILE, "< $fileold") or die $!;
$content = join '',<FILE>;
close FILE;

$sizeold = length ($content);

$content = htmlClean($content);
$sizenew = length ($content);

open (FILE, "> $filenew") or die $!;
print FILE $content;
close FILE;

print "Oldsize: $sizeold Bytes\nNewsize: $sizenew Bytes = ".(int(100*$sizenew/$sizeold))."%\n";

#------------SUBS-----------
#
sub htmlClean {

my $s = shift;
# no Comments
$s =~ s/<\!--.+?-->//gis;
# remove multiple spaces
$s =~ s/<[ \t]+/ /gs;

# 'TAG >' -> 'TAG>'
$s =~ s/\s+>/>/gis;
# remove these tags and all between
foreach my $tag qw(applet embed noembed object script) {
$s =~ s#<$tag\b[^>]*>.*?</$tag>##gis;
}
# not this tags
foreach my $tag qw(big div font link span style col dt dl) {
$s =~ s#</?$tag\b[^>]*>##gis;
}
# no options for this tags
foreach my $tag qw(html body p br hr h1 h2 h3 h4 h5 h6) {
$s =~ s/<($tag)\b[^>]+>/<$1>/gis;
}
# remove these options from all tags
foreach my $opt qw (bgcolor class onclick onmouseover onmouseout style x:str x:num height width) { # height width
#<tr style='height:12.75pt'>
$s =~ s#(<[^>]+)\s+$opt\s*=\s*('|").*?\2\s*([^>]*>)#$1 $3#gis;
#<td class=xl24>
$s =~ s#(<[^>]+)\s+$opt\s*=\s*[^\s>]+\s*([^>]*>)#$1 $2#gis;
#<td x:num>
$s =~ s#(<[^>]+)\s+$opt\s*([^>]*>)#$1 $2#gis;
}

# less meta
$s =~ s/<meta\b[^>]+name\b[^>]+>//gis;
# <B> Text </B> -> <B>Text</B>
foreach my $tag qw(b strong i u) {
$s =~ s/\s*(<$tag>)\s+/ $1/gis;
$s =~ s/\s+(<\/$tag>)\s*/$1 /gis;
}
# no <o:p></o:p>
$s =~ s#</?o:[^>]*>##gis;
# no <![...]>
$s =~ s#<!\[[^>]*\]>##gis;
# no whitespaces in or around headings
$s =~ s# (<h\d>)#$1#gis;
$s =~ s#(<h\d>) #$1#gis;
$s =~ s#(<h\d>)\s+#$1#gis;
$s =~ s#\s+(</h\d>)#$1#gis;



# no <br><p> or <br></p>
$s =~ s#<br>\s*(</?p>)#$1#gis;
# no <TAG></TAG>
# <tr></tr> !!!!
#$s =~ s#<([^>]+)>\s*</\1>##gis;
# <td > -> <td>
$s =~ s#\s+>#>#gs;
#
my @l = split "\n",$s;
# no empty lines
@l = grep {!m/^\s*$/} @l;
# remove multiple spaces
@l = map {$_ =~ s/\s\s+/ /g;$_} @l;
# remove spaces at start of line
@l = map {$_ =~ s/^\s+//g;$_} @l;
#
$s = join "\n",@l;
return $s;
}


Hope you found what you where looking for. Feel free to drop me a line
Torben