#!/usr/bin/perl # htmldebloater -- debloat HTML so it can be comfotably read even a # turn of the century black and white PDA # Copyright : http://www.fsf.org/copyleft/gpl.html # Created On : December 2004 # Last Modified On: Sat Feb 6 02:47:12 2010 # Update Count : 23 # Inspiration : Dan Jacobson -- http://jidanni.org/comp # Actual brains : Sam Watkins use warnings; use strict; use HTML::Entities; # Only tags in this list will get through. my $ok_tags = set( qw( html head body title base meta p br hr a img table th tr td b i u em strong center blockquote ul ol li dl dt dd h1 h2 h3 h4 h5 h6 pre div ) ); # Attributes in this list will get through no matter what tag they are in. # - I can't think of any yet. my $ok_attr = set( qw( ) ); # a list of allowed attributes for each tag my $ok_tag_attr = { td => set(qw(colspan rowspan align valign)), a => set(qw(href name)), img => set(qw(src width height alt title)), base => set(qw(href)), meta => set(qw(http-equiv content)), }; # a list of tags where we want to hide the content between and my $kill_containers = set( qw( script style select textarea ) ); # Tags in this list will be filtered based on a predicate over their attributes. # The "predicate" could also change the attribute names or values, delete attributes, etc. my $tag_filter = { meta => sub { my $attr = $_[0]; $attr->{'http-equiv'} && $attr->{content} } }; use strict; use warnings; use HTML::Parser; my $in_dead_container = 0; my $parser = HTML::Parser->new( text_h => [ \&pass_through, "text" ], declaration_h => [ \&pass_through, "text" ], start_h => [ \&start_tag, "self, tagname, attrseq, attr" ], end_h => [ \&end_tag, "self, tagname, text" ] # no comments will get through # default_h => [ sub { print shift }, 'text' ], ); $parser->parse_file(*STDIN); sub pass_through { print shift unless $in_dead_container; } sub start_tag { my ( $self, $tag, $attrseq, $attr ) = @_; if ( $ok_tags->{$tag} && !$in_dead_container ) { $attrseq = [ grep { $ok_attr->{$_} || $ok_tag_attr->{$tag}{$_} } @$attrseq ]; my $keep_tag = 1; if (my $pred = $tag_filter->{$tag}) { $keep_tag = $pred->($attr); } if ($keep_tag) { print format_start_tag( $tag, $attrseq, $attr ); } } if ( $kill_containers->{$tag} ) { ++$in_dead_container; } } sub end_tag { my ( $self, $tag, $text ) = @_; print $text if $ok_tags->{$tag} && !$in_dead_container; if ( $kill_containers->{$tag} ) { --$in_dead_container; } } sub set { return { map { $_, 1 } @_ }; } sub format_start_tag { my ( $tag, $attrseq, $attr ) = @_; my $out = "<$tag"; for (@$attrseq) { $out .= " $_"; if ( defined($attr->{$_}) ) { $out .= '="' . encode_entities( $attr->{$_} ) . '"'; } } $out .= '>'; return $out; }