#!/usr/bin/perl
# htmldebloater -- debloat HTML so it can be comfotably read even a
# turn of the century black and white PDA
# Copyright : http://www.fsf.org/copyleft/gpl.html
# Created On : December 2004
# Last Modified On: Sat Feb 6 02:47:12 2010
# Update Count : 23
# Inspiration : Dan Jacobson -- http://jidanni.org/comp
# Actual brains : Sam Watkins
use warnings;
use strict;
use HTML::Entities;
# Only tags in this list will get through.
my $ok_tags = set(
qw(
html head body
title base meta
p br hr
a img
table th tr td
b i u em strong
center
blockquote
ul ol li dl dt dd
h1 h2 h3 h4 h5 h6
pre
div
)
);
# Attributes in this list will get through no matter what tag they are in.
# - I can't think of any yet.
my $ok_attr = set(
qw(
)
);
# a list of allowed attributes for each tag
my $ok_tag_attr = {
td => set(qw(colspan rowspan align valign)),
a => set(qw(href name)),
img => set(qw(src width height alt title)),
base => set(qw(href)),
meta => set(qw(http-equiv content)),
};
# a list of tags where we want to hide the content between and
my $kill_containers = set(
qw(
script style
select textarea
)
);
# Tags in this list will be filtered based on a predicate over their attributes.
# The "predicate" could also change the attribute names or values, delete attributes, etc.
my $tag_filter = {
meta => sub { my $attr = $_[0]; $attr->{'http-equiv'} && $attr->{content} }
};
use strict;
use warnings;
use HTML::Parser;
my $in_dead_container = 0;
my $parser = HTML::Parser->new(
text_h => [ \&pass_through, "text" ],
declaration_h => [ \&pass_through, "text" ],
start_h => [ \&start_tag, "self, tagname, attrseq, attr" ],
end_h => [ \&end_tag, "self, tagname, text" ]
# no comments will get through
# default_h => [ sub { print shift }, 'text' ],
);
$parser->parse_file(*STDIN);
sub pass_through {
print shift unless $in_dead_container;
}
sub start_tag {
my ( $self, $tag, $attrseq, $attr ) = @_;
if ( $ok_tags->{$tag} && !$in_dead_container ) {
$attrseq =
[ grep { $ok_attr->{$_} || $ok_tag_attr->{$tag}{$_} } @$attrseq ];
my $keep_tag = 1;
if (my $pred = $tag_filter->{$tag}) {
$keep_tag = $pred->($attr);
}
if ($keep_tag) {
print format_start_tag( $tag, $attrseq, $attr );
}
}
if ( $kill_containers->{$tag} ) {
++$in_dead_container;
}
}
sub end_tag {
my ( $self, $tag, $text ) = @_;
print $text if $ok_tags->{$tag} && !$in_dead_container;
if ( $kill_containers->{$tag} ) {
--$in_dead_container;
}
}
sub set {
return { map { $_, 1 } @_ };
}
sub format_start_tag {
my ( $tag, $attrseq, $attr ) = @_;
my $out = "<$tag";
for (@$attrseq) {
$out .= " $_";
if ( defined($attr->{$_}) ) {
$out .= '="' . encode_entities( $attr->{$_} ) . '"';
}
}
$out .= '>';
return $out;
}