# ... snippet ... # tags to ignore my @ignore_tags = qw(font big small body dir html); # teags to drop with content @ignore_elements = qw(script style head); ########################################################## sub clean_up_htmltree { ########################################################## my $input = shift; my $warn = 0; my $htmlex; use HTML::TreeBuilder; my $h = HTML::TreeBuilder->new; $h->ignore_unknown(0); $h->warn($warn); $h->parse($input); foreach (@ignore_tags) { $htmlex = 1, next if lc($_) eq "html"; # remove ...? while (my $ok = $h->look_down('_tag', "$_")) { $ok->replace_with_content; } } foreach (@ignore_elements) { while (my $ok = $h->look_down('_tag', "$_")) { $ok->detach; } } my $output = $h->as_HTML(undef, " ", {}); # entities to encode, indent, optional endtags $h = $h->delete(); # nuke it! if ($htmlex) { $output =~ s:^\s*::m; $output =~ s:\s*$::m; } return $output; }