use strict; use vars qw($raw @opened %ok_tag %unbal_tag); use HTML::Entities qw(encode_entities); %ok_tag = ( p => {}, br => {}, a => { accesskey => 1, charset => 1, coords => 1, href => 1, hreflang => 1, name => 1, tabindex => 1, target => 1, type => 1, }, font => { color => 1, face => 1, size => 1, }, h1 => {}, h2 => {}, h3 => {}, h4 => {}, h5 => {}, h6 => {}, ); %unbal_tag = map {($_, 1)} 'br', 'p'; # Takes an input string and returns it. It will leave alone # the tags in %ok_tags if they have only allowed attributes, # and will escape everything else. It will also insert # needed closing tags for tags not in %unbal_tag. I don't # have to do that, but I felt like it since regular # expressions cannot ever solve that problem. # # Oh, and this probably comes with bugs. That is what # you get for free though! :-) { my $raw; my @opened; sub scrub_input { $raw = shift; my $scrubbed = ''; @opened = (); # Grab a chunk of known OK data while ($raw =~ /([\s\w]*)/g) { $scrubbed .= $1; my $pos = pos($raw); # Search for a tag if ($raw =~ m=\G<(/?)([\w\d]+)=g) { my $is_close = $1; my $tag = lc($2); if (exists $ok_tag{$tag}) { if ($is_close) { # closing tag? $scrubbed .= _close_tag($tag); } else { $scrubbed .= _open_tag($tag); } } else { # This tag is not allowed pos($raw) = 0; } } # Escape if last /g match failed unless (pos($raw)) { if (length($raw) == $pos) { # EXIT HERE # return join '', $scrubbed, map "", reverse @opened; } else { my $char = substr($raw, $pos, 1); pos($raw) = $pos + 1; $scrubbed .= &encode_entities($char); } } } } sub _close_tag { my $tag = shift; # Check a couple of obvious conditions unless ($raw =~ /\G>/g) { # Oops! return ''; } if (exists $unbal_tag{$tag}) { return ""; # Not needed but...*shrug* } # OK then, time to figure out which need to be closed my @searched; while (@opened) { my $open_tag = pop(@opened); unshift @searched, $open_tag; if ($open_tag eq $tag) { # Close em! return join '', map "", reverse @searched; } } # Closing a tag that was not opened? I don't think so! @opened = @searched; pos($raw) = 0; return ''; } sub _open_tag { my $tag = shift; my $allowed = $ok_tag{$tag}; my $text = "<$tag"; while ($raw =~ /\G(?: # Attribute or close \s+([\w\d]+)=("[^"]*"|[^">\s]+) | \s*> )/gx ) { if ($1) { if ($allowed->{lc($1)}) { $text .= " $1=$2"; } else { # Show the bad tag pos($raw) = 0; return ''; } } else { push @opened, $tag; return "$text>"; } } # If I get here, was not well-formed pos($raw) = 0; return ''; } }