#!/usr/bin/perl -w use strict; use HTML::Entities; while (my $data = ) { exit if $data =~ /q/i; print scrubInput($data) . "\n"; } sub scrubInput { # This sub converts potentially harmful characters to their HTML equivalent. # Then, it converts < and > around allowed tags back to < and > # Finally, it converts dangerous characters in those tags back to normal. # Otherwise, users could have things like # show up on a page, which obviously is not a valid anchor. my $data = shift; my $unsafe_chars = '!~%^&*\\|"\'<>-_+=?\/;:\[\]{}()\@\$\.'; # Allowed tags # # All tags should be a regex without the < or > characters # Case is irrelevant # Append an underscore to the tag if it can have attributes # -- Examples -- # would be 'font_' # To represent the

through

tags, use 'h[1-6]' my @tags = ('br', 'p', 'font_', 'h[1-6]', 'a_' ); $data = encode_entities($data, $unsafe_chars); # Let's substitute back angle brackets that match our allowed tags foreach my $tag (@tags) { # This substitution is for tags that allow additional attributes. # The weird negative lookahead takes into account that the final # > has been replaced by > $tag =~ s/_$/(?:\\s+(?:[^&]|&(?!gt;))+)?/; # / is the / found in an end tag. would be encoded as # </a> $data =~ s!<(/)?(/?$tag)>! defined $1 ? "" : "<$2>"!gesi; } # Return those bad characters if they are in (allowed tags) # Otherwise, things like a somelink # will bomb $data =~ s/ ( # Capture to $1 ](?!href) # All non > not followed by href )* # zero or more of them href\s* # href followed by zero or more space characters ) ( # Capture to $2 =\s* # = plus zero or more spaces ( # Capture to $3 &[^;]+; # some HTML character code (probably " or ') )? # which might not exist (?: # Non-grouping parens .(?!\3) # any character not followed by $3 )+ # one or more of them (?: \3 # $3 )? # (which may not exist) ) ( # Capture to $4 [^>]+ # Everything up to final > > # Final > ) /$1 . decode_entities($2) . $4/gsexi; return $data; }