#!/usr/bin/perl -w
use strict;
use HTML::Entities;
while (my $data = ) {
exit if $data =~ /q/i;
print scrubInput($data) . "\n";
}
sub scrubInput {
# This sub converts potentially harmful characters to their HTML equivalent.
# Then, it converts < and > around allowed tags back to < and >
# Finally, it converts dangerous characters in those tags back to normal.
# Otherwise, users could have things like
# show up on a page, which obviously is not a valid anchor.
my $data = shift;
my $unsafe_chars = '!~%^&*\\|"\'<>-_+=?\/;:\[\]{}()\@\$\.';
# Allowed tags
#
# All tags should be a regex without the < or > characters
# Case is irrelevant
# Append an underscore to the tag if it can have attributes
# -- Examples --
# would be 'font_'
# To represent the through tags, use 'h[1-6]'
my @tags = ('br',
'p',
'font_',
'h[1-6]',
'a_'
);
$data = encode_entities($data, $unsafe_chars);
# Let's substitute back angle brackets that match our allowed tags
foreach my $tag (@tags) {
# This substitution is for tags that allow additional attributes.
# The weird negative lookahead takes into account that the final
# > has been replaced by >
$tag =~ s/_$/(?:\\s+(?:[^&]|&(?!gt;))+)?/;
# / is the / found in an end tag.
would be encoded as
# </a>
$data =~ s!<(/)?(/?$tag)>! defined $1 ? "$2>" : "<$2>"!gesi;
}
# Return those bad characters if they are in (allowed tags)
# Otherwise, things like a somelink
# will bomb
$data =~ s/
( # Capture to $1
](?!href) # All non > not followed by href
)* # zero or more of them
href\s* # href followed by zero or more space characters
)
( # Capture to $2
=\s* # = plus zero or more spaces
( # Capture to $3
&[^;]+; # some HTML character code (probably " or ')
)? # which might not exist
(?: # Non-grouping parens
.(?!\3) # any character not followed by $3
)+ # one or more of them
(?:
\3 # $3
)? # (which may not exist)
)
( # Capture to $4
[^>]+ # Everything up to final >
> # Final >
)
/$1 . decode_entities($2) . $4/gsexi;
return $data;
}