through

#!/usr/bin/perl -w
use strict;

use HTML::Entities;

while (my $data = <STDIN>) {
	exit if $data =~ /q/i;
	print scrubInput($data) . "\n";
}

sub scrubInput {

	# This sub converts potentially harmful characters to their HTML equivalent.
	# Then, it converts &lt; and &gt; around allowed tags back to < and >
	# Finally, it converts dangerous characters in those tags back to normal.
	# Otherwise, users could have things like <A HREF=&quot;somelink$quot;>
	# show up on a page, which obviously is not a valid anchor.
	
	my $data = shift;
	my $unsafe_chars = '!~%^&*\\|"\'<>-_+=?\/;:\[\]{}()\@\$\.';
	
	# Allowed tags
	#
	# All tags should be a regex without the < or > characters
	# Case is irrelevant
	# Append an underscore to the tag if it can have attributes
	# -- Examples --
	# <font size=1>  would be 'font_'
	# To represent the <h1> through <h6> tags, use 'h[1-6]'
	
	my @tags = ('br',
				'p',
				'font_',
				'h[1-6]',
				'a_'
			   );

	$data = encode_entities($data, $unsafe_chars);
	
	# Let's substitute back angle brackets that match our allowed tags
	
	foreach my $tag (@tags) {
		# This substitution is for tags that allow additional attributes.
		# The weird negative lookahead takes into account that the final
		# > has been replaced by &gt;
		
		$tag =~ s/_$/(?:\\s+(?:[^&]|&(?!gt;))+)?/;
		
		# &#47; is the / found in an end tag.  </a> would be encoded as
		# &lt;&#47;a&gt;
		$data =~ s!&lt;(&#47;)?(/?$tag)&gt;! defined $1 ? "</$2>" : "<$2>"!gesi;
	}
	
	# Return those bad characters if they are in <a ... > (allowed tags)
	# Otherwise, things like a <A HREF=&quot;somelink$quot;> somelink </a>
	# will bomb
	
	$data =~ s/
				(					   # Capture to $1
				  <a\s                 #     <a and a space character
			      (?:                  #     Non-capturing parens
					[^>](?!href)       #         All non > not followed by href
				  )*                   #         zero or more of them
				  href\s*              #     href followed by zero or more space characters
				) 
				(                      # Capture to $2
				  &#61;\s*             #     = plus zero or more spaces
				  (                    #     Capture to $3
				    &[^;]+;            #         some HTML character code (probably " or ')
				  )?                   #         which might not exist
				  (?:                  #     Non-grouping parens
				    .(?!\3)            #         any character not followed by $3
				  )+                   #         one or more of them
				  (?:
				    \3                 #     $3 
				  )?                   #     (which may not exist)
				)
				(                      # Capture to $4
				  [^>]+                #     Everything up to final >
				  >                    #     Final >
				)
			/$1 . decode_entities($2) .  $4/gsexi;
	return $data;
}