OK, this reimplementation is for
nate. I really did
start out thinking I was going to make the handler modal,
but when I got done it just didn't make sense. For
instance the hack to handle tables (which was really the
reason for wanting it in the first place) would have
resulted in a circular reference which is a memory leak.
What I did instead was added several hooks for pre and
post filters. And made the final routine return a
subroutine that processes markup. It would be possible to
actually use this in a modal way, just set the pos() to
the end of the string and then have the post hook set the
pos() to whatever you wanted it to be. I did most of the
work required, I just didn't think it made sense for the
problem at hand.
For people other than nate, this change fixes a few minor
bugs, can be used to handle the attribute "checked" (that
ability is not shown here), can be used to allow additional
validations of attribute values, and (very important) allows
you to refuse to let new table tags to be opened outside of
a table that you have opened.
First a module, MarkupHandler.pm. This does most of the
"reasoning".
package MarkupHandler;
use Carp;
use HTML::Entities qw(encode_entities);
use Exporter;
@ISA = 'Exporter';
@EXPORT_OK = qw(
ret_bal_tag_handlers ret_escape_code
ret_ok_attr_val ret_scrubber
);
use strict;
use vars qw(@open_tags);
# Gets the value of a tag attribute
sub get_attr_val {
my $t_ref = shift;
if (
$$t_ref =~ /
\G\s*=\s*(
[^\s>"'][^\s>]* | # Unquoted
"[^"]*" | # Double-quoted
'[^']*' # Single-quoted
)
/gx
) {
return $1;
}
else {
return ('', "no value found");
}
}
sub ret_bal_tag_handlers {
my @out = ();
while (@_) {
my $tag = shift;
my $name = $tag;
if (ref($tag)) {
if (exists $tag->{name}) {
$name = $tag->{name};
}
else {
confess("Tags must have name attributes");
}
}
else {
$tag = {name => $name};
}
my $attribs = shift;
if (@$attribs) {
push @out, "<$name", ret_tag_open($name, @$attribs);
}
else {
push @out, "<$name>", ret_const_tag_open($name);
}
# Rewrite handler?
if (exists $tag->{pre}) {
push @out, wrap_handler($tag->{pre}, pop(@out), '');
}
push @out, "</$name>", ret_tag_close($name);
if (exists $tag->{post}) {
push @out, wrap_handler('', pop(@out), $tag->{post});
}
}
return @out;
}
# Many tags have no attributes allowed. Handle
# them efficiently.
sub ret_const_tag_open {
my $tag = shift;
return sub {
push @open_tags, $tag;
return "<$tag>";
};
}
# Returns the basic "literal escape" that you see for
# code.
sub ret_escape_code {
my $end_pat = shift;
my $name = shift;
return (
$name,
sub {
my $t_ref = shift;
if ($$t_ref =~ m=\G(.*?)$end_pat=gs) {
return "<pre>" . encode_entities($1) . "</pre>";
}
else {
return show_err("Unmatched $name tag found");
}
}
);
}
# Generate an attribute handler based on an "ok value" test.
# Note that quotes on the attribute will exist in the
# value passed to the ok test.
sub ret_ok_attr_val {
my $ok_test = shift;
wrap_handler(
'', \&get_attr_val,
sub {
my $text = shift;
if ($ok_test->($text)) {
$text;
}
else {
return ('', "Illegal val '$text'");
}
}
);
}
# Pass a list of case/handler pairs, returns an anonymous
# sub that processes those pairs.
sub ret_scrubber {
my %handler = @_;
# Sanity check
foreach my $case (keys %handler) {
unless (UNIVERSAL::isa($handler{$case}, 'CODE')) {
carp("Case '$case' dropped - handlers must be functions");
delete $handler{$case};
}
}
$handler{pre} ||= sub {return '';};
$handler{post} ||= sub {
return join '', map "</$_>", reverse @open_tags;
};
# Sorted in reverse so that '<br' comes *before* '<b'...
my $re_str = join "|",
map {quotemeta} reverse sort keys %handler;
my $is_handled = qr/$re_str/i;
return sub {
my $t_ref = shift;
my $scrubbed;
local @open_tags;
while ($$t_ref =~ /\G([\w ]*)/g) {
$scrubbed .= $1;
my $pos = pos($$t_ref);
if ($$t_ref =~ /\G($is_handled)/g) {
my ($chunk, @err) = $handler{ lc($1) }->($t_ref);
if (@err) {
pos($$t_ref) = 0;
$scrubbed .= show_err(@err);
}
else {
$scrubbed .= $chunk;
# Obscure bug fix. You cannot match 2 zero
# length patterns in a row, this resets the
# flag so you can.
pos($$t_ref) = pos($$t_ref);
}
}
unless (pos($$t_ref)) {
if (length($$t_ref) == $pos) {
# EXIT HERE #
return $scrubbed . $handler{post}->($t_ref);
}
else {
my $char = substr($$t_ref, $pos, 1);
pos($$t_ref) = $pos + 1;
$scrubbed .= encode_entities($char);
}
}
}
confess("I have no idea how I got here!");
}
}
# Returns a sub that closes a tag
sub ret_tag_close {
my $tag = shift;
return sub {
my @searched;
while (@open_tags) {
my $open = pop(@open_tags);
push @searched, $open;
if ($open eq $tag) {
# Close em!
return join '', map "</$_>", @searched;
}
}
# No you cannot close a tag you didn't open!
@open_tags = reverse @searched;
pos(${$_[0]}) = 0;
return show_err("Unmatched close tag </$tag>");
};
}
# The general open tag
sub ret_tag_open {
my $tag = shift;
my %attr_test;
foreach (@_) {
if (ref($_)) {
foreach my $attrib (keys %$_) {
$attr_test{lc($attrib)} = $_->{$attrib};
}
}
else {
$attr_test{lc($_)} = \&get_attr_val;
}
}
return sub {
my $t_ref = shift;
my $text = "<$tag";
while ($$t_ref =~ /\G(?:\s+(\w+)|\s*>)/g) {
if (defined($1)) {
my $attrib = lc($1);
if (exists $attr_test{$attrib}) {
my ($chunk, @err) = $attr_test{$attrib}->($t_ref);
if (@err) {
return show_err(
"While processing '$attrib' in <$tag>:", @err
);
}
else {
$text .= " $attrib=$chunk";
}
}
else {
pos($$t_ref) = 0;
return show_err(
"Tag '$tag' cannot accept attribute '$attrib'"
);
}
}
else {
$text .= ">";
push @open_tags, $tag;
return $text;
}
}
return show_err("Unended <$tag> detected");
};
}
sub show_err {
if (wantarray()) {
return ('', @_);
}
else {
my $err = encode_entities(join ' ', grep length($_), @_);
return "<h2><font color=red>$err</font></h2> ";
}
}
sub wrap_handler {
my $pre = shift() || sub {''};
my $fn = shift();
my $post = shift() || sub {@_};
return sub {
my $t_ref = shift;
my ($text, @err) = $pre->($t_ref);
if (@err) {
return show_err($text, @err);
}
(my $chunk, @err) = $fn->($t_ref);
@err ?
show_err("$text$chunk", @err) :
$post->("$text$chunk");
};
}
1;
And then the test script that I used with it. (Some of
the functionality in this script could definitely be moved
to the handler.)
use strict;
use vars qw($table_depth);
$table_depth = 0;
use MarkupHandler qw(
ret_bal_tag_handlers ret_escape_code ret_ok_attr_val ret_scrubber
);
my $href_test = ret_ok_attr_val(
sub {
shift() =~ m-^'?"?(http://|ftp://|mailto:|#)-i;
}
);
my @bal_tags = (
a => [
qw(name target),
{'href' => $href_test}
],
font => [qw(color size)],
ol => ['type'],
p => ['align'],
{
name => 'table',
pre => sub {
if (5 > $table_depth++) {
return '';
}
else {
$table_depth--;
return ('', "Exceeded maximum table depth\n");
}
},
post => sub {
if (0 > --$table_depth) {
$table_depth = 0;
return ("", "Table depth should not be below 0");
}
@_;
},
}, [qw(bgcolor border cellpadding cellspacing width)],
in_table_tags(
colgroup => [ qw(align span valign width) ],
td => [qw(align bgcolor colspan height rowspan valign width)],
tr => [qw(align valign width)],
),
map {$_, []} qw(
b big blockquote br dd dl dt em h1 h2 h3 h4 h5 h6 hr
i li pre small strike strong sub sup tt u ul
),
);
my @handlers = (
ret_bal_tag_handlers(@bal_tags),
ret_escape_code(qr(\[/code\]), "[code]"),
);
my $scrubber = ret_scrubber(@handlers);
my $text = <<'EOT';
Hello world. <<<foo>;
[code] This is <code>...see the <a href=whatever> being escaped? [/co
+de]
<b>Hello world</ br></b>ub>
<a href="javascript://www.d<a href=http://yada>evil.com" name=happy>he
+llo world
<a href=http://yada>hello</a>
<tr><td>
<table><tr>
<table>
<table>
<table>
<table>
<table>
</table>
</table>
</table>
</table>
</table>
EOT
print $scrubber->(\$text);
sub in_table_tags {
my @out;
while (@_) {
my $name = shift;
push @out, {
name => $name,
pre => sub {
$table_depth ? '' : ('', "Cannot open <$name>, not in table");
},
}, shift();
}
return @out;
}
Also some of the other handlers that I had in this thread
are likely to be of interest.