##
# This covers the majority of rules
%bal_tag_attribs = (
# Tags with ok attributes
a => [qw(href name target)],
# I added this one, and there is a later as well
colgroup => [
qw(align span valign width)
],
font => ["color size"],
ol => ["type"],
# Balancing looks strange but align needs closing
p => ["align"],
table => [
qw(bgcolor border cellpadding cellspacing width)
],
td => [
qw(align bgcolor colspan height rowspan valign width)
],
tr => [qw(align valign width)],
# Ones without attributes
# I omitted the 'nbsp' tag, should that be 'nobr'?
map {$_, []} qw(
b big blockquote br dd dl dt em h1 h2 h3 h4 h5 h6 hr
i li pre small strike strong sub sup tt u ul
),
);
# Initialize default handlers
%handlers = (
# literal text with special meanings we allow
( map {my $tag = $_; ($tag, sub {return $tag})} qw(&
) ),
# Various html tags
&ret_tag_handlers(%bal_tag_attribs),
# And our escape mode.
'[code]' => ret_escape_code(qr(\[/code\]), "[code]"),
);
##
##
sub ret_const_tag_open {
my $tag = shift;
return sub {
push @open_tags, $tag;
return "<$tag>";
};
}
sub ret_escape_code {
my $end_pat = shift;
my $name = shift;
return sub {
my $t_ref = shift;
if ($$t_ref =~ m=\G(.*?)$end_pat=gs) {
return "" . encode_entities($1) . "
";
}
else {
return show_err("Unmatched $name tag found");
}
};
}
sub ret_tag_close {
my $tag = shift;
return sub {
my @searched;
while (@open_tags) {
my $open = pop(@open_tags);
push @searched, $open;
if ($open eq $tag) {
# Close em!
return join '', map "$_>", @searched;
}
}
# No you cannot close a tag you didn't open!
@open_tags = reverse @searched;
pos(${$_[0]}) = 0;
return show_err("Unmatched close tag $tag>");
};
}
sub ret_tag_open {
# The general case
my $tag = shift;
my %is_attrib;
++$is_attrib{$_} foreach @_;
return sub {
my $t_ref = shift;
my $text = "<$tag";
while (
$$t_ref =~ /\G(?:
\s+
([\w\d]+) # Value
\s*=\s*( # = attribute:
[^\s>"'][^\s>]* | # Unquoted
"[^"]*" | # Double-quoted
'[^']*' # Single-quoted
)
|
\s*> # End of tag
)/gx
) {
if ($1) { # Trying to match an attrib
if ($is_attrib{ lc($1) }) {
$text .= " $1=$2";
}
else {
pos($$t_ref) = 0;
return show_err("Tag '$tag' cannot accept attribute '$1'");
}
}
else {
# Ended text
push @open_tags, $tag;
return "$text>";
}
}
return show_err("Unended <$tag> detected");
};
}
sub ret_tag_handlers {
my %attribs = @_;
my @out = ();
foreach my $tag (keys %attribs) {
if (@{$attribs{$tag}}) {
push @out, "<$tag", ret_tag_open($tag, @{$attribs{$tag}});
}
else {
push @out, "<$tag>", ret_const_tag_open($tag);
}
push @out, "$tag>", ret_tag_close($tag);
}
return @out;
}
sub show_err {
my $err = join '', @_;
return "$err
";
}
##
##
=head1 B
my $scrubbed_text = scrub_input($raw_text, [$handlers]);
This takes a string and an optional ref to a hash of handlers,
and returns html-escaped output, except for the sections handled
by the handlers which do whatever they want.
If handlers are passed their names should be lower case and start
with a character matching [^\w\s\d] or else they will not be
matched properly. While parsing the string, when they can be
matched case insensitively, then the handler is called. It will be
passed a reference to $raw_text right after that matches the name
of the handler. (pos($raw_text) will point to the end of the
name.) It should return the text to be inserted into the output,
and set pos($raw_text) to where to continue parsing from, or 0 if
no text was handled.
Two special handlers that may be used are "pre" and "post" that
will be called before and after (respectively) the raw text is
processed. For consistency they also get a reference to the raw
text.
If no handler is passed, it will use \%handlers instead.
=cut
sub scrub_input {
my $raw = shift;
local @open_tags;
my $handler = shift || \%handlers;
$handler->{pre} ||= sub {return '';};
$handler->{post} ||= sub {
return join '', map "$_>", reverse @open_tags;
};
my $scrubbed = $handler->{pre}->(\$raw);
# This would be faster with the trie code from node 30896. But
# that is not the point of this example so I have not done that.
# Also note the next line is meant to force an NFA engine to
# match the longest alternative first
my $re_str = join "|",
map {quotemeta} reverse sort keys %$handler;
my $is_handled = qr/$re_str/i;
while ($raw =~ /\G([\w\d\s]*)/gi) {
$scrubbed .= $1;
my $pos = pos($raw);
if ($raw =~ /\G($is_handled)/g) {
$scrubbed .= $handler->{ lc($1) }->(\$raw);
}
unless (pos($raw)) {
if (length($raw) == $pos) {
# EXIT HERE #
return $scrubbed . $handler->{post}->(\$raw);
}
else {
my $char = substr($raw, $pos, 1);
pos($raw) = $pos + 1;
$scrubbed .= &encode_entities($char);
}
}
}
confess("I have no idea how I got here!");
}