package MarkupHandler; use Carp; use HTML::Entities qw(encode_entities); use Exporter; @ISA = 'Exporter'; @EXPORT_OK = qw( ret_bal_tag_handlers ret_escape_code ret_ok_attr_val ret_scrubber ); use strict; use vars qw(@open_tags); # Gets the value of a tag attribute sub get_attr_val { my $t_ref = shift; if ( $$t_ref =~ / \G\s*=\s*( [^\s>"'][^\s>]* | # Unquoted "[^"]*" | # Double-quoted '[^']*' # Single-quoted ) /gx ) { return $1; } else { return ('', "no value found"); } } sub ret_bal_tag_handlers { my @out = (); while (@_) { my $tag = shift; my $name = $tag; if (ref($tag)) { if (exists $tag->{name}) { $name = $tag->{name}; } else { confess("Tags must have name attributes"); } } else { $tag = {name => $name}; } my $attribs = shift; if (@$attribs) { push @out, "<$name", ret_tag_open($name, @$attribs); } else { push @out, "<$name>", ret_const_tag_open($name); } # Rewrite handler? if (exists $tag->{pre}) { push @out, wrap_handler($tag->{pre}, pop(@out), ''); } push @out, "", ret_tag_close($name); if (exists $tag->{post}) { push @out, wrap_handler('', pop(@out), $tag->{post}); } } return @out; } # Many tags have no attributes allowed. Handle # them efficiently. sub ret_const_tag_open { my $tag = shift; return sub { push @open_tags, $tag; return "<$tag>"; }; } # Returns the basic "literal escape" that you see for # code. sub ret_escape_code { my $end_pat = shift; my $name = shift; return ( $name, sub { my $t_ref = shift; if ($$t_ref =~ m=\G(.*?)$end_pat=gs) { return "
" . encode_entities($1) . "
"; } else { return show_err("Unmatched $name tag found"); } } ); } # Generate an attribute handler based on an "ok value" test. # Note that quotes on the attribute will exist in the # value passed to the ok test. sub ret_ok_attr_val { my $ok_test = shift; wrap_handler( '', \&get_attr_val, sub { my $text = shift; if ($ok_test->($text)) { $text; } else { return ('', "Illegal val '$text'"); } } ); } # Pass a list of case/handler pairs, returns an anonymous # sub that processes those pairs. sub ret_scrubber { my %handler = @_; # Sanity check foreach my $case (keys %handler) { unless (UNIVERSAL::isa($handler{$case}, 'CODE')) { carp("Case '$case' dropped - handlers must be functions"); delete $handler{$case}; } } $handler{pre} ||= sub {return '';}; $handler{post} ||= sub { return join '', map "", reverse @open_tags; }; # Sorted in reverse so that '($t_ref); if (@err) { pos($$t_ref) = 0; $scrubbed .= show_err(@err); } else { $scrubbed .= $chunk; # Obscure bug fix. You cannot match 2 zero # length patterns in a row, this resets the # flag so you can. pos($$t_ref) = pos($$t_ref); } } unless (pos($$t_ref)) { if (length($$t_ref) == $pos) { # EXIT HERE # return $scrubbed . $handler{post}->($t_ref); } else { my $char = substr($$t_ref, $pos, 1); pos($$t_ref) = $pos + 1; $scrubbed .= encode_entities($char); } } } confess("I have no idea how I got here!"); } } # Returns a sub that closes a tag sub ret_tag_close { my $tag = shift; return sub { my @searched; while (@open_tags) { my $open = pop(@open_tags); push @searched, $open; if ($open eq $tag) { # Close em! return join '', map "", @searched; } } # No you cannot close a tag you didn't open! @open_tags = reverse @searched; pos(${$_[0]}) = 0; return show_err("Unmatched close tag "); }; } # The general open tag sub ret_tag_open { my $tag = shift; my %attr_test; foreach (@_) { if (ref($_)) { foreach my $attrib (keys %$_) { $attr_test{lc($attrib)} = $_->{$attrib}; } } else { $attr_test{lc($_)} = \&get_attr_val; } } return sub { my $t_ref = shift; my $text = "<$tag"; while ($$t_ref =~ /\G(?:\s+(\w+)|\s*>)/g) { if (defined($1)) { my $attrib = lc($1); if (exists $attr_test{$attrib}) { my ($chunk, @err) = $attr_test{$attrib}->($t_ref); if (@err) { return show_err( "While processing '$attrib' in <$tag>:", @err ); } else { $text .= " $attrib=$chunk"; } } else { pos($$t_ref) = 0; return show_err( "Tag '$tag' cannot accept attribute '$attrib'" ); } } else { $text .= ">"; push @open_tags, $tag; return $text; } } return show_err("Unended <$tag> detected"); }; } sub show_err { if (wantarray()) { return ('', @_); } else { my $err = encode_entities(join ' ', grep length($_), @_); return "

$err

"; } } sub wrap_handler { my $pre = shift() || sub {''}; my $fn = shift(); my $post = shift() || sub {@_}; return sub { my $t_ref = shift; my ($text, @err) = $pre->($t_ref); if (@err) { return show_err($text, @err); } (my $chunk, @err) = $fn->($t_ref); @err ? show_err("$text$chunk", @err) : $post->("$text$chunk"); }; } 1; #### use strict; use vars qw($table_depth); $table_depth = 0; use MarkupHandler qw( ret_bal_tag_handlers ret_escape_code ret_ok_attr_val ret_scrubber ); my $href_test = ret_ok_attr_val( sub { shift() =~ m-^'?"?(http://|ftp://|mailto:|#)-i; } ); my @bal_tags = ( a => [ qw(name target), {'href' => $href_test} ], font => [qw(color size)], ol => ['type'], p => ['align'], { name => 'table', pre => sub { if (5 > $table_depth++) { return ''; } else { $table_depth--; return ('', "Exceeded maximum table depth\n"); } }, post => sub { if (0 > --$table_depth) { $table_depth = 0; return ("", "Table depth should not be below 0"); } @_; }, }, [qw(bgcolor border cellpadding cellspacing width)], in_table_tags( colgroup => [ qw(align span valign width) ], td => [qw(align bgcolor colspan height rowspan valign width)], tr => [qw(align valign width)], ), map {$_, []} qw( b big blockquote br dd dl dt em h1 h2 h3 h4 h5 h6 hr i li pre small strike strong sub sup tt u ul ), ); my @handlers = ( ret_bal_tag_handlers(@bal_tags), ret_escape_code(qr(\[/code\]), "[code]"), ); my $scrubber = ret_scrubber(@handlers); my $text = <<'EOT'; Hello world. <<; [code] This is ...see the being escaped? [/code] Hello worldub> hello world hello
EOT print $scrubber->(\$text); sub in_table_tags { my @out; while (@_) { my $name = shift; push @out, { name => $name, pre => sub { $table_depth ? '' : ('', "Cannot open <$name>, not in table"); }, }, shift(); } return @out; }