Actually, here's what I ended up with the one time. It transforms certain HTML documents to \latex{}. (I just wanted to print the documents out, but the page break algorithms in web browsers are nonexistent.) Thought you might want to see a sample of the module in action.
my $dom = Mojo::DOM->new($html);
my $body = $dom->at('.article-bodycopy');
$body->find('p, table')->each(sub {
my $node = shift;
if ($node->{class} eq 'SubHead') {
print '\subsection{' . $node->text . "}";
return;
} elsif ($node->type eq "table") {
my $img = $node->find('img')->[0]->{src};
my $cap = filter($node->find('.Figure1')->[0]);
$img =~ s/\.gif/\.png/;
print join("\n", '\begin{Figure}',
'\centering',
'\includegraphics[width=0.65\linewidth,' .
'height=0.85\textheight,keepaspectratio]{' . $img . '}
+',
'\captionof{figure}{' . $cap . '}', '\end{Figure}');
return;
}
if ($node->children->size == 0) {
print filter($node);
} else {
# node has sub-tags
$node->children->each(sub {
my $n = shift;
my $tag = $n->type;
if ($tag eq 'b') {
$n->replace('{\bf ' . $n->text . '}');
} else {
print STDERR "UNHANDLED MARKUP TYPE: " . $n->type
+. "\n";
}
});
print filter($node);
}
});
|