#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw(cmpthese timethese);
use App::scrape qw(scrape);
use HTML::Query qw(Query);
use HTML::Selector::XPath qw(selector_to_xpath);
use HTML::TreeBuilder qw();
use HTML::TreeBuilder::XPath;
use Mojo::DOM;
use SGMLExtract qw(sgml_find sgml_extract);
use Web::Query qw(wq);
use Web::Scraper qw(process scraper);
use Debug;
my $html = q{
--stuff--
--more stuff--
--still more stuff--
Stuff I do not want
--all the stuff I want, which might include div tags, too--
--yet more stuff--
};
# appse and sgmle cheat because they go off relative position of the div - not the class name
sub m_appse { (scrape($html, ['div'], {class => 'myBody'}))[1]->[0] }
sub m_hselx { (HTML::TreeBuilder::XPath->new_from_content($html)->findnodes(selector_to_xpath('div.myBody')))[0]->as_HTML }
sub m_htmlq { Query(text => $html)->query('div.myBody')->as_HTML }
sub m_mojod { Mojo::DOM->new->parse($html)->at('.myBody')->text }
sub m_sgmle { sgml_extract(\$html, 'div', {all => 1, content => 1})->[1]->{'content'} }
sub m_sgmlf { sgml_find(\$html, 'div', {class => 'myBody'})->[0]->{'content'} }
sub m_treeb { HTML::TreeBuilder->new_from_content($html)->look_down(_tag => 'div', class => 'myBody')->as_HTML(q{}) }
sub m_webqy { wq($html)->find('div.myBody')->html }
sub m_websc { (scraper { process "div.myBody", key => 'TEXT' }->scrape($html))[0]->{'key'} }
debug m_appse(), m_hselx(), m_htmlq(), m_mojod(), m_treeb(), m_sgmle(), m_sgmlf(), m_webqy(), m_websc();
cmpthese timethese -1, {
appse => \&m_appse,
hselx => \&m_hselx,
htmlq => \&m_htmlq,
mojod => \&m_mojod,
sgmle => \&m_sgmle,
sgmlf => \&m_sgmlf,
treeb => \&m_treeb,
webqy => \&m_webqy,
websc => \&m_websc,
};
__END__
debug: paul/bench.pl line 45
m_appse() = "--all the stuff I want, which might include div tags, too--";
m_hselx() = " --all the stuff I want, which might include div tags, too--
";
m_htmlq() = " --all the stuff I want, which might include div tags, too--
";
m_mojod() = "\n--all the stuff want, which might include div tags, too--\n";
m_treeb() = " --all the stuff I want, which might include div tags, too--
";
m_sgmle() = "\n--all the stuff I want, which might include div tags, too--\n";
m_sgmlf() = "\n--all the stuff I want, which might include div tags, too--\n";
m_webqy() = " --all the stuff I want, which might include div tags, too--
";
m_websc() = " --all the stuff I want, which might include div tags, too-- ";
Rate webqy hselx websc appse htmlq treeb mojod sgmlf sgmle
webqy 697/s -- -4% -5% -37% -47% -54% -72% -97% -98%
hselx 724/s 4% -- -1% -35% -44% -52% -71% -97% -97%
websc 731/s 5% 1% -- -34% -44% -51% -70% -97% -97%
appse 1110/s 59% 53% 52% -- -15% -26% -55% -95% -96%
htmlq 1305/s 87% 80% 78% 18% -- -13% -47% -94% -95%
treeb 1506/s 116% 108% 106% 36% 15% -- -39% -93% -95%
mojod 2465/s 254% 240% 237% 122% 89% 64% -- -89% -91%
sgmlf 22330/s 3103% 2983% 2953% 1912% 1611% 1383% 806% -- -22%
sgmle 28709/s 4018% 3864% 3825% 2486% 2100% 1807% 1065% 29% --