Here is a simple timing code to replicate the issue.
I couldn't find any large tables in public websites but I found one in Wikipedia with 162 rows that illustrates the problem. If you find one with 400+ you'll see it takes 3-4 seconds for obtaining the TDs of a TR.
#!/usr/bin/env perl
use strict;
use warnings;
use feature qw(say);
no warnings qw(experimental);
use Log::Log4perl qw(:easy);
use WWW::Mechanize::Chrome;
use Time::HiRes qw( gettimeofday tv_interval );
my $debug = 0;
my ($t0, $elapsed);
Log::Log4perl->easy_init($ERROR);
my $mech = WWW::Mechanize::Chrome->new(
headless => 0,
autodie => 0,
autoclose => 0
);
$mech->get('https://meta.wikimedia.org/wiki/Wikipedia_article_depth');
sleep(2);
my @nodes = $mech->xpath('//table');
$t0 = [gettimeofday];
my @rows = $mech->xpath('.//tr', node => $nodes[3]);
say 'xpath for TR tooK:'.tv_interval ( $t0 );
my @cell_keys = ( );
my @table_data = ( );
say "Timing for $#rows rows.";
foreach my $row_index (0 .. $#rows) {
my %row_data = ( );
# column names
if($row_index == 0){
$t0 = [gettimeofday];
my @cells = $mech->xpath('.//th', node => $rows[$row_index]);
say 'xpath for TH tooK:'.tv_interval ( $t0 );
foreach (0 ... $#cells) {
say "HEADER CELL: $_, VALUE:".$cells[$_]->get_text() if $d
+ebug;
push @cell_keys, $cells[$_]->get_text();
}
if($debug) {
say 'Column Names:';
say $_ foreach @cell_keys;
}
}
# data row
else{
$t0 = [gettimeofday];
my @cells = $mech->xpath('.//td', node => $rows[$row_index]);
say 'xpath for TD tooK:'.tv_interval ( $t0 );
say "DATA ROW: $row_index" if $debug;
foreach (0 ... $#cells) {
say "DATA CELL: $_, VALUE:" . $cells[$_]->get_text() if $d
+ebug;
$row_data{ $cell_keys[$_] } = $cells[$_]->get_text();
}
push @table_data, \%row_data;
if($debug) {
say 'Column Data:';
say $row_data{$_} foreach @cell_keys;
}
}
}
say Dumper(@table_data) if $debug;
Here are the results:
|