Simple little module that allows you to access the text inside nested tables using a multidimensional array. The
html can either be in a variable or from a file.
usage:
my $table = Table->parse_it(\$content); or
my $table = Table->parse_it($filename);
then:
print $table->[$table][$row][$col];
package Table;
use strict;
use HTML::Parser;
## PRIVATE
my $table = [];
my $tb_count;
my $tb_idx;
my $row;
my $column;
my $table_status;
my @save;
sub new {
my $type = shift;
return bless $table, $type;
}
sub parse_it {
my $self = shift;
my $src = shift;
my $p = HTML::Parser->new( api_version => 3,
handlers => [ start => [ \&_start, "tagname"],
end => [ \&_end, "tagname"],
text => [ \&_text, "dtext"],
],
marked_sections => 1,
);
if (ref($src)){
$p->parse($$src) or return;
}else{
$p->parse_file($src) or return;
}
return 1;
}
sub _start {
my $tag = shift;
if ($tag eq 'table'){
push @save, [$tb_idx, $row, $column];
$row = $column = 0;
++$tb_count;
$tb_idx = $tb_count;
++$table_status;
}
$row++ if ($tag eq 'tr');
$column++ if ($tag eq 'td');
}
sub _end {
my $tag = shift;
if ($tag eq 'table') {
($tb_idx, $row, $column) = @{ pop @save };
--$table_status;
}
$column = 0 if ($tag eq 'tr');
}
sub _text {
my $text = shift;
$text =~ s/\xa0//;
$table->[$tb_idx][$row][$column] .= $text if ($table_status) &&
($text !~ m/^\s+$/) &&
($text);
}
return 1;