Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

Convert html table to text

by PodMaster (Abbot)
on Mar 21, 2004 at 17:30 UTC ( #338485=snippet: print w/ replies, xml ) Need Help??
Description: Turns
header1header2
tr 1, td 1 tr 1, td 2
tr 3, td 1
00
0
Yes, a Lone Table Cell
tr2 td2
into
.=------------------------------------------=.
| header1                      | header2     |
|-==========================================-|
|  tr 1, td 1                  |  tr 1, td 2 |
|=-----------------------------*------------=|
|  tr 3, td 1                  |  tr2 td2    |
| .=------------------------=. |             |
| | 00                      |  |             |
| |                         |  |             |
| |  Yes, a Lone Table Cell |  |             |
| '=------------------------=' |             |
'=------------------------------------------='


.=------------------------=.
| 00                      |
|                         |
|  Yes, a Lone Table Cell |
'=------------------------='
caveats: doesn't handle multiple rows with TH tags (just uses the last TH tags seen).
Text::ASCIITable doesn't like incomplete rows (row has set 4 columns, but 2 were added, so the row doesn't get added) which I consider a bug (filed as such). FIXED in Text-ASCIITable-0.14
Doesn't handle colspan/rowspan.
#!/usr/bin/perl 
use strict;
use HTML::TreeBuilder;
use Text::ASCIITable;
use List::Util qw(max);


my $html = q~
<table border=1>
<tr><th>header1</th><th>header2</th></tr>
<tr>
<Td> tr 1, td 1 </td>
<td> tr 1, td 2</td>
</tr>
<tr>
<td> tr 3, td 1 <table border=0><tr><td>00</td></tr><tr><td>0</td></tr
+><tr><td> Yes, a Lone Table Cell</td></tr></table>
</td>
<td> tr2 td2 </td>
</tr>
</table>~;

my $t = HTML::TreeBuilder->new();
$t->parse($html);
$t->eof;

print DumpTable( $_ ), $/, $/
    for $t->find_by_tag_name('table') ;

sub DumpTable {
    my $ht = shift;

    die "$ht is not a table" unless $ht->tag eq 'table';

    my $tt = Text::ASCIITable::->new;
    my @co;
    my @da;
    my $da = [];

    for my $ro ( @{ $ht->content() } ) {

        if( $ro->tag eq 'tr' ) {

            push @da, $da if @$da;
            $da = [];

            for my $ce ( @{ $ro->content() } ) {

                if( $ce->tag eq 'td' ) {

                    if( $ce->look_down( '_tag', 'table' ) ) {
                        my $string = '';

                        for my $i ( @{ $ce->content() } ) {
                            if( not ref $i ) {
                                $string .= $i;
                            }
                            elsif( $i->tag eq 'table' ) {
                                $string .= "\n";
                                $string .= DumpTable($i);
                                $string .= "\n";
                            }
                            else {
                                $string .= $i->as_text;
                            }
                        }

                        push @$da, $string;
                    }
                    else {
                        push @$da, $ce->as_text;
                    }
                }
                elsif( $ce->tag eq 'th' ) {
                    push @co, $ce->as_text;
                }
            }
        }
    }

    push @da, $da if @$da;

    if(@co) {
        $tt->setCols(@co);
    }
    else {
        use List::Util qw(max);
        my $max = 1 + max( 0, map { $#$_ } @da );
        $tt->setCols( (' ') x $max );
        $tt->setOptions( hide_HeadRow  => 1 );
        $tt->setOptions( hide_HeadLine => 1 );
    }

    $tt->addRow($_) for @da;

    $tt->setOptions( 'drawRowLine', 1) if $ht->attr('border');
#    return $tt->draw();
    return $tt->draw(
        [ '.=', '=.', '-', '-' ],    # .=-----------=.
        [ '|',  '|',  '|' ],         # | info | info |
        [ '|-', '-|', '=', '=' ],    # |-===========-|
        [ '|',  '|',  '|' ],         # | info | info |
        [ "'=", "='", '-', '-' ],    # '=-----------='
        [ '|=', '=|', '-', '*' ]     # rowseperator
    );
}
Comment on Convert html table to text
Download Code
Replies are listed 'Best First'.
Re: Convert html table to text
by Anonymous Monk on Aug 21, 2008 at 15:29 UTC
    Is it possible to not redraw the table that are inside ? Jeff
Re: Convert html table to text
by Anonymous Monk on Sep 18, 2008 at 13:47 UTC
    Is it possible to limit the overall width of the table, i.e. 120 characters wide?
      Not really, this is the best I could come up with
      #!/usr/bin/perl -- use strict; use warnings; use HTML::TreeBuilder; use Text::ASCIITable; use List::Util qw(max); my $html = q~ <table border=1> <tr><th>header1</th><th>header2</th></tr> <tr> <Td> tr 1, td 1 </td> <td> tr 1, td 2</td> </tr> <tr> <td> tr 3, td 1 <table border=0><tr><td>00</td></tr><tr><td>0</td></tr +><tr><td> Yes, a Lone Table Cell</td></tr></table> </td> <td> tr2 td2 </td> </tr> </table>~; my $t = HTML::TreeBuilder->new(); $t->parse($html); $t->eof; print DumpTable( $_ ), $/, $/ for $t->find_by_tag_name('table') ; sub DumpTable { my($ht, $depth) = (@_,0); # warn "$ht depth $depth"; die "$ht is not a table" unless $ht->tag eq 'table'; my $tt = Text::ASCIITable::->new; my @co; my @da; my $da = []; for my $ro ( @{ $ht->content() } ) { if( $ro->tag eq 'tr' ) { push @da, $da if @$da; $da = []; for my $ce ( @{ $ro->content() } ) { if( $ce->tag eq 'td' ) { if( $ce->look_down( '_tag', 'table' ) ) { my $string = ''; for my $i ( @{ $ce->content() } ) { if( not ref $i ) { $string .= $i; } elsif( $i->tag eq 'table' ) { $string .= "\n"; $string .= DumpTable($i,$depth+1); $string .= "\n"; } else { $string .= $i->as_text; } } push @$da, $string; } else { push @$da, $ce->as_text; } } elsif( $ce->tag eq 'th' ) { push @co, $ce->as_text; } } } } push @da, $da if @$da; unless(@co) { use List::Util qw(max); my $max = 1 + max( 0, map { $#$_ } @da ); ( @co ) = (' ') x $max ; $tt->setOptions( hide_HeadRow => 1 ); $tt->setOptions( hide_HeadLine => 1 ); } # warn " co ", map {"{$_}"} @co; $tt->setCols( @co ); if( $depth == 0 ){ my $maxcolwidth = 120; $maxcolwidth -= 1 for split //, '.==.'; # bad guesstimation $maxcolwidth -= 2 for @co; # '| ' and '| ' $tt->setColWidth( $_, $maxcolwidth/(0+@co), 1) for @co; } $tt->addRow($_) for @da; $tt->setOptions( 'drawRowLine', 1) if $ht->attr('border'); # return $tt->draw(); return $tt->draw( [ '.=', '=.', '-', '-' ], # .=-----------=. [ '|', '|', '|' ], # | info | info | [ '|-', '-|', '=', '=' ], # |-===========-| [ '|', '|', '|' ], # | info | info | [ "'=", "='", '-', '-' ], # '=-----------=' [ '|=', '=|', '-', '*' ] # rowseperator ); } __END__

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: snippet [id://338485]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others pondering the Monastery: (5)
As of 2016-05-29 21:38 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?