#!/usr/bin/perl -- use strict; use warnings; our $VERSION = 20120112; # 2012-01-12 use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; for my $ix ( 1 .. 3 ){ my $demo = "Demo$ix"; print "$demo\n"; __PACKAGE__->can($demo)->(); } print "Usage: $0 file _tag div\n\n"; } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__';
key1 val1
key2 val2
key3 val3
key4 val4
key5 val5
key6 val6
key7 val7
key8 val8
key9 val9
key10 val10
key11 val11
__HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo2 { my $html = <<'__HTML__';
don't
"quote"
it's
nonsense
__HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo2 sub Demo3 { my $html = <<'__HTML__'; educa.ch
Adresse - Schulen in der SchweizDruckenSchliessen
 
Altes Schulhaus Ossingen
 
Guntibachstrasse 10
8475  Ossingen
 
sekretariat.psossingen@bluewin.ch
 
Tel:052 317 15 45
Fax:052 317 04 42
 
__HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_names ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, xpath_attr_escape($e->attr($_)) } @att ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /parse($html); } else { if( $html =~ /\.xml$/ ){ $tree->implicit_tags(0); $tree->no_expand_entities(1); $tree->ignore_unknown(0); $tree->ignore_ignorable_whitespace(0); $tree->no_space_compacting(1); $tree->store_comments(1); $tree->store_pis(1); } $tree->parse_file($html); } $tree->eof; warn $tree->as_HTML, " " if $html =~ /\.xml$/; # because it just doesn't work for xml @lookdown = sub{1} unless @lookdown; # every tag for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } } __END__