Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number
 
PerlMonks  

Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html ($VERSION = 20120112 )

by Anonymous Monk
on Jan 12, 2012 at 13:00 UTC ( #947532=note: print w/ replies, xml ) Need Help??


in reply to htmltreexpather.pl - xpath helper, creates xpath search strings from html

Here is  $VERSION = 20120112 , incorporates OBO and properly escapes quotes/apostrophe in attributes

#!/usr/bin/perl -- use strict; use warnings; our $VERSION = 20120112; # 2012-01-12 use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('<html>', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; for my $ix ( 1 .. 3 ){ my $demo = "Demo$ix"; print "$demo\n"; __PACKAGE__->can($demo)->(); } print "Usage: $0 file _tag div\n\n"; } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__'; <html> <body> <div></div> <div id="wrapper"> <div></div> <div id="outer"> <div id="inner"> <div></div> <div id="center"> <div></div> <div id="main"> <div></div> <div> <table id="wrappedcontent"> <tbody class="shnitzel" bgcolor='red'> <tr> <td> <table> <tbody> <tr> <td><strong>key1</strong></td> <td>val1</td> </tr> <tr> <td><strong>key2</strong></td> <td>val2</td> </tr> <tr> <td><strong>key3</strong></td> <td>val3</td> </tr> <tr> <td><strong>key4</strong></td> <td>val4</td> </tr> <tr> <td><strong>key5</strong></td> <td>val5</td> </tr> <tr> <td><strong>key6</strong></td> <td>val6</td> </tr> <tr> <td><strong>key7</strong></td> <td>val7</td> </tr> <tr> <td><strong>key8</strong></td> <td>val8</td> </tr> <tr> <td><strong>key9</strong></td> <td>val9</td> </tr> <tr> <td><strong>key10</strong></td> <td>val10</td> </tr> <tr> <td><strong>key11</strong></td> <td>val11</td> </tr> </tbody> </table> </td> </tr> </tbody> </table> </div> </div> </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo2 { my $html = <<'__HTML__'; <html> <body> <div id="yo"> <div q="don't"> don't <div></div> <div q='"quote"'> "quote" <div q="it's"> it's </div> <div q="it's"> nonsense </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo2 sub Demo3 { my $html = <<'__HTML__'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/html4/loose.dtd"><html><head><meta name="generator" con +tent="DigiOnline GmbH - WebWeaver 3.4 CMS - http://www.webweaver.de"> +<title>educa.ch</title><meta http-equiv="Content-Type" content="text/ +html; charset=iso-8859-1"><link rel="stylesheet" href="101.htm"><scri +pt src="102.htm"></script><script language="JavaScript"><!-- var did='d79376'; var root=new Array('d200','d205','d73137','d1566','d79376','d'); var usefocus = 1; function check() { if ((self.focus) && (usefocus)) { self.focus(); } } // --></script></head><body bgcolor="#FFFFFF" leftmargin="0" topmargin +="0" marginwidth="0" marginheight="0" onload="check();"><table cellsp +acing="0" cellpadding="0" border="0" width="100%"><tr><td width="15" +class="popuphead"><img src="/0.gif" alt="" width="15" height="16"></t +d><td width="99%" class="popuphead">Adresse - Schulen in der Schweiz< +/td><td width="20" class="popuphead" valign="middle"><a href="#" titl +e="Print" onclick="window.print(); return false;"><img src="../pics/p +rint16x13.gif" alt="Drucken" width="16" height="13"></a></td><td widt +h="20" class="popuphead" valign="middle"><a href="#" title="close" on +click="window.close(); return false;"><img src="../pics/close21x13.gi +f" alt="Schliessen" width="21" height="13"></a></td></tr> <tr bgcolor="#B2B2B2"><td colspan="4"><img src="/0.gif" alt="" width=" +1" height="1"></td></tr></table><div class="leerzeile">&#160;</div><d +iv class="leerzeile"><img src="/0.gif" alt="" width="15"height="8">Al +tes Schulhaus Ossingen </div><div class="leerzeile">&#160;</div><d +iv><img src="/0.gif" alt="" width="15" height="8">Guntibachstrasse 10 +</div><div><img src="/0.gif" alt="" width="15" height="8"></div><div> +<img src="/0.gif" alt="" width="15" height="8">8475 &#160;Ossingen</d +iv><div class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" w +idth="15" height="8"><a href="" target="_blank"></a></div><div><img s +rc="/0.gif" alt="" width="15" height="8"><a href="mailto: sekretariat +.psossingen@bluewin.ch">sekretariat.psossingen@bluewin.ch</a></div><d +iv class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" width= +"15" height="8">Tel:<img src="/0.gif" alt="" width="6" height="8">052 + 317 15 45 </div><div><img src="/0.gif" alt="" width="15" height="8"> +Fax:<img src="/0.gif" alt="" width="4" height="8">052 317 04 42 </div +><div>&#160;</div></body></html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone er +ror, in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, + in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, + in xpath, start counting at 1? xpather, xpath checker agree my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_na +mes ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, xpath_attr_escape($ +e->attr($_)) } @att ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /</ ) { $tree->parse($html); } else { if( $html =~ /\.xml$/ ){ $tree->implicit_tags(0); $tree->no_expand_entities(1); $tree->ignore_unknown(0); $tree->ignore_ignorable_whitespace(0); $tree->no_space_compacting(1); $tree->store_comments(1); $tree->store_pis(1); } $tree->parse_file($html); } $tree->eof; warn $tree->as_HTML, " " if $html =~ /\.xml$/; # because it just does +n't work for xml @lookdown = sub{1} unless @lookdown; # every tag for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump BEGIN { my %rep = qw{ " &quot; ' &apos; } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } } __END__

Demo2 HTML::Element=HASH(0xb33a1c) 0.1.0 don't "quote" it's nonsense /html/body/div //div[@id='yo'] //div[@id='yo'] ------------------------------------------------------------------ HTML::Element=HASH(0xb33bdc) 0.1.0.0.2.2 nonsense /html/body/div/div/div[2]/div[2] //div[@id='yo']/div/div[2]/div[2] //div[@id='yo']/div[@q='don&apos;t']/div[@q='&quot;quote&quot;']/div[@ +q='it&apos;s'] ------------------------------------------------------------------ ##################################################################


Comment on Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html ($VERSION = 20120112 )
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://947532]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others browsing the Monastery: (7)
As of 2015-07-05 18:05 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (67 votes), past polls