Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl-Sensitive Sunglasses
 
PerlMonks  

Re^2: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML

by Anonymous Monk
on Mar 28, 2014 at 23:57 UTC ( [id://1080168]=note: print w/replies, xml ) Need Help??


in reply to Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
in thread htmltreexpather.pl - xpath helper, creates xpath search strings from html

This should teach you 80% of everything xpath you need to know :)

And here is updated version of star its now namespace aware and only uses local-name() ... and skips adding a bunch of xmlns attributes and its more indented look, example

# star /*[ local-name() = "sub-group-tree" and position() = 1 ] /*[ local-name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ local-name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ] /*[ local-name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ local-name() = "niagra" and contains(string(), " peels ") ]

The code

#!/usr/bin/perl -- use strict; use warnings; use XML::LibXML 1.70; ## for load_html/load_xml/location use Getopt::Long(); our $VERSION = 20140328170138; Main( @ARGV ); exit( 0 ); sub Usage {" Usage: ## xpatherize only terminal nodes (no descendents) ## xpatherize only terminal tags (no subtags) $0 xml_or_html_file_or_URL $0 xml_or_html_file_or_URL //tagname $0 http://example.com/?htm_signals_to_treat_it_as_html //a ## force load_xml or load_html $0 --xml xml_or_html_file_or_URL //tagname $0 --html xml_or_html_file_or_URL //tagname ## force xpatherize only terminal nodes (no descendents) ## force xpatherize only terminal tags (no subtags) $0 --terminal xml_or_html_file_or_URL //tagname ## force xpatherize all matching nodes (disable terminal) $0 --all xml_or_html_file_or_URL //tagname $0 --help \n"; } sub Main { @_ or die Usage(); my %opt; Getopt::Long::GetOptionsFromArray( \@_, \%opt, q{html|htm|ht!}, q{all|a!}, q{terminal|term|t!}, q{xml|xm|x!}, q{help|h!}, q{posy!}, # todo??? nah q{star!}, q{rats!}, q{raid!}, ); $opt{help} and return print Usage(); my( $url , $path ) = @_; my $load = $url=~/htm/i ? 'load_html' : 'load_xml'; $opt{html} and $load = 'load_html'; $opt{xml} and $load = 'load_xml'; my $terminal = 1; $path and $terminal = 0; $opt{terminal} and $terminal = 1; $opt{all} and $terminal = 0; $path or $path = '//*'; my $dom = XML::LibXML->new( qw/ recover 2 / )->$load( location => $url, ); for my $node( $dom->F( $path ) ){ next if $terminal and $node->F('.//*')->size; print #~ '# ', overload::StrVal($node), "\n", $node->nodePath,"\n", $node->fullxpath,"\n", "# \x22content\x22\n ",shorten( $node->textContent ), "\n\n------\n", ;;;;;;;;;;;; } } BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } $::xpc = XML::LibXML::XPathContext->new( ); sub XML::LibXML::Node::F { my( $self, $xpath, $context ) = @_; $::xpc->findnodes( $xpath, $context || $self ); } } sub XML::LibXML::Node::POS { $_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )-> +size+1; } sub shorten { my $longy = join '', @_; $longy =~ s/[\r\n\t]+/ /gs; my $ll = length($longy); $ll > 71 and substr( $longy, 69, $ll ) = '...'; $longy; } sub XML::LibXML::Node::fullxpath { my $node = shift; my $ret = ''; $ret .= "\n# posy\n".fullxpath_posy($node)."\n"; $ret .= "\n# star".fullxpath_star($node)."\n\n"; $ret .= "# rats\n".fullxpath_rats($node)."\n\n"; if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim? $ret .= "# raid\n".fullxpath_rats_raid($node)."\n"; $ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n"; } $ret; } ## *[...]/*[...] always sub fullxpath_star { my $node = shift; my $ret = "\n/" . yatts( $node, !'doposition', !!'docontent' ); my $parent = $node->getParentNode; while ($parent and $parent->getParentNode()) { $ret = yatts( $parent , !!'doposition', !'docontent' ) . $ret; $ret = "\n/". $ret; $parent = $parent->getParentNode(); } $ret; } ## /every[1]/node[1]/position[1]/always[1] sub fullxpath_posy { my $node = shift; my $ret = ''; my $parent = $node; while ($parent and $parent->getParentNode()) { my $pos = $parent->POS(); $ret = '['.( $pos ).']' . $ret; $ret = '/'.$parent->getName () . $ret; $parent = $parent->getParentNode (); } $ret; } sub yatts { my( $node, $dopos , $docontent ) = @_; $dopos = !!$dopos; $docontent = !!$docontent; my $name = xpath_attr_escape( ''.$node->localName() ); my @ret = qq{local-name() = "$name"}; if( my $ns = $node->getNamespaceURI() ){ $ns = xpath_attr_escape( $ns ); push @ret , qq{namespace-uri() = "$ns"}; } if( $dopos ){ push @ret, 'position() = '.$node->POS; } for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~ /content|xmlns/; ## skip the noise my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } if( $docontent ){ my $content = xpath_attr_escape( shorten( $node->textContent ) + ); if( length $content ){ push @ret, qq{contains(string(), "$content")}; } } return join '', '*[ ', join( "\n and ", @ret ) , "\n ]\n"; } sub datts { my( $node ) = @_; my @ret = 'position() = '.$node->POS; for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return \@ret; } sub fullxpath_ratsy { my $node = shift; my @stuff ; my $parent = $node; while ($parent and $parent->getParentNode()) { my $atts = datts( $parent ) ; if( @$atts > 1 ){ ## more than position my $name = xpath_attr_escape( $parent->getName() ); push @stuff, join '', '*[ ', join( ' and ', qq{name() = "$name"}, @$atts , ), ' ]', ;;;;;;;; } else { push @stuff, $parent->getName() .'['. $parent->POS .']'; } $parent = $parent->getParentNode(); } return @stuff; } ## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" + ] sub fullxpath_rats { return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy; } ## absolute with @id trumping other attrs sub fullxpath_rats_raid { return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid; } ## if @id remove all other attributes / id's are unique right? sub fullxpath_rats_theid { return map { m{ \sname\(\)\s=\s"([^"]+)" .+? \s(\@id\s=\s"[^"]+") }xi ? "$1\[$2]" : $_ } &fullxpath_ratsy; } ## relative from first @id , with @id trumping other attrs sub fullxpath_rats_cutoff { my @stuff = &fullxpath_rats_theid; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\@id\s=\s"/i } @stuff; return join '/', ( $stuff > @stuff ? '/' : '' ), map {"$_\n " } reverse @stuff; } __END__
  • Comment on Re^2: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
  • Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://1080168]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others taking refuge in the Monastery: (4)
As of 2025-06-23 06:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found

    Notices?
    erzuuliAnonymous Monks are no longer allowed to use Super Search, due to an excessive use of this resource by robots.