Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl Monk, Perl Meditation
 
PerlMonks  

Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML

by Anonymous Monk
on Jun 27, 2013 at 03:57 UTC ( #1040902=note: print w/replies, xml ) Need Help??


in reply to htmltreexpather.pl - xpath helper, creates xpath search strings from html

And here is the libxml variant

#!/usr/bin/perl -- use strict; use warnings; use XML::LibXML 1.70; ## for load_html/load_xml/location use Getopt::Long(); Main( @ARGV ); exit( 0 ); sub Usage {" Usage: ## xpatherize only terminal nodes (no descendents) ## xpatherize only terminal tags (no subtags) $0 xml_or_html_file_or_URL $0 xml_or_html_file_or_URL //tagname $0 http://example.com/?htm_signals_to_treat_it_as_html //a ## force load_xml or load_html $0 --xml xml_or_html_file_or_URL //tagname $0 --html xml_or_html_file_or_URL //tagname ## force xpatherize only terminal nodes (no descendents) ## force xpatherize only terminal tags (no subtags) $0 --terminal xml_or_html_file_or_URL //tagname ## force xpatherize all matching nodes (disable terminal) $0 --all xml_or_html_file_or_URL //tagname $0 --help \n"; } sub Main { @_ or die Usage(); my %opt; Getopt::Long::GetOptionsFromArray( \@_, \%opt, q{html|htm|ht!}, q{all|a!}, q{terminal|term|t!}, q{xml|xm|x!}, q{help|h!}, q{posy!}, # todo??? nah q{star!}, q{rats!}, q{raid!}, ); $opt{help} and return print Usage(); my( $url , $path ) = @_; my $load = $url=~/htm/i ? 'load_html' : 'load_xml'; $opt{html} and $load = 'load_html'; $opt{xml} and $load = 'load_xml'; my $terminal = 1; $path and $terminal = 0; $opt{terminal} and $terminal = 1; $opt{all} and $terminal = 0; $path or $path = '//*'; my $dom = XML::LibXML->new( qw/ recover 2 / )->$load( location => $url, ); for my $node( $dom->F( $path ) ){ next if $terminal and $node->F('.//*')->size; print #~ '# ', overload::StrVal($node), "\n", $node->nodePath,"\n", $node->fullxpath,"\n", "# \x22content\x22\n ",shorten( $node->textContent ), "\n\n------\n", ;;;;;;;;;;;; } } BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } $::xpc = XML::LibXML::XPathContext->new( ); sub XML::LibXML::Node::F { my( $self, $xpath, $context ) = @_; $::xpc->findnodes( $xpath, $context || $self ); } } sub XML::LibXML::Node::POS { $_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )-> +size+1; } sub shorten { my $longy = join '', @_; $longy =~ s/[\r\n\t]+/ /gs; my $ll = length($longy); $ll > 71 and substr( $longy, 69, $ll ) = '...'; $longy; } sub XML::LibXML::Node::fullxpath { my $node = shift; my $ret = ''; $ret .= "\n# posy\n".fullxpath_posy($node)."\n"; $ret .= "\n# star".fullxpath_star($node)."\n\n"; $ret .= "# rats\n".fullxpath_rats($node)."\n\n"; if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim? $ret .= "# raid\n".fullxpath_rats_raid($node)."\n"; $ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n"; } $ret; } ## *[...]/*[...] always sub fullxpath_star { my $node = shift; #~ my $ret = "\n/" . yatts( $node ); my $ret = "\n/" . yatts( $node, !!1 ); ## why was this my $parent = $node->getParentNode; while ($parent and $parent->getParentNode()) { $ret = yatts( $parent, !!1 ) . $ret; $ret = "\n/". $ret; $parent = $parent->getParentNode(); } $ret; } ## /every[1]/node[1]/position[1]/always[1] sub fullxpath_posy { my $node = shift; my $ret = ''; my $parent = $node; while ($parent and $parent->getParentNode()) { my $pos = $parent->POS(); $ret = '['.( $pos ).']' . $ret; $ret = '/'.$parent->getName () . $ret; $parent = $parent->getParentNode (); } $ret; } sub yatts { my( $node, $dopos ) = @_; my $name = xpath_attr_escape( $node->getName() ); my @ret = qq{name() = "$name"}; if( $dopos ){ push @ret, 'position() = '.$node->POS; } for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return join '', '*[ ', join( ' and ', @ret ) , ' ]'; } sub datts { my( $node ) = @_; my @ret = 'position() = '.$node->POS; for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return \@ret; } sub fullxpath_ratsy { my $node = shift; my @stuff ; my $parent = $node; while ($parent and $parent->getParentNode()) { my $atts = datts( $parent ) ; if( @$atts > 1 ){ ## more than position my $name = xpath_attr_escape( $parent->getName() ); push @stuff, join '', '*[ ', join( ' and ', qq{name() = "$name"}, @$atts , ), ' ]', ;;;;;;;; } else { push @stuff, $parent->getName() .'['. $parent->POS .']'; } $parent = $parent->getParentNode(); } return @stuff; } ## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" + ] sub fullxpath_rats { return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy; } ## absolute with @id trumping other attrs sub fullxpath_rats_raid { return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid; } ## if @id remove all other attributes / id's are unique right? sub fullxpath_rats_theid { return map { m{ \sname\(\)\s=\s"([^"]+)" .+? \s(\@id\s=\s"[^"]+") }xi ? "$1\[$2]" : $_ } &fullxpath_ratsy; } ## relative from first @id , with @id trumping other attrs sub fullxpath_rats_cutoff { my @stuff = &fullxpath_rats_theid; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\@id\s=\s"/i } @stuff; return join '/', ( $stuff > @stuff ? '/' : '' ), map {"$_\n " } reverse @stuff; } __END__

This xml

<?xml version="1.0" encoding="UTF-8"?> <sub-group-tree> <fake rocks="diamons"> <fake watch="ebolex" id="delicious"> <fake teeth="wood"> <niagra> peels </niagra> </fake> <fake ailment="vasomunchgestion"> <fake condition="Hungary" id="staxicemnt"> <fake disease="chroniclion"> <ip-address-ranges>192.168.0.1/24</ip-address-ranges> </fake> </fake> </fake> </fake> </fake> </sub-group-tree>

By default produces these paths to the same node

/sub-group-tree/fake/fake/fake[1]/niagra # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[1]/niagra[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ name() = "niagra" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # chop //fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # "content" peels ------ /sub-group-tree/fake/fake/fake[2]/fake/fake/ip-address-ranges # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[2]/fake[1]/fake[1]/ip-address- +ranges[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestio +n" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and +@id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /*[ name() = "ip-address-ranges" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and + @id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # chop //fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # "content" 192.168.0.1/24 ------
  • Comment on Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
  • Select or Download Code

Replies are listed 'Best First'.
Re^2: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
by Anonymous Monk on Mar 28, 2014 at 23:57 UTC

    This should teach you 80% of everything xpath you need to know :)

    And here is updated version of star its now namespace aware and only uses local-name() ... and skips adding a bunch of xmlns attributes and its more indented look, example

    # star /*[ local-name() = "sub-group-tree" and position() = 1 ] /*[ local-name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ local-name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ] /*[ local-name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ local-name() = "niagra" and contains(string(), " peels ") ]

    The code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://1040902]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others making s'mores by the fire in the courtyard of the Monastery: (5)
As of 2016-10-01 21:47 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    How many different varieties (color, size, etc) of socks do you have in your sock drawer?






    Results (8 votes). Check out past polls.