Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

Re^2: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML

by Anonymous Monk
on Mar 28, 2014 at 23:57 UTC ( [id://1080168]=note: print w/replies, xml ) Need Help??


in reply to Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
in thread htmltreexpather.pl - xpath helper, creates xpath search strings from html

This should teach you 80% of everything xpath you need to know :)

And here is updated version of star its now namespace aware and only uses local-name() ... and skips adding a bunch of xmlns attributes and its more indented look, example

# star /*[ local-name() = "sub-group-tree" and position() = 1 ] /*[ local-name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ local-name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ] /*[ local-name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ local-name() = "niagra" and contains(string(), " peels ") ]

The code

#!/usr/bin/perl -- use strict; use warnings; use XML::LibXML 1.70; ## for load_html/load_xml/location use Getopt::Long(); our $VERSION = 20140328170138; Main( @ARGV ); exit( 0 ); sub Usage {" Usage: ## xpatherize only terminal nodes (no descendents) ## xpatherize only terminal tags (no subtags) $0 xml_or_html_file_or_URL $0 xml_or_html_file_or_URL //tagname $0 http://example.com/?htm_signals_to_treat_it_as_html //a ## force load_xml or load_html $0 --xml xml_or_html_file_or_URL //tagname $0 --html xml_or_html_file_or_URL //tagname ## force xpatherize only terminal nodes (no descendents) ## force xpatherize only terminal tags (no subtags) $0 --terminal xml_or_html_file_or_URL //tagname ## force xpatherize all matching nodes (disable terminal) $0 --all xml_or_html_file_or_URL //tagname $0 --help \n"; } sub Main { @_ or die Usage(); my %opt; Getopt::Long::GetOptionsFromArray( \@_, \%opt, q{html|htm|ht!}, q{all|a!}, q{terminal|term|t!}, q{xml|xm|x!}, q{help|h!}, q{posy!}, # todo??? nah q{star!}, q{rats!}, q{raid!}, ); $opt{help} and return print Usage(); my( $url , $path ) = @_; my $load = $url=~/htm/i ? 'load_html' : 'load_xml'; $opt{html} and $load = 'load_html'; $opt{xml} and $load = 'load_xml'; my $terminal = 1; $path and $terminal = 0; $opt{terminal} and $terminal = 1; $opt{all} and $terminal = 0; $path or $path = '//*'; my $dom = XML::LibXML->new( qw/ recover 2 / )->$load( location => $url, ); for my $node( $dom->F( $path ) ){ next if $terminal and $node->F('.//*')->size; print #~ '# ', overload::StrVal($node), "\n", $node->nodePath,"\n", $node->fullxpath,"\n", "# \x22content\x22\n ",shorten( $node->textContent ), "\n\n------\n", ;;;;;;;;;;;; } } BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } $::xpc = XML::LibXML::XPathContext->new( ); sub XML::LibXML::Node::F { my( $self, $xpath, $context ) = @_; $::xpc->findnodes( $xpath, $context || $self ); } } sub XML::LibXML::Node::POS { $_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )-> +size+1; } sub shorten { my $longy = join '', @_; $longy =~ s/[\r\n\t]+/ /gs; my $ll = length($longy); $ll > 71 and substr( $longy, 69, $ll ) = '...'; $longy; } sub XML::LibXML::Node::fullxpath { my $node = shift; my $ret = ''; $ret .= "\n# posy\n".fullxpath_posy($node)."\n"; $ret .= "\n# star".fullxpath_star($node)."\n\n"; $ret .= "# rats\n".fullxpath_rats($node)."\n\n"; if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim? $ret .= "# raid\n".fullxpath_rats_raid($node)."\n"; $ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n"; } $ret; } ## *[...]/*[...] always sub fullxpath_star { my $node = shift; my $ret = "\n/" . yatts( $node, !'doposition', !!'docontent' ); my $parent = $node->getParentNode; while ($parent and $parent->getParentNode()) { $ret = yatts( $parent , !!'doposition', !'docontent' ) . $ret; $ret = "\n/". $ret; $parent = $parent->getParentNode(); } $ret; } ## /every[1]/node[1]/position[1]/always[1] sub fullxpath_posy { my $node = shift; my $ret = ''; my $parent = $node; while ($parent and $parent->getParentNode()) { my $pos = $parent->POS(); $ret = '['.( $pos ).']' . $ret; $ret = '/'.$parent->getName () . $ret; $parent = $parent->getParentNode (); } $ret; } sub yatts { my( $node, $dopos , $docontent ) = @_; $dopos = !!$dopos; $docontent = !!$docontent; my $name = xpath_attr_escape( ''.$node->localName() ); my @ret = qq{local-name() = "$name"}; if( my $ns = $node->getNamespaceURI() ){ $ns = xpath_attr_escape( $ns ); push @ret , qq{namespace-uri() = "$ns"}; } if( $dopos ){ push @ret, 'position() = '.$node->POS; } for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~ /content|xmlns/; ## skip the noise my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } if( $docontent ){ my $content = xpath_attr_escape( shorten( $node->textContent ) + ); if( length $content ){ push @ret, qq{contains(string(), "$content")}; } } return join '', '*[ ', join( "\n and ", @ret ) , "\n ]\n"; } sub datts { my( $node ) = @_; my @ret = 'position() = '.$node->POS; for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return \@ret; } sub fullxpath_ratsy { my $node = shift; my @stuff ; my $parent = $node; while ($parent and $parent->getParentNode()) { my $atts = datts( $parent ) ; if( @$atts > 1 ){ ## more than position my $name = xpath_attr_escape( $parent->getName() ); push @stuff, join '', '*[ ', join( ' and ', qq{name() = "$name"}, @$atts , ), ' ]', ;;;;;;;; } else { push @stuff, $parent->getName() .'['. $parent->POS .']'; } $parent = $parent->getParentNode(); } return @stuff; } ## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" + ] sub fullxpath_rats { return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy; } ## absolute with @id trumping other attrs sub fullxpath_rats_raid { return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid; } ## if @id remove all other attributes / id's are unique right? sub fullxpath_rats_theid { return map { m{ \sname\(\)\s=\s"([^"]+)" .+? \s(\@id\s=\s"[^"]+") }xi ? "$1\[$2]" : $_ } &fullxpath_ratsy; } ## relative from first @id , with @id trumping other attrs sub fullxpath_rats_cutoff { my @stuff = &fullxpath_rats_theid; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\@id\s=\s"/i } @stuff; return join '/', ( $stuff > @stuff ? '/' : '' ), map {"$_\n " } reverse @stuff; } __END__
  • Comment on Re^2: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
  • Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://1080168]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others making s'mores by the fire in the courtyard of the Monastery: (3)
As of 2024-04-19 01:35 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found