Beefy Boxes and Bandwidth Generously Provided by pair Networks
Clear questions and runnable code
get the best and fastest answer
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??

And here is the libxml variant

#!/usr/bin/perl -- use strict; use warnings; use XML::LibXML 1.70; ## for load_html/load_xml/location use Getopt::Long(); Main( @ARGV ); exit( 0 ); sub Usage {" Usage: ## xpatherize only terminal nodes (no descendents) ## xpatherize only terminal tags (no subtags) $0 xml_or_html_file_or_URL $0 xml_or_html_file_or_URL //tagname $0 http://example.com/?htm_signals_to_treat_it_as_html //a ## force load_xml or load_html $0 --xml xml_or_html_file_or_URL //tagname $0 --html xml_or_html_file_or_URL //tagname ## force xpatherize only terminal nodes (no descendents) ## force xpatherize only terminal tags (no subtags) $0 --terminal xml_or_html_file_or_URL //tagname ## force xpatherize all matching nodes (disable terminal) $0 --all xml_or_html_file_or_URL //tagname $0 --help \n"; } sub Main { @_ or die Usage(); my %opt; Getopt::Long::GetOptionsFromArray( \@_, \%opt, q{html|htm|ht!}, q{all|a!}, q{terminal|term|t!}, q{xml|xm|x!}, q{help|h!}, q{posy!}, # todo??? nah q{star!}, q{rats!}, q{raid!}, ); $opt{help} and return print Usage(); my( $url , $path ) = @_; my $load = $url=~/htm/i ? 'load_html' : 'load_xml'; $opt{html} and $load = 'load_html'; $opt{xml} and $load = 'load_xml'; my $terminal = 1; $path and $terminal = 0; $opt{terminal} and $terminal = 1; $opt{all} and $terminal = 0; $path or $path = '//*'; my $dom = XML::LibXML->new( qw/ recover 2 / )->$load( location => $url, ); for my $node( $dom->F( $path ) ){ next if $terminal and $node->F('.//*')->size; print #~ '# ', overload::StrVal($node), "\n", $node->nodePath,"\n", $node->fullxpath,"\n", "# \x22content\x22\n ",shorten( $node->textContent ), "\n\n------\n", ;;;;;;;;;;;; } } BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } $::xpc = XML::LibXML::XPathContext->new( ); sub XML::LibXML::Node::F { my( $self, $xpath, $context ) = @_; $::xpc->findnodes( $xpath, $context || $self ); } } sub XML::LibXML::Node::POS { $_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )-> +size+1; } sub shorten { my $longy = join '', @_; $longy =~ s/[\r\n\t]+/ /gs; my $ll = length($longy); $ll > 71 and substr( $longy, 69, $ll ) = '...'; $longy; } sub XML::LibXML::Node::fullxpath { my $node = shift; my $ret = ''; $ret .= "\n# posy\n".fullxpath_posy($node)."\n"; $ret .= "\n# star".fullxpath_star($node)."\n\n"; $ret .= "# rats\n".fullxpath_rats($node)."\n\n"; if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim? $ret .= "# raid\n".fullxpath_rats_raid($node)."\n"; $ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n"; } $ret; } ## *[...]/*[...] always sub fullxpath_star { my $node = shift; #~ my $ret = "\n/" . yatts( $node ); my $ret = "\n/" . yatts( $node, !!1 ); ## why was this my $parent = $node->getParentNode; while ($parent and $parent->getParentNode()) { $ret = yatts( $parent, !!1 ) . $ret; $ret = "\n/". $ret; $parent = $parent->getParentNode(); } $ret; } ## /every[1]/node[1]/position[1]/always[1] sub fullxpath_posy { my $node = shift; my $ret = ''; my $parent = $node; while ($parent and $parent->getParentNode()) { my $pos = $parent->POS(); $ret = '['.( $pos ).']' . $ret; $ret = '/'.$parent->getName () . $ret; $parent = $parent->getParentNode (); } $ret; } sub yatts { my( $node, $dopos ) = @_; my $name = xpath_attr_escape( $node->getName() ); my @ret = qq{name() = "$name"}; if( $dopos ){ push @ret, 'position() = '.$node->POS; } for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return join '', '*[ ', join( ' and ', @ret ) , ' ]'; } sub datts { my( $node ) = @_; my @ret = 'position() = '.$node->POS; for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return \@ret; } sub fullxpath_ratsy { my $node = shift; my @stuff ; my $parent = $node; while ($parent and $parent->getParentNode()) { my $atts = datts( $parent ) ; if( @$atts > 1 ){ ## more than position my $name = xpath_attr_escape( $parent->getName() ); push @stuff, join '', '*[ ', join( ' and ', qq{name() = "$name"}, @$atts , ), ' ]', ;;;;;;;; } else { push @stuff, $parent->getName() .'['. $parent->POS .']'; } $parent = $parent->getParentNode(); } return @stuff; } ## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" + ] sub fullxpath_rats { return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy; } ## absolute with @id trumping other attrs sub fullxpath_rats_raid { return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid; } ## if @id remove all other attributes / id's are unique right? sub fullxpath_rats_theid { return map { m{ \sname\(\)\s=\s"([^"]+)" .+? \s(\@id\s=\s"[^"]+") }xi ? "$1\[$2]" : $_ } &fullxpath_ratsy; } ## relative from first @id , with @id trumping other attrs sub fullxpath_rats_cutoff { my @stuff = &fullxpath_rats_theid; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\@id\s=\s"/i } @stuff; return join '/', ( $stuff > @stuff ? '/' : '' ), map {"$_\n " } reverse @stuff; } __END__

This xml

<?xml version="1.0" encoding="UTF-8"?> <sub-group-tree> <fake rocks="diamons"> <fake watch="ebolex" id="delicious"> <fake teeth="wood"> <niagra> peels </niagra> </fake> <fake ailment="vasomunchgestion"> <fake condition="Hungary" id="staxicemnt"> <fake disease="chroniclion"> <ip-address-ranges>192.168.0.1/24</ip-address-ranges> </fake> </fake> </fake> </fake> </fake> </sub-group-tree>

By default produces these paths to the same node

/sub-group-tree/fake/fake/fake[1]/niagra # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[1]/niagra[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ name() = "niagra" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # chop //fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # "content" peels ------ /sub-group-tree/fake/fake/fake[2]/fake/fake/ip-address-ranges # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[2]/fake[1]/fake[1]/ip-address- +ranges[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestio +n" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and +@id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /*[ name() = "ip-address-ranges" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and + @id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # chop //fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # "content" 192.168.0.1/24 ------

In reply to Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML by Anonymous Monk
in thread htmltreexpather.pl - xpath helper, creates xpath search strings from html by Anonymous Monk

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others perusing the Monastery: (4)
    As of 2014-10-25 09:35 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      For retirement, I am banking on:










      Results (142 votes), past polls