Beefy Boxes and Bandwidth Generously Provided by pair Networks
laziness, impatience, and hubris
 
PerlMonks  

XPATH DOM traverse html/xml

by gsiglet (Acolyte)
on Dec 07, 2009 at 17:53 UTC ( #811576=perlquestion: print w/replies, xml ) Need Help??
gsiglet has asked for the wisdom of the Perl Monks concerning the following question:

Good afternoon, Could you please help in the following? How can I traverse an xml/html document and for each terminal node to be able to find the entire path from that node up to the root element? Thank you in advance,

Replies are listed 'Best First'.
Re: XPATH DOM traverse html/xml (build your own)
by ikegami (Pope) on Dec 07, 2009 at 19:02 UTC
    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub visit { my ($node, $path) = @_; $path = '' if !defined($path); print("$path: ", $node->nodeName(), "\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); visit($children[$_], "$path*[".($_+1).']') for 0..$#children; } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); visit($root, '/*');
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*[1]: CabinClass /*/*[2]/*[1]/*[2]/*[1]/*[1]: AirRows /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]: AirRow /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber

    You can use other expressions for the path segments if you want. I kept it simple.

    And of course, you can do other things other than printing the path and element name.

    Update: Fixed bugs.

Re: XPATH DOM traverse html/xml (->nodePath)
by ikegami (Pope) on Dec 07, 2009 at 19:16 UTC
    Turns out that XML::LibXML can do the work for you at the cost of flexibility:
    use strict; use warnings; use XML::LibXML qw( ); my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); for my $node ($root->findnodes('//*')) { print($node->nodePath(), ': ', $node->nodeName(), "\n"); }
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*: CabinClass /*/*[2]/*[1]/*[2]/*/*: AirRows /*/*[2]/*[1]/*[2]/*/*/*[1]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*/*/*[2]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*/*/*[22]: AirRow /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber
Re: XPATH DOM traverse html/xml (fancy)
by ikegami (Pope) on Dec 07, 2009 at 20:20 UTC

    ok, I can't leave this alone :)

    To get

    /p1:root /p1:root/p1:foo[1] /p1:root/p1:foo[2] /p1:root/p1:bar[1] /p1:root/p1:bar[2] Prefix definitions: p1: http://www.example.org/tooty

    instead of

    /* /*/*[1] /*/*[2] /*/*[3] /*/*[4]

    requires a lot more work.

    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub _qname { my ($cx, $node) = @_; my $name = $node->nodeName(); my $uri = $node->namespaceURI(); return $name if !defined($uri); my $prefix = $cx->[1]{$uri}; if (!defined($prefix)) { $cx->[1]{$uri} = $prefix = 'p' . ++($cx->[0]); $cx->[2]->registerNs($prefix, $uri); } return "$prefix:$name"; } sub _visit { my ($cx, $path, $node) = @_; print("$path\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); my %idxs; for (0..$#children) { my $node = $children[$_]; my $qname = _qname($cx, $node); my $i = ++$idxs{$qname}; _visit($cx, "$path$qname\[$i]", $node); } } sub visit_node { my ($node) = @_; my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '.'; _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } sub visit_doc { my ($doc) = @_; my $node = $doc->documentElement(); my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '/' . _qname($cx, $node); _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); visit_doc($doc);

    The code doesn't use the XML::LibXML::XPathContext object it creates, but I figured you'd need it if you're building xpaths.

      Thank you very much! I will try your solution. Have a nice evening.
Re: XPATH DOM traverse html/xml
by mirod (Canon) on Dec 08, 2009 at 16:11 UTC

    I don't know which module you use, but with XML::Twig you can use the xpath method like this, for example:

    perl -MXML::Twig -E'my $t=XML::Twig->parse( "my.xml"); foreach my $e ($t->descendants( "#ELT")) { say $e->xpath; }'

      XML::Twig is one of the many things that keeps me using Perl. (It's one of the modules my personal 'M.pm' module loads -- 'M.pm' so I can do perl -MM -lwe ....) So, thanks again, mirod.

      The first thing that came to mind was:

      perl -MXML::Twig -E 'XML::Twig->new( twig_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

      ...which isn't so different from what you wrote. But, the following has the benefit of not requiring the loading of the entire file (correct?):

      perl -MXML::Twig -E 'XML::Twig->new( start_tag_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

        Absolutely, the way you wrote it is more efficient. I just went for the easiest way I could think of, not knowing enough about the OP's exact constraints.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://811576]
Approved by Corion
Front-paged by Old_Gray_Bear
help
Chatterbox?
[Corion]: ... values to be used. For example, I think for headers, one would want to have various kinds of Content-Encoding headers, but for the get_parameters one would have various kinds of Bobby Tables
[choroba]: What about [metadoc:// Algorithm::Loops]?
[Corion]: choroba: Yeah, but handing off the request to Dancer,Plack, Mojolicious,LWP is easy once I have the data filled into some structure ;))
[choroba]: Algorithm::Loops
[Corion]: choroba: I'm using that to generate the permutations, but I don't know how the user can pass the intended values to my function in a sane way
[Corion]: I have a prototype that permutes the get_parameters, but the values used for the get parameters should be different from the values used for the headers and potentially for parts of the URL
[Corion]: But yes, in general, my approach will be "split the URL into another set of parameters, generate an array of allowed values for each parameter and then NestedLoops() over the set"
[choroba]: hmm... so you need something like bag from Test::Deep, but not for checking, but for generation
[Corion]: This has the dual use of easily requesting sequential URLs and also being suitable for testing
[Corion]: For testing, I want to skip all tests with the same value(s) once one test fails to cut down on the number of failing tests

How do I use this? | Other CB clients
Other Users?
Others wandering the Monastery: (9)
As of 2017-01-17 08:16 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Do you watch meteor showers?




    Results (152 votes). Check out past polls.