Beefy Boxes and Bandwidth Generously Provided by pair Networks
more useful options
 
PerlMonks  

XPATH DOM traverse html/xml

by gsiglet (Acolyte)
on Dec 07, 2009 at 17:53 UTC ( [id://811576]=perlquestion: print w/replies, xml ) Need Help??

gsiglet has asked for the wisdom of the Perl Monks concerning the following question:

Good afternoon, Could you please help in the following? How can I traverse an xml/html document and for each terminal node to be able to find the entire path from that node up to the root element? Thank you in advance,

Replies are listed 'Best First'.
Re: XPATH DOM traverse html/xml (build your own)
by ikegami (Patriarch) on Dec 07, 2009 at 19:02 UTC
    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub visit { my ($node, $path) = @_; $path = '' if !defined($path); print("$path: ", $node->nodeName(), "\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); visit($children[$_], "$path*[".($_+1).']') for 0..$#children; } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); visit($root, '/*');
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*[1]: CabinClass /*/*[2]/*[1]/*[2]/*[1]/*[1]: AirRows /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]: AirRow /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber

    You can use other expressions for the path segments if you want. I kept it simple.

    And of course, you can do other things other than printing the path and element name.

    Update: Fixed bugs.

Re: XPATH DOM traverse html/xml (->nodePath)
by ikegami (Patriarch) on Dec 07, 2009 at 19:16 UTC
    Turns out that XML::LibXML can do the work for you at the cost of flexibility:
    use strict; use warnings; use XML::LibXML qw( ); my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); for my $node ($root->findnodes('//*')) { print($node->nodePath(), ': ', $node->nodeName(), "\n"); }
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*: CabinClass /*/*[2]/*[1]/*[2]/*/*: AirRows /*/*[2]/*[1]/*[2]/*/*/*[1]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*/*/*[2]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*/*/*[22]: AirRow /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber
Re: XPATH DOM traverse html/xml (fancy)
by ikegami (Patriarch) on Dec 07, 2009 at 20:20 UTC

    ok, I can't leave this alone :)

    To get

    /p1:root /p1:root/p1:foo[1] /p1:root/p1:foo[2] /p1:root/p1:bar[1] /p1:root/p1:bar[2] Prefix definitions: p1: http://www.example.org/tooty

    instead of

    /* /*/*[1] /*/*[2] /*/*[3] /*/*[4]

    requires a lot more work.

    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub _qname { my ($cx, $node) = @_; my $name = $node->nodeName(); my $uri = $node->namespaceURI(); return $name if !defined($uri); my $prefix = $cx->[1]{$uri}; if (!defined($prefix)) { $cx->[1]{$uri} = $prefix = 'p' . ++($cx->[0]); $cx->[2]->registerNs($prefix, $uri); } return "$prefix:$name"; } sub _visit { my ($cx, $path, $node) = @_; print("$path\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); my %idxs; for (0..$#children) { my $node = $children[$_]; my $qname = _qname($cx, $node); my $i = ++$idxs{$qname}; _visit($cx, "$path$qname\[$i]", $node); } } sub visit_node { my ($node) = @_; my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '.'; _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } sub visit_doc { my ($doc) = @_; my $node = $doc->documentElement(); my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '/' . _qname($cx, $node); _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); visit_doc($doc);

    The code doesn't use the XML::LibXML::XPathContext object it creates, but I figured you'd need it if you're building xpaths.

      Thank you very much! I will try your solution. Have a nice evening.
Re: XPATH DOM traverse html/xml
by mirod (Canon) on Dec 08, 2009 at 16:11 UTC

    I don't know which module you use, but with XML::Twig you can use the xpath method like this, for example:

    perl -MXML::Twig -E'my $t=XML::Twig->parse( "my.xml"); foreach my $e ($t->descendants( "#ELT")) { say $e->xpath; }'

      XML::Twig is one of the many things that keeps me using Perl. (It's one of the modules my personal 'M.pm' module loads -- 'M.pm' so I can do perl -MM -lwe ....) So, thanks again, mirod.

      The first thing that came to mind was:

      perl -MXML::Twig -E 'XML::Twig->new( twig_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

      ...which isn't so different from what you wrote. But, the following has the benefit of not requiring the loading of the entire file (correct?):

      perl -MXML::Twig -E 'XML::Twig->new( start_tag_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

        Absolutely, the way you wrote it is more efficient. I just went for the easiest way I could think of, not knowing enough about the OP's exact constraints.

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: perlquestion [id://811576]
Approved by Corion
Front-paged by Old_Gray_Bear
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others lurking in the Monastery: (3)
As of 2024-04-24 02:41 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found