http://www.perlmonks.org?node_id=811576

gsiglet has asked for the wisdom of the Perl Monks concerning the following question:

Good afternoon, Could you please help in the following? How can I traverse an xml/html document and for each terminal node to be able to find the entire path from that node up to the root element? Thank you in advance,

Replies are listed 'Best First'.
Re: XPATH DOM traverse html/xml (build your own)
by ikegami (Patriarch) on Dec 07, 2009 at 19:02 UTC
    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub visit { my ($node, $path) = @_; $path = '' if !defined($path); print("$path: ", $node->nodeName(), "\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); visit($children[$_], "$path*[".($_+1).']') for 0..$#children; } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); visit($root, '/*');
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*[1]: CabinClass /*/*[2]/*[1]/*[2]/*[1]/*[1]: AirRows /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]: AirRow /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*[1]/*[1]/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]: AirRow /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*[1]/*[1]/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber

    You can use other expressions for the path segments if you want. I kept it simple.

    And of course, you can do other things other than printing the path and element name.

    Update: Fixed bugs.

Re: XPATH DOM traverse html/xml (->nodePath)
by ikegami (Patriarch) on Dec 07, 2009 at 19:16 UTC
    Turns out that XML::LibXML can do the work for you at the cost of flexibility:
    use strict; use warnings; use XML::LibXML qw( ); my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); my $root = $doc->documentElement(); for my $node ($root->findnodes('//*')) { print($node->nodePath(), ': ', $node->nodeName(), "\n"); }
    /*: OTA_AirSeatMapRS /*/*[1]: Success /*/*[2]: SeatMapResponses /*/*[2]/*[1]: SeatMapResponse /*/*[2]/*[1]/*[1]: FlightSegmentInfo /*/*[2]/*[1]/*[1]/*[1]: DepartureAirport /*/*[2]/*[1]/*[1]/*[2]: ArrivalAirport /*/*[2]/*[1]/*[1]/*[3]: OperatingAirline /*/*[2]/*[1]/*[1]/*[4]: MarketingAirline /*/*[2]/*[1]/*[2]: SeatMapDetails /*/*[2]/*[1]/*[2]/*: CabinClass /*/*[2]/*[1]/*[2]/*/*: AirRows /*/*[2]/*[1]/*[2]/*/*/*[1]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[2]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[3]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[4]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[5]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[1]/*[6]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[1]/*[2]: AirRowCharacteristics /*/*[2]/*[1]/*[2]/*/*/*[2]: AirRow /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]: AirSeats /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[1]: AirSeat /*/*[2]/*[1]/*[2]/*/*/*[2]/*[1]/*[2]: AirSeat ... /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[21]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[2]/*/*/*[22]: AirRow /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]: AirSeats /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[1]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[2]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[3]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[4]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[5]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[1]/*[6]: AirSeat /*/*[2]/*[2]/*[2]/*/*/*[22]/*[2]: AirRowCharacteristics /*/*[2]/*[2]/*[3]: BookingReferenceID /*/*[2]/*[3]: AirTravelers /*/*[2]/*[3]/*[1]: AirTraveler /*/*[2]/*[3]/*[1]/*[1]: PersonName /*/*[2]/*[3]/*[1]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[1]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[1]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[1]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[1]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[2]: AirTraveler /*/*[2]/*[3]/*[2]/*[1]: PersonName /*/*[2]/*[3]/*[2]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[2]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[2]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[2]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[2]/*[2]: TravelerRefNumber /*/*[2]/*[3]/*[3]: AirTraveler /*/*[2]/*[3]/*[3]/*[1]: PersonName /*/*[2]/*[3]/*[3]/*[1]/*[1]: GivenName /*/*[2]/*[3]/*[3]/*[1]/*[2]: MiddleName /*/*[2]/*[3]/*[3]/*[1]/*[3]: Surname /*/*[2]/*[3]/*[3]/*[1]/*[4]: NameTitle /*/*[2]/*[3]/*[3]/*[2]: TravelerRefNumber
Re: XPATH DOM traverse html/xml (fancy)
by ikegami (Patriarch) on Dec 07, 2009 at 20:20 UTC

    ok, I can't leave this alone :)

    To get

    /p1:root /p1:root/p1:foo[1] /p1:root/p1:foo[2] /p1:root/p1:bar[1] /p1:root/p1:bar[2] Prefix definitions: p1: http://www.example.org/tooty

    instead of

    /* /*/*[1] /*/*[2] /*/*[3] /*/*[4]

    requires a lot more work.

    use strict; use warnings; use XML::LibXML qw( XML_ELEMENT_NODE ); sub _qname { my ($cx, $node) = @_; my $name = $node->nodeName(); my $uri = $node->namespaceURI(); return $name if !defined($uri); my $prefix = $cx->[1]{$uri}; if (!defined($prefix)) { $cx->[1]{$uri} = $prefix = 'p' . ++($cx->[0]); $cx->[2]->registerNs($prefix, $uri); } return "$prefix:$name"; } sub _visit { my ($cx, $path, $node) = @_; print("$path\n"); $path .= '/' if length($path) && $path !~ m{/\z}; my @children = grep $_->nodeType() == XML_ELEMENT_NODE, $node->childNodes(); my %idxs; for (0..$#children) { my $node = $children[$_]; my $qname = _qname($cx, $node); my $i = ++$idxs{$qname}; _visit($cx, "$path$qname\[$i]", $node); } } sub visit_node { my ($node) = @_; my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '.'; _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } sub visit_doc { my ($doc) = @_; my $node = $doc->documentElement(); my $cx = [ 0, {}, XML::LibXML::XPathContext->new() ]; my $path = '/' . _qname($cx, $node); _visit($cx, $path, $node); print("\n"); print("Prefix definitions:\n"); for my $uri (keys(%{ $cx->[1] })) { my $prefix = $cx->[1]{$uri}; print("$prefix: $uri\n"); } } my $parser = XML::LibXML->new(); my $doc = $parser->parse_fh(*STDIN); visit_doc($doc);

    The code doesn't use the XML::LibXML::XPathContext object it creates, but I figured you'd need it if you're building xpaths.

      Thank you very much! I will try your solution. Have a nice evening.
Re: XPATH DOM traverse html/xml
by mirod (Canon) on Dec 08, 2009 at 16:11 UTC

    I don't know which module you use, but with XML::Twig you can use the xpath method like this, for example:

    perl -MXML::Twig -E'my $t=XML::Twig->parse( "my.xml"); foreach my $e ($t->descendants( "#ELT")) { say $e->xpath; }'

      XML::Twig is one of the many things that keeps me using Perl. (It's one of the modules my personal 'M.pm' module loads -- 'M.pm' so I can do perl -MM -lwe ....) So, thanks again, mirod.

      The first thing that came to mind was:

      perl -MXML::Twig -E 'XML::Twig->new( twig_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

      ...which isn't so different from what you wrote. But, the following has the benefit of not requiring the loading of the entire file (correct?):

      perl -MXML::Twig -E 'XML::Twig->new( start_tag_handlers => { _all_ => sub { say $_->xpath } })->parsefile(shift)'

        Absolutely, the way you wrote it is more efficient. I just went for the easiest way I could think of, not knowing enough about the OP's exact constraints.