<?xml version="1.0" encoding="windows-1252"?>
<node id="975079" title="Re^4: Parsing HTML" created="2012-06-08 00:57:19" updated="2012-06-08 00:57:19">
<type id="11">
note</type>
<author id="961">
Anonymous Monk</author>
<data>
<field name="doctext">
&lt;p&gt; :D I might approach that like this (look ma, no slurping )
&lt;p&gt; $ [mod://lwp-download] http://www.costacrociere.it/it/lista_crociere/capitali_nord_europa-201206.html
&lt;br/&gt;Saving to 'capitali_nord_europa-201206.html'...
&lt;br/&gt;134 KB received in 1 seconds (134 KB/sec)
&lt;br/&gt;
&lt;p&gt;$ perl [id://947532|htmltreexpather.pl] &lt;i&gt;capitali_nord_europa-201206.html&lt;/i&gt;  [href://http://search.cpan.org/perldoc/HTML::Element#$h-%3Elook_down%28_...criteria..._%29|_tag p] | [mod://ack] Copenhagen -C3 | [man://head]
&lt;c&gt;//div[@id='ctl00_cph_PageContent_ucCLR_upCLC']/div[@class='info-cruise']/div[@class='sx']/p[@class='note']
------------------------------------------------------------------
HTML::Element=HASH(0xb91ba4)    0.1.0.8.1.0.1.1.1.0.0
Itinerario Danimarca, fiordi norvegesi, Germania Data partenza 17ágiugnoá2012 Nave Costa Fortuna N.ro giorni crociera á
7 Porto di partenza Copenhagen Documenti di viaggio PassaportoáoáCarta d'identit&amp;#9500;á valida per l'espatrio Possono essere
disponibili le seguenti tariffe
/html/body/form/div/div[2]/div/div[2]/div/div[2]/div/p
//div[@id='ctl00_cph_PageContent_ucCLR_rpL_ctl00_BoxDescItinaryDx_pnlInfoCruise']/p
//div[@id='ctl00_cph_PageContent_ucCLR_rpL_ctl00_BoxDescItinaryDx_pnlInfoCruise']/p[@class='itinerari-info']
--
//div[@id='ctl00_cph_PageContent_ucCLR_upCLC']/div[@class='info-cruise']/div[@class='sx']/p[@class='note']
------------------------------------------------------------------

&lt;/c&gt;
&lt;p&gt; Then plug stuff into  [mod://Web::Scraper] , its like [mod://XML::Rules] &lt;c&gt;#!/usr/bin/perl --
use strict; use warnings;
use Data::Dump;
use URI;
use Web::Scraper;


my $soy = scraper {
## only get leafs/twigs with this @class
## store the results into  { info =&gt; \@info }
    process '.info-cruise' =&gt; 'info[]' =&gt; scraper {
        process './/div[@class="sx"]/h3' =&gt; 'title'  =&gt; 'TEXT';
        process '.new-price'             =&gt; 'price'  =&gt; 'TEXT';
        process '.itinerari-info'        =&gt; 'span[]' =&gt; scraper {

#~             process '//span' =&gt; 'span[]' =&gt; 'RAW'; ## this
            process '//span/b | //span/child::text()' =&gt; 'span[]' =&gt; sub {
                my $ishtml   = $_[0]-&gt;isa('HTML::Element');
                my $keyOrVal = $ishtml ? 'key' : 'val';
                my %foo      = ( $keyOrVal =&gt; $_[0]-&gt;getValue );
                $foo{raw} = $_[0]-&gt;as_XML if $ishtml;
                return \%foo;
            };
        };
    };
};

## NOTE Web::Scraper wants URI objects
my $url = URI-&gt;new('file:capitali_nord_europa-201206.html');
my $base='http://www.costacrociere.it';
my $ret = $soy-&gt;scrape( $url , $base );

#~ dd $ret;
dd $ret-&gt;{info}-&gt;[0];

__END__
{
  price =&gt; "\x{20AC} 510,00",
  span  =&gt; [
             {
               span =&gt; [
                 { key =&gt; " Itinerario ", raw =&gt; "&lt;b&gt; Itinerario &lt;/b&gt;\n" },
                 { val =&gt; " Danimarca, fiordi norvegesi, Germania" },
                 { val =&gt; " " },
                 { key =&gt; "Data partenza", raw =&gt; "&lt;b&gt;Data partenza&lt;/b&gt;\n" },
                 { val =&gt; " 17\xA0giugno\xA02012 " },
                 { key =&gt; " Nave ", raw =&gt; "&lt;b&gt; Nave &lt;/b&gt;\n" },
                 { val =&gt; " Costa Fortuna" },
                 {
                   key =&gt; " N.ro giorni crociera \xA0 ",
                   raw =&gt; "&lt;b&gt; N.ro giorni crociera \xA0 &lt;/b&gt;\n",
                 },
                 { val =&gt; " 7" },
                 { key =&gt; " Porto di partenza ", raw =&gt; "&lt;b&gt; Porto di partenza &lt;/b&gt;\n" },
                 { val =&gt; " Copenhagen" },
                 {
                   key =&gt; " Documenti di viaggio ",
                   raw =&gt; "&lt;b&gt; &lt;a href=\"http://www.costacrociere.it/B2C/I/Before_you_go/documentation/travel.htm\" target=\"_blank\"&gt;Documenti di viaggio&lt;/a&gt; &lt;/b&gt;\n",
                 },
                 {
                   val =&gt; " Passaporto\xA0o\xA0Carta d'identit\xE0 valida per l'espatrio",
                 },
                 { val =&gt; " Possono essere disponibili le seguenti tariffe " },
               ],
             },
           ],
  title =&gt; "Le terre dei vichinghi",
}
&lt;/c&gt;
&lt;p&gt; I wouldn't be surprised if tobyink stops by with a [id://947540|Web::Magic example] :)</field>
<field name="root_node">
974906</field>
<field name="parent_node">
974928</field>
<field name="reputation">
2</field>
</data>
</node>
