Beefy Boxes and Bandwidth Generously Provided by pair Networks
Syntactic Confectionery Delight
 
PerlMonks  

Re: Parsing HTML files to recover data...

by wfsp (Abbot)
on Nov 22, 2006 at 12:15 UTC ( #585487=note: print w/ replies, xml ) Need Help??


in reply to Parsing HTML files to recover data...

This give you an array of hashes. It uses the second blockquote to trigger the start of the next record.

#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser::Simple; use Data::Dumper; my $p = HTML::TokeParser::Simple->new(*DATA) or die "couldn't parse DATA: $!\n"; my (@records, %record, $start); while (my $t = $p->get_token){ if ($t->is_start_tag('span')){ if ($t->get_attr('class') and $t->get_attr('class') eq 'jobname'){ $record{jobname} = $p->get_trimmed_text('span'); } elsif ($t->get_attr('class') and $t->get_attr('class') eq 'jobseri +al'){ $record{jobserial} = $p->get_trimmed_text('span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'em'){ push @{$record{em}}, $p->get_trimmed_text('span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'offices') +{ $record{offices} = $p->get_trimmed_text('span'); } } if ($t->is_start_tag('blockquote')){ next if exists $record{job_desc}; $record{job_desc} = $p->get_trimmed_text('blockquote'); #die Dumper \%record; push @records, \%record; %record = (); } } print Dumper \@records; __DATA__ <p><b><span class="jobname"> Accounting Assistant, Level 2 </span> <span class="jobserial">(19203)</span> <br /> Current members: <br /> <span name="em">Plow, Elliot</span> <span name="em">Wang, Susan</span> <br /> <span name=”offices”>Huston</span> </p> <blockquote> Job descriptions here. This block quoted text contains a job description and it what I am rea +lly looking to recover. </blockquote> <blockquote><a href="#top">Go to the top of this page</a>.</blockquote +> <blockquote><a href=”companyHR.html”>Check for open positions now!</a> +</blockquote>
output:
$VAR1 = { 'job_desc' => 'Job descriptions here. This block quoted text contain +s a job description and it what I am really looking to recover.', 'em' => [ 'Plow, Elliot', 'Wang, Susan' ], 'jobserial' => '(19203) Current members:', 'jobname' => 'Accounting Assistant, Level 2' };

update: see my reply below.


Comment on Re: Parsing HTML files to recover data...
Select or Download Code
Re^2: Parsing HTML files to recover data...
by kaif (Friar) on Nov 22, 2006 at 12:18 UTC
    That's a questionable value for the 'jobserial' key. Looking at your code, I can't figure out why that could happen ...
Re^2: Parsing HTML files to recover data...
by wfsp (Abbot) on Nov 22, 2006 at 14:07 UTC
    As kaif points out my script above did indeed produce questionable output and I also couldn't figure out why.

    After a lot of head scratching and cursing I noticed that the OPs data had a variety of quotes around the attribute values. I changed them to ordinary quotes and it now works ok.

    kaif++ for spotting the snag.

    #!/usr/bin/perl use strict; use warnings; use HTML::TokeParser::Simple; use Data::Dumper; my $p = HTML::TokeParser::Simple->new(*DATA) or die "couldn't parse DATA: $!\n"; my (@records, %record, $start, $i); while (my $t = $p->get_token){ if ($t->is_start_tag('span')){ if ($t->get_attr('class') and $t->get_attr('class') eq 'jobname'){ $record{jobname} = $p->get_trimmed_text('/span'); } elsif ($t->get_attr('class') and $t->get_attr('class') eq 'jobseri +al'){ $record{jobserial} = $p->get_trimmed_text('/span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'em'){ push @{$record{em}}, $p->get_trimmed_text('/span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'offices') +{ $record{offices} = $p->get_trimmed_text('/span'); } } if ($t->is_start_tag('blockquote')){ next if $i; my $txt = $p->get_trimmed_text(('blockquote')); $record{job_desc} = $txt; push @records, {%record}; %record = (); $i++; } } print Dumper \@records; __DATA__ <p><b> <span class="jobname">Accounting Assistant, Level 2</span> <span class="jobserial">(19203)</span> <br />Current members:<br /> <span name="em">Plow, Elliot</span> <span name="em">Wang, Susan</span> <br /> <span name="offices">Huston</span> </p> <blockquote> Job descriptions here. This block quoted text contains a job description and it what I am really looking to recover. </blockquote> <blockquote> <a href="#top">Go to the top of this page</a>. </blockquote> <blockquote> <a href="companyHR.html">Check for open positions now!</a> </blockquote>
    ---------- Capture Output ---------- > "c:\perl\bin\perl.exe" _new.pl $VAR1 = [ { 'em' => [ 'Plow, Elliot', 'Wang, Susan' ], 'job_desc' => 'Job descriptions here. This block quoted text conta +ins a job description and it what I am really looking to recover.', 'offices' => 'Huston', 'jobserial' => '(19203)', 'jobname' => 'Accounting Assistant, Level 2' } ]; > Terminated with exit code 0.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://585487]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others romping around the Monastery: (4)
As of 2014-09-19 05:04 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    How do you remember the number of days in each month?











    Results (129 votes), past polls