#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser::Simple; use Data::Dumper; my $p = HTML::TokeParser::Simple->new(*DATA) or die "couldn't parse DATA: $!\n"; my (@records, %record, $start, $i); while (my $t = $p->get_token){ if ($t->is_start_tag('span')){ if ($t->get_attr('class') and $t->get_attr('class') eq 'jobname'){ $record{jobname} = $p->get_trimmed_text('/span'); } elsif ($t->get_attr('class') and $t->get_attr('class') eq 'jobserial'){ $record{jobserial} = $p->get_trimmed_text('/span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'em'){ push @{$record{em}}, $p->get_trimmed_text('/span'); } elsif ($t->get_attr('name') and $t->get_attr('name') eq 'offices'){ $record{offices} = $p->get_trimmed_text('/span'); } } if ($t->is_start_tag('blockquote')){ next if $i; my $txt = $p->get_trimmed_text(('blockquote')); $record{job_desc} = $txt; push @records, {%record}; %record = (); $i++; } } print Dumper \@records; __DATA__
Accounting Assistant, Level 2
(19203)
Current members:
Plow, Elliot
Wang, Susan
Huston
Job descriptions here. This block quoted text contains a job description and it what I am really looking to recover.
Go to the top of this page.
Check for open positions now!##
##
---------- Capture Output ----------
> "c:\perl\bin\perl.exe" _new.pl
$VAR1 = [
{
'em' => [
'Plow, Elliot',
'Wang, Susan'
],
'job_desc' => 'Job descriptions here. This block quoted text contains a job description and it what I am really looking to recover.',
'offices' => 'Huston',
'jobserial' => '(19203)',
'jobname' => 'Accounting Assistant, Level 2'
}
];
> Terminated with exit code 0.