Code for graq:
#!/usr/bin/perl
use strict;
use warnings;
use open ':locale'; # tell perl to use the $LANG environme
+nt encoding for STDOUT/IN/ERR
# check 'man 3 open' for details; but
+this is quite important
use HTML::Entities; # used to decode HTML &; entities
use Encode qw(decode); # used to decode utf8/iso into perls i
+nternal representation (which is utf8)
my $data = {
'href' => 'http://www.accountancyage.com/accountancyage/news/2
+159769/kpmg-sets-retail-think-tank',
'teaser' => '<p><small>AccountancyAge.com, <a href="http://www
+.accountancyage.com/">Accountancy Age</a>, Thursday 6 July 2006 at 00
+:00:00</small></p><p><i> Firm forms partnership with retail research
+group </i></p><p>KPMG has launched the ‘Retail Think
+ Tank’ (RTT) aimed at establishing ‘
+;the true health and status\' of the retail sector. The Bi
+g Four firm has joined forces with retail research group...</p><p><sm
+all>> <a href="http://www.accountancyage.com/accountancyage/n
+ews/2159769/kpmg-sets-retail-think-tank"><i>Read the full article</i>
+</a></small></p>',
'title' => "KPMG sets up retail \x{e2}\x{80}\x{98}think tank\x
+{e2}\x{80}\x{99}",
};
my $html = $data->{teaser};
decode_entities($html); # because it's html, we need t
+o do this first
$html = decode('utf8',$html); # now 'parse' the utf8
my $title = $data->{title}; # this is 'raw' utf8; the \x{e
+2} sequences indicate this
$title = decode('utf8',$title); # so just parse it
print "** $title:\n";
print "$html\n";
This generates a nice hash by region name, with a an array of hashes with all data in it...
#!/usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
use Data::Dumper;
my $tree = HTML::TreeBuilder->new_from_file('IDQ60606.shtml');
my @cellnames = qw(
station time temperature dewpoint
relhumidity deltat
wind_dir speedkmh gustkmh speedknt gistknt
pressure rain
);
my $region;
my %data;
for my $row ($tree->look_down('_tag'=>'tr')) {
my @cells = $row->look_down('_tag'=>'td');
print scalar @cells, "\n";
if(@cells==1) {
$region = $cells[0]->as_trimmed_text;
}
if(@cells == @cellnames) {
my %row;
@row{@cellnames} = map { $_->as_trimmed_text} @cells;
push @{$data{$region}} => \%row;
}
}
print "$_\n" for keys %data;
print Dumper \%data;
while (my ($region,$data) = each %data) {
my @data = @$data;
my $raintotal;
for (@data) {
my $rain = $_->{rain};
$rain = 0 if $rain eq '-';
$raintotal+=$rain;
}
my $rainaverage = @data ? ($raintotal / @data) : undef;
print "$region: $rainaverage\n";
}
#!/usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
my $tree = HTML::TreeBuilder->new_from_file('IDQ60606.shtml');
my @cells = $tree->look_down(
'_tag' => 'td',
'class' => 'rowlevel1',
);
print $_->as_trimmed_text,"\n" for @cells;
gives
PENINSULA
GULF COUNTRY
NORTHERN GOLDFIELDS and UPPER FLINDERS
NORTH TROPICAL COAST and TABLELANDS
HERBERT and LOWER BURDEKIN
CENTRAL COAST - WHITSUNDAYS
CAPRICORNIA
CENTRAL HIGHLANDS - COALFIELDS
CENTRAL WEST
NORTHWEST
CHANNEL COUNTRY
MARANOA and WARREGO
DARLING DOWNS and GRANITE BELT
WIDE BAY and BURNETT
SOUTHEAST COAST
CORAL SEA
Some of my snippets...
# All directories in our parent's path
%dirs = map { /^.*\/(.*)/ => $_ } grep {-d} glob "../*";
# De-crapper (for use after Word HTML idiocy)
my $file = join '',<>;
$file =~ s/<li.*?>/<li>/gms;
$file =~ s/<p.*?>/<p>/gms;
$file =~ s/<\/?o:.*?>//gms;
$file =~ s/<!.*?>//gms;
$file =~ s/<h2.*?>/<h2>/gms;
$file =~ s/<div .*?>//gms;
$file =~ s/<\/?span.*?>//gms;
print $file;
# Password generator
my @chars = ('.','!','#','@','$','/',0..9,'A'..'Z','a'..'z');
my $length = 8 + rand 4;
my $pw = join '', @chars[ map { rand @chars } (1..$length)];
# Java namestyle to SQL namestyle regex
s/(?<!^)([A-Z]+)/_\L$1\E/g;
# Environment dumper
while (my @set = each %ENV) {printf "%s=>%s\n",@set}
|