I cleaned it up and tried a hashref instead of an arrayref; also, I shortened the sub name a wee bit. Try this:
#!/usr/bin/perl -w
use HTML::TreeBuilder;
use Storable qw(nstore retrieve);
use Data::Dumper::HTML qw(dumper_html);
sub google_alert {
use strict 'refs';
my ( $html, $results ) = @_;
my $tree = 'HTML::TreeBuilder'->new_from_content($html);
my (@paragraphs) = $tree->look_down( '_tag', 'p' );
my $type;
foreach my $i ( 0 .. $#paragraphs ) {
my $p = $paragraphs[$i];
my $ptext = $p->as_text;
if ( $ptext =~ /blogs alert/i ) {
$type = 'blog';
next;
}
elsif ( $ptext =~ /news alert/i ) {
$type = 'news';
next;
}
elsif ( $ptext =~ /create another alert/i ) {
$type = '';
next;
}
my ( $link, $title, $source, $sourceurl, $excerpt );
if ( $type eq 'news' ) {
return;
}
elsif ( $type eq 'blog' ) {
my (%anchors) = $p->look_down( '_tag', 'a' );
if (%anchors) {
nstore( \%anchors, 'anchors.sto' );
$link = $anchors{'href'};
$link =~ s/\s+$//;
( $title = $anchors{'href'}->as_text ) =~ s/<.+?>//g;
$title =~ s/^\s+|\s+$//g;
$sourceurl = $anchors{'href'};
( my $temp = $anchors{'href'}->as_text ) =~ s/<.+?>//g
+;
($source) = split( / \- /, $temp, 2 );
$source =~ s/^\s+|\s+$//g;
my $snippet = $p->as_HTML;
$snippet =~ s[<br ?/?>][<br>]gi;
my(@segments) = split( /<br>/i, $snippet, 0 );
if ( not $segments[1] =~ /color="\#666666"/i ) {
( $excerpt = $segments[1] ) =~ s/<.+?>//g;
}
else {
( $excerpt = $segments[2] ) =~ s/<.+?>//g;
}
push @{ $$results{'blog'}; },
{
'link', $link, 'title', $title,
'source', $source, 'sourceurl', $sourceurl,
'excerpt', $excerpt
};
}
}
}
return $results;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.