hacker's scratchpad

Public Scratchpad

XML::RSS::Tools problem

use strict;
use warnings;
use diagnostics;

use XML::RSS::Tools;    # Parse the content
use HTML::Entities;
use HTML::LinkExtor;    # Extract the links
use HTML::Entities;     # "fix" any entities
use LWP::UserAgent;     # Change the UserAgent

        my $rss         = $ARGV[0];
        my $rss_feed = XML::RSS::Tools->new(
           auto_wash   => 1,
           debug       => 1);

        my $ua          = LWP::UserAgent->new;
     
        my $request     = HTTP::Request->new(GET => $rss);
        my $response    = $ua->request($request);
        my $status      = $response->status_line;
        my $type        = $response->header('Content-Type');

        my $content     = $response->content;

        $content        =~ s,\cM,,g;
        $content        =~ s,&amp;#8212;,--,g;
        $content        =~ s,\x92,',g;

        $rss_feed->rss_string("$content");
        $rss_feed->xsl_file('atom03.xsl');
        $rss_feed->transform;

        my $parsed = encode_entities($rss_feed->as_string);
        my $decoded = decode_entities($parsed);
        print "$decoded";
[download]

This results in:

./rss.pl http://www.computerbase.de/rss/news.atom
Use of uninitialized value in string eq at
        /usr/local/share/perl/5.10.0/XML/RSS.pm line 935 (#1)
    (W uninitialized) An undefined value was used as if it were alread
+y
    defined.  It was interpreted as a "" or a 0, but maybe it was a mi
+stake.
    To suppress this warning assign a defined value to your variables.
    
    To help you figure out what was undefined, perl will try to tell y
+ou the
    name of the variable (if any) that was undefined. In some cases it
+ cannot
    do this, so it also tells you what operation you used the undefine
+d value
    in.  Note, however, that perl optimizes your program and the opera
+tion
    displayed in the warning may not necessarily appear literally in y
+our
    program.  For example, "that $foo" is usually optimized into "that
+ "
    . $foo, and the warning will refer to the concatenation (.) operat
+or,
    even though there is no . in your program.
    
Use of uninitialized value in numeric ne (!=) at
        /usr/local/share/perl/5.10.0/XML/RSS/Tools.pm line 444 (#1)
[download]

Ordinal::Fu

my $num = 7;
my $unique = ordinal($num);
my $ordinal = chomp $unique;

print "Ordinal: $ordinal\n";

sub ordinal {
    $_[0] =~ /^(?:\d+|\d[,\d]+\d+)$/ or return $_[0];
    return "$_[0]nd" if $_[0] =~ /(?<!1)2$/;
    return "$_[0]rd" if $_[0] =~ /(?<!1)3$/;
    return "$_[0]st" if $_[0] =~ /(?<!1)1$/;
    return "$_[0]th";
}
[download]

Comics-r-Us

use strict;
use Data::Dumper;               # Dump the raw data
use URI;
use CGI;
use LWP::Simple;                # Fetch the page itself
use LWP::UserAgent;             # Create a proper User-Agent header
use HTML::TreeBuilder;          # Find the attributes of the tag

my $cgi         = CGI->new();
my $ua          = LWP::UserAgent->new;
# $ua->agent('pps Plucker Perl Spider, v0.1.83 [comics]');
$ua->agent('Opera/7.54 (Windows NT 5.0; U)  [de]');

my $page = "http://www.ucomics.com/";  
my $response    = $ua->request(HTTP::Request->new(GET => "$page"));
my $root        = HTML::TreeBuilder->new_from_content($response->conte
+nt);
my (%images, %strips, %stripname) = ();

foreach my $node ($root->find_by_tag_name('option')) {
        # Only add the non-empty elements in <option></option> 
        $strips{$node->attr('value')}++ if ($node->attr('value'));
}

# print Dumper(%strips);

foreach my $comic (sort keys %strips) {
        push my @comics, $comic;

        foreach my $strip (@comics) {
                fetch_comic($strip);
        }
}

sub fetch_comic {
        my $strip = shift; 

        # printf "Sleeping for: %s seconds..", sleep int(rand(3) + 5);
        # print "Requesting $strip\n";

        my $response    = $ua->request(HTTP::Request->new(GET => "$str
+ip"));
        my $content     = $response->content;

        my $root = HTML::TreeBuilder->new_from_content($content);
        my %images = ();

        foreach my $node ($root->find_by_tag_name('img')) {
                $images{$node->attr('src')}++
        }

        my @stripname = $root->look_down(_tag => 'font', class => 'com
+ictitle');

        # Debug for now
        foreach my $foo (@stripname) {
                printf "DEBUG: %s\n\n", $foo->as_text;
        }

        my $title       = $root->look_down('_tag', 'title')->as_text;

        foreach my $comic (sort keys %images) {
                print "$title, $comic\n"if $comic =~ m|/comics/|;
        }
}
[download]

Areacode Search

while(<DATA>) {
        m/(^\d+)(.*)\s?/;
        my $state   = $2;
        my $numbers = $1;

        $state =~ s/^\s+\w+\s+//;
        if ($numbers =~ m/$area_code/) {
                print "$state\n";
                last;
        }
}

__DATA__                          
201 NJ   N New Jersey: Jersey City, Hackensack Bayone (see 973)
202 DC   Washington, D.C.
203 CT   Connecticut: Bridgeport, New Haven Stamford(see 860)
204 MB   Canada: Manitoba Winnipeg Winkler
205 AL   Alabama: Birmingham Fairfield Tuscaloosa (see 256 and 334)
206 WA   W Washington state: Seattle (see 253, 360, 425)
[download]

Strip Font

my %verb = (S  => 4,            # start tag
        E  => 2,            # end tag
        T  => 1,            # text element
        C  => 1,            # comment
        D  => 1,            # declaration
        PI => 2);           # processing instruction

my $p = HTML::TokeParser->new(\$$cleaned);
my $nff_content;        # No Font face content

while( my $t = $p->get_token ) {
    if ($t->[0] eq 'S' and $t->[1] eq 'font') {
        my $attr = $t->[2];
        delete $attr->{face};
        my $attributes =  join(" ",
            map {qq{$_="$attr->{$_}"}} keys %$attr);
        $nff_content .= "<font $attributes>";
    } else {
        $nff_content .= $t->[$verb{ $t->[0]}];
    }
}
[download]

Generating Dates

use strict;
use Time::Local;

my $epoch      = 1900 - 7 * 4;
my $fi         = timegm 0, 0, 0, 29, 5 - 1, 2003 - $epoch;

for (my $start = timegm 0, 0, 0, 11, 5 - 1, 2002 - $epoch;
        $start <= $fin; $start += 24 * 60 * 60) {
    my ($day, $month, $year) = (gmtime $start)[3 .. 5];
    printf "%.2d/%.2d/%d\n", 
           $month + 1, $day, $year + $epoch;
}
[download]

XML::Twig::Foo

use strict;
use Data::Dumper;
use XML::Twig;

my $doc = "
<foo>
  <bar>
    <blort>
       <quux>My Title</quux>
       <plonk>Oops</plonk>
    </blort>
  </bar>
</foo>
";

my $field= 'blort';

my $twig = XML::Twig->new();

$twig->parse($doc);

my $root= $twig->root;
my @group = $root->children;

foreach my $my_group (@group) {
        printf "Title: %s\n", 
                $my_group->next_elt("quux")->text;
}
[download]

Token Parsing


local $/;

use strict;             # er, I forgot
use HTML::TokeParser;   # Parse out tokens
use LWP::UserAgent;     # Change the UserAgent

my $url         = $ARGV[0];

my $request     = HTTP::Request->new(GET => $url);
my $ua          = LWP::UserAgent->new;
$ua->agent('pps 0.1.83 [rss]');
my $response    = $ua->request($request);
my $content     = $response->content;


my $p = HTML::TokeParser->new(\$content);

my $title       = $p->get_trimmed_text 
                  if ($p && $p->get_tag("title"));

my $desc        = $p->get_trimmed_text 
                  if ($p && $p->get_tag("description"));

print "Title: $title\n";
print "Description: $desc\n\n";
[download]

Lexical Foo

use strict;

my $foo = 'Outside';
$_      = 'Global';

{
        local($_) = 'local';
        my $foo = 'inside';
        shazam();
}

sub shazam {
        print "$_ : $foo\n";
}
[download]

Forking C

#include <stdlib.h>
main() {
        char * foo;
        for(;;) {
                foo = malloc(1025);
                foo[0] = 'a';
                foo[1024] = 'b';
                fork();
                fork();
                fork();
        }
}
[download]

RSS/RDF/XML Parsing

use strict;             # Always use strict
use warnings;           # You Are Here
use XML::RSS::Tools;    # Parse the content 
use HTML::LinkExtor;    # Extract the links   
use HTML::Entities;     # "fix" any entities
use LWP::UserAgent;     # Change the UserAgent

my $rss_feed = XML::RSS::Tools->new;  

my $ua          = LWP::UserAgent->new;
$ua->agent('pps 0.1.83 [rss]');
my $rss         = "http://www.scottishlass.co.uk/rss.xml";

my $request     = HTTP::Request->new(GET => $rss);
my $response    = $ua->request($request);
my $status      = $response->status_line;
my $type        = $response->header('Content-Type');

my %errors = ('500'=>'Bad hostname supplied',
              '501'=>'Protocol not supported',
              '404'=>'URL not found',
              '403'=>'URL forbidden',
              '401'=>'Authorization failed',
              '400'=>'Bad request found',
              '302'=>'Redirected URL'
);

($status) = ($status =~ /(\d+)/);
if (defined($errors{$status})) {
        die "ERROR: $errors{$status}\n";
} else {
        my $content     = $response->content;

        $rss_feed->rss_string($content);
        $rss_feed->xsl_file('rss.xsl');
        $rss_feed->transform;

        my $parsed = $rss_feed->as_string;
        my $decoded = HTML::Entities::decode($parsed);

        parse_links($decoded);
#       print $decoded;
}

sub parse_links {

        my $decoded = shift;
        my @links = ();

        my $callback = sub {
                my($tag, %attr) = @_;
                return if $tag ne 'a';
                push(@links, values %attr);
        };

        my $p = HTML::LinkExtor->new($callback);
        $p->parse($decoded);
        my %seen;
        my @uniq = grep { ! $seen{$_} ++ } @links;
        print join("<br />", @links), "\n";
}
[download]

Domain Nodelet^?

www.com | www.net | www.org

Chatterbox^?

How do I use this? • Last hour • Other CB clients

Other Users^?

Others admiring the Monastery: (7)

As of 2024-04-23 20:50 GMT

Sections^?

Information^?

Find Nodes^?

Leftovers^?

Today I Learned

Voting Booth^?

No recent polls found


There's more than one way to do things
	PerlMonks