http://www.perlmonks.org?node_id=524871

Looking at my Gaim plugin which notifies me when a question gets posted on perlmonks.com, I thought it might be helpful to funnel that info into an IRC channel instead.

If you connect to the IRC server on irc.freenode.net and run /join #pm2irc, you'll see the pm2irc bot sitting there. Every ten minutes, it gets the newest questions from perlmonks.com and if there's new ones, it posts their topic and URL to the IRC channel. If you run a graphical IRC client like Gaim, all you need is click on the link to open up a browser showing the question page. Handy for XP-hungry monks.

Here's the script that does it. I'll let it run for a while to collect feedback, let me know if you find it useful. Ideally, we wouldn't have to scrape the site but have a process on perlmonks funnelling the realtime data into an IRC channel, wouldn't that be great?

#!/usr/bin/perl -w ########################################### # pm2irc - Scrape perlmonks.com and post # new questions to #pm2irc ########################################### use strict; use Bot::BasicBot; use HTML::TreeBuilder; use URI::URL; use CGI qw(a); use Cache::FileCache; use HTTP::Request::Common; use Log::Log4perl qw(:easy); use POE; use POE::Component::Client::HTTP; our $CHANNEL = "#pm2irc"; our $USER = "pm2irc"; our $FETCH_INTERVAL = 600; our $FETCH_URL = "http://perlmonks.com/" . "?node=Newest%20Nodes"; Log::Log4perl->easy_init($INFO); our $cache = new Cache::FileCache({ namespace => "pm2irc", }); my $Bot = Bot::BasicBot->new( server => 'irc.freenode.net', channels => [$CHANNEL], nick => $USER, ); DEBUG "Setting up pm2irc POE components"; POE::Component::Client::HTTP->spawn( Alias => "ua", Timeout => 60, ); POE::Session->create( inline_states => { _start => sub { # Wait 20 secs before the first post $poe_kernel->delay('http_start', 20); }, http_start => sub { DEBUG "Fetching url $FETCH_URL"; $poe_kernel->post("ua", "request", "http_ready", GET $FETCH_URL); $poe_kernel->delay('http_start', $FETCH_INTERVAL); }, http_ready => sub { DEBUG "http_ready $FETCH_URL"; my $resp= $_[ARG1]->[0]; if($resp->is_success()) { pm_update($resp->content()); } else { ERROR "Can't fetch $FETCH_URL: ", $resp->message(); } }, } ); DEBUG "The dance begins ..."; $Bot->run(); ########################################### sub pm_update { ########################################### my($html_text) = @_; if(my @nws = latest_news($html_text)) { for(@nws) { INFO "Sending '$_' to channel"; $Bot->say(channel => $CHANNEL, body => "$_", ); } } } ########################################### sub latest_news { ########################################### my($html_string) = @_; my $start_url = URI::URL->new($FETCH_URL); my $max_node; my $saved = $cache->get("max-node"); $saved = 0 unless defined $saved; my @aimtext = (); for my $entry (@{qparse($html_string)}) { my($text, $url) = @$entry; my($node) = $url =~ /(\d+)$/; if($node > $saved) { INFO "New node $text ($url)"; push @aimtext, "$text $url"; } $max_node = $node if !defined $max_node or $max_node < $node; } $cache->set("max-node", $max_node) if $saved < $max_node; return @aimtext; } ########################################### sub qparse { ########################################### my($html_string) = @_; my $start_url = URI::URL->new($FETCH_URL); my @questions = (); my $parser = HTML::TreeBuilder->new(); my $tree = $parser->parse($html_string); my($questions) = $tree->look_down( "_tag", "a", "name", "toc-Questions"); if(! $questions) { ERROR "Couldn't find Questions section"; return undef; } my $node = $questions->parent(); while($node->tag() ne "table") { $node = $node->right(); } for my $tr ($node->look_down( "_tag", "tr")) { for my $a ($tr->look_down( "_tag", "a")) { my $href = $a->attr('href'); my $text = $a->as_text(); my $url = URI::URL->new($href, $start_url); push @questions, [$text, $url->abs()]; # Process only the question # node, not the author's node last; } } $tree->delete(); return \@questions; }