+---------+------------------------+-----------------------------------------------------------+
| Column | Type | Modifiers |
+---------+------------------------+-----------------------------------------------------------+
| sid | integer | not null default nextval('public.rsssites_sid_seq'::text) |
| title | character varying(255) | not null |
| url | character varying(255) | not null |
| active | boolean | |
| baseurl | character varying(255) | |
+---------+------------------------+-----------------------------------------------------------+
Indexes: rsssites_pkey primary key btree (sid),
rsssites_sid_key unique btree (sid)
+--------+------------------------+-----------+
| Column | Type | Modifiers |
+--------+------------------------+-----------+
| cid | integer | |
| title | character varying(255) | not null |
| url | character varying(255) | not null |
+--------+------------------------+-----------+
Foreign Key constraints: rss_site FOREIGN KEY (cid) REFERENCES rsssites(sid) ON UPDATE CASCADE ON DELETE CASCADE
####
#!/usr/bin/perl
use strict;
use warnings;
#############################################################################
# Takes rss files from across the internet and sticks them into the database.
# Best ran from cron
#############################################################################
if( -f '/var/run/retrieve_rss.pid'){
system('kill -9 `cat /var/run/retrieve_rss.pid`');
system('rm /var/run/retrieve_rss.pid');
}
open(PIDFILE,'>/var/run/retrieve_rss.pid');
print PIDFILE $$,"\n";
close(PIDFILE);
my $DEBUG = defined $ARGV[0] ? $ARGV[0] : 0;
### initial setup
use LWP::Simple;
use XML::RSS;
use DBI;
my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS',{AutoCommit => '0'});
my $sth = $dbh->prepare('select sid,url from rsssites where active is true');
$sth->execute();
while(my ($site_id,$site_url) = $sth->fetchrow_array()){
eval 'get_links($site_id,$site_url)';
print $@,"\n" if $@;
}
### disconnect
$sth->finish();
$dbh->disconnect;
### done
unlink '/var/run/retrieve_rss.pid';
1;
### this is for rss sites
sub get_links {
my ($id,$url) = @_;
### DEBUG
print "Getting links for $url\n" if $DEBUG;
my $document = get($url) || return;
# clean the string (this fixes some broken rss)
$document =~ s/\015\012?/\012/g || 1;
$document =~ s/&(?!(?:[a-zA-Z0-9]+|#\d+);)/&/g || 1;
# parse a string
my $rss = new XML::RSS(Style => 'Debug') || return;
$rss->parse($document) || return;
# clear out the db, check for a failure, rollback, and move on...
unless( clear_db($id) ){
$dbh->rollback;
return;
}
foreach my $item (@{$rss->{'items'}}) {
my ($title,$link);
$title = $item->{'title'};
$link = $item->{'link'};
chomp($title,$link);
### remove unsightly site specific links
next if ($title =~ /Customize this feed/i);
### stick it into the database
my $sth = $dbh->prepare('insert into rsscontent (cid,title,url) values (?,?,?)');
$sth->execute($id,$title,$link);
$sth->finish();
# check to see if an error has been raised and rollback if true
if($dbh->errstr){
$dbh->rollback;
print "Rolling back line [$title][$link]: $dbh->errstr\n" if $DEBUG;
return;
}
}
# check to see if an error has been raised...
# if so, rollback, if not, commit
unless($dbh->errstr){
print "Committing for $url\n" if $DEBUG;
$dbh->commit;
}else{
print "Rolling back $url: $dbh->errstr\n" if $DEBUG;
$dbh->rollback;
}
return;
}
sub clear_db {
my ($sid) = @_;
if( defined($sid) ){
$dbh->do("delete from rsscontent where cid = $sid");
unless($DBI::errstr){
print "Successfully cleared content for $sid\n" if $DEBUG;
return 1;
}else{
print "Failed to clear content for $sid: $DBI::errstr\n" if $DEBUG;
return 0;
}
}
return 0;
}
##
##
#!/usr/bin/perl
use strict;
use warnings;
use DBI;
use XML::RSS;
use CGI qw(:standard);
my $cgi = new CGI();
my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS');
unless( $dbh ){
print $cgi->header(),$cgi->start_html('Oops'),$cgi->h1('We have a problem'),$cgi->end_html();
exit;
}
my $rss = new XML::RSS();
if(defined( $cgi->param('site') ) ){
my $site_data = $dbh->selectrow_hashref('select title,baseurl from rsssites where sid = ' . $cgi->param('site') );
$rss->channel(
title => $site_data->{title},
link => $site_data->{baseurl},
description => $site_data->{title}
);
my $query = 'select title,url from rsscontent where cid = ?';
my $sth = $dbh->prepare($query);
$sth->execute( $cgi->param('site') );
while (my ($title,$link) = $sth->fetchrow_array()){
$title = $cgi->escapeHTML($title);
$rss->add_item(
title => $title,
link => $link,
);
}
}else{ ### no site param
$rss->channel(
title => 'RSS caching system',
link => 'http://www.localhost/cgi-bin/rss',
description => 'RSS interface to cached news',
);
$rss->image(
title => 'Localhost',
url => 'http://www.localhost/images/favicon.png',
link => 'http://www.localhost',
);
$rss->textinput(
title => 'Localhost search',
description => 'Use the text input below to search Localhost',
name => 'search_term',
link => 'http://www.localhost/search'
);
my $query = 'select sid,title from rsssites';
my $sth = $dbh->prepare($query);
$sth->execute();
while (my ($sid,$title) = $sth->fetchrow_array()){
$rss->add_item(
title => $title,
link => "http://www.localhost/cgi-bin/rss?site=$sid"
);
}
}
$dbh->disconnect();
print $cgi->header( -type=>'text/xml' );
print $rss->as_string;
1;