+---------+------------------------+-----------------------------------------------------------+ | Column | Type | Modifiers | +---------+------------------------+-----------------------------------------------------------+ | sid | integer | not null default nextval('public.rsssites_sid_seq'::text) | | title | character varying(255) | not null | | url | character varying(255) | not null | | active | boolean | | | baseurl | character varying(255) | | +---------+------------------------+-----------------------------------------------------------+ Indexes: rsssites_pkey primary key btree (sid), rsssites_sid_key unique btree (sid) +--------+------------------------+-----------+ | Column | Type | Modifiers | +--------+------------------------+-----------+ | cid | integer | | | title | character varying(255) | not null | | url | character varying(255) | not null | +--------+------------------------+-----------+ Foreign Key constraints: rss_site FOREIGN KEY (cid) REFERENCES rsssites(sid) ON UPDATE CASCADE ON DELETE CASCADE #### #!/usr/bin/perl use strict; use warnings; ############################################################################# # Takes rss files from across the internet and sticks them into the database. # Best ran from cron ############################################################################# if( -f '/var/run/retrieve_rss.pid'){ system('kill -9 `cat /var/run/retrieve_rss.pid`'); system('rm /var/run/retrieve_rss.pid'); } open(PIDFILE,'>/var/run/retrieve_rss.pid'); print PIDFILE $$,"\n"; close(PIDFILE); my $DEBUG = defined $ARGV[0] ? $ARGV[0] : 0; ### initial setup use LWP::Simple; use XML::RSS; use DBI; my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS',{AutoCommit => '0'}); my $sth = $dbh->prepare('select sid,url from rsssites where active is true'); $sth->execute(); while(my ($site_id,$site_url) = $sth->fetchrow_array()){ eval 'get_links($site_id,$site_url)'; print $@,"\n" if $@; } ### disconnect $sth->finish(); $dbh->disconnect; ### done unlink '/var/run/retrieve_rss.pid'; 1; ### this is for rss sites sub get_links { my ($id,$url) = @_; ### DEBUG print "Getting links for $url\n" if $DEBUG; my $document = get($url) || return; # clean the string (this fixes some broken rss) $document =~ s/\015\012?/\012/g || 1; $document =~ s/&(?!(?:[a-zA-Z0-9]+|#\d+);)/&/g || 1; # parse a string my $rss = new XML::RSS(Style => 'Debug') || return; $rss->parse($document) || return; # clear out the db, check for a failure, rollback, and move on... unless( clear_db($id) ){ $dbh->rollback; return; } foreach my $item (@{$rss->{'items'}}) { my ($title,$link); $title = $item->{'title'}; $link = $item->{'link'}; chomp($title,$link); ### remove unsightly site specific links next if ($title =~ /Customize this feed/i); ### stick it into the database my $sth = $dbh->prepare('insert into rsscontent (cid,title,url) values (?,?,?)'); $sth->execute($id,$title,$link); $sth->finish(); # check to see if an error has been raised and rollback if true if($dbh->errstr){ $dbh->rollback; print "Rolling back line [$title][$link]: $dbh->errstr\n" if $DEBUG; return; } } # check to see if an error has been raised... # if so, rollback, if not, commit unless($dbh->errstr){ print "Committing for $url\n" if $DEBUG; $dbh->commit; }else{ print "Rolling back $url: $dbh->errstr\n" if $DEBUG; $dbh->rollback; } return; } sub clear_db { my ($sid) = @_; if( defined($sid) ){ $dbh->do("delete from rsscontent where cid = $sid"); unless($DBI::errstr){ print "Successfully cleared content for $sid\n" if $DEBUG; return 1; }else{ print "Failed to clear content for $sid: $DBI::errstr\n" if $DEBUG; return 0; } } return 0; } #### #!/usr/bin/perl use strict; use warnings; use DBI; use XML::RSS; use CGI qw(:standard); my $cgi = new CGI(); my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS'); unless( $dbh ){ print $cgi->header(),$cgi->start_html('Oops'),$cgi->h1('We have a problem'),$cgi->end_html(); exit; } my $rss = new XML::RSS(); if(defined( $cgi->param('site') ) ){ my $site_data = $dbh->selectrow_hashref('select title,baseurl from rsssites where sid = ' . $cgi->param('site') ); $rss->channel( title => $site_data->{title}, link => $site_data->{baseurl}, description => $site_data->{title} ); my $query = 'select title,url from rsscontent where cid = ?'; my $sth = $dbh->prepare($query); $sth->execute( $cgi->param('site') ); while (my ($title,$link) = $sth->fetchrow_array()){ $title = $cgi->escapeHTML($title); $rss->add_item( title => $title, link => $link, ); } }else{ ### no site param $rss->channel( title => 'RSS caching system', link => 'http://www.localhost/cgi-bin/rss', description => 'RSS interface to cached news', ); $rss->image( title => 'Localhost', url => 'http://www.localhost/images/favicon.png', link => 'http://www.localhost', ); $rss->textinput( title => 'Localhost search', description => 'Use the text input below to search Localhost', name => 'search_term', link => 'http://www.localhost/search' ); my $query = 'select sid,title from rsssites'; my $sth = $dbh->prepare($query); $sth->execute(); while (my ($sid,$title) = $sth->fetchrow_array()){ $rss->add_item( title => $title, link => "http://www.localhost/cgi-bin/rss?site=$sid" ); } } $dbh->disconnect(); print $cgi->header( -type=>'text/xml' ); print $rss->as_string; 1;