#!c:\perl\bin\perl
use LWP::UserAgent;
use XML::RSS;
#We're running this off of a Windows machine, connecting to a M$SQL se
+rver
# although any old SQL server would do (e.g. MySQL)
use Win32::ODBC;
$DSN = "TESTSERVER";
#Create a new UserAgent to pull the XML data down
$ua = new LWP::UserAgent;
$ua->agent("HeadlineAgent/0.1 ".$ua->agent);
#connect via ODBC to the SQL server
if(!($db = new Win32::ODBC($DSN))){
print "Error connecting to $DSN\n";
print "Error: " . Win32::ODBC::Error() . "\n";
exit;
}
# We'll be pulling in RSS files from various sources,
# their URL's are stored in the SQL database
my %sources;
if($db->Sql("SELECT * FROM ExternalNewsSources"))
{
print "SQL failed.\n";
print "Error: " . $db->Error() . "\n";
$db->Close();
exit;
}
while($db->FetchRow()){
my(%data) = $db->DataHash();
# ...process the data...
# Add to hash of hashes
$sources{$data{'ExternalNewsSourceID'}} = $data{'Source'};
}
#Create the RSS object to parse the RSS files retrieved...
my $rss = new XML::RSS;
($sec,$min,$hour,$mday,$mon,$year) = localtime(time);
# preformatted string compatible with SQLServer's timestamp field
$nowstring = sprintf("%02i/%02i/%i %02i:%02i:%02i",($mon+1),$mday,($ye
+ar+1900),$hour,$min,$sec);
#Walk through each of the XML sources
foreach $sourceid(keys %sources)
{
# fetch RSS file from the source's URL
my $request = new HTTP::Request GET => $sources{$sourceid};
my $result = $ua->request($request);
if($result->is_success)
{
# grok the RSS file retrieved
$rss->parse($result->content);
# Step through all the links in the RSS
for my $i (@{$rss->{items}})
{
# Check to see if we've already seen this link from this source
+before...
$db->Sql("SELECT * FROM ExternalNews WHERE SourceID=".$sou
+rceid." AND Link = '".$i->{'link'}."'");
if($db->FetchRow())
{
#skip it - it's here already...
}
#Sometimes the RSS mis-parses and give us an empty item
elsif(length($i->{'title'}) <= 0)
{
#skip it - it's empty...
}
else
{
#Plunk it into the database
$db->Sql("INSERT INTO ExternalNews (SourceID,PostDate,
+Title,Link,Description) VALUES ($sourceid,'$nowstring','".$i->{'title
+'}."','".$i->{'link'}."','".$i->{'description'}."')");
}
# Nuke the current values in the object, it appears that the XML lib r
+ecycles the variables without clearing them...
$i->{'title'} = '';
$i->{'link'} = '';
$i->{'description'} = '';
}
}
else
{
print "Doh! couldnt get ".$sources{$sourceid}.": $!\n";
}
}
#clean up
$db->Close();
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.