Since a week ago I used this crawler to have the co-author list of DBLP authors, it worked fine, but it stopped, i don't know where is the problem, because it doesn't shows any error but just doesn't crawls (have a empty file of results) maybe it is the problem on the DBLP page itself. here are the files I've used. Please help to make function this file again because I need to finish a project. please help
#!/usr/bin/perl
# This script scrapes co-author info for a specific person from DBLP.
# Revision 1 intended use is to supply author name on cmd line,
# with a list of co-authors provided as output, one per line.
# Revision 2 methodizes the crawler to provide multi-lvl crawling.
# Revision 3 encapsulates the crawler in a loop to run against a list
+of authors.
# Status output is written to STDOUT, errors (such as unfound authors)
+ written
# to STDERR, and coauthor info is written to the file specified belo
+w.
# Import Perl's WWW library for quick & easy web retrieval.
# utf8 allows unicode char in this script, and also import HTML uni
+code conversion methods.
use utf8;
use LWP::Simple;
use HTML::Entities;
# Inits; $sleep indicates time to wait between each author crawl.
my %conflicts = ();
my %index = ();
my $base_url = 'http://www.informatik.uni-trier.de/~ley/db/indices/a-
+tree/';
my $sleep = 0;
my $outfile = 'conf.txt.';
my $index_number = 0;
my $index_dy = 0;
# DBLP full names for some authors otherwise not found in catalog.
my %fullnames = ('List of names snipped');
# Open the input data, parsing out reviewer names into a list. Init o
+utput file.
$filename = "author.txt";
open(INPUT,$filename) or die "Can't open file $filename\n";
undef $/; my $text=<INPUT>; close INPUT; $/ = "\n";
@reviewers = split(/\n/, $text);
my $num_reviewers = $#reviewers + 1;
# Open output file... use > to start over, >> to continue.
open OUTFILE, '>>'.$outfile;
close OUTFILE;
# Disable buffering on STDOUT so I can see the damn progress log in re
+altime.
select((select(STDOUT), $|=1)[0]);
# Loop over all reviewers, formatting name and calling the Crawler.
my $count = 0;
foreach my $reviewer (@reviewers)
{
# Loop inits; clear the conflicts hash.
$count++; $index_number++; $index_dy++;
next if ($count < 0); # Skip to current guy (or gal).
%conflicts = ();
print 'Working on ', $reviewer, ', # ', $count, ' of ', $num_review
+ers, '... ';
# Format reviewer name to match DBLP specs.
my $orig_name = $reviewer;
$reviewer = encode_entities($reviewer);
$reviewer =~ s/[^\w\s]/=/g;
my ($first, $middle, $last) = split /\s+/, $reviewer;
my $formatted = '';
if (defined $last)
{ $formatted = $last.':'.$first.'_'.$middle; }
else
{ $formatted = $middle.':'.$first; }
$index{$formatted} = $index_dy;
$index{$conflicts} = $index_number;
# Call the crawler method with formatted name.
#&Crawl('Fox:Edward_A=');
&Crawl($formatted);
# Output the results.
open OUTFILE, '>>'.$outfile;
foreach my $key (keys %conflicts)
{ print OUTFILE $index{$formatted}, '=', $orig_name,'=', $conflicts
+{$key},' ', "\n"; }
my @conflicts = sort keys %conflicts;
#$index{$formatted}
#$index{$conflicts},
%conflicts = ();
foreach my $conflict (@conflicts)
{ &Crawl($conflict);
($surname, $name) = split /:/,$conflict;
open OUTFILE, '>>'.$outfile;
}
close OUTFILE;
# Finished with this $reviewer, wait $sleep seconds before starting
+next.
print 'done.', "\n";
sleep $sleep;
}
# Returns a list of co-authors from DBLP.
sub Crawl
{
# Compose author name for retrieval.
my $name = shift || die "Bad usage of method Crawl.";
#print "\n",$name,"\n";
my $category = lc(substr($name, 0, 1));
#print "\n",$category ,"\n";
# Construct author URL and retrieve summary page.
my $url = $base_url.$category.'/'.$name.'.html';
#print "\n",$url ,"\n";
my $page = get($url) || warn "Couldn't get ${url}: $!";
#print "\n",$page ,"\n";
return () unless defined $page;
# Find co-authors list at bottom and parse out all names & URLs.
while ($page =~ m{<\/td> # First two lines match style code.
<td\sclass="coauthor"\salign="right"\sbgcolor="[^"]+
+">
<a\shref="([^"]+)"> # Matches relative li
+nk to coauthor page.
([^>]+)<\/a> # Matches co-author
+ name.
}mgx)
{ # Translate relative URL into an absolute using base address of D
+BLP.
my $url = $1;
my $coauth_name = $2;
my ($tmp1, $tmp2, $tmp3) = split '/', $url;
my $coauth = $tmp3;
$coauth =~ s/.html$//;
$coauth_name = decode_entities($coauth_name);
# Save this co-author.
$conflicts{$coauth} = $coauth_name;
if (!exists $index{$coauth})
{
$index_number++;
$index{$coauth} = $index_number;
}
}
return 0;
}
the author file author.txt
James F. Blakesley
James F. Blinn
James F. Blowey
James F. Bowring