package Spider::URL;
use strict;
sub new{
my ($class,$url,$target,$current_depth,$referer) = @_;
my $object = bless {}, $class;
$object->{url} = $url;
$object->{current_depth} = $current_depth;
$object->{target} = $target;
$object->{referer} = $referer;
return $object;
}
1;
Next is the object that inherits from LWP::Parallel::UserAgent:
package Spider::LWP;
use strict;
use Exporter();
use HTTP::Request;
use LWP::Parallel::UserAgent qw(:CALLBACK);
use Spider::URL;
use File::Path;
use Data::Dump qw(dump);
use vars qw (@ISA @EXPORT);
@ISA = qw(LWP::Parallel::UserAgent Exporter);
@EXPORT = @LWP::Parallel::UserAgent::EXPORT_OK;
sub new {
my ($class,$depth,$path,$max_sockets) = @_;
my $object = bless(LWP::Parallel::UserAgent->new(),$class);
$object->{current} = {};
$object->{queue} = {};
$object->{finished} = {};
$object->{failed} = {};
$object->{depth} = $depth;
$object->{path} = $path;
$object->{completed} = 0;
$object->{max_sockets} = $max_sockets;
return $object;
}
sub on_return {
my ($self, $request, $response, $entry) = @_;
if ($response->is_success) {
$response->content($response->decoded_content);
$self->{finished}->{$request->url->as_string} = $self-
+>{current}->{$request->url->as_string};
delete $self->{current}->{$request->url->as_string};
$self->_process_return($response);
# print "\n\nWoa! Request to ",$request->url," returned
+code ", $response->code, ": ", $response->message, "\n";
} else {
on_failure($self, $request, $response, $entry);
}
$self->discard_entry($entry);
return;
}
sub _process_return{
my ($self,$response) = @_;
my $html = $response->header('content-type') =~ /text\/html/i;
my $key = $response->request->url->as_string;
my $path = _build_path($self->{path},$self->{finished}->{$key}
+->{target});
_write_page($path,$response->request->uri,$response->content,$
+html);
my $maxed = $self->{depth} - $self->{finished}->{$key}->{curre
+nt_depth};
if($html){
my $urls = _get_urls($response->content,$response->bas
+e,$maxed,);#\$ignore,\$exclude);#biggest waiting point
foreach my $url(@$urls){
if($url !~ /\w/){next;}
if(! defined $self->{current}->{$$url[0]} and
! defined $self->{queue}->{$$url[0]} and
! defined $self->{finished}->{$$url[0]}){
$self->{queue}->{$$url[0]} = Spider::U
+RL->new($$url[0],$self->{finished}->{$key}->{target},$self->{finished
+}->{$key}->{current_depth}+1);
#print "Putting $$url[0] into the queue\n";
}
}
}
my @keys = keys %{$self->{queue}};
while((keys %{$self->{current}}) < $self->{max_sockets} and $#
+keys > -1){
$self->{current}->{$keys[0]} = $self->{queue}->{$keys[
+0]};
#print "Putting $keys[0] into the current\n";
delete $self->{queue}->{$keys[0]};
my $req = HTTP::Request->new('GET', "$keys[0]");
$req->header('Accept-Encoding'=>'gzip,x-gzip,x-bzip2,d
+eflate,compress,base64,quoted-printable');
$self->register($req);
@keys = keys %{$self->{queue}};
}
if($#keys < 0){$self->{completed} = 1;}
}
Last is the main.pl which initializes and calls everything:
#!/usr/bin/perl
use lib ".";
use Spider::LWP; #Overloaded version of LWP
use HTTP::Request;
use Googly::Search;
use Getopt::Long;
use File::Slurp;
use Data::Dump qw(dump);
sub main{
my @list = $ARGV[0]; #starting dir of lists of participating URLS.
foreach(@list){
if($_ !~ /\w/){next;}
chomp;
$urls = GetListOfSta
rtingURLS($_);
my $path_addition = $_;
$path_addition =~ s/[\.\s+\\\/\:\*\"\?\<\>\|]+/_/g;
$path_addition =~ s/[\_\s*]$//;
foreach my $url(@$urls){
my $req = HTTP::Request->new('GET', $url);
$ua->{current}->{$req->uri->as_string} = Spide
+r::URL->new($req->uri->as_string,$path_addition,0);
$req->header('Accept-Encoding'=>'gzip,x-gzip,x
+-bzip2,deflate,compress,base64,quoted-printable');
$ua->register ($req);
}
}
$SIG{INT} = sub {
#print dump $ua;
print_queue($ua->{queue},"QUEUE");
print_queue($ua->{current},"CURRENT");
print_queue($ua->{failed},"FAILED");
print_queue($ua->{finished},"FINISHED");
exit;
};
$ua->wait(300); # block until we are all finished or until ever
+ything has stopped for 5 minutes
}
|