package Spider::LWP; use strict; use Exporter(); use HTTP::Request; use LWP::Parallel::UserAgent qw(:CALLBACK); use Spider::URL; use File::Path; use Data::Dump qw(dump); use vars qw (@ISA @EXPORT); @ISA = qw(LWP::Parallel::UserAgent Exporter); @EXPORT = @LWP::Parallel::UserAgent::EXPORT_OK; sub new { my ($class,$depth,$path,$max_sockets) = @_; my $object = bless(LWP::Parallel::UserAgent->new(),$class); $object->{current} = {}; $object->{queue} = {}; $object->{finished} = {}; $object->{failed} = {}; $object->{depth} = $depth; $object->{path} = $path; $object->{completed} = 0; $object->{max_sockets} = $max_sockets; return $object; } sub on_return { my ($self, $request, $response, $entry) = @_; if ($response->is_success) { $response->content($response->decoded_content); $self->{finished}->{$request->url->as_string} = $self->{current}->{$request->url->as_string}; delete $self->{current}->{$request->url->as_string}; $self->_process_return($response); # print "\n\nWoa! Request to ",$request->url," returned code ", $response->code, ": ", $response->message, "\n"; } else { on_failure($self, $request, $response, $entry); } $self->discard_entry($entry); return; } sub _process_return{ my ($self,$response) = @_; my $html = $response->header('content-type') =~ /text\/html/i; my $key = $response->request->url->as_string; my $path = _build_path($self->{path},$self->{finished}->{$key}->{target}); _write_page($path,$response->request->uri,$response->content,$html); my $maxed = $self->{depth} - $self->{finished}->{$key}->{current_depth}; if($html){ my $urls = _get_urls($response->content,$response->base,$maxed,);#\$ignore,\$exclude);#biggest waiting point foreach my $url(@$urls){ if($url !~ /\w/){next;} if(! defined $self->{current}->{$$url[0]} and ! defined $self->{queue}->{$$url[0]} and ! defined $self->{finished}->{$$url[0]}){ $self->{queue}->{$$url[0]} = Spider::URL->new($$url[0],$self->{finished}->{$key}->{target},$self->{finished}->{$key}->{current_depth}+1); #print "Putting $$url[0] into the queue\n"; } } } my @keys = keys %{$self->{queue}}; while((keys %{$self->{current}}) < $self->{max_sockets} and $#keys > -1){ $self->{current}->{$keys[0]} = $self->{queue}->{$keys[0]}; #print "Putting $keys[0] into the current\n"; delete $self->{queue}->{$keys[0]}; my $req = HTTP::Request->new('GET', "$keys[0]"); $req->header('Accept-Encoding'=>'gzip,x-gzip,x-bzip2,deflate,compress,base64,quoted-printable'); $self->register($req); @keys = keys %{$self->{queue}}; } if($#keys < 0){$self->{completed} = 1;} }