http://www.perlmonks.org?node_id=575883


in reply to Re: Problems with 'discard_entry' in LWP::Parallel::Useragent
in thread LWP::Parallel::UserAgent discard_entry does not delete entry objects

Okay, here is the pertinent parts of my code:

First is a small container object

package Spider::URL; use strict; sub new{ my ($class,$url,$target,$current_depth,$referer) = @_; my $object = bless {}, $class; $object->{url} = $url; $object->{current_depth} = $current_depth; $object->{target} = $target; $object->{referer} = $referer; return $object; } 1;

Next is the object that inherits from LWP::Parallel::UserAgent:

package Spider::LWP; use strict; use Exporter(); use HTTP::Request; use LWP::Parallel::UserAgent qw(:CALLBACK); use Spider::URL; use File::Path; use Data::Dump qw(dump); use vars qw (@ISA @EXPORT); @ISA = qw(LWP::Parallel::UserAgent Exporter); @EXPORT = @LWP::Parallel::UserAgent::EXPORT_OK; sub new { my ($class,$depth,$path,$max_sockets) = @_; my $object = bless(LWP::Parallel::UserAgent->new(),$class); $object->{current} = {}; $object->{queue} = {}; $object->{finished} = {}; $object->{failed} = {}; $object->{depth} = $depth; $object->{path} = $path; $object->{completed} = 0; $object->{max_sockets} = $max_sockets; return $object; } sub on_return { my ($self, $request, $response, $entry) = @_; if ($response->is_success) { $response->content($response->decoded_content); $self->{finished}->{$request->url->as_string} = $self- +>{current}->{$request->url->as_string}; delete $self->{current}->{$request->url->as_string}; $self->_process_return($response); # print "\n\nWoa! Request to ",$request->url," returned +code ", $response->code, ": ", $response->message, "\n"; } else { on_failure($self, $request, $response, $entry); } $self->discard_entry($entry); return; } sub _process_return{ my ($self,$response) = @_; my $html = $response->header('content-type') =~ /text\/html/i; my $key = $response->request->url->as_string; my $path = _build_path($self->{path},$self->{finished}->{$key} +->{target}); _write_page($path,$response->request->uri,$response->content,$ +html); my $maxed = $self->{depth} - $self->{finished}->{$key}->{curre +nt_depth}; if($html){ my $urls = _get_urls($response->content,$response->bas +e,$maxed,);#\$ignore,\$exclude);#biggest waiting point foreach my $url(@$urls){ if($url !~ /\w/){next;} if(! defined $self->{current}->{$$url[0]} and ! defined $self->{queue}->{$$url[0]} and ! defined $self->{finished}->{$$url[0]}){ $self->{queue}->{$$url[0]} = Spider::U +RL->new($$url[0],$self->{finished}->{$key}->{target},$self->{finished +}->{$key}->{current_depth}+1); #print "Putting $$url[0] into the queue\n"; } } } my @keys = keys %{$self->{queue}}; while((keys %{$self->{current}}) < $self->{max_sockets} and $# +keys > -1){ $self->{current}->{$keys[0]} = $self->{queue}->{$keys[ +0]}; #print "Putting $keys[0] into the current\n"; delete $self->{queue}->{$keys[0]}; my $req = HTTP::Request->new('GET', "$keys[0]"); $req->header('Accept-Encoding'=>'gzip,x-gzip,x-bzip2,d +eflate,compress,base64,quoted-printable'); $self->register($req); @keys = keys %{$self->{queue}}; } if($#keys < 0){$self->{completed} = 1;} }

Last is the main.pl which initializes and calls everything:

#!/usr/bin/perl use lib "."; use Spider::LWP; #Overloaded version of LWP use HTTP::Request; use Googly::Search; use Getopt::Long; use File::Slurp; use Data::Dump qw(dump); sub main{ my @list = $ARGV[0]; #starting dir of lists of participating URLS. foreach(@list){ if($_ !~ /\w/){next;} chomp; $urls = GetListOfSta rtingURLS($_); my $path_addition = $_; $path_addition =~ s/[\.\s+\\\/\:\*\"\?\<\>\|]+/_/g; $path_addition =~ s/[\_\s*]$//; foreach my $url(@$urls){ my $req = HTTP::Request->new('GET', $url); $ua->{current}->{$req->uri->as_string} = Spide +r::URL->new($req->uri->as_string,$path_addition,0); $req->header('Accept-Encoding'=>'gzip,x-gzip,x +-bzip2,deflate,compress,base64,quoted-printable'); $ua->register ($req); } } $SIG{INT} = sub { #print dump $ua; print_queue($ua->{queue},"QUEUE"); print_queue($ua->{current},"CURRENT"); print_queue($ua->{failed},"FAILED"); print_queue($ua->{finished},"FINISHED"); exit; }; $ua->wait(300); # block until we are all finished or until ever +ything has stopped for 5 minutes }