Beefy Boxes and Bandwidth Generously Provided by pair Networks
We don't bite newbies here... much
 
PerlMonks  

Re: Problems with 'discard_entry' in LWP::Parallel::Useragent

by shmem (Chancellor)
on Oct 02, 2006 at 10:03 UTC ( [id://575855]=note: print w/replies, xml ) Need Help??


in reply to LWP::Parallel::UserAgent discard_entry does not delete entry objects

Show us a bit more of your own code.

Ok, I first:

package PUA; use LWP::Parallel::UserAgent; @PUA::ISA = qw(LWP::Parallel::UserAgent); sub on_return { # print '@_ = (' . join(', ',map{"'$_'"} @_) .")\n"; my ($self,$request, $response, $entry) = @_; $self->discard_entry($entry); } 1;
#!/usr/bin/perl use strict; use Data::Dumper; $Data::Dumper::Indent = 1; use HTTP::Request; use PUA; my $ua = PUA->new(); for(<*.pl>) { my $request = HTTP::Request->new('GET',"file://$ENV{HOME}/perlmonk +s/$_"); if( my $res = $ua->register($request)) { print STDERR $res->error_as_HTML; } } print Dumper($ua);

Output:

$VAR1 = bless( { 'ordpend_connections' => [], 'entries_by_sockets' => {}, 'max_hosts' => 7, 'seen_request' => {}, 'handle_duplicates' => 0, 'requests_redirectable' => [ 'GET', 'HEAD' ], 'from' => undef, 'timeout' => 180, 'handle_response' => 1, 'parse_head' => 1, 'remember_failures' => 0, 'entries_by_requests' => {}, 'max_req' => 5, 'current_connections' => {}, 'max_redirect' => 7, 'nonblock' => 0, 'previous_requests' => {}, 'select_out' => bless( [ undef, 0 ], 'IO::Select' ), 'pending_connections' => {}, 'failed_connections' => {}, 'protocols_forbidden' => undef, 'no_proxy' => [], 'protocols_allowed' => undef, 'use_eval' => 1, 'agent' => 'libwww-perl/5.805', 'handle_in_order' => 0, 'def_headers' => undef, 'proxy' => {}, 'select_in' => bless( [ undef, 0 ], 'IO::Select' ), 'max_size' => undef }, 'PUA' );

Works like expected. No Request/Response object in $ua. If I comment out the $self->discard_entry($entry) line in PUA.pm, the $ua object is stuffed with the entries in the anonymous array keyed as ordpend_connections in the object's hash.

Your turn. I suspect usage errors.

--shmem

_($_=" "x(1<<5)."?\n".q·/)Oo.  G°\        /
                              /\_¯/(q    /
----------------------------  \__(m.====·.(_("always off the crowd"))."·
");sub _{s./.($e="'Itrs `mnsgdq Gdbj O`qkdq")=~y/"-y/#-z/;$e.e && print}

Replies are listed 'Best First'.
Re^2: Problems with 'discard_entry' in LWP::Parallel::Useragent
by perlmonkey2 (Beadle) on Oct 02, 2006 at 13:53 UTC
    Okay, here is the pertinent parts of my code:

    First is a small container object

    package Spider::URL; use strict; sub new{ my ($class,$url,$target,$current_depth,$referer) = @_; my $object = bless {}, $class; $object->{url} = $url; $object->{current_depth} = $current_depth; $object->{target} = $target; $object->{referer} = $referer; return $object; } 1;

    Next is the object that inherits from LWP::Parallel::UserAgent:

    package Spider::LWP; use strict; use Exporter(); use HTTP::Request; use LWP::Parallel::UserAgent qw(:CALLBACK); use Spider::URL; use File::Path; use Data::Dump qw(dump); use vars qw (@ISA @EXPORT); @ISA = qw(LWP::Parallel::UserAgent Exporter); @EXPORT = @LWP::Parallel::UserAgent::EXPORT_OK; sub new { my ($class,$depth,$path,$max_sockets) = @_; my $object = bless(LWP::Parallel::UserAgent->new(),$class); $object->{current} = {}; $object->{queue} = {}; $object->{finished} = {}; $object->{failed} = {}; $object->{depth} = $depth; $object->{path} = $path; $object->{completed} = 0; $object->{max_sockets} = $max_sockets; return $object; } sub on_return { my ($self, $request, $response, $entry) = @_; if ($response->is_success) { $response->content($response->decoded_content); $self->{finished}->{$request->url->as_string} = $self- +>{current}->{$request->url->as_string}; delete $self->{current}->{$request->url->as_string}; $self->_process_return($response); # print "\n\nWoa! Request to ",$request->url," returned +code ", $response->code, ": ", $response->message, "\n"; } else { on_failure($self, $request, $response, $entry); } $self->discard_entry($entry); return; } sub _process_return{ my ($self,$response) = @_; my $html = $response->header('content-type') =~ /text\/html/i; my $key = $response->request->url->as_string; my $path = _build_path($self->{path},$self->{finished}->{$key} +->{target}); _write_page($path,$response->request->uri,$response->content,$ +html); my $maxed = $self->{depth} - $self->{finished}->{$key}->{curre +nt_depth}; if($html){ my $urls = _get_urls($response->content,$response->bas +e,$maxed,);#\$ignore,\$exclude);#biggest waiting point foreach my $url(@$urls){ if($url !~ /\w/){next;} if(! defined $self->{current}->{$$url[0]} and ! defined $self->{queue}->{$$url[0]} and ! defined $self->{finished}->{$$url[0]}){ $self->{queue}->{$$url[0]} = Spider::U +RL->new($$url[0],$self->{finished}->{$key}->{target},$self->{finished +}->{$key}->{current_depth}+1); #print "Putting $$url[0] into the queue\n"; } } } my @keys = keys %{$self->{queue}}; while((keys %{$self->{current}}) < $self->{max_sockets} and $# +keys > -1){ $self->{current}->{$keys[0]} = $self->{queue}->{$keys[ +0]}; #print "Putting $keys[0] into the current\n"; delete $self->{queue}->{$keys[0]}; my $req = HTTP::Request->new('GET', "$keys[0]"); $req->header('Accept-Encoding'=>'gzip,x-gzip,x-bzip2,d +eflate,compress,base64,quoted-printable'); $self->register($req); @keys = keys %{$self->{queue}}; } if($#keys < 0){$self->{completed} = 1;} }

    Last is the main.pl which initializes and calls everything:

    #!/usr/bin/perl use lib "."; use Spider::LWP; #Overloaded version of LWP use HTTP::Request; use Googly::Search; use Getopt::Long; use File::Slurp; use Data::Dump qw(dump); sub main{ my @list = $ARGV[0]; #starting dir of lists of participating URLS. foreach(@list){ if($_ !~ /\w/){next;} chomp; $urls = GetListOfSta rtingURLS($_); my $path_addition = $_; $path_addition =~ s/[\.\s+\\\/\:\*\"\?\<\>\|]+/_/g; $path_addition =~ s/[\_\s*]$//; foreach my $url(@$urls){ my $req = HTTP::Request->new('GET', $url); $ua->{current}->{$req->uri->as_string} = Spide +r::URL->new($req->uri->as_string,$path_addition,0); $req->header('Accept-Encoding'=>'gzip,x-gzip,x +-bzip2,deflate,compress,base64,quoted-printable'); $ua->register ($req); } } $SIG{INT} = sub { #print dump $ua; print_queue($ua->{queue},"QUEUE"); print_queue($ua->{current},"CURRENT"); print_queue($ua->{failed},"FAILED"); print_queue($ua->{finished},"FINISHED"); exit; }; $ua->wait(300); # block until we are all finished or until ever +ything has stopped for 5 minutes }

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://575855]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others about the Monastery: (2)
As of 2024-03-19 04:30 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found