Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Re: Re: Merlyns Web Link checker

by Anonymous Monk
on Feb 27, 2002 at 17:25 UTC ( [id://147978]=note: print w/replies, xml ) Need Help??


in reply to Re: Merlyns Web Link checker
in thread Merlyns Web Link checker

I now get the following error after running the script:
C:\Perl\bin>we3.pl verifying http://www.sun.com Cannot fetch http://www.sun.com (status 501 Protocol scheme '' is not +supported)
</CODE> Am I suppose to use '/perl/bin' for my path? Please advise what I need to do to make this work. Here is the script:
#!/usr/bin/perl use lib "/perl/bin"; use LWP::UserAgent; use HTML::Parser; use URI::URL; ## begin configure @CHECK = # list of initial starting points qw(http://osis.nima.mil); sub PARSE { # verify existance, parse for further URLs ## $_[0] is the absolute URL $_[0] =~ m!^<http://www>\.(teleport|stonehenge)\.com/~merlyn! and not $_[0] =~ /refindex/; } sub PING { # verify existence, but don't parse ## $_[0] is the absolute URL $_[0] =~ m!^(http):!; } ## end configure { package ParseLink; @ISA = qw(HTML::Parser); sub start { # called by parse my $this = shift; my ($tag, $attr) = @_; if ($tag eq "a") { $this->{links}{$attr->{href}}++; } elsif ($tag eq "img") { $this->{links}{$attr->{src}}++; } } sub get_links { my $this = shift; sort keys %{$this->{links}}; } } # end of ParseLink $ua = new LWP::UserAgent; $ua->agent("hverify/1.0"); $ua->env_proxy; $| = 1; MAINLOOP: while ($thisurl = shift @CHECK) { $thisurl =~ s/%7e/~/ig; # ugh :-) next if $did{$thisurl}++; if (PARSE $thisurl) { warn "fetching $thisurl\n"; $request = HTTP::Request('GET',$thisurl); $response = $ua->request($request); # fetch! unless ($response->is_success) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; next MAINLOOP; } next MAINLOOP unless $response->content_type =~ /text\/html/i; $base = $response->base; my $p = ParseLink->new; $p->parse($response->content); $p->parse(undef); for $link ($p->get_links) { $abs = url($link, $base)->abs; warn "... $link => $abs\n"; push(@CHECK, $abs); } next MAINLOOP; } if (PING $thisurl) { warn "verifying $thisurl\n"; for $method (qw(HEAD GET)) { $request = new HTTP::Request('GET',$thisurl); $response = $ua->request($request); # fetch! next MAINLOOP if $response->is_success; # ok } warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; next MAINLOOP; } warn "[skipping $thisurl]\n"; }

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://147978]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others having a coffee break in the Monastery: (4)
As of 2024-04-26 00:13 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found