I now get the following error after running the script:
C:\Perl\bin>we3.pl
verifying http://www.sun.com
Cannot fetch http://www.sun.com (status 501 Protocol scheme '' is not
+supported)
</CODE>
Am I suppose to use '/perl/bin' for my path?
Please advise what I need to do to make this work.
Here is the script:
#!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(http://osis.nima.mil);
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^<http://www>\.(teleport|stonehenge)\.com/~merlyn! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = new HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}