Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:
Please advise on how to get this web link checker script
from Merlyn to work. The output just says:
[skipping ~http://www.sun.com/]
[skipping <http://www.sun.com/>]
Here is Merlyns script:
!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(~http://www.sun.com/ <http://www.sun.com/>);
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^<http://www>\.(teleport|stonehenge)\.com/~merlyn! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = HTTP::Request ($method,$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}
•Re: Merlyns Web Link checker
by merlyn (Sage) on Feb 27, 2002 at 16:46 UTC
|
Unless you happen to be the webmaster for sun.com,
DO NOT RUN THIS ON SUN.COM OR ANY DOMAIN FOR WHICH YOU ARE NOT THE ADMINISTRATOR.
{sigh}
-- Randal L. Schwartz, Perl hacker | [reply] |
|
Please forgive me for asking, but why? (sorry if that's lame, but I really don't get your meaning from the context.)
I'll admit that I'm not an admin, but why wouldn't I want to make sure the links are good on a site I frequent? Wouldn't it save me some time to avoid the bad ones up front?
| [reply] |
|
- It's not your job to check the links of someone else's site.
- The link checker doesn't respect the Robot Rules, and will wander blindly
into robot-hostile territory.
You wouldn't believe the number of lamers that come check stonehenge.com's
site hitting thousands of robot-hostile URLs because the first few versions
of this program had a live URL pointing at my site. I finally put a very-early
block in my website to keep people from doing it. And it still happens!
And I no longer use live URLs in my examples. Some people are just Lacking Clue.
-- Randal L. Schwartz, Perl hacker
| [reply] |
Re: Merlyns Web Link checker
by dvergin (Monsignor) on Feb 27, 2002 at 16:55 UTC
|
It's not clear why you added the tilde and
the angle brackets to the values
assigned to @CHECK, but the script runs better
if you change
@CHECK = # list of initial starting points
qw(~http://www.sun.com/ <http://www.sun.com/>);
to:
@CHECK = # list of initial starting points
qw(http://www.sun.com/);
Also, you made a mistake in typing the line near the
bottom that you have as:
$request = HTTP::Request ($method,$thisurl);
It should read:
$request = new HTTP::Request ($method,$thisurl);
You will also need to adjust the values in
sub PARSE to properly screen for the
range of URLs you are interested in. The
script you have at present checks for
pages at www.sun.com that satisfy a regex in
sub PARSE that is screening for pages in
Merlyn's site. Since there are no pages that satisfy
both conditions, the script finishes up rather quickly.
For those interested in pursing this further, here is the
listing
and the
column
that discusses it.
Update: Hi, Merlyn!
| [reply] [d/l] [select] |
|
I still got same error message:
C:\Perl\bin>we3.pl
verifying http://www.domain/mysite.com
Cannot fetch http://www.domain/mysite.com (status 501 Protocol scheme
+'' is not
supported)
I have tried several different URL's that I have access to and still get
the same message when I run this script.
Any other suggestions???
I made changes as suggested:
#!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(http://www.domain/mysite.com);
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^<http://www>\.(domain)\.com/! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = new HTTP::Request($method,$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}
| [reply] [d/l] [select] |
|
Unrelated to your immediate question, but I really doubt that you need to specify use lib '/perl/bin'; unless you've done some funky stuff to your Perl config; as that is the base directory that Perl looks for by default...Guv
| [reply] [d/l] |
|
| [reply] |
Re: Merlyns Web Link checker
by jonjacobmoon (Pilgrim) on Feb 27, 2002 at 16:42 UTC
|
Actually, I have used this very code (modified somewhat) recently
and found it most useful (thanks, Merlyn).
Your problem comes from the declaration at the beginning. Why
did you start the URL with a tilde and then add it again enclosed
in angle brackets? Take those out and it should work.
Also, the line "$request = HTTP::Request('GET',$thisurl);" should
be "$request = new HTTP::Request('GET',$thisurl);"
I admit it, I am Paco. | [reply] |
|
I now get the following error after running the script:
C:\Perl\bin>we3.pl
verifying http://www.sun.com
Cannot fetch http://www.sun.com (status 501 Protocol scheme '' is not
+supported)
</CODE>
Am I suppose to use '/perl/bin' for my path?
Please advise what I need to do to make this work.
Here is the script:
#!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(http://osis.nima.mil);
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^<http://www>\.(teleport|stonehenge)\.com/~merlyn! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = new HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}
| [reply] [d/l] [select] |
|
|