#!/usr/bin/perl -w
use strict;
use HTML::TokeParser;
use LWP::UserAgent;
use Net::FTP;
use Getopt::Std;
my ( @urls, %emails, %seen, %seen_anchors, %opts );
my $root = pop @ARGV or &useage; # root of search
getopts( 'vrd:', \%opts );
my $level = 0;
my $depth = $opts{d} ? $opts{d} : 1;
my $verbose = $opts{v} ? 1 : 0;
my $res = $opts{r} ? 1 : 0;
my $follow = qr/(?:htm|html|cgi|pl|php|asp)/;
@{$urls[$level]} = ( $root );
my ($domain)= $urls[0][0] =~ m|http://([^/]+)|i;
my $rfc822 = init_rfc822();
my $ua = LWP::UserAgent->new;
my $proxy = 'http://proxy.hbt.tassie.net.au:8080';
$ua->proxy( 'http', $proxy );
# $ua->credentials( $netloc, $realm, $uname, $pass ); # uncomment if r
+equired
# get a DNS resolver object from Net::DNS and set nameservers
if ($res) {
$^W = 0; # silence warnings about deprecated syntax from Net::DNS
eval { require Net::DNS };
$^W = 1;
$res = $@ ? 0 : new Net::DNS::Resolver;
warn "Net::DNS error $@\n" if $@;
$res->nameservers( "203.57.213.18", "203.19.156.68" ) if $res;
}
for my $level_ref ( @urls ) { # we keep adding levels to @urls in the
+loop
$level++;
last if $level > $depth;
print "\nLevel $level\n" if $verbose;
for my $url ( @$level_ref ) {
$url .= '/' if $url =~ m|http://[^/]+$|i; # add trailing / i
+f forgotten
my ($root) = $url =~ m|(http://.*/)|i;
my $content = get( $url );
print " Checking links....\n" if $verbose;
my $parser = HTML::TokeParser->new( \$content );
my %anchors;
LINK:
while ( my $token = $parser->get_tag(qw( a img )) ) {
my $link = $token->[1]{href} || $token->[1]{src} || next;
+ # next unless hyerlink
my $type = $token->[0];
my $alt = $token->[1]{alt} || '';
my $text = $type eq 'a' ? $parser->get_trimmed_text('/a')
+: 'Image alt="'.$alt.'"';
if ( $link =~ m|^\s*mailto:(.*)$|i ) { # mail link
$emails{"$text<$1>"} .= defined $emails{"$text<$1>"} ?
+ ", $url" : $url;
next LINK;
}
elsif ( $link =~ m|\Q$domain\E| ) { # internal link
do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
$seen{$link} = &testHTTP( $link, $text, $url );
push @{$urls[$level]}, $link if $seen{$link} =~ /^OK.*
+$follow \(Link/;
}
elsif ( $link =~ m|^\s*http://|i ) { # external http li
+nk
do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
$seen{$link} = &testHTTP( $link, $text, $url );
}
elsif ( $link =~ m|^\s*ftp://|i ) { # external ftp lin
+k
do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
$seen{$link} = &testFTP( $link, $text, $url );
}
else { # relative link
$link =~ s|^\s*/||; # trim link
do{ $anchors{$url.$link}[0] = $text; next LINK } if $l
+ink =~ m|^#|;
my $rel_root = $root;
# move back up tree in response to ../
while ( $link =~ s|^\.\./|| ) {
$rel_root =~ s|[^/]+/$|| unless $rel_root =~ m|htt
+p://[^/]+/$|i # Until can't go any further!
}
my $rel_link = $rel_root.$link;
$rel_link =~ s|/\./|/|g;
do{ &seen( $rel_link, $url ); next LINK } if defined $
+seen{$rel_link};
$seen{$rel_link} = &testHTTP( $rel_link, $text, $url )
+;
push @{$urls[$level]}, $rel_link if $seen{$rel_link} =
+~ /^OK.*$follow \(Link/;
}
}
if (keys %anchors) {
print " Checking anchors...\n Page: $url\n" if $verbos
+e;
&check_anchors( $url, \%anchors, $content, $url );
}
}
}
my @dud_links = grep { not /^OK/ } values %seen;
# Schwartzian transform - due credit to RS :-)
sub page { pop =~ m/On page:'([^']+)'/; $1 }
@dud_links = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]}@
+dud_links;
print "\n\nBroken links $urls[0][0]\n";
@dud_links = ('No bad links!') unless @dud_links;
print " $_\n" for @dud_links;
my @dud_anchors = grep { not /^OK/ } values %seen_anchors;
@dud_anchors = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]
+}@dud_anchors;
print "\n\nBroken anchors $urls[0][0]\n";
@dud_anchors = ('No bad anchors!') unless @dud_anchors;
print " $_\n" for @dud_anchors;
print "\n\nMailto links\n";
for (keys %emails) {
my ( $email, $domain ) = $_ =~ m/<([^@]+@([^>]+))>/;
my $status = check_rfc822($email) ? 'OK RFC822 - ' : 'Failed RFC82
+2 - ';
my $dns = dns_query($domain);
$status .= ! defined $dns ? "No DNS check" : $dns ? "OK DNS" : "No
+ DNS listing";
print " $status email: $_ page(s): $emails{$_}\n";
}
sub check_rfc822 {
my $email = shift;
return 1 if $email =~ m/^$rfc822$/o;
return 0;
}
sub dns_query {
return undef unless $res;
my $domain = shift;
for my $dns ( 'MX', 'A' ) {
my $packet = $res->send($domain, $dns ) or warn $res->errorstr
+ing;
return 1 if $packet->header->ancount;
}
return 0;
}
sub seen {
my ( $link, $on_page ) = @_;
$seen{$link} .= ", $on_page";
}
sub get {
my $url = shift;
print " Getting $url...." if $verbose;;
my $request = HTTP::Request->new( 'GET', $url );
my $content = $ua->request( $request );
print "$$content{_msg} $$content{_rc}\n" if $verbose;;
return $$content{_content};
}
sub testHTTP {
my ( $url, $text, $on_page ) = @_;
my $request = HTTP::Request->new( 'HEAD', $url );
my $content = $ua->request( $request );
my $answer = "$$content{_msg} $$content{_rc} $url (Link text: $te
+xt ) (On page: $on_page )";
print " $answer\n" if $verbose;
# if $url also include an anchor load page and check anchor exists
if ($url =~ m|^(.*/[^/]+)(#[^/]+)$|) {
my %anchors;
$anchors{$url}[0] = $text;
my $old_verbose = $verbose;
$verbose = 0;
my $content = get( $url );
$verbose = $old_verbose;
check_anchors( $1, \%anchors, $content, $on_page );
}
return $answer;
}
sub testFTP {
my ( $url, $text, $on_page ) = @_;
my ( $host, $filepath ) = $url =~ m|ftp://([^/]+)(.*)$|;
my $answer;
if (my $ftp = Net::FTP->new( $host )) {
$ftp->login( 'anonymous', 'nobody@nowhere.com' );
my $size = $ftp->size( $filepath );
$ftp->quit;
my $status = defined $size ? "OK" : "Not OK File does not exis
+t";
$answer = "$status FTP $url (Link text: $text ) (On page: $on
+_page )";
}
else {
$answer = "$@ FTP $url (Link text: $text ) (On page: $on_page
+)";
}
print " $answer\n" if $verbose;
return $answer;
}
sub check_anchors {
my( $url, $anchor_ref, $content, $on_page ) = @_;
my $parser = HTML::TokeParser->new( \$content );
while ( my $token = $parser->get_tag('a') ) {
my $link = $token->[1]{name} || next; # next unless named
$link = "$url#$link";
$$anchor_ref{$link}[1] = 1 if defined $$anchor_ref{$link};
}
for (sort keys %$anchor_ref) {
$$anchor_ref{$_}[1] = defined $$anchor_ref{$_}[1] ? 1 : 0;
my $status = $$anchor_ref{$_}[1] ? "OK" : "Not OK";
my $answer = "$status ANCHOR $_ (Link text: $$anchor_ref{$_}[
+0] ) (On page: $on_page )";;
print " $answer\n" if $verbose;
$seen_anchors{$on_page.$_} = $answer;
}
}
sub init_rfc822 {
my $rfc_pat = '';
while (<DATA>) {
next unless $_;
chomp;
$rfc_pat .= $_
}
$rfc_pat = qr/$rfc_pat/;
return $rfc_pat;
}
sub useage {
print "
Useage:
linkcheck.pl -[v,d[depth]] <url>
-v sets verbose mode
-dn sets a search depth of 'n' levels where 'n' is a +ve integ
+er
<url> is the root url in which to start the search
defaults are quiet mode and depth level 1
";
exit;
}
=head1 NAME
linkcheck.pl v0.001
=head1 SYNOPSIS
linkcheck.pl -[v,r,d[depth]] <url>
-v sets verbose mode
-r resolve email address domains via DNS lookups
-d[n] sets a search depth of 'n' levels where 'n' is a +ve int
+eger
<url> is the root url in which to start the search
defaults are quiet mode, no DNS lookup, and depth level 1
linkcheck.pl -vrd1000 http://www.yoursite.com > logfile.txt
this will check your entire site (assuming a link depth <= 1000 :-
+)
and send the verbose output to the file logfile.txt
=head1 DESCRIPTION
This script extracts and checks links for validity. A HEAD request is
+made
for each link found. A valid link will return status OK in the header.
Script performs a recursive search, width first to a user defined dept
+h. The
default depth of 1 means just check the links on the root page. A
value of 2 means check all the links on the root and child pages, a le
+vel of
3 will check links on root, childen, children's children. A depth of 4
+ gets
to the children's children's children's links and so on...
Links that have been checked are not checked again if they occur elsew
+here.
The pages on which repeat links occur are appended to the original che
+ck data
so you can find all problem pages in the event of a broken link.
External http links (outside the root domain) are checked for validity
+ but are
not followed for obvious reasons. External ftp links are checked to en
+sure
they point at a valid (accessible) file.
Anchors within a page are checked on block as an efficiency hack. Link
+s that
point to anchors on pages other than the current working page are chec
+ked
individually.
Mailto links are harvested for (manual) checking. Their format is chec
+ked
against the RFC822 spec using pattern developed in program by Jeffery
+Friedl
in the ORA book "Mastering Regular Expressions". A check can also be m
+ade to
see the domain can be looked up via DNS to help eliminate domain name
+typos.
In verbose mode you see all the links that are checked at each depth.
+In
normal mode only the broken links, anchors and mailto harvest are outp
+ut.
There are a number of hardcoded options such as username, password, pr
+oxy,
DNS name servers. The $follow = qr// defines what sort of links to fol
+low.
Not all links need to be followed. You are unlikely to find more links
+ by
following the link to a .jpg image for instance. By default links that
+ end
in any of:
=over
=item * .htm
=item * .html
=item * .cgi
=item * .pl
=item * .php
=item * .asp
=back
are followed by default but you can add whatever you want.
=head1 AUTHOR
Dr James Freeman aka tachyon <lt>jfreeman@tassie.net.au<gt>
=head1 LICENSE
This package is free software; you can redistribute it and/or modify i
+t under
the terms of the "GNU General Public License".
=head1 DISCLAIMER
This script is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the "GNU General Public License" for more details.
=cut
# this data is the RFC822 pattern - leave it alone!
__DATA__
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\
+xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\
+x80-\
xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n
+\015"
]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8
+0-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\
+((?:[
^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\
+x80-\
xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)
+<>@,;
:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff]
+)*"))
*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-
+\xff\
n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015
+()]|\
\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[
+^(\04
0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037
+\x80-
\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]
+|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
+^\x80
-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
+f]|\(
(?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\
+\\[\]
\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?
+:[^\\
\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\0
+00-\0
37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x8
+0-\xf
f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\01
+2-\03
7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015
+()]|\
\[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(?
+:[\04
0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0
+15()]
|\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\
+\[^\x
80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04
+0)<>@
,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-
+\xff]
)|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?
+:[^\\
\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
+-\xff
])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\(
+(?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]
+\000-
\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\
+\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff
+\n\01
5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\)
+)*,(?
:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xf
+f\n\0
15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]
+|\\[^
\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\
+040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8
+0-\xf
f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\(
+(?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x
+80-\x
ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|
+\((?:
[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[
+\]\00
0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^
+\\\x8
0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*)
?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\
+]\000
-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\0
+40\t]
|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()
+]|\\[
^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8
+0-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,
+;:".\
\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])
+|"(?:
[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*@
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\
+xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80
+-\xff
]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\0
+15\[\
]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\
+x80-\
xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]
+|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
+^\x80
-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\04
+0)<>@
,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80
+-\xff
])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^
+\\\x8
0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*
|