Beefy Boxes and Bandwidth Generously Provided by pair Networks
P is for Practical
 
PerlMonks  

Link Checker

by tachyon (Chancellor)
on Oct 10, 2001 at 10:47 UTC ( [id://117944]=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info Dr James Freeman aka tachyon <jfreeman@tassie.net.au>
Description:

This script is a website link checking tool. It extracts and checks *all* links for validity including anchors, http, ftp, mailto and image links.

Script performs a recursive search, width first to a user defined depth. External links are checked for validity but are not followed for obvious reasons - we don't want to check the whole web.

Broken anchors, links are repoted along with the server error. All email addresses harvested are checked for RFC822 compliance and optionally against an MX or A DNS listing.

More details in pod

#!/usr/bin/perl -w

use strict;
use HTML::TokeParser;
use LWP::UserAgent;
use Net::FTP;
use Getopt::Std;

my ( @urls, %emails, %seen, %seen_anchors, %opts );
my $root    = pop @ARGV or &useage; # root of search
getopts( 'vrd:', \%opts );  
my $level   = 0;
my $depth   = $opts{d} ? $opts{d} : 1;
my $verbose = $opts{v} ? 1 : 0;
my $res     = $opts{r} ? 1 : 0;
my $follow  = qr/(?:htm|html|cgi|pl|php|asp)/;
@{$urls[$level]} = ( $root );           
my ($domain)= $urls[0][0] =~ m|http://([^/]+)|i;
my $rfc822  = init_rfc822();

my $ua      = LWP::UserAgent->new;
my $proxy   = 'http://proxy.hbt.tassie.net.au:8080';
$ua->proxy( 'http', $proxy );
# $ua->credentials( $netloc, $realm, $uname, $pass ); # uncomment if r
+equired
# get a DNS resolver object from Net::DNS and set nameservers
if ($res) {
    $^W = 0; # silence warnings about deprecated syntax from Net::DNS
    eval { require Net::DNS }; 
    $^W = 1;
    $res = $@ ? 0 : new Net::DNS::Resolver;
    warn "Net::DNS error $@\n" if $@;
    $res->nameservers( "203.57.213.18", "203.19.156.68" ) if $res;
}

for my $level_ref ( @urls ) { # we keep adding levels to @urls in the 
+loop
    $level++;
    last if $level > $depth;
    print "\nLevel $level\n" if $verbose;
    for my $url ( @$level_ref ) {    
        $url .= '/' if $url =~ m|http://[^/]+$|i;   # add trailing / i
+f forgotten
        my ($root) = $url =~ m|(http://.*/)|i;
        my $content = get( $url );   
        print "  Checking links....\n" if $verbose;
        my $parser = HTML::TokeParser->new( \$content );
        my %anchors;
    LINK: 
        while ( my $token = $parser->get_tag(qw( a img )) ) {
            my $link = $token->[1]{href} || $token->[1]{src} || next; 
+  # next unless hyerlink
            my $type = $token->[0];
            my $alt  = $token->[1]{alt} || '';
            my $text = $type eq 'a' ? $parser->get_trimmed_text('/a') 
+: 'Image alt="'.$alt.'"';
            if ( $link =~ m|^\s*mailto:(.*)$|i ) {    # mail link
                $emails{"$text<$1>"} .= defined $emails{"$text<$1>"} ?
+ ", $url" : $url;
              next LINK;
            }
            elsif ( $link =~ m|\Q$domain\E| ) {     # internal link
                do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
                $seen{$link} = &testHTTP( $link, $text, $url );
                push @{$urls[$level]}, $link if $seen{$link} =~ /^OK.*
+$follow \(Link/;
            }
            elsif ( $link =~ m|^\s*http://|i ) {    # external http li
+nk
                do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
                $seen{$link} = &testHTTP( $link, $text, $url );
            }
            elsif ( $link =~ m|^\s*ftp://|i ) {     # external ftp lin
+k
                do{ &seen( $link, $url ); next LINK } if defined $seen
+{$link};
                $seen{$link} = &testFTP( $link, $text, $url );
            } 
            else {                                  # relative link
                $link =~ s|^\s*/||;                 # trim link
                do{ $anchors{$url.$link}[0] = $text; next LINK } if $l
+ink =~ m|^#|;
                my $rel_root = $root;
                # move back up tree in response to ../
                while ( $link =~ s|^\.\./|| ) {
                    $rel_root =~ s|[^/]+/$|| unless $rel_root =~ m|htt
+p://[^/]+/$|i # Until can't go any further!
                }
                my $rel_link = $rel_root.$link;
                $rel_link =~ s|/\./|/|g;
                do{ &seen( $rel_link, $url ); next LINK } if defined $
+seen{$rel_link};
                $seen{$rel_link} = &testHTTP( $rel_link, $text, $url )
+;
                push @{$urls[$level]}, $rel_link if $seen{$rel_link} =
+~ /^OK.*$follow \(Link/;
            }
        }
        if (keys %anchors) {
            print "  Checking anchors...\n    Page: $url\n" if $verbos
+e;
            &check_anchors( $url, \%anchors, $content, $url );
        } 
    }
}

my @dud_links = grep { not /^OK/ } values %seen;
# Schwartzian transform - due credit to RS :-)
sub page { pop =~ m/On page:'([^']+)'/; $1 }
@dud_links = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]}@
+dud_links;
print "\n\nBroken links $urls[0][0]\n";
@dud_links = ('No bad links!') unless @dud_links;
print "  $_\n" for @dud_links;

my @dud_anchors = grep { not /^OK/ } values %seen_anchors;
@dud_anchors = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]
+}@dud_anchors;
print "\n\nBroken anchors $urls[0][0]\n";
@dud_anchors = ('No bad anchors!') unless @dud_anchors;
print "  $_\n" for @dud_anchors;

print "\n\nMailto links\n";
for (keys %emails) {
    my ( $email, $domain ) = $_ =~ m/<([^@]+@([^>]+))>/;
    my $status = check_rfc822($email) ? 'OK RFC822 - ' : 'Failed RFC82
+2 - ';
    my $dns = dns_query($domain);
    $status .= ! defined $dns ? "No DNS check" : $dns ? "OK DNS" : "No
+ DNS listing";    
    print "  $status  email: $_  page(s): $emails{$_}\n";
} 

sub check_rfc822 {
    my $email = shift;
  return 1 if $email =~ m/^$rfc822$/o;
  return 0;
}

sub dns_query {
  return undef unless $res;
    my $domain = shift;
    for my $dns ( 'MX', 'A' ) {
        my $packet = $res->send($domain, $dns ) or warn $res->errorstr
+ing;
      return 1 if $packet->header->ancount; 
    }
  return 0;             
}

sub seen {
    my ( $link, $on_page ) = @_;
    $seen{$link} .= ", $on_page";
}

sub get {
    my $url = shift;
    print "  Getting $url...." if $verbose;;
    my $request = HTTP::Request->new( 'GET', $url );
    my $content = $ua->request( $request ); 
    print "$$content{_msg} $$content{_rc}\n" if $verbose;;
  return $$content{_content};
}

sub testHTTP {
    my ( $url, $text, $on_page ) = @_;
    my $request = HTTP::Request->new( 'HEAD', $url );
    my $content = $ua->request( $request );
    my $answer  = "$$content{_msg} $$content{_rc} $url (Link text: $te
+xt ) (On page: $on_page )";
    print "    $answer\n" if $verbose;
    # if $url also include an anchor load page and check anchor exists
    if ($url =~ m|^(.*/[^/]+)(#[^/]+)$|) {
        my %anchors;
        $anchors{$url}[0] = $text;
        my $old_verbose = $verbose;
        $verbose = 0;
        my $content = get( $url );
        $verbose = $old_verbose;
        check_anchors( $1, \%anchors, $content, $on_page );
    }
  return $answer;
}

sub testFTP {
    my ( $url, $text, $on_page ) = @_;
    my ( $host, $filepath ) = $url =~ m|ftp://([^/]+)(.*)$|;
    my $answer;
    if (my $ftp = Net::FTP->new( $host )) {
        $ftp->login( 'anonymous', 'nobody@nowhere.com' );
        my $size = $ftp->size( $filepath );
        $ftp->quit;
        my $status = defined $size ? "OK" : "Not OK File does not exis
+t";
        $answer  = "$status FTP $url (Link text: $text ) (On page: $on
+_page )";
    }
    else {
        $answer = "$@ FTP $url (Link text: $text ) (On page: $on_page 
+)";
    }
    print "    $answer\n" if $verbose;
  return $answer;
}

sub check_anchors {
    my( $url, $anchor_ref, $content, $on_page ) = @_;
    my $parser = HTML::TokeParser->new( \$content );
    while ( my $token = $parser->get_tag('a') ) {
        my $link = $token->[1]{name} || next;   # next unless named
        $link = "$url#$link";
        $$anchor_ref{$link}[1] = 1 if defined $$anchor_ref{$link};
    }
    for (sort keys %$anchor_ref) {
        $$anchor_ref{$_}[1] = defined $$anchor_ref{$_}[1] ? 1 : 0;
        my $status = $$anchor_ref{$_}[1] ? "OK" : "Not OK";
        my $answer  = "$status ANCHOR $_ (Link text: $$anchor_ref{$_}[
+0] ) (On page: $on_page )";;
        print "      $answer\n" if $verbose;
        $seen_anchors{$on_page.$_} = $answer;
    }
}    

sub init_rfc822 {
    my $rfc_pat = '';
     while (<DATA>) { 
        next unless $_;
        chomp;
        $rfc_pat .= $_ 
    }
    $rfc_pat = qr/$rfc_pat/;
  return $rfc_pat;
}

sub useage {
    print "
    Useage:

    linkcheck.pl -[v,d[depth]] <url>
        -v sets verbose mode
        -dn sets a search depth of 'n' levels where 'n' is a +ve integ
+er
        <url> is the root url in which to start the search

    defaults are quiet mode and depth level 1
    ";
  exit;
}

=head1 NAME

linkcheck.pl v0.001

=head1 SYNOPSIS

    linkcheck.pl -[v,r,d[depth]] <url>
        -v sets verbose mode
        -r resolve email address domains via DNS lookups
        -d[n] sets a search depth of 'n' levels where 'n' is a +ve int
+eger
        <url> is the root url in which to start the search

    defaults are quiet mode, no DNS lookup, and depth level 1

    linkcheck.pl -vrd1000 http://www.yoursite.com > logfile.txt
 
    this will check your entire site (assuming a link depth <= 1000 :-
+) 
    and send the verbose output to the file logfile.txt

=head1 DESCRIPTION

This script extracts and checks links for validity. A HEAD request is 
+made 
for each link found. A valid link will return status OK in the header.

Script performs a recursive search, width first to a user defined dept
+h. The 
default depth of 1 means just check the links on the root page. A 
value of 2 means check all the links on the root and child pages, a le
+vel of 
3 will check links on root, childen, children's children. A depth of 4
+ gets 
to the children's children's children's links and so on...
 
Links that have been checked are not checked again if they occur elsew
+here. 
The pages on which repeat links occur are appended to the original che
+ck data 
so you can find all problem pages in the event of a broken link.

External http links (outside the root domain) are checked for validity
+ but are 
not followed for obvious reasons. External ftp links are checked to en
+sure 
they point at a valid (accessible) file.

Anchors within a page are checked on block as an efficiency hack. Link
+s that 
point to anchors on pages other than the current working page are chec
+ked 
individually.

Mailto links are harvested for (manual) checking. Their format is chec
+ked 
against the RFC822 spec using pattern developed in program by Jeffery 
+Friedl 
in the ORA book "Mastering Regular Expressions". A check can also be m
+ade to 
see the domain can be looked up via DNS to help eliminate domain name 
+typos. 

In verbose mode you see all the links that are checked at each depth. 
+In 
normal mode only the broken links, anchors and mailto harvest are outp
+ut.

There are a number of hardcoded options such as username, password, pr
+oxy, 
DNS name servers. The $follow = qr// defines what sort of links to fol
+low. 
Not all links need to be followed. You are unlikely to find more links
+ by 
following the link to a .jpg image for instance. By default links that
+ end 
in any of:

=over
=item * .htm 
=item * .html 
=item * .cgi 
=item * .pl 
=item * .php 
=item * .asp 
=back

are followed by default but you can add whatever you want. 

=head1 AUTHOR

Dr James Freeman aka tachyon <lt>jfreeman@tassie.net.au<gt>

=head1 LICENSE

This package is free software; you can redistribute it and/or modify i
+t under 
the terms of the "GNU General Public License".

=head1 DISCLAIMER

This script is distributed in the hope that it will be useful, 
but WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

See the "GNU General Public License" for more details.

=cut

# this data is the RFC822 pattern - leave it alone!
__DATA__
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\
+xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\
+x80-\
xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n
+\015"
]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8
+0-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\
+((?:[
^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\
+x80-\
xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)
+<>@,;
:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff]
+)*"))
*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-
+\xff\
n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015
+()]|\
\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[
+^(\04
0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037
+\x80-
\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]
+|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
+^\x80
-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
+f]|\(
(?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\
+\\[\]
\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?
+:[^\\
\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\0
+00-\0
37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x8
+0-\xf
f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\01
+2-\03
7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015
+()]|\
\[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(?
+:[\04
0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0
+15()]
|\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\
+\[^\x
80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04
+0)<>@
,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-
+\xff]
)|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?
+:[^\\
\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
+-\xff
])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\(
+(?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]
+\000-
\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\
+\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff
+\n\01
5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\)
+)*,(?
:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xf
+f\n\0
15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]
+|\\[^
\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\
+040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8
+0-\xf
f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\(
+(?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x
+80-\x
ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|
+\((?:
[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[
+\]\00
0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^
+\\\x8
0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*)
?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\
+]\000
-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\0
+40\t]
|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()
+]|\\[
^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8
+0-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,
+;:".\
\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])
+|"(?:
[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*@
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\
+xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80
+-\xff
]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\0
+15\[\
]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\
+x80-\
xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]
+|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
+^\x80
-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\04
+0)<>@
,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80
+-\xff
])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^
+\\\x8
0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\x
+ff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*
+\))*
Replies are listed 'Best First'.
Re: Link Checker
by merlyn (Sage) on Oct 10, 2001 at 11:06 UTC
Re: Link Checker
by ajt (Prior) on Oct 10, 2001 at 12:02 UTC
    There is also the W3C's free link checking service, freely available at validator.w3.org/checklink. It works quite well, and you can download the Perl source from CVS, and run it yourself - though I've only got it to run on Solaris and Linux so far, wouldn't run on NT...!
Re: Link Checker
by markjugg (Curate) on Sep 18, 2002 at 21:32 UTC
    I tried this script and merlyn's third iteration, but I found I much preferred linklint because it produces excellent cross referenced HTML reports, and allows you to analyze a site both from a local perspective as well as a remote perspective and has an option to produce a "orphaned files" report.

    -mark

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://117944]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others exploiting the Monastery: (5)
As of 2024-03-19 08:40 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found