#!/usr/bin/perl -w use strict; use HTML::TokeParser; use LWP::UserAgent; use Net::FTP; use Getopt::Std; my ( @urls, %emails, %seen, %seen_anchors, %opts ); my $root = pop @ARGV or &useage; # root of search getopts( 'vrd:', \%opts ); my $level = 0; my $depth = $opts{d} ? $opts{d} : 1; my $verbose = $opts{v} ? 1 : 0; my $res = $opts{r} ? 1 : 0; my $follow = qr/(?:htm|html|cgi|pl|php|asp)/; @{$urls[$level]} = ( $root ); my ($domain)= $urls[0][0] =~ m|http://([^/]+)|i; my $rfc822 = init_rfc822(); my $ua = LWP::UserAgent->new; my $proxy = 'http://proxy.hbt.tassie.net.au:8080'; $ua->proxy( 'http', $proxy ); # $ua->credentials( $netloc, $realm, $uname, $pass ); # uncomment if required # get a DNS resolver object from Net::DNS and set nameservers if ($res) { $^W = 0; # silence warnings about deprecated syntax from Net::DNS eval { require Net::DNS }; $^W = 1; $res = $@ ? 0 : new Net::DNS::Resolver; warn "Net::DNS error $@\n" if $@; $res->nameservers( "203.57.213.18", "203.19.156.68" ) if $res; } for my $level_ref ( @urls ) { # we keep adding levels to @urls in the loop $level++; last if $level > $depth; print "\nLevel $level\n" if $verbose; for my $url ( @$level_ref ) { $url .= '/' if $url =~ m|http://[^/]+$|i; # add trailing / if forgotten my ($root) = $url =~ m|(http://.*/)|i; my $content = get( $url ); print " Checking links....\n" if $verbose; my $parser = HTML::TokeParser->new( \$content ); my %anchors; LINK: while ( my $token = $parser->get_tag(qw( a img )) ) { my $link = $token->[1]{href} || $token->[1]{src} || next; # next unless hyerlink my $type = $token->[0]; my $alt = $token->[1]{alt} || ''; my $text = $type eq 'a' ? $parser->get_trimmed_text('/a') : 'Image alt="'.$alt.'"'; if ( $link =~ m|^\s*mailto:(.*)$|i ) { # mail link $emails{"$text<$1>"} .= defined $emails{"$text<$1>"} ? ", $url" : $url; next LINK; } elsif ( $link =~ m|\Q$domain\E| ) { # internal link do{ &seen( $link, $url ); next LINK } if defined $seen{$link}; $seen{$link} = &testHTTP( $link, $text, $url ); push @{$urls[$level]}, $link if $seen{$link} =~ /^OK.*$follow \(Link/; } elsif ( $link =~ m|^\s*http://|i ) { # external http link do{ &seen( $link, $url ); next LINK } if defined $seen{$link}; $seen{$link} = &testHTTP( $link, $text, $url ); } elsif ( $link =~ m|^\s*ftp://|i ) { # external ftp link do{ &seen( $link, $url ); next LINK } if defined $seen{$link}; $seen{$link} = &testFTP( $link, $text, $url ); } else { # relative link $link =~ s|^\s*/||; # trim link do{ $anchors{$url.$link}[0] = $text; next LINK } if $link =~ m|^#|; my $rel_root = $root; # move back up tree in response to ../ while ( $link =~ s|^\.\./|| ) { $rel_root =~ s|[^/]+/$|| unless $rel_root =~ m|http://[^/]+/$|i # Until can't go any further! } my $rel_link = $rel_root.$link; $rel_link =~ s|/\./|/|g; do{ &seen( $rel_link, $url ); next LINK } if defined $seen{$rel_link}; $seen{$rel_link} = &testHTTP( $rel_link, $text, $url ); push @{$urls[$level]}, $rel_link if $seen{$rel_link} =~ /^OK.*$follow \(Link/; } } if (keys %anchors) { print " Checking anchors...\n Page: $url\n" if $verbose; &check_anchors( $url, \%anchors, $content, $url ); } } } my @dud_links = grep { not /^OK/ } values %seen; # Schwartzian transform - due credit to RS :-) sub page { pop =~ m/On page:'([^']+)'/; $1 } @dud_links = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]}@dud_links; print "\n\nBroken links $urls[0][0]\n"; @dud_links = ('No bad links!') unless @dud_links; print " $_\n" for @dud_links; my @dud_anchors = grep { not /^OK/ } values %seen_anchors; @dud_anchors = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]}@dud_anchors; print "\n\nBroken anchors $urls[0][0]\n"; @dud_anchors = ('No bad anchors!') unless @dud_anchors; print " $_\n" for @dud_anchors; print "\n\nMailto links\n"; for (keys %emails) { my ( $email, $domain ) = $_ =~ m/<([^@]+@([^>]+))>/; my $status = check_rfc822($email) ? 'OK RFC822 - ' : 'Failed RFC822 - '; my $dns = dns_query($domain); $status .= ! defined $dns ? "No DNS check" : $dns ? "OK DNS" : "No DNS listing"; print " $status email: $_ page(s): $emails{$_}\n"; } sub check_rfc822 { my $email = shift; return 1 if $email =~ m/^$rfc822$/o; return 0; } sub dns_query { return undef unless $res; my $domain = shift; for my $dns ( 'MX', 'A' ) { my $packet = $res->send($domain, $dns ) or warn $res->errorstring; return 1 if $packet->header->ancount; } return 0; } sub seen { my ( $link, $on_page ) = @_; $seen{$link} .= ", $on_page"; } sub get { my $url = shift; print " Getting $url...." if $verbose;; my $request = HTTP::Request->new( 'GET', $url ); my $content = $ua->request( $request ); print "$$content{_msg} $$content{_rc}\n" if $verbose;; return $$content{_content}; } sub testHTTP { my ( $url, $text, $on_page ) = @_; my $request = HTTP::Request->new( 'HEAD', $url ); my $content = $ua->request( $request ); my $answer = "$$content{_msg} $$content{_rc} $url (Link text: $text ) (On page: $on_page )"; print " $answer\n" if $verbose; # if $url also include an anchor load page and check anchor exists if ($url =~ m|^(.*/[^/]+)(#[^/]+)$|) { my %anchors; $anchors{$url}[0] = $text; my $old_verbose = $verbose; $verbose = 0; my $content = get( $url ); $verbose = $old_verbose; check_anchors( $1, \%anchors, $content, $on_page ); } return $answer; } sub testFTP { my ( $url, $text, $on_page ) = @_; my ( $host, $filepath ) = $url =~ m|ftp://([^/]+)(.*)$|; my $answer; if (my $ftp = Net::FTP->new( $host )) { $ftp->login( 'anonymous', 'nobody@nowhere.com' ); my $size = $ftp->size( $filepath ); $ftp->quit; my $status = defined $size ? "OK" : "Not OK File does not exist"; $answer = "$status FTP $url (Link text: $text ) (On page: $on_page )"; } else { $answer = "$@ FTP $url (Link text: $text ) (On page: $on_page )"; } print " $answer\n" if $verbose; return $answer; } sub check_anchors { my( $url, $anchor_ref, $content, $on_page ) = @_; my $parser = HTML::TokeParser->new( \$content ); while ( my $token = $parser->get_tag('a') ) { my $link = $token->[1]{name} || next; # next unless named $link = "$url#$link"; $$anchor_ref{$link}[1] = 1 if defined $$anchor_ref{$link}; } for (sort keys %$anchor_ref) { $$anchor_ref{$_}[1] = defined $$anchor_ref{$_}[1] ? 1 : 0; my $status = $$anchor_ref{$_}[1] ? "OK" : "Not OK"; my $answer = "$status ANCHOR $_ (Link text: $$anchor_ref{$_}[0] ) (On page: $on_page )";; print " $answer\n" if $verbose; $seen_anchors{$on_page.$_} = $answer; } } sub init_rfc822 { my $rfc_pat = ''; while () { next unless $_; chomp; $rfc_pat .= $_ } $rfc_pat = qr/$rfc_pat/; return $rfc_pat; } sub useage { print " Useage: linkcheck.pl -[v,d[depth]] -v sets verbose mode -dn sets a search depth of 'n' levels where 'n' is a +ve integer is the root url in which to start the search defaults are quiet mode and depth level 1 "; exit; } =head1 NAME linkcheck.pl v0.001 =head1 SYNOPSIS linkcheck.pl -[v,r,d[depth]] -v sets verbose mode -r resolve email address domains via DNS lookups -d[n] sets a search depth of 'n' levels where 'n' is a +ve integer is the root url in which to start the search defaults are quiet mode, no DNS lookup, and depth level 1 linkcheck.pl -vrd1000 http://www.yoursite.com > logfile.txt this will check your entire site (assuming a link depth <= 1000 :-) and send the verbose output to the file logfile.txt =head1 DESCRIPTION This script extracts and checks links for validity. A HEAD request is made for each link found. A valid link will return status OK in the header. Script performs a recursive search, width first to a user defined depth. The default depth of 1 means just check the links on the root page. A value of 2 means check all the links on the root and child pages, a level of 3 will check links on root, childen, children's children. A depth of 4 gets to the children's children's children's links and so on... Links that have been checked are not checked again if they occur elsewhere. The pages on which repeat links occur are appended to the original check data so you can find all problem pages in the event of a broken link. External http links (outside the root domain) are checked for validity but are not followed for obvious reasons. External ftp links are checked to ensure they point at a valid (accessible) file. Anchors within a page are checked on block as an efficiency hack. Links that point to anchors on pages other than the current working page are checked individually. Mailto links are harvested for (manual) checking. Their format is checked against the RFC822 spec using pattern developed in program by Jeffery Friedl in the ORA book "Mastering Regular Expressions". A check can also be made to see the domain can be looked up via DNS to help eliminate domain name typos. In verbose mode you see all the links that are checked at each depth. In normal mode only the broken links, anchors and mailto harvest are output. There are a number of hardcoded options such as username, password, proxy, DNS name servers. The $follow = qr// defines what sort of links to follow. Not all links need to be followed. You are unlikely to find more links by following the link to a .jpg image for instance. By default links that end in any of: =over =item * .htm =item * .html =item * .cgi =item * .pl =item * .php =item * .asp =back are followed by default but you can add whatever you want. =head1 AUTHOR Dr James Freeman aka tachyon jfreeman@tassie.net.au =head1 LICENSE This package is free software; you can redistribute it and/or modify it under the terms of the "GNU General Public License". =head1 DISCLAIMER This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the "GNU General Public License" for more details. =cut # this data is the RFC822 pattern - leave it alone! __DATA__ (?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n \015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\ xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015" ]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[ ^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\ xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,; :".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")) *(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\ n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\ \[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04 0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80- \xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((? :[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80 -\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\( (?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\] \000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\ \x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\000-\0 37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xf f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\03 7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\ \[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(?:[\04 0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()] |\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x 80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@ ,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff] )|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\ \x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff ])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^ \\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000- \037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80- \xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\01 5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*,(? :[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0 15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^ \x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)< >@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xf f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^ \\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\x ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?: [^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\00 0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x8 0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\xff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*) ?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000 -\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\040\t] |\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[ ^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\ \\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?: [^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*@ (?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n \015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff ]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\ ]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\ xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((? :[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80 -\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@ ,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff ])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x8 0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\xff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*