Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl-Sensitive Sunglasses
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/perl -w use strict; use HTML::TokeParser; use LWP::UserAgent; use Net::FTP; use Getopt::Std; my ( @urls, %emails, %seen, %seen_anchors, %opts ); my $root = pop @ARGV or &useage; # root of search getopts( 'vrd:', \%opts ); my $level = 0; my $depth = $opts{d} ? $opts{d} : 1; my $verbose = $opts{v} ? 1 : 0; my $res = $opts{r} ? 1 : 0; my $follow = qr/(?:htm|html|cgi|pl|php|asp)/; @{$urls[$level]} = ( $root ); my ($domain)= $urls[0][0] =~ m|http://([^/]+)|i; my $rfc822 = init_rfc822(); my $ua = LWP::UserAgent->new; my $proxy = 'http://proxy.hbt.tassie.net.au:8080'; $ua->proxy( 'http', $proxy ); # $ua->credentials( $netloc, $realm, $uname, $pass ); # uncomment if r +equired # get a DNS resolver object from Net::DNS and set nameservers if ($res) { $^W = 0; # silence warnings about deprecated syntax from Net::DNS eval { require Net::DNS }; $^W = 1; $res = $@ ? 0 : new Net::DNS::Resolver; warn "Net::DNS error $@\n" if $@; $res->nameservers( "203.57.213.18", "203.19.156.68" ) if $res; } for my $level_ref ( @urls ) { # we keep adding levels to @urls in the +loop $level++; last if $level > $depth; print "\nLevel $level\n" if $verbose; for my $url ( @$level_ref ) { $url .= '/' if $url =~ m|http://[^/]+$|i; # add trailing / i +f forgotten my ($root) = $url =~ m|(http://.*/)|i; my $content = get( $url ); print " Checking links....\n" if $verbose; my $parser = HTML::TokeParser->new( \$content ); my %anchors; LINK: while ( my $token = $parser->get_tag(qw( a img )) ) { my $link = $token->[1]{href} || $token->[1]{src} || next; + # next unless hyerlink my $type = $token->[0]; my $alt = $token->[1]{alt} || ''; my $text = $type eq 'a' ? $parser->get_trimmed_text('/a') +: 'Image alt="'.$alt.'"'; if ( $link =~ m|^\s*mailto:(.*)$|i ) { # mail link $emails{"$text<$1>"} .= defined $emails{"$text<$1>"} ? + ", $url" : $url; next LINK; } elsif ( $link =~ m|\Q$domain\E| ) { # internal link do{ &seen( $link, $url ); next LINK } if defined $seen +{$link}; $seen{$link} = &testHTTP( $link, $text, $url ); push @{$urls[$level]}, $link if $seen{$link} =~ /^OK.* +$follow \(Link/; } elsif ( $link =~ m|^\s*http://|i ) { # external http li +nk do{ &seen( $link, $url ); next LINK } if defined $seen +{$link}; $seen{$link} = &testHTTP( $link, $text, $url ); } elsif ( $link =~ m|^\s*ftp://|i ) { # external ftp lin +k do{ &seen( $link, $url ); next LINK } if defined $seen +{$link}; $seen{$link} = &testFTP( $link, $text, $url ); } else { # relative link $link =~ s|^\s*/||; # trim link do{ $anchors{$url.$link}[0] = $text; next LINK } if $l +ink =~ m|^#|; my $rel_root = $root; # move back up tree in response to ../ while ( $link =~ s|^\.\./|| ) { $rel_root =~ s|[^/]+/$|| unless $rel_root =~ m|htt +p://[^/]+/$|i # Until can't go any further! } my $rel_link = $rel_root.$link; $rel_link =~ s|/\./|/|g; do{ &seen( $rel_link, $url ); next LINK } if defined $ +seen{$rel_link}; $seen{$rel_link} = &testHTTP( $rel_link, $text, $url ) +; push @{$urls[$level]}, $rel_link if $seen{$rel_link} = +~ /^OK.*$follow \(Link/; } } if (keys %anchors) { print " Checking anchors...\n Page: $url\n" if $verbos +e; &check_anchors( $url, \%anchors, $content, $url ); } } } my @dud_links = grep { not /^OK/ } values %seen; # Schwartzian transform - due credit to RS :-) sub page { pop =~ m/On page:'([^']+)'/; $1 } @dud_links = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)]}@ +dud_links; print "\n\nBroken links $urls[0][0]\n"; @dud_links = ('No bad links!') unless @dud_links; print " $_\n" for @dud_links; my @dud_anchors = grep { not /^OK/ } values %seen_anchors; @dud_anchors = map{$_->[0]}sort{$a->[1] cmp $b->[1]}map{[$_, page($_)] +}@dud_anchors; print "\n\nBroken anchors $urls[0][0]\n"; @dud_anchors = ('No bad anchors!') unless @dud_anchors; print " $_\n" for @dud_anchors; print "\n\nMailto links\n"; for (keys %emails) { my ( $email, $domain ) = $_ =~ m/<([^@]+@([^>]+))>/; my $status = check_rfc822($email) ? 'OK RFC822 - ' : 'Failed RFC82 +2 - '; my $dns = dns_query($domain); $status .= ! defined $dns ? "No DNS check" : $dns ? "OK DNS" : "No + DNS listing"; print " $status email: $_ page(s): $emails{$_}\n"; } sub check_rfc822 { my $email = shift; return 1 if $email =~ m/^$rfc822$/o; return 0; } sub dns_query { return undef unless $res; my $domain = shift; for my $dns ( 'MX', 'A' ) { my $packet = $res->send($domain, $dns ) or warn $res->errorstr +ing; return 1 if $packet->header->ancount; } return 0; } sub seen { my ( $link, $on_page ) = @_; $seen{$link} .= ", $on_page"; } sub get { my $url = shift; print " Getting $url...." if $verbose;; my $request = HTTP::Request->new( 'GET', $url ); my $content = $ua->request( $request ); print "$$content{_msg} $$content{_rc}\n" if $verbose;; return $$content{_content}; } sub testHTTP { my ( $url, $text, $on_page ) = @_; my $request = HTTP::Request->new( 'HEAD', $url ); my $content = $ua->request( $request ); my $answer = "$$content{_msg} $$content{_rc} $url (Link text: $te +xt ) (On page: $on_page )"; print " $answer\n" if $verbose; # if $url also include an anchor load page and check anchor exists if ($url =~ m|^(.*/[^/]+)(#[^/]+)$|) { my %anchors; $anchors{$url}[0] = $text; my $old_verbose = $verbose; $verbose = 0; my $content = get( $url ); $verbose = $old_verbose; check_anchors( $1, \%anchors, $content, $on_page ); } return $answer; } sub testFTP { my ( $url, $text, $on_page ) = @_; my ( $host, $filepath ) = $url =~ m|ftp://([^/]+)(.*)$|; my $answer; if (my $ftp = Net::FTP->new( $host )) { $ftp->login( 'anonymous', 'nobody@nowhere.com' ); my $size = $ftp->size( $filepath ); $ftp->quit; my $status = defined $size ? "OK" : "Not OK File does not exis +t"; $answer = "$status FTP $url (Link text: $text ) (On page: $on +_page )"; } else { $answer = "$@ FTP $url (Link text: $text ) (On page: $on_page +)"; } print " $answer\n" if $verbose; return $answer; } sub check_anchors { my( $url, $anchor_ref, $content, $on_page ) = @_; my $parser = HTML::TokeParser->new( \$content ); while ( my $token = $parser->get_tag('a') ) { my $link = $token->[1]{name} || next; # next unless named $link = "$url#$link"; $$anchor_ref{$link}[1] = 1 if defined $$anchor_ref{$link}; } for (sort keys %$anchor_ref) { $$anchor_ref{$_}[1] = defined $$anchor_ref{$_}[1] ? 1 : 0; my $status = $$anchor_ref{$_}[1] ? "OK" : "Not OK"; my $answer = "$status ANCHOR $_ (Link text: $$anchor_ref{$_}[ +0] ) (On page: $on_page )";; print " $answer\n" if $verbose; $seen_anchors{$on_page.$_} = $answer; } } sub init_rfc822 { my $rfc_pat = ''; while (<DATA>) { next unless $_; chomp; $rfc_pat .= $_ } $rfc_pat = qr/$rfc_pat/; return $rfc_pat; } sub useage { print " Useage: linkcheck.pl -[v,d[depth]] <url> -v sets verbose mode -dn sets a search depth of 'n' levels where 'n' is a +ve integ +er <url> is the root url in which to start the search defaults are quiet mode and depth level 1 "; exit; } =head1 NAME linkcheck.pl v0.001 =head1 SYNOPSIS linkcheck.pl -[v,r,d[depth]] <url> -v sets verbose mode -r resolve email address domains via DNS lookups -d[n] sets a search depth of 'n' levels where 'n' is a +ve int +eger <url> is the root url in which to start the search defaults are quiet mode, no DNS lookup, and depth level 1 linkcheck.pl -vrd1000 http://www.yoursite.com > logfile.txt this will check your entire site (assuming a link depth <= 1000 :- +) and send the verbose output to the file logfile.txt =head1 DESCRIPTION This script extracts and checks links for validity. A HEAD request is +made for each link found. A valid link will return status OK in the header. Script performs a recursive search, width first to a user defined dept +h. The default depth of 1 means just check the links on the root page. A value of 2 means check all the links on the root and child pages, a le +vel of 3 will check links on root, childen, children's children. A depth of 4 + gets to the children's children's children's links and so on... Links that have been checked are not checked again if they occur elsew +here. The pages on which repeat links occur are appended to the original che +ck data so you can find all problem pages in the event of a broken link. External http links (outside the root domain) are checked for validity + but are not followed for obvious reasons. External ftp links are checked to en +sure they point at a valid (accessible) file. Anchors within a page are checked on block as an efficiency hack. Link +s that point to anchors on pages other than the current working page are chec +ked individually. Mailto links are harvested for (manual) checking. Their format is chec +ked against the RFC822 spec using pattern developed in program by Jeffery +Friedl in the ORA book "Mastering Regular Expressions". A check can also be m +ade to see the domain can be looked up via DNS to help eliminate domain name +typos. In verbose mode you see all the links that are checked at each depth. +In normal mode only the broken links, anchors and mailto harvest are outp +ut. There are a number of hardcoded options such as username, password, pr +oxy, DNS name servers. The $follow = qr// defines what sort of links to fol +low. Not all links need to be followed. You are unlikely to find more links + by following the link to a .jpg image for instance. By default links that + end in any of: =over =item * .htm =item * .html =item * .cgi =item * .pl =item * .php =item * .asp =back are followed by default but you can add whatever you want. =head1 AUTHOR Dr James Freeman aka tachyon <lt>jfreeman@tassie.net.au<gt> =head1 LICENSE This package is free software; you can redistribute it and/or modify i +t under the terms of the "GNU General Public License". =head1 DISCLAIMER This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the "GNU General Public License" for more details. =cut # this data is the RFC822 pattern - leave it alone! __DATA__ (?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\ +xff\n \015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\ +x80-\ xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n +\015" ]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8 +0-\xf f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\ +((?:[ ^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\ +x80-\ xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040) +<>@,; :".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff] +)*")) *(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80- +\xff\ n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015 +()]|\ \[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[ +^(\04 0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037 +\x80- \xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t] +|\((? :[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[ +^\x80 -\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf +f]|\( (?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\ +\\[\] \000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(? +:[^\\ \x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\0 +00-\0 37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x8 +0-\xf f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\01 +2-\03 7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015 +()]|\ \[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(? +:[\04 0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0 +15()] |\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\ +\[^\x 80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04 +0)<>@ ,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80- +\xff] )|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((? +:[^\\ \x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80 +-\xff ])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\( +(?:[^ \\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\] +\000- \037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\ +\x80- \xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff +\n\01 5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\) +)*,(? :[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xf +f\n\0 15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()] +|\\[^ \x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\ +040)< >@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8 +0-\xf f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\( +(?:[^ \\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x +80-\x ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]| +\((?: [^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[ +\]\00 0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^ +\\\x8 0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\x +ff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))* +\))*) ?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\ +]\000 -\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\0 +40\t] |\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015() +]|\\[ ^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x8 +0-\xf f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@, +;:".\ \\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) +|"(?: [^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\x +ff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))* +\))*@ (?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\ +xff\n \015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80 +-\xff ]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\0 +15\[\ ]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\ +x80-\ xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t] +|\((? :[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[ +^\x80 -\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\04 +0)<>@ ,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80 +-\xff ])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^ +\\\x8 0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\x +ff\n\ 015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))* +\))*

In reply to Link Checker by tachyon

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others contemplating the Monastery: (5)
As of 2024-04-25 14:36 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found