Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

Re: Detect Broken links

by hawtin (Prior)
on Oct 15, 2009 at 08:36 UTC ( #801315=note: print w/ replies, xml ) Need Help??


in reply to Detect Broken links

I have a very old script that I do that with, it uses LWP to fetch the pages and parses the html with regex (yes, I know better now, but it works).

At the time I failed to identify any module to do what I, needed and later I added various bits and pieces to count words, list external links and so on. So rolling my own turned out to be the best way to go. Essentially it was:

use strict; use LWP::UserAgent; use IO::File; my $ua = LWP::UserAgent->new; $ua->agent("Angler/0.1 "); my $server_root = "/"; my %pages = ( $server_root => "needs_scan", # Note some pages that we don't want to visit "${server_root}login.htm" => "done", "${server_root}logout" => "done", "${server_root}table.xls" => "done", "${server_root}select_view" => "done", "${server_root}sws/error" => "done", ); my $default_host = "127.0.0.1"; my $default_port; my $done_some = 1; while($done_some) { $done_some = 0; foreach my $path (keys %pages) { next if($pages{$path} eq "done"); scan_page($path); $pages{$path} = "done"; $done_some = 1; $total_done++; if(($total_done % 1000) == 0) { print "\rDone $total_done"; } } } print "\nDone $total_done pages\n"; # A whole load of reporting in here exit 0; sub sub scan_page { my($path) = @_; print STDERR "Scanning: $path\n" if($verbose); if(!defined $base_name) { $base_name = "http://$default_host"; $base_name .= ":" . $default_port if($default_port != 80); } # Dir contains the relative path to the page so that # we can correctly adjust relative links $dir = $path; if($dir =~ s#/+[^/]*$#/#) { } else { $dir = $server_root; } %known_content_types = ( # Here are the link types that we know about at the start "text/html" => "HTML", "image/gif" => "ignore", "application/octet-stream" => "ignore", "application/pdf" => "ignore", ) if(!%known_content_types); my $req = HTTP::Request->new(GET => "$base_name$path"); my $res = $ua->request($req); if(!defined $res) { print STDERR "Failed to load $base_name$path\n"; return; } elsif (!$res->is_success) { print STDERR "Status ".$res->status_line." when connecting to +$base_name$path\n"; return; } my $page_contents = $res->content(); if(!defined $page_contents || $page_contents eq "") { print STDERR "Failed to find content $path\n"; return; } if(defined $res->headers()->content_type()) { my $content = $res->headers()->content_type(); if($content =~ s/;.*$//) { # The content type could be something like # "text/html; charset=iso-8859-1" } if(!defined $known_content_types{$content}) { print STDERR "Unknown content type \"$content\" for $path\ +n"; print STDERR " Ref from ".keys(%{$ref_from{$path}})."\n" if(defined $ref_from{$path +}); $known_content_types{$content} = "ignore" if($content ne " +"); return; } return if($known_content_types{$content} ne "HTML"); } else { print STDERR "Cannot find content type for $path\n"; return; } # Parse HTML with regex, this bit needs rebuilding while($page_contents =~ s/href\s*\=\s*\"([^\"]+)\"/**ref_done**/i) { my $link_to = $1; # Remove the within page address $link_to =~ s/#.*$//; next if(!$link_to); if($link_to =~ m#https?:#i) { # Link to explicitly external site if(!defined $external_pages{$link_to}) { if(!defined $external_report) { print STDERR "External page (from $path) $link_to\ +n"; } else { print $external_report "External page (from $path) + $link_to\n"; } $external_pages{$link_to} = "noted"; } next; } elsif($link_to =~ s#^\./+##) { # Change from link to . to local $link_to = $dir.$link_to; } elsif($link_to =~ m#javascript:#) { # I ain't going in there... } elsif($link_to =~ m#^/#) { # Absolute path within the web site } elsif($link_to =~ m#^(\w[\w\d_\-\/\.\s]+)$#) { # Change from rel to absolute $link_to = $dir.$link_to; } else { print STDERR "Unparsed href (from $path) => \"$link_to\"\n +"; next; } if(!defined $pages{$link_to}) { $pages{$link_to} = "needs_scan"; } } while($page_contents =~ s/href(.{0,30})/**ref_done**/) { my $href_val = $1; # These are the special cases that we have found previously an +d # are not interesting to us next if($href_val =~ m#<#); next if($href_val =~ m#^\"$#); next if($href_val =~ m#^=\&quot;#); next if($href_val =~ m#^_values#); next if($href_val =~ m#^\"\s+type=#); next if($href_val =~ m#[34]_p#); print STDERR "Cannot parse out (from $path) \"href$1\"\n"; } } $num_pages++; }

Of course that code is chopped out of a much larger script and not tested, but I think it should give you all the bits you need (well except for extracting href values with a proper parser rather than using regex). Hope it helps.


Comment on Re: Detect Broken links
Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://801315]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chanting in the Monastery: (9)
As of 2014-09-18 21:10 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    How do you remember the number of days in each month?











    Results (124 votes), past polls