Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl Monk, Perl Meditation

comment on

( #3333=superdoc: print w/replies, xml ) Need Help??

I have a very old script that I do that with, it uses LWP to fetch the pages and parses the html with regex (yes, I know better now, but it works).

At the time I failed to identify any module to do what I, needed and later I added various bits and pieces to count words, list external links and so on. So rolling my own turned out to be the best way to go. Essentially it was:

use strict; use LWP::UserAgent; use IO::File; my $ua = LWP::UserAgent->new; $ua->agent("Angler/0.1 "); my $server_root = "/"; my %pages = ( $server_root => "needs_scan", # Note some pages that we don't want to visit "${server_root}login.htm" => "done", "${server_root}logout" => "done", "${server_root}table.xls" => "done", "${server_root}select_view" => "done", "${server_root}sws/error" => "done", ); my $default_host = ""; my $default_port; my $done_some = 1; while($done_some) { $done_some = 0; foreach my $path (keys %pages) { next if($pages{$path} eq "done"); scan_page($path); $pages{$path} = "done"; $done_some = 1; $total_done++; if(($total_done % 1000) == 0) { print "\rDone $total_done"; } } } print "\nDone $total_done pages\n"; # A whole load of reporting in here exit 0; sub sub scan_page { my($path) = @_; print STDERR "Scanning: $path\n" if($verbose); if(!defined $base_name) { $base_name = "http://$default_host"; $base_name .= ":" . $default_port if($default_port != 80); } # Dir contains the relative path to the page so that # we can correctly adjust relative links $dir = $path; if($dir =~ s#/+[^/]*$#/#) { } else { $dir = $server_root; } %known_content_types = ( # Here are the link types that we know about at the start "text/html" => "HTML", "image/gif" => "ignore", "application/octet-stream" => "ignore", "application/pdf" => "ignore", ) if(!%known_content_types); my $req = HTTP::Request->new(GET => "$base_name$path"); my $res = $ua->request($req); if(!defined $res) { print STDERR "Failed to load $base_name$path\n"; return; } elsif (!$res->is_success) { print STDERR "Status ".$res->status_line." when connecting to +$base_name$path\n"; return; } my $page_contents = $res->content(); if(!defined $page_contents || $page_contents eq "") { print STDERR "Failed to find content $path\n"; return; } if(defined $res->headers()->content_type()) { my $content = $res->headers()->content_type(); if($content =~ s/;.*$//) { # The content type could be something like # "text/html; charset=iso-8859-1" } if(!defined $known_content_types{$content}) { print STDERR "Unknown content type \"$content\" for $path\ +n"; print STDERR " Ref from ".keys(%{$ref_from{$path}})."\n" if(defined $ref_from{$path +}); $known_content_types{$content} = "ignore" if($content ne " +"); return; } return if($known_content_types{$content} ne "HTML"); } else { print STDERR "Cannot find content type for $path\n"; return; } # Parse HTML with regex, this bit needs rebuilding while($page_contents =~ s/href\s*\=\s*\"([^\"]+)\"/**ref_done**/i) { my $link_to = $1; # Remove the within page address $link_to =~ s/#.*$//; next if(!$link_to); if($link_to =~ m#https?:#i) { # Link to explicitly external site if(!defined $external_pages{$link_to}) { if(!defined $external_report) { print STDERR "External page (from $path) $link_to\ +n"; } else { print $external_report "External page (from $path) + $link_to\n"; } $external_pages{$link_to} = "noted"; } next; } elsif($link_to =~ s#^\./+##) { # Change from link to . to local $link_to = $dir.$link_to; } elsif($link_to =~ m#javascript:#) { # I ain't going in there... } elsif($link_to =~ m#^/#) { # Absolute path within the web site } elsif($link_to =~ m#^(\w[\w\d_\-\/\.\s]+)$#) { # Change from rel to absolute $link_to = $dir.$link_to; } else { print STDERR "Unparsed href (from $path) => \"$link_to\"\n +"; next; } if(!defined $pages{$link_to}) { $pages{$link_to} = "needs_scan"; } } while($page_contents =~ s/href(.{0,30})/**ref_done**/) { my $href_val = $1; # These are the special cases that we have found previously an +d # are not interesting to us next if($href_val =~ m#<#); next if($href_val =~ m#^\"$#); next if($href_val =~ m#^=\&quot;#); next if($href_val =~ m#^_values#); next if($href_val =~ m#^\"\s+type=#); next if($href_val =~ m#[34]_p#); print STDERR "Cannot parse out (from $path) \"href$1\"\n"; } } $num_pages++; }

Of course that code is chopped out of a much larger script and not tested, but I think it should give you all the bits you need (well except for extracting href values with a proper parser rather than using regex). Hope it helps.

In reply to Re: Detect Broken links by hawtin
in thread Detect Broken links by vishi83

Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":

  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?

What's my password?
Create A New User
Domain Nodelet?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others lurking in the Monastery: (4)
As of 2022-12-04 08:02 GMT
Find Nodes?
    Voting Booth?

    No recent polls found