Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??

I have a very old script that I do that with, it uses LWP to fetch the pages and parses the html with regex (yes, I know better now, but it works).

At the time I failed to identify any module to do what I, needed and later I added various bits and pieces to count words, list external links and so on. So rolling my own turned out to be the best way to go. Essentially it was:

use strict; use LWP::UserAgent; use IO::File; my $ua = LWP::UserAgent->new; $ua->agent("Angler/0.1 "); my $server_root = "/"; my %pages = ( $server_root => "needs_scan", # Note some pages that we don't want to visit "${server_root}login.htm" => "done", "${server_root}logout" => "done", "${server_root}table.xls" => "done", "${server_root}select_view" => "done", "${server_root}sws/error" => "done", ); my $default_host = "127.0.0.1"; my $default_port; my $done_some = 1; while($done_some) { $done_some = 0; foreach my $path (keys %pages) { next if($pages{$path} eq "done"); scan_page($path); $pages{$path} = "done"; $done_some = 1; $total_done++; if(($total_done % 1000) == 0) { print "\rDone $total_done"; } } } print "\nDone $total_done pages\n"; # A whole load of reporting in here exit 0; sub sub scan_page { my($path) = @_; print STDERR "Scanning: $path\n" if($verbose); if(!defined $base_name) { $base_name = "http://$default_host"; $base_name .= ":" . $default_port if($default_port != 80); } # Dir contains the relative path to the page so that # we can correctly adjust relative links $dir = $path; if($dir =~ s#/+[^/]*$#/#) { } else { $dir = $server_root; } %known_content_types = ( # Here are the link types that we know about at the start "text/html" => "HTML", "image/gif" => "ignore", "application/octet-stream" => "ignore", "application/pdf" => "ignore", ) if(!%known_content_types); my $req = HTTP::Request->new(GET => "$base_name$path"); my $res = $ua->request($req); if(!defined $res) { print STDERR "Failed to load $base_name$path\n"; return; } elsif (!$res->is_success) { print STDERR "Status ".$res->status_line." when connecting to +$base_name$path\n"; return; } my $page_contents = $res->content(); if(!defined $page_contents || $page_contents eq "") { print STDERR "Failed to find content $path\n"; return; } if(defined $res->headers()->content_type()) { my $content = $res->headers()->content_type(); if($content =~ s/;.*$//) { # The content type could be something like # "text/html; charset=iso-8859-1" } if(!defined $known_content_types{$content}) { print STDERR "Unknown content type \"$content\" for $path\ +n"; print STDERR " Ref from ".keys(%{$ref_from{$path}})."\n" if(defined $ref_from{$path +}); $known_content_types{$content} = "ignore" if($content ne " +"); return; } return if($known_content_types{$content} ne "HTML"); } else { print STDERR "Cannot find content type for $path\n"; return; } # Parse HTML with regex, this bit needs rebuilding while($page_contents =~ s/href\s*\=\s*\"([^\"]+)\"/**ref_done**/i) { my $link_to = $1; # Remove the within page address $link_to =~ s/#.*$//; next if(!$link_to); if($link_to =~ m#https?:#i) { # Link to explicitly external site if(!defined $external_pages{$link_to}) { if(!defined $external_report) { print STDERR "External page (from $path) $link_to\ +n"; } else { print $external_report "External page (from $path) + $link_to\n"; } $external_pages{$link_to} = "noted"; } next; } elsif($link_to =~ s#^\./+##) { # Change from link to . to local $link_to = $dir.$link_to; } elsif($link_to =~ m#javascript:#) { # I ain't going in there... } elsif($link_to =~ m#^/#) { # Absolute path within the web site } elsif($link_to =~ m#^(\w[\w\d_\-\/\.\s]+)$#) { # Change from rel to absolute $link_to = $dir.$link_to; } else { print STDERR "Unparsed href (from $path) => \"$link_to\"\n +"; next; } if(!defined $pages{$link_to}) { $pages{$link_to} = "needs_scan"; } } while($page_contents =~ s/href(.{0,30})/**ref_done**/) { my $href_val = $1; # These are the special cases that we have found previously an +d # are not interesting to us next if($href_val =~ m#<#); next if($href_val =~ m#^\"$#); next if($href_val =~ m#^=\&quot;#); next if($href_val =~ m#^_values#); next if($href_val =~ m#^\"\s+type=#); next if($href_val =~ m#[34]_p#); print STDERR "Cannot parse out (from $path) \"href$1\"\n"; } } $num_pages++; }

Of course that code is chopped out of a much larger script and not tested, but I think it should give you all the bits you need (well except for extracting href values with a proper parser rather than using regex). Hope it helps.


In reply to Re: Detect Broken links by hawtin
in thread Detect Broken links by vishi83

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others contemplating the Monastery: (14)
    As of 2014-12-19 15:13 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      Is guessing a good strategy for surviving in the IT business?





      Results (84 votes), past polls