Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
#!/usr/local/bin/perl -w ############################################################ use strict; use HTML::LinkExtor; #used to extract links use HTTP::Request::Common; use LWP::Simple; #used to check links use LWP::UserAgent; #helps with authorization ### Global Variables ### my (%visitedLinks, %linksToVisit, %linkResults); #declare hashes ### Restricting Variables ### my $restriction = "www.perlmonks.org"; #restricts links to addresses including $restriction my $stripquery = "true"; #true of false, yanks the ?node=whatever,blah=whatever my $stripanchor = "true"; #true of false, yanks the #anchor13 ### some server settings variables ### my $webserver = "http://www.somesite.com/mydir/"; #home web directory my $onserver = "/theserver/mydir/"; #this is where the web address is located on your server... my $localserver = "false"; #if this is true, it will perform some checks # on the server side # using the $webserver and $onserver variables my $dirIndexExt = "index.pl"; #the default page loaded when a directory is requested from the server # i.e. "http://www.perlmonks.org/" # loads "http://www.niwoths.org/index.pl" ### Start Page ### my $startpage = "http://www.perlmonks.org/"; #the page to start checking from -- must be full address # if it's the site name or a directory name, please be sure to include + / ### Site Login Info (if any) ### my %logins = ( ); #for directions on setting up this variable, #please refer to the documentation # "sitename.com"=>"user:pass" ### Log File ### my $linkInfoFile = "linkInfoFile.txt"; #which pages link to what my $updateFreq = 100; #frequency at which link results file is updated ### Control Variables my $sleep = 1; #seconds to pause after displaying output results my $timeout = 30; #timeout in seconds ### Run the Script! ### &main; #run main function &outLinkResults; #when execution finishes, update the results sub main { ############################################################ my($link); $| =1; #unbuffer output $linksToVisit{$startpage} = &linkFormatter($startpage,"_Beginning_"); #starting from the $startpage, # marked as being linked to from "_Beginning_" while ((scalar keys %linksToVisit) > 0) { #while some left to check foreach (sort keys %linksToVisit) { #foreach link $link = $_; &checkLink($link); #check the link if ($linkResults{$link} eq "OK") { #if the page is OK if ($link =~ /${restriction}/) { if ($link =~ /(\.s?html?|\.php|\.(a|j)sp|\/$)/) { #it's a web page &grabLinks($link); #grab the links }#end if }#end if }#end if if ((scalar keys %visitedLinks) % $updateFreq == 0) { #update time &outLinkResults; #output link information sleep($sleep); #sleep }#end if } } ############################################################ } sub grabLinks{ ############################################################ my(@links,@values); #declare local variables my($parser,$base_url,$elt_type); my($attr_name,$attr_value,$value); my $ua = LWP::UserAgent->new; $ua->timeout($timeout); #set timeout for UserAgent my $req; $base_url = $_; #grab the page url $req = HTTP::Request::Common::GET($base_url); #the request LINE: foreach (keys %logins) { if ($base_url =~ /$_/) { $req->authorization_basic(split(":",$logins{$_})); last LINE; }#end if }#end GOTO style loop $parser = HTML::LinkExtor->new(undef, $base_url); #initialize parser $parser->parse($ua->request($req)->as_string)->eof; #grab the page @links = $parser->links; #parse it foreach (@links) { $elt_type = shift @$_; #get tag type if ($elt_type =~ /\b(a|img)\b/) { while (@$_) { ($attr_name, $attr_value) = splice (@$_, 0, 2); if ($attr_value =~ /(https?|ftp|file)/) { $attr_value = &linkFormatter($attr_value,$base_url); if(exists $visitedLinks{$attr_value}) { $value = $visitedLinks{$attr_value}; @values = split("::",$value); unless (grep {$base_url =~ /$_/} @values) { $visitedLinks{$attr_value} = join("::",$value,$base_url); } }#end if else { if(exists $linksToVisit{$attr_value}) { $value = $linksToVisit{$attr_value}; @values = split("::",$value); unless (grep {$base_url =~ /$_/} @values) { $linksToVisit{$attr_value} = join("::",$value,$base_url); return; }#end unless } else { #not marked to visit $linksToVisit{$attr_value} = $base_url; } } } #end if } #end while } #end if } #end foreach ############################################################ } sub checkLink { ############################################################ my $link = $_; #store link value locally my $req; #declare local variable print "Checking ${link}:"; my $ua = LWP::UserAgent->new; #create user agent my $results; $req = HTTP::Request::Common::HEAD($link); #grab head by default if($link =~ /(\/cgi-bin|\.cgi|\.pl)/) { #if cgi, do a post $req = HTTP::Request::Common::POST($link); } LINE: foreach (keys %logins) { if ($link =~ /$_/) { #if it needs a login $req->authorization_basic(split(":",$logins{$_})); #send in login info last LINE; }#end if }#end foreach (and LINE) $ua->timeout($timeout); $results = $ua->request($req)->as_string; #grab it (head or full page) if ($results =~ /\b200\b/) { #request successfull print "OK\n"; $linkResults{$link} = "OK"; } elsif ($results =~ /\b401\b|\b403\b/) { #unauthorized or forbid +den print "UNAUTH\n"; $linkResults{$link} = "UNAUTH"; } elsif ($results =~ /\b500\b/) { #internal server error print "SERVERROR\n"; $linkResults{$link} = "SERVERROR"; } else { print "BAD\n"; #404 (file not found) or other $linkResults{$link} = "BAD"; #add it to results } $visitedLinks{$link} = $linksToVisit{$link}; #add it to list delete $linksToVisit{$link}; #do not need to visit again ############################################################ } sub outLinkResults{ ############################################################ my ($key,$link,$value,$status); #local variables my %stats; $stats{"goodpage"} = 0; #init. values (none unassigned) $stats{"badpage"} = 0; $stats{"unauthpage"} = 0; $stats{"serverror"} = 0; $stats{"goodlink"} = 0; $stats{"badlink"} = 0; $stats{"unauthlink"} = 0; my (@links,@keys); print "-------------------------------\n"; print "Updating Link Information file...\n"; #let them know... open(FILE,">$linkInfoFile"); #open link info file print FILE "##########################################\n"; print FILE "# Link File: Tells ya what links where #\n"; print FILE "##########################################\n"; print FILE "# Note: Totals are listed at the bottom. #\n"; print FILE "##########################################\n"; @keys = sort keys %visitedLinks; #grab the keys foreach $key (@keys) { #go through all the keys $value = $visitedLinks{$key}; #grab the value $status = $linkResults{$key}; chomp($status); push(@links,split(/::/,$value)); print FILE "Information for ${key} --\n"; print FILE "Status -- ${status}\n"; if ($status eq "OK") { $stats{"goodpage"}++; $stats{"goodlink"} += scalar(@links); } elsif ($status eq "UNAUTH") { $stats{"unauthpage"}++; $stats{"unauthlink"} += scalar(@links); } elsif ($status eq "SERVERROR") { $stats{"serverror"}++; $stats{"goodlink"} += scalar(@links); } else { $stats{"badpage"}++; $stats{"badlink"} += scalar(@links); } print FILE "Linked to from " . scalar(@links) . " pages: \n"; foreach $link (@links) { print FILE "$link\n"; } print FILE "-----------------------\n"; @links = qw(); } print FILE " TOTALS\n"; #outputing totals print FILE "-----------------------\n"; print FILE " Good pages: " . $stats{"goodpage"} . "\n"; #good + pages print FILE " Bad pages: " . $stats{"badpage"} . "\n"; #bad +pages print FILE " Unauth pages: " . $stats{"unauthpage"} . "\n"; #unau +thorized print FILE "Server errors: " . $stats{"serverror"} . "\n"; #serv. + error print FILE " Total pages: " . ($stats{"goodpage"} + $stats{"badpage"} + + $stats{"serverror"} + $stats{"unauthpage"}) . "\n"; print FILE "-----------------------\n"; print FILE " Good links: " . $stats{"goodlink"} . "\n"; print FILE " Bad links: " . $stats{"badlink"} . "\n"; print FILE " Unauth links: " . $stats{"unauthlink"} . "\n"; print FILE " Total links: " . ($stats{"goodlink"} + $stats{"badlink"} + $stats{"unauthlink"}) . "\n"; print FILE "-----------------------\n"; print " ...completed.\n"; #let 'em know it's done print "Pages checked: " . (scalar keys % visitedLinks) . "\n"; print "Pages remaining: " . (scalar keys %linksToVisit) . "\n"; print "-------------------------------\n"; close(FILE); ############################################################ } sub linkFormatter { ############################################################ my($address,$from,$path); #set local vars $address = shift; #grab address $from = shift; #grab addr. linked from if ($stripanchor eq "true" && ($address =~ /\#/)) { $address =~ s!\#.*$!!; #discard anchor } if ($stripquery eq "true" && ($address =~ /\?/)) { $address =~ s!\?.*$!!; #discard query } if ($address =~ /\.\.\//) { #if the link includes a relative directory (../something/) #fix it up to the absolute address $from =~ s!/[^/]*?/?$!!; #removes subdirectory $address =~ s!.*/\.\./(.*)!${from}/${1}!; #does it } if ($localserver eq "true" && $address =~ /$webserver/) { #if the link is to a directory, add $dirIndexExt #not done to off server links cause we can't touch 'em $path = $address; $path =~ s|$webserver/?(.*)$|${1}|; $path = $onserver . $path; if (-d $path) { #if it's a directory $address =~ s!(.*?)/?$!${1}/$dirIndexExt!; #add on extension } } return $address; #return the address ############################################################ } ###T#H#E##E#N#D###

In reply to Agent00013's URL Checking Spider by agent00013

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others cooling their heels in the Monastery: (19)
    As of 2015-07-28 20:47 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (258 votes), past polls