use strict; use warnings; use WWW::Mechanize; use URI::ImpliedBase; use URI; my %visited; my @queue; my $start_url = shift or die "No starting URL supplied"; my $extractor = URI::ImpliedBase->new($start_url); my $local_site = $extractor->host; my $mech = WWW::Mechanize->new(autocheck=>0); push @queue, $start_url; while (@queue) { my $next_url = shift @queue; next unless $next_url; print STDERR $next_url,"\n"; next if $visited{$next_url}; ## Not needed with version 0.08 of URI::ImpliedBase; remove if you have it my $scheme_checker = URI->new($next_url); next if $scheme_checker->scheme and $scheme_checker->scheme !~ /http/; ## end of removable code $extractor = URI::ImpliedBase->new($next_url); next if $extractor->host ne $local_site; $mech->get($extractor->as_string); next unless $mech->success; # Unseen, on this site, and we can read it. # Save that we saw it, grab links from it, process this page. $visited{$next_url}++; push @queue, map {$_->url} $mech->links; process($next_url, $mech->content); } sub process { my($url, $page_content) = @_; # Do as you like with the page content here... print $page_content; }