http://www.perlmonks.org?node_id=334989
Category: Web Stuff
Author/Contact Info mkurtis
Description: This is a webcrawler that will grab links from web addresses in file1.txt and place them in file2.txt and then visit all of those links and place them in file3.txt and so on. It prints the content of each page in numerated files in a seperate folder.
#!/usr/bin/perl -w

use strict;
use diagnostics;
use LWP::RobotUA;
use URI::URL;
#use HTML::Parser ();
use HTML::SimpleLinkExtor;

my $a=0;
my $i;
my $links;
my $base;
my $u;
for($u=1; $u<1000000000; $u++)  {
open(FILE1,"</var/www/links/file$u.txt");
while(<FILE1>) {
my $ua = LWP::RobotUA->new('theusefulbot', 'bot@theusefulnet.com');
#my $p = HTML::Parser->new();
$ua->delay(10/600);
my $content = $ua->get($_)->content;
#my $text = $p->parse($content)->parse;
open(OUTPUT,">/var/www/data/$a.txt");

print OUTPUT "$content";
close(OUTPUT);
my $extor = HTML::SimpleLinkExtor->new($base);
$extor->parse($content);
my @links = $extor->a;
$u++;
open(FILE2,">/var/www/links/file$u.txt");
foreach $links(@links) {
print FILE2 url("$links")->abs("$_");
print FILE2 "\n";
}

$a++;
$i=$a;
$u--;
}
close(FILE1);
close(FILE2);
}
UPDATE: NEW WORKING CODE Thanks to Kappa for making it check itself against a visited list.

#!/usr/bin/perl -w
use strict;

use LWP::RobotUA;
use HTML::SimpleLinkExtor;
use URI::URL;

use vars qw/$http_ua $link_extractor/;

sub crawl {
    my @queue = @_;
    my %visited;
    my $a = 0;
    my $base;
    while ( my $url = shift @queue ) {
        next if $visited{$url};

        my $content = $http_ua->get($url)->content;

        open FILE, '>' . ++$a . '.txt';
        print FILE "$url\n";
    print FILE $content;
        close FILE;

        print qq{Downloaded: "$url"\n};

        push @queue, do {
            my $link_extractor = HTML::SimpleLinkExtor->new($url);
            $link_extractor->parse($content);
            $link_extractor->a;

        };
        $visited{$url} = 1;
    }
}

$http_ua = new LWP::RobotUA theusefulbot => 'bot@theusefulnet.com';
$http_ua->delay( 10 / 6000 );

crawl(@ARGV);