Category: | Web Stuff |
Author/Contact Info | mkurtis |
Description: | This is a webcrawler that will grab links from web addresses in file1.txt and place them in file2.txt and then visit all of those links and place them in file3.txt and so on. It prints the content of each page in numerated files in a seperate folder. |
#!/usr/bin/perl -w use strict; use diagnostics; use LWP::RobotUA; use URI::URL; #use HTML::Parser (); use HTML::SimpleLinkExtor; my $a=0; my $i; my $links; my $base; my $u; for($u=1; $u<1000000000; $u++) { open(FILE1,"</var/www/links/file$u.txt"); while(<FILE1>) { my $ua = LWP::RobotUA->new('theusefulbot', 'bot@theusefulnet.com'); #my $p = HTML::Parser->new(); $ua->delay(10/600); my $content = $ua->get($_)->content; #my $text = $p->parse($content)->parse; open(OUTPUT,">/var/www/data/$a.txt"); print OUTPUT "$content"; close(OUTPUT); my $extor = HTML::SimpleLinkExtor->new($base); $extor->parse($content); my @links = $extor->a; $u++; open(FILE2,">/var/www/links/file$u.txt"); foreach $links(@links) { print FILE2 url("$links")->abs("$_"); print FILE2 "\n"; } $a++; $i=$a; $u--; } close(FILE1); close(FILE2); }UPDATE: NEW WORKING CODE Thanks to Kappa for making it check itself against a visited list. #!/usr/bin/perl -w use strict; use LWP::RobotUA; use HTML::SimpleLinkExtor; use URI::URL; use vars qw/$http_ua $link_extractor/; sub crawl { my @queue = @_; my %visited; my $a = 0; my $base; while ( my $url = shift @queue ) { next if $visited{$url}; my $content = $http_ua->get($url)->content; open FILE, '>' . ++$a . '.txt'; print FILE "$url\n"; print FILE $content; close FILE; print qq{Downloaded: "$url"\n}; push @queue, do { my $link_extractor = HTML::SimpleLinkExtor->new($url); $link_extractor->parse($content); $link_extractor->a; }; $visited{$url} = 1; } } $http_ua = new LWP::RobotUA theusefulbot => 'bot@theusefulnet.com'; $http_ua->delay( 10 / 6000 ); crawl(@ARGV); |
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: Web Crawler
by matija (Priest) on Mar 09, 2004 at 08:09 UTC |
Back to
Code Catacombs