http://www.perlmonks.org?node_id=127264
Category: Web Stuff
Author/Contact Info rob_au
Description: Earlier this month, George_Sherston posted a node, where he submitted code for a site indexer and search engine - I took this code and decided to build upon it for my own site and in evaluating it and other options available, I found HTML::Index. This code offered the ability to create site indexes for both local and remote files (through the use of WWW::SimpleRobot by the same author) - This ability for indexing based upon URL was important to me as a great deal of content on the site is dynamic in nature. This was where my journey hit a stumbling block ... WWW::SimpleRobot didn't work!

So, I set about writing my own simplified robot code which had one and only one function - return a list of crawled URLs from a start URL address.

#!/usr/bin/perl -w use Local::SiteRobot; use strict; my $robot = Local::SiteRobot->new( DEPTH => 10, FOLLOW_REGEX => '^http://www.cowsnet.com', URLS => [ 'http://www.cowsnet.com.au' ] ); my @pages = $robot->crawl; print STDOUT $_, "\n" foreach @pages;

The code I feel is quite self explanatory - /msg me if you have any questions on usage.

package Local::SiteRobot;

use HTML::LinkExtor;
use LWP::Simple;
use URI;
use strict;

sub new {
    my $class = shift;
    my %options = (
        DEPTH           =>  undef,
        FOLLOW_REGEX    =>  '',
        URLS            =>  [],
        VERBOSE         =>  0
    );
    my %args = (%options, @_);
    foreach (keys %args) {
        die "Local::SiteRobot->new : Unknown argument option - $_" unl
+ess exists $options{$_};
    };
    my $self = bless \%args, (ref($class) || $class);
    $self->_verbose("Local::SiteRobot->new : Created new Local::SiteRo
+bot object");
    return $self;
}

sub crawl {
    my $self = shift;
    return undef unless @{$self->{URLS}};
    my @pages;
    foreach my $url (@{$self->{URLS}}) {
        my $uri = URI->new($url);
        next unless $uri->scheme;
        next unless $uri->scheme eq 'http';
        $self->_verbose("Local::SiteRobot->crawl : Crawling from URL "
+, $uri->canonical->as_string);
        push (@pages, $self->_crawl($uri->canonical->as_string));
        $self->_verbose("Local::SiteRobot->crawl : Crawling from URL "
+, $uri->canonical->as_string, " returned ", scalar(@pages), " pages")
+;
    }
    return @pages;
}

sub _crawl {
    my ($self, $url, $depth) = @_;
    my @pages;
    my $uri = URI->new($url);
    $self->_verbose("Local::SiteRobot->_crawl : GET ", $uri->canonical
+->as_string);
    my $html = get($uri->canonical->as_string);
    return unless $html;
    return $uri->canonical->as_string if ((defined $self->{DEPTH}) && 
+($self->{DEPTH} == ($depth || 0)));
    ${$self->{pages}}{$uri->canonical->as_string} = 1;
    push (@pages, $uri->canonical->as_string);
    my $linkextor = HTML::LinkExtor->new(undef, $uri->canonical->as_st
+ring);
    $linkextor->parse($html);
    foreach my $link ($linkextor->links) {
        my ($tag, %attr) = @{$link};
        next unless ($tag eq 'a');
        next unless (defined $attr{'href'});
        my $href = URI->new($attr{'href'});
        next unless ($href->canonical->as_string =~ /$self->{FOLLOW_RE
+GEX}/);
        next if exists ${$self->{pages}}{$href};
        ${$self->{pages}}{$href} = 1;
        push (@pages, $self->_crawl($href, ($depth || 0) + 1));
    }
    return @pages;
}

sub _verbose {
    my $self = shift;
    return unless $self->{VERBOSE};
    print STDERR @_, "\n";
}

1;


__END__