Some days ago, someone was
asking about web robots...
I pointed him to some CPAN modules, including
LWP::Simple and
HTML::Parser. Well, this program isn't precisely a robot, but
may provide some help.
Furthermore, it is supposed to be a spot of
GraphViz and the related package GraphViz
for the other monks. If you want a more complex tool to do such things, use WebStalker.
#!/usr/bin/perl
use strict;
# Did someone say 'robot'?
use LWP::Simple;
use HTML::Parser;
use URI;
use GraphViz;
my $start_page = 'http://localhost/~stefano/';
my $graph = GraphViz->new( width => 8, height => 8 );
my %is_known = ();
my @to_be_explored = ();
my $current;
# Formerly known as $explorer :)
# Rename it according to your sympathies :)
my $scout = HTML::Parser->new(
start_h => [sub
{
my $self = shift;
my ($tagname, $attr) = @_;
if ($tagname eq "a") {
$attr->{href} =~ s/\#.*//;
$graph->add_edge($current, $attr->{href});
unshift @to_be_explored, $attr->{href};
}
}, "self, tagname, attr"]);
push @to_be_explored, $start_page;
while (@to_be_explored) {
$current = URI->new( shift @to_be_explored );
if ( not $is_known{ $current } ) {
$is_known{ $current }++;
$graph->add_node( $current,
fontsize => 8,
height => 0.5,
wodth => 0.5);
print "Exploring $current ($#to_be_explored)...\n";
my $html_string = get( $current->abs( $start_page ))
|| warn "Can't get page! - $!\n";
$scout->parse( $html_string );
}
}
open OUT, "> out.png" || die "Can't open file for output! - $!\n";
print OUT $graph->as_png;
close OUT;