http://www.perlmonks.org?node_id=177026
Category: Web Stuff
Author/Contact Info Matthew Diephouse elusion
Contact: matt --at-- diephouse --dot-- com
where "--at--" is @ and "--dot--" is .
Description: This is a script I wrote to grab sites from the web to stick on my hard drive and/or handheld device. It follows links to a certain depth, can download images, follow offsite links, and remove some unwanted html.

Code reviews/critiques are welcomed and requested.

Updated: Jun. 28, 2002

  • Squashed 2 bugs
  • UserAgent support
  • Proxy support
  • Progress report
#!/usr/bin/perl -w
use strict;

# goose.pl

################
# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
#
# Copyright 2002.  This may be modified and distributed
# on the same terms as Perl.
################

my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko
+/20020310 ';
my %proxy = (
  host => '',    # http://host.dom:port
  id   => '',    # ntdom\userid
  pass => '',    # empty quotes if no proxy auth
);

use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent($browser);
$ua->proxy(http => "$proxy{host}")
    if defined $proxy{host}; 

use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1
+, 1, "./"); # initialize and provide some defaults
GetOptions(  'images!'    => \$images,
            'offsite!'    => \$offsite,
            'help'        => \$help,
            'depth=i'    => \$depth,
            'path=s'    => \$path,
            'filename=s' => \$filename,
            'compact!'    => \$compact);

if ($help) {
    print <<EOH;
goose.pl -- a utility to grab site from the web
---------------------------------
usage: goose.pl [options] location
options:
    --images:
    --noimages:
        whether or not to download images
        defaults to yes
    --offsite:
    --nooffsite:
        whether or not to follow offsite links
        defaults to no
    --help
        prints this help message
    --depth
        the link depth to follow
        defaults to 0
    --path
        where to save the files to
    --filename
        the name of the first file. future names are
        made using the increment operator (++)
    --compact
    --nocompact
        whether or not to remove certain html tags
        defaults to use
EOH
    exit;    
}

my $href = shift or die "Must provide a location to be goosed. Use opt
+ion --help for usage information.";

$path .= "/"
    if $path !~ /[\/\\]$/;

my %links;    # holds location and filename
$|++;         # unbuffer output; allows progress to be shown
mkdir $path
    if not -e $path;

#------------ setup ends here

{
    my $count = 0;
    sub update_progress {
        print "\rGoosing $href... got $count files";
        $count++
    }
}

#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
    my ($html, %tags) = @_;
    
    for my $tag (keys %tags) {
        # if it's not closed
        if (not $tags{$tag}->{closed})
        {
            $$html =~ s! < \Q$tag\E .*? > !!xgis;
        }
        
        # if it's closed and we should remove content
        elsif (not $tags{$tag}->{content})
        {
            1 while remove_tag($tag, $html);
        }
        
        # if it's not closed and we should leave content
        elsif ($tags{$tag}->{content})
        {
            $$html =~ s! < /? \Q$tag\E .*? > !!xgis;
        }
    }
    return $$html;
}

#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
    my ($tag, $html) = @_;
    my $pre = "";
    my ($open, $content, $close);
    
    while (1) {
        $$html =~ m! (?<= \Q$pre\E )  (< \Q$tag\E [^>]* >) (.*?) (</ \
+Q$tag\E \s*? >) !xi
            || return 0;
    
        ($open, $content, $close) = ($1, $2, $3);
        
        # if it has a nested tag of the same kind
        last if $content !~ /<\Q$tag\E/;
        
        $pre .= $open;
    }
    
    $$html =~ s/\Q$open$content$close\E//;
    
    return 1;
}


#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
    my %options = @_;
    $options{location} || die "must give a location to goose";
    $options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $
+1);

    $links{$options{location}} ||= $filename++ . ".html";

    my $count = 1; # number of files saved
    my $page = get_page( $options{location} );
    return 0 if not $page; # if get_page() fails
    
    if ($options{compact}) {
        compact( \$page, style => { closed => 1 },
                        meta => { closed => 0 },
                        link   => { closed => 0} )
    }

    update_progress();
    
    # receive just the files that still need to be goosed
    my @files = extract_links( \$page,
                        $options{images},
                        $options{offsite},
                        $options{depth},
                        $options{master},
                        $options{location} );
    # get images if specified
    if ($options{images}) {
        my @images = extract_images( \$page, $options{master}, $option
+s{location} );

        for my $image (@images) {
            $count += get_image($image);
        }
    }
        
    open FILE, ">$options{path}$links{$options{location}}" || die "cou
+ldn't open $links{$options{location}}";
    print FILE $page;
    close FILE;

    for my $file (@files) {
        $count +=
             goose( location => $file,
                depth    => $options{depth} - 1,
                images    => $options{images},
                offsite    => $options{offsite},
                master    => $options{master},
                path        => $options{path},
                compact    => $options{compact});
    }
    
    return $count;
}

#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;
    
    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);
    
    return $result->content # if everything went right
        if $result->is_success;
    
    return get_page($location, 0) #try once more (default once)
        if $tryagain;
    return ""; # cop out
}

#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;

    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);

    # if everything went right
    if ($result->is_success) {
        open FILE, ">" . $path . $links{$location} || die "couldn't op
+en $links{$location}";
        binmode(FILE);
        print FILE $result->content;
        close FILE;
        update_progress();
        return 1;
    }
    
    return get_image($location, 0) #try once more (default once)
        if $tryagain;
    return 0; # cop out
}

#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
    my ($html, $images, $offsite, $depth, $master, $parent_location) =
+ @_;
    my @pages; # ones that still need to be goosed
    
    $$html =~ s{ ( <a \s+ [^>]+ > ) }
             { new_link($1, "href", $depth, $offsite, $master, $parent
+_location, \@pages, "html") }xgei;
    
    return @pages;
}

#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
    my ($html, $master, $parent) = @_;
    my @images;

    $$html =~ s{ ( <img \s+ [^>]+ > ) }
                 {new_link($1, "src", 1, 1, $master, $parent, \@images
+)}xgei;
    
    return @images
}

#feedit: link, type, depth, offsite, master, parent location, pages (r
+ef), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
    my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag
+es, $ext) = @_;
      
    my $link = get_attribute($tag, $type) || return $tag;
    # if ext is provided, get the ext of the file being saved
    $ext ||= ($link =~ m!/[^/]+ \. (\w+)  (?: \? [\w%&;\s#=]+ )?$!x &&
+ $1);

    # if it's a mailto: link
    return $tag
        if $link =~ /^mailto:/;
    # if it's javascript
    return $tag
        if $link =~ /^javascript:/;
    
    my $fullpath = fullpath( $link, $parent_location );
    $fullpath =~ m!^(https?://[^/]+)!;
    my $root = $1;

    # if it's already been goosed or is queued up
    return set_attribute($tag, $type, $links{$fullpath})
        if defined $links{$fullpath};
    # if we're done goosing
    return set_attribute($tag, $type, $fullpath)
        if not $depth;
    # if it's an offsite link and we don't want it
    return $tag
        if lc($root) ne lc($master)
          && not $offsite;
    
    # else queue it up
    push @$pages, $fullpath;
    $links{$fullpath} = $filename++ . "." . $ext;
    
    return set_attribute($tag, $type, $links{$fullpath})
}

#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
    my ($relative, $found) = @_;
    
    #if it's not a filename and doesn't have a /
    $found .= "/"
        if $found !~ m!https?://.*/.*!i;
    
    return $relative
        if $relative =~ m!^https?://!i;
    $found =~ s![^/]+$!!; # remove filename at end
    return $1 . $relative    # /foo/bar
        if $relative =~ s/^\///
           && $found =~ m!^(https?://.+?/)!i;
    $found =~ s![^/]+/$!! # for relative urls (../whatever)
        while $relative =~ s!^\.\./!!;
    1 while $relative =~ s!^\./!!; # for urls relative to the current 
+directory ( ./whatever)
    return $found . $relative;
}

#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
    my ($tag, $attribute) = @_;
    
    # normal tag
    return $2
        if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
    
    # tag with no quotes
    return $1
        if $tag =~ /\Q$attribute\E=([^\s>]+)/;
    
    # else
    return 0;
}

#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
    my ($tag, $attribute, $value) = @_;

        # normal tag
    return $tag
        if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va
+lue$2/xi;
    
    # tag with no quotes
    return $tag
        if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i;
    
    # else
    die "\nERROR: couldn't set attribute '$attribute' to '$value' for 
+'$tag'\n";
}

update_progress();
my $count = goose( location => $href,
                depth    => $depth,
                offsite   => $offsite,
                images  => $images,
                path         => $path,
                compact => $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
    " " x (length($count) + 6),
    "\n$count files received\n";
Replies are listed 'Best First'.
Re: goose.pl
by hacker (Priest) on Jul 18, 2002 at 12:49 UTC
    Since nobody else has commented, I will add mine. WONDERFUL!

    I've got a few ideas for this, and it actually is very close to the spider I've been working on for Plucker in my spare time. Bear with me while I brain-dump these out:

    1. LWP::Parallel::UserAgent to fetch content asynchronously.
    2. HTTP::Cookies for storing client-side cookies into the "jar"
    3. Link rewrite rules, so gathering/spidering the content locally after fetching can be done with relative/absolute links, without breaking anything. (URI::URL can help here)
    4. Ability to forge Referer and UserAgent strings through the use of --referer and --useragent. Trivial to add.
    5. More verbose progress reporting (use LWP::Debug qw(+);)
    6. Options for staying on the same host, same domain, or staying below a certain fragment of the URI. Something like:
      # url http://www.domain.com/foo/bar/blort/quux --staybelow http://www.domain.com/foo/bar/ --stayonhost www.domain.com --stayondomain domain.com
    7. Ability to update the "cache" on multiple runs. Compare the remote file with the local file, and fetch if newer.

    Expect some patches from me to come flying in within the next few weeks on this one. Great work!