Description: This is a script I wrote to grab sites from the web to stick on my hard drive and/or handheld device. It follows links to a certain depth, can download images, follow offsite links, and remove some unwanted html.

Code reviews/critiques are welcomed and requested.

Updated: Jun. 28, 2002

  • Squashed 2 bugs
  • UserAgent support
  • Proxy support
  • Progress report
#!/usr/bin/perl -w
use strict;


# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
# Copyright 2002.  This may be modified and distributed
# on the same terms as Perl.

my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko
+/20020310 ';
my %proxy = (
  host => '',    # http://host.dom:port
  id   => '',    # ntdom\userid
  pass => '',    # empty quotes if no proxy auth

use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->proxy(http => "$proxy{host}")
    if defined $proxy{host}; 

use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1
+, 1, "./"); # initialize and provide some defaults
GetOptions(  'images!'    => \$images,
            'offsite!'    => \$offsite,
            'help'        => \$help,
            'depth=i'    => \$depth,
            'path=s'    => \$path,
            'filename=s' => \$filename,
            'compact!'    => \$compact);

if ($help) {
    print <<EOH; -- a utility to grab site from the web
usage: [options] location
        whether or not to download images
        defaults to yes
        whether or not to follow offsite links
        defaults to no
        prints this help message
        the link depth to follow
        defaults to 0
        where to save the files to
        the name of the first file. future names are
        made using the increment operator (++)
        whether or not to remove certain html tags
        defaults to use

my $href = shift or die "Must provide a location to be goosed. Use opt
+ion --help for usage information.";

$path .= "/"
    if $path !~ /[\/\\]$/;

my %links;    # holds location and filename
$|++;         # unbuffer output; allows progress to be shown
mkdir $path
    if not -e $path;

#------------ setup ends here

    my $count = 0;
    sub update_progress {
        print "\rGoosing $href... got $count files";

#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
    my ($html, %tags) = @_;
    for my $tag (keys %tags) {
        # if it's not closed
        if (not $tags{$tag}->{closed})
            $$html =~ s! < \Q$tag\E .*? > !!xgis;
        # if it's closed and we should remove content
        elsif (not $tags{$tag}->{content})
            1 while remove_tag($tag, $html);
        # if it's not closed and we should leave content
        elsif ($tags{$tag}->{content})
            $$html =~ s! < /? \Q$tag\E .*? > !!xgis;
    return $$html;

#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
    my ($tag, $html) = @_;
    my $pre = "";
    my ($open, $content, $close);
    while (1) {
        $$html =~ m! (?<= \Q$pre\E )  (< \Q$tag\E [^>]* >) (.*?) (</ \
+Q$tag\E \s*? >) !xi
            || return 0;
        ($open, $content, $close) = ($1, $2, $3);
        # if it has a nested tag of the same kind
        last if $content !~ /<\Q$tag\E/;
        $pre .= $open;
    $$html =~ s/\Q$open$content$close\E//;
    return 1;

#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
    my %options = @_;
    $options{location} || die "must give a location to goose";
    $options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $

    $links{$options{location}} ||= $filename++ . ".html";

    my $count = 1; # number of files saved
    my $page = get_page( $options{location} );
    return 0 if not $page; # if get_page() fails
    if ($options{compact}) {
        compact( \$page, style => { closed => 1 },
                        meta => { closed => 0 },
                        link   => { closed => 0} )

    # receive just the files that still need to be goosed
    my @files = extract_links( \$page,
                        $options{location} );
    # get images if specified
    if ($options{images}) {
        my @images = extract_images( \$page, $options{master}, $option
+s{location} );

        for my $image (@images) {
            $count += get_image($image);
    open FILE, ">$options{path}$links{$options{location}}" || die "cou
+ldn't open $links{$options{location}}";
    print FILE $page;
    close FILE;

    for my $file (@files) {
        $count +=
             goose( location => $file,
                depth    => $options{depth} - 1,
                images    => $options{images},
                offsite    => $options{offsite},
                master    => $options{master},
                path        => $options{path},
                compact    => $options{compact});
    return $count;

#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;
    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);
    return $result->content # if everything went right
        if $result->is_success;
    return get_page($location, 0) #try once more (default once)
        if $tryagain;
    return ""; # cop out

#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;

    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);

    # if everything went right
    if ($result->is_success) {
        open FILE, ">" . $path . $links{$location} || die "couldn't op
+en $links{$location}";
        print FILE $result->content;
        close FILE;
        return 1;
    return get_image($location, 0) #try once more (default once)
        if $tryagain;
    return 0; # cop out

#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
    my ($html, $images, $offsite, $depth, $master, $parent_location) =
+ @_;
    my @pages; # ones that still need to be goosed
    $$html =~ s{ ( <a \s+ [^>]+ > ) }
             { new_link($1, "href", $depth, $offsite, $master, $parent
+_location, \@pages, "html") }xgei;
    return @pages;

#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
    my ($html, $master, $parent) = @_;
    my @images;

    $$html =~ s{ ( <img \s+ [^>]+ > ) }
                 {new_link($1, "src", 1, 1, $master, $parent, \@images
    return @images

#feedit: link, type, depth, offsite, master, parent location, pages (r
+ef), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
    my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag
+es, $ext) = @_;
    my $link = get_attribute($tag, $type) || return $tag;
    # if ext is provided, get the ext of the file being saved
    $ext ||= ($link =~ m!/[^/]+ \. (\w+)  (?: \? [\w%&;\s#=]+ )?$!x &&
+ $1);

    # if it's a mailto: link
    return $tag
        if $link =~ /^mailto:/;
    # if it's javascript
    return $tag
        if $link =~ /^javascript:/;
    my $fullpath = fullpath( $link, $parent_location );
    $fullpath =~ m!^(https?://[^/]+)!;
    my $root = $1;

    # if it's already been goosed or is queued up
    return set_attribute($tag, $type, $links{$fullpath})
        if defined $links{$fullpath};
    # if we're done goosing
    return set_attribute($tag, $type, $fullpath)
        if not $depth;
    # if it's an offsite link and we don't want it
    return $tag
        if lc($root) ne lc($master)
          && not $offsite;
    # else queue it up
    push @$pages, $fullpath;
    $links{$fullpath} = $filename++ . "." . $ext;
    return set_attribute($tag, $type, $links{$fullpath})

#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
    my ($relative, $found) = @_;
    #if it's not a filename and doesn't have a /
    $found .= "/"
        if $found !~ m!https?://.*/.*!i;
    return $relative
        if $relative =~ m!^https?://!i;
    $found =~ s![^/]+$!!; # remove filename at end
    return $1 . $relative    # /foo/bar
        if $relative =~ s/^\///
           && $found =~ m!^(https?://.+?/)!i;
    $found =~ s![^/]+/$!! # for relative urls (../whatever)
        while $relative =~ s!^\.\./!!;
    1 while $relative =~ s!^\./!!; # for urls relative to the current 
+directory ( ./whatever)
    return $found . $relative;

#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
    my ($tag, $attribute) = @_;
    # normal tag
    return $2
        if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
    # tag with no quotes
    return $1
        if $tag =~ /\Q$attribute\E=([^\s>]+)/;
    # else
    return 0;

#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
    my ($tag, $attribute, $value) = @_;

        # normal tag
    return $tag
        if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va
    # tag with no quotes
    return $tag
        if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i;
    # else
    die "\nERROR: couldn't set attribute '$attribute' to '$value' for 

my $count = goose( location => $href,
                depth    => $depth,
                offsite   => $offsite,
                images  => $images,
                path         => $path,
                compact => $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
    " " x (length($count) + 6),
    "\n$count files received\n";