http://www.perlmonks.org?node_id=177026
Category: Web Stuff
Author/Contact Info Matthew Diephouse elusion
Contact: matt --at-- diephouse --dot-- com
where "--at--" is @ and "--dot--" is .
Description: This is a script I wrote to grab sites from the web to stick on my hard drive and/or handheld device. It follows links to a certain depth, can download images, follow offsite links, and remove some unwanted html.

Code reviews/critiques are welcomed and requested.

Updated: Jun. 28, 2002

  • Squashed 2 bugs
  • UserAgent support
  • Proxy support
  • Progress report
#!/usr/bin/perl -w
use strict;

# goose.pl

################
# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
#
# Copyright 2002.  This may be modified and distributed
# on the same terms as Perl.
################

my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko
+/20020310 ';
my %proxy = (
  host => '',    # http://host.dom:port
  id   => '',    # ntdom\userid
  pass => '',    # empty quotes if no proxy auth
);

use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent($browser);
$ua->proxy(http => "$proxy{host}")
    if defined $proxy{host}; 

use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1
+, 1, "./"); # initialize and provide some defaults
GetOptions(  'images!'    => \$images,
            'offsite!'    => \$offsite,
            'help'        => \$help,
            'depth=i'    => \$depth,
            'path=s'    => \$path,
            'filename=s' => \$filename,
            'compact!'    => \$compact);

if ($help) {
    print <<EOH;
goose.pl -- a utility to grab site from the web
---------------------------------
usage: goose.pl [options] location
options:
    --images:
    --noimages:
        whether or not to download images
        defaults to yes
    --offsite:
    --nooffsite:
        whether or not to follow offsite links
        defaults to no
    --help
        prints this help message
    --depth
        the link depth to follow
        defaults to 0
    --path
        where to save the files to
    --filename
        the name of the first file. future names are
        made using the increment operator (++)
    --compact
    --nocompact
        whether or not to remove certain html tags
        defaults to use
EOH
    exit;    
}

my $href = shift or die "Must provide a location to be goosed. Use opt
+ion --help for usage information.";

$path .= "/"
    if $path !~ /[\/\\]$/;

my %links;    # holds location and filename
$|++;         # unbuffer output; allows progress to be shown
mkdir $path
    if not -e $path;

#------------ setup ends here

{
    my $count = 0;
    sub update_progress {
        print "\rGoosing $href... got $count files";
        $count++
    }
}

#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
    my ($html, %tags) = @_;
    
    for my $tag (keys %tags) {
        # if it's not closed
        if (not $tags{$tag}->{closed})
        {
            $$html =~ s! < \Q$tag\E .*? > !!xgis;
        }
        
        # if it's closed and we should remove content
        elsif (not $tags{$tag}->{content})
        {
            1 while remove_tag($tag, $html);
        }
        
        # if it's not closed and we should leave content
        elsif ($tags{$tag}->{content})
        {
            $$html =~ s! < /? \Q$tag\E .*? > !!xgis;
        }
    }
    return $$html;
}

#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
    my ($tag, $html) = @_;
    my $pre = "";
    my ($open, $content, $close);
    
    while (1) {
        $$html =~ m! (?<= \Q$pre\E )  (< \Q$tag\E [^>]* >) (.*?) (</ \
+Q$tag\E \s*? >) !xi
            || return 0;
    
        ($open, $content, $close) = ($1, $2, $3);
        
        # if it has a nested tag of the same kind
        last if $content !~ /<\Q$tag\E/;
        
        $pre .= $open;
    }
    
    $$html =~ s/\Q$open$content$close\E//;
    
    return 1;
}


#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
    my %options = @_;
    $options{location} || die "must give a location to goose";
    $options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $
+1);

    $links{$options{location}} ||= $filename++ . ".html";

    my $count = 1; # number of files saved
    my $page = get_page( $options{location} );
    return 0 if not $page; # if get_page() fails
    
    if ($options{compact}) {
        compact( \$page, style => { closed => 1 },
                        meta => { closed => 0 },
                        link   => { closed => 0} )
    }

    update_progress();
    
    # receive just the files that still need to be goosed
    my @files = extract_links( \$page,
                        $options{images},
                        $options{offsite},
                        $options{depth},
                        $options{master},
                        $options{location} );
    # get images if specified
    if ($options{images}) {
        my @images = extract_images( \$page, $options{master}, $option
+s{location} );

        for my $image (@images) {
            $count += get_image($image);
        }
    }
        
    open FILE, ">$options{path}$links{$options{location}}" || die "cou
+ldn't open $links{$options{location}}";
    print FILE $page;
    close FILE;

    for my $file (@files) {
        $count +=
             goose( location => $file,
                depth    => $options{depth} - 1,
                images    => $options{images},
                offsite    => $options{offsite},
                master    => $options{master},
                path        => $options{path},
                compact    => $options{compact});
    }
    
    return $count;
}

#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;
    
    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);
    
    return $result->content # if everything went right
        if $result->is_success;
    
    return get_page($location, 0) #try once more (default once)
        if $tryagain;
    return ""; # cop out
}

#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;

    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);

    # if everything went right
    if ($result->is_success) {
        open FILE, ">" . $path . $links{$location} || die "couldn't op
+en $links{$location}";
        binmode(FILE);
        print FILE $result->content;
        close FILE;
        update_progress();
        return 1;
    }
    
    return get_image($location, 0) #try once more (default once)
        if $tryagain;
    return 0; # cop out
}

#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
    my ($html, $images, $offsite, $depth, $master, $parent_location) =
+ @_;
    my @pages; # ones that still need to be goosed
    
    $$html =~ s{ ( <a \s+ [^>]+ > ) }
             { new_link($1, "href", $depth, $offsite, $master, $parent
+_location, \@pages, "html") }xgei;
    
    return @pages;
}

#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
    my ($html, $master, $parent) = @_;
    my @images;

    $$html =~ s{ ( <img \s+ [^>]+ > ) }
                 {new_link($1, "src", 1, 1, $master, $parent, \@images
+)}xgei;
    
    return @images
}

#feedit: link, type, depth, offsite, master, parent location, pages (r
+ef), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
    my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag
+es, $ext) = @_;
      
    my $link = get_attribute($tag, $type) || return $tag;
    # if ext is provided, get the ext of the file being saved
    $ext ||= ($link =~ m!/[^/]+ \. (\w+)  (?: \? [\w%&;\s#=]+ )?$!x &&
+ $1);

    # if it's a mailto: link
    return $tag
        if $link =~ /^mailto:/;
    # if it's javascript
    return $tag
        if $link =~ /^javascript:/;
    
    my $fullpath = fullpath( $link, $parent_location );
    $fullpath =~ m!^(https?://[^/]+)!;
    my $root = $1;

    # if it's already been goosed or is queued up
    return set_attribute($tag, $type, $links{$fullpath})
        if defined $links{$fullpath};
    # if we're done goosing
    return set_attribute($tag, $type, $fullpath)
        if not $depth;
    # if it's an offsite link and we don't want it
    return $tag
        if lc($root) ne lc($master)
          && not $offsite;
    
    # else queue it up
    push @$pages, $fullpath;
    $links{$fullpath} = $filename++ . "." . $ext;
    
    return set_attribute($tag, $type, $links{$fullpath})
}

#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
    my ($relative, $found) = @_;
    
    #if it's not a filename and doesn't have a /
    $found .= "/"
        if $found !~ m!https?://.*/.*!i;
    
    return $relative
        if $relative =~ m!^https?://!i;
    $found =~ s![^/]+$!!; # remove filename at end
    return $1 . $relative    # /foo/bar
        if $relative =~ s/^\///
           && $found =~ m!^(https?://.+?/)!i;
    $found =~ s![^/]+/$!! # for relative urls (../whatever)
        while $relative =~ s!^\.\./!!;
    1 while $relative =~ s!^\./!!; # for urls relative to the current 
+directory ( ./whatever)
    return $found . $relative;
}

#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
    my ($tag, $attribute) = @_;
    
    # normal tag
    return $2
        if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
    
    # tag with no quotes
    return $1
        if $tag =~ /\Q$attribute\E=([^\s>]+)/;
    
    # else
    return 0;
}

#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
    my ($tag, $attribute, $value) = @_;

        # normal tag
    return $tag
        if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va
+lue$2/xi;
    
    # tag with no quotes
    return $tag
        if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i;
    
    # else
    die "\nERROR: couldn't set attribute '$attribute' to '$value' for 
+'$tag'\n";
}

update_progress();
my $count = goose( location => $href,
                depth    => $depth,
                offsite   => $offsite,
                images  => $images,
                path         => $path,
                compact => $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
    " " x (length($count) + 6),
    "\n$count files received\n";