goose.pl

Category:	Web Stuff
Author/Contact Info	Matthew Diephouse elusion Contact: matt --at-- diephouse --dot-- com where "--at--" is @ and "--dot--" is .
Description:	This is a script I wrote to grab sites from the web to stick on my hard drive and/or handheld device. It follows links to a certain depth, can download images, follow offsite links, and remove some unwanted html. Code reviews/critiques are welcomed and requested. Updated: Jun. 28, 2002 Squashed 2 bugs UserAgent support Proxy support Progress report
#!/usr/bin/perl -w use strict; # goose.pl ################ # Written by Matthew Diephouse # Contact at "matt --at-- diephouse --dot-- com" # where "--at--" is @ and "--dot--" is . # # Copyright 2002. This may be modified and distributed # on the same terms as Perl. ################ my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko +/20020310 '; my %proxy = ( host => '', # http://host.dom:port id => '', # ntdom\userid pass => '', # empty quotes if no proxy auth ); use LWP::UserAgent; my $ua = new LWP::UserAgent; $ua->agent($browser); $ua->proxy(http => "$proxy{host}") if defined $proxy{host}; use Getopt::Long; my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1 +, 1, "./"); # initialize and provide some defaults GetOptions( 'images!' => \$images, 'offsite!' => \$offsite, 'help' => \$help, 'depth=i' => \$depth, 'path=s' => \$path, 'filename=s' => \$filename, 'compact!' => \$compact); if ($help) { print <<EOH; goose.pl -- a utility to grab site from the web --------------------------------- usage: goose.pl [options] location options: --images: --noimages: whether or not to download images defaults to yes --offsite: --nooffsite: whether or not to follow offsite links defaults to no --help prints this help message --depth the link depth to follow defaults to 0 --path where to save the files to --filename the name of the first file. future names are made using the increment operator (++) --compact --nocompact whether or not to remove certain html tags defaults to use EOH exit; } my $href = shift or die "Must provide a location to be goosed. Use opt +ion --help for usage information."; $path .= "/" if $path !~ /[\/\\]$/; my %links; # holds location and filename $\|++; # unbuffer output; allows progress to be shown mkdir $path if not -e $path; #------------ setup ends here { my $count = 0; sub update_progress { print "\rGoosing $href... got $count files"; $count++ } } #feedit: html (ref), hash with tags to cut and options (hash ref) #receive: compacted version #effect: compacts ref sub compact { my ($html, %tags) = @_; for my $tag (keys %tags) { # if it's not closed if (not $tags{$tag}->{closed}) { $$html =~ s! < \Q$tag\E .? > !!xgis; } # if it's closed and we should remove content elsif (not $tags{$tag}->{content}) { 1 while remove_tag($tag, $html); } # if it's not closed and we should leave content elsif ($tags{$tag}->{content}) { $$html =~ s! < /? \Q$tag\E .? > !!xgis; } } return $$html; } #feedit: tag name, html (ref) #receive: success indicator #effect: remove's tag from html sub remove_tag { my ($tag, $html) = @_; my $pre = ""; my ($open, $content, $close); while (1) { $$html =~ m! (?<= \Q$pre\E ) (< \Q$tag\E [^>]* >) (.?) (</ \ +Q$tag\E \s? >) !xi \|\| return 0; ($open, $content, $close) = ($1, $2, $3); # if it has a nested tag of the same kind last if $content !~ /<\Q$tag\E/; $pre .= $open; } $$html =~ s/\Q$open$content$close\E//; return 1; } #feedit: location, depth, images?, offsite?, path #receive: number of files #effect: save files sub goose { my %options = @_; $options{location} \|\| die "must give a location to goose"; $options{master} \|\|= ($options{location} =~ m!(http://[^/]+)! && $ +1); $links{$options{location}} \|\|= $filename++ . ".html"; my $count = 1; # number of files saved my $page = get_page( $options{location} ); return 0 if not $page; # if get_page() fails if ($options{compact}) { compact( \$page, style => { closed => 1 }, meta => { closed => 0 }, link => { closed => 0} ) } update_progress(); # receive just the files that still need to be goosed my @files = extract_links( \$page, $options{images}, $options{offsite}, $options{depth}, $options{master}, $options{location} ); # get images if specified if ($options{images}) { my @images = extract_images( \$page, $options{master}, $option +s{location} ); for my $image (@images) { $count += get_image($image); } } open FILE, ">$options{path}$links{$options{location}}" \|\| die "cou +ldn't open $links{$options{location}}"; print FILE $page; close FILE; for my $file (@files) { $count += goose( location => $file, depth => $options{depth} - 1, images => $options{images}, offsite => $options{offsite}, master => $options{master}, path => $options{path}, compact => $options{compact}); } return $count; } #feedit: the name of the page to be retrieved #receive: the html #effect: none sub get_page { my ($location, $tryagain) = @_; $tryagain = 1 if not defined $tryagain; my $request = HTTP::Request->new(GET => $location); $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} ) if defined $proxy{id}; my $result = $ua->request($request); return $result->content # if everything went right if $result->is_success; return get_page($location, 0) #try once more (default once) if $tryagain; return ""; # cop out } #feedit: the name of the image to be retrieved #receive: the number of files saved #effect: saves the files sub get_image { my ($location, $tryagain) = @_; $tryagain = 1 if not defined $tryagain; my $request = HTTP::Request->new(GET => $location); $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} ) if defined $proxy{id}; my $result = $ua->request($request); # if everything went right if ($result->is_success) { open FILE, ">" . $path . $links{$location} \|\| die "couldn't op +en $links{$location}"; binmode(FILE); print FILE $result->content; close FILE; update_progress(); return 1; } return get_image($location, 0) #try once more (default once) if $tryagain; return 0; # cop out } #feedit: the html (ref), images?, offsite?, depth #receive: the names of pages yet to be goosed #effect: change links of html sub extract_links { my ($html, $images, $offsite, $depth, $master, $parent_location) = + @_; my @pages; # ones that still need to be goosed $$html =~ s{ ( <a \s+ [^>]+ > ) } { new_link($1, "href", $depth, $offsite, $master, $parent +_location, \@pages, "html") }xgei; return @pages; } #feedit: html with img tags, master, parent location #receive: array of images do download #effect: none sub extract_images { my ($html, $master, $parent) = @_; my @images; $$html =~ s{ ( <img \s+ [^>]+ > ) } {new_link($1, "src", 1, 1, $master, $parent, \@images +)}xgei; return @images } #feedit: link, type, depth, offsite, master, parent location, pages (r +ef), $ext (opt) #receive: a new link (possibly) #effect: adds an entry to @pages if necessary sub new_link { my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag +es, $ext) = @_; my $link = get_attribute($tag, $type) \|\| return $tag; # if ext is provided, get the ext of the file being saved $ext \|\|= ($link =~ m!/[^/]+ \. (\w+) (?: \? [\w%&;\s#=]+ )?$!x && + $1); # if it's a mailto: link return $tag if $link =~ /^mailto:/; # if it's javascript return $tag if $link =~ /^javascript:/; my $fullpath = fullpath( $link, $parent_location ); $fullpath =~ m!^(https?://[^/]+)!; my $root = $1; # if it's already been goosed or is queued up return set_attribute($tag, $type, $links{$fullpath}) if defined $links{$fullpath}; # if we're done goosing return set_attribute($tag, $type, $fullpath) if not $depth; # if it's an offsite link and we don't want it return $tag if lc($root) ne lc($master) && not $offsite; # else queue it up push @$pages, $fullpath; $links{$fullpath} = $filename++ . "." . $ext; return set_attribute($tag, $type, $links{$fullpath}) } #feedit: location, location where found #receive: full location #effect: none sub fullpath { my ($relative, $found) = @_; #if it's not a filename and doesn't have a / $found .= "/" if $found !~ m!https?://./.!i; return $relative if $relative =~ m!^https?://!i; $found =~ s![^/]+$!!; # remove filename at end return $1 . $relative # /foo/bar if $relative =~ s/^\/// && $found =~ m!^(https?://.+?/)!i; $found =~ s![^/]+/$!! # for relative urls (../whatever) while $relative =~ s!^\.\./!!; 1 while $relative =~ s!^\./!!; # for urls relative to the current +directory ( ./whatever) return $found . $relative; } #feedit: a tag, attribute #receive: value of attribute #effect: none sub get_attribute { my ($tag, $attribute) = @_; # normal tag return $2 if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi; # tag with no quotes return $1 if $tag =~ /\Q$attribute\E=([^\s>]+)/; # else return 0; } #feedit: tag, attribute, value #receive: tag #effect: none sub set_attribute { my ($tag, $attribute, $value) = @_; # normal tag return $tag if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va +lue$2/xi; # tag with no quotes return $tag if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i; # else die "\nERROR: couldn't set attribute '$attribute' to '$value' for +'$tag'\n"; } update_progress(); my $count = goose( location => $href, depth => $depth, offsite => $offsite, images => $images, path => $path, compact => $compact) \|\| die " it didn't work!"; print "\rGoosing $href... Done", " " x (length($count) + 6), "\n$count files received\n";

Back to Code Catacombs