#!/usr/bin/perl use strict; use vars qw($ref_image @err_images $CURRENT_FILE); use LWP::UserAgent; use HTTP::Request::Common qw(GET); use HTML::LinkExtor; use Storable; use Term::ProgressBar; use Getopt::Long qw(:config bundling_override); use Pod::Usage; use constant CHUNK => 4096; $| = 1; $SIG{INT} = \&store_remaining; our $user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98)'; our $file_type = '/\.jpe?g$/i'; our $filter = ''; our $prepend = 0; my ($dump, $resume); our $STORE_FILE = '.getimages.dat'; GetOptions( "u|ua|user_agent=s" => \$user_agent, "t|type|file_type=s" => \$file_type, "f|filter=s" => \$filter, "d|dump" => \$dump, "r|resume" => \$resume, "p|prepend=i" => \$prepend, "h|help|?" => sub {pod2usage(-verbose => 1)}, "m|man" => sub {pod2usage(-verbose => 2)}, "s|store|store_file=s" => \$STORE_FILE, ) || pod2usage(-verbose => 0); unless (@ARGV or -t) { chomp(@ARGV = <>); } our @rem_files = @ARGV; our @rem_images = (); my $use_store; if ( $resume ) { my $ref = retrieve($STORE_FILE) || die "Cannot retrieve ", $STORE_FILE, $/; unshift @rem_files, @{ $ref->{files} }; @rem_images = @{ $ref->{images} }; @err_images = @{ $ref->{err_images} }; $file_type = $ref->{file_type}; $filter = $ref->{filter}; $user_agent = $ref->{user_agent}; $prepend = $ref->{prepend}; $use_store = 1; } my $sfile_type = sub { local $_ = shift; eval $file_type; }; my $sfilter = sub { local $_ = shift; eval $filter; return $_; }; my $ua = new LWP::UserAgent; $ua->agent($user_agent); getimages( \@rem_images ) if @rem_images; while ( @rem_files ) { $CURRENT_FILE = shift @rem_files; my $response = $ua->get( $CURRENT_FILE ); print STDERR "Error retrieving html file: ", $response->status_line, "\n" and next unless $response->is_success; my ($baseurl) = $CURRENT_FILE =~ /^(.+\/).*?$/; my $parser = HTML::LinkExtor->new(undef, $baseurl); $parser->parse( $response->content ); @rem_images = map {$sfilter->( $_->[2] )} grep {$_->[0] eq 'a' and $sfile_type->( $_->[2] )} $parser->links; print join("\n", @rem_images), "\n" and next if $dump; getimages( \@rem_images ); } exit if $dump; if (@err_images) { print STDERR "There were errors retrieving some images. Gonna retry now.\n"; @rem_images = @err_images; @err_images = (); getimages( \@rem_images ); if (@err_images) { print STDERR "Still some errors...\n"; store_remaining(); } } unlink $STORE_FILE if $use_store; print STDERR "Done.\n"; sub store_remaining { unshift @rem_files, $CURRENT_FILE unless @rem_images; store { files => [@rem_files], images => [map {"$_"} ($ref_image, @rem_images)], err_images => [map {"$_"} @err_images], file_type => $file_type, filter => $filter, user_agent => $user_agent, prepend => $prepend }, $STORE_FILE; print < $#path ? $#path : abs($prepend) ) - 1); $image = uniname( $image ); my $inc_image = "__INCOMPLETE__$image"; my ($i, $bytes, $size, $code, $meter) = (0, 0, 0, undef, undef); my $saveimage = sub { my ($data, $response, $protocol) = @_; unless ($i++) { open(FH, ">", $inc_image) || die "Cannot write $image: $@\n"; $size = $response->headers->content_length; $code = $response->code; my $name = sprintf "$image (%s)", $size ? sprintf("%.1fkb", $size / 1024) : 'unknown size'; if ($size) { $meter = Term::ProgressBar->new({ name => $name, count => $size, ETA => 'linear' }); $meter->minor(0); } } print FH $data; if ($size) { $bytes += length $data; $meter->update($bytes) if $size; } }; my $request = GET($url); $request->referer($CURRENT_FILE); my $response = $ua->request($request, $saveimage, CHUNK); close(FH); if ($code == 200 and ($size == -s $inc_image xor !$size) ) { rename $inc_image, $image; } else { printf "\nERROR: %d %s", $code || $response->code, $image; push @err_images, $url; } print "\n"; } sub getimages { my $ref = shift; for ($ref_image = shift @$ref; $ref_image; $ref_image = shift @$ref) { getimage( $ref_image ) } } sub uniname { my $fn = shift; return $fn unless -e $fn; my $ufn; for (my $i; -e ($ufn = sprintf "%03d_$fn", $i); $i++) {} return $ufn; } __END__ =head1 NAME getimages - gets images (and more) whose links to are in a HTML page =head1 SYNOPSIS getimages [OPTIONS] [URLs...] If no URL is specified getimages will try to read URLs from STDIN Options: (-t|-type) REGEXP Download just the files whose names match REGEXP (defaults to /\.jpe?g$/i) (-f|-filter) CODE Modify URLs to download (in $_) according to CODE (-d|-dump) Prints the list of extracted URLs without downloading them (-r|-resume) Continue downloading from where you left last time (-ua|-user_agent) Changes user-agent (-p|-prepend) Prepend a path to the filename downloaded (-s|-store) Changes the status save file (defaults to ".getimages.dat") (-h|-help|-?) Display a help message to STDOUT (-m|-man) Display the full man page CTRL+C Stops the program (save state in ".getimages.dat", unless otherwise specified with "-s" option) =head1 OPTIONS =over 8 =item B<-t|-type> REGEXP This option lets you specify a REGEXP (but that's really a code block) that will be evaluated in a context where $_ is the filename. The program will download just the filenames for which the code block returns a true value. =item B<-f|-filter> CODE This option will apply CODE in a context where $_ is the filename. You can modify it as you like. =item B<-p|-prepend> INTEGER This is the number of directories in the URL that will be prepended to the filename. It defaults to 0. A traslation between '/' and '_' will also take place. For example if the file you are trying to download is "http://www.domain.com/foo/bar/00.jpg" then if you set the prepend parameter to 1 the file will be saved as bar_00.jpg. =back =head1 DESCRIPTION getimages was born to let you easily download big images that stands behind thumbnails in most webpages. Then it evolved to let include chunks of code from the command line and to download whichever kind of file you like =head1 EXAMPLES Download images linked at http://foo.org getimages http://foo.org/ Download MPEG audio files from a list of URLs you got in a file getimages -t '/\.mp\d$/i' < mympegURLlist.txt Go deeper one level and we want every pic getimages -dump -t '/pics.+?\.html?$/' http://foo.org/ | \ getimages -t '/\.(gif|png|jpe?g)$/' Every image is displayed through a CGI page but we don't want to download every page getimages -dump -t 'm!/pictures/.+\.shtml?$!' http://foo.org/ | \ getimages -f 's!cgi-bin/view(\w+)?\.cgi\?!pictures/!' =head1 COPYRIGHT Copyright 2002 Giulio Motta L. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut