Beefy Boxes and Bandwidth Generously Provided by pair Networks
We don't bite newbies here... much
 
PerlMonks  

Swimsuits2006

by merlyn (Sage)
on Feb 14, 2006 at 15:40 UTC ( [id://530145]=note: print w/replies, xml ) Need Help??


in reply to Swimsuits2005
in thread Swimsuits2004

Mere minutes after the 2006 site went live, I reverse engineered it for this. This doesn't get the "subscription exclusive" shots as the prior versions did... they finally got smart and put it in a separate index. Nor does it grab the videos and a few of the other odd extra things. "Until next time, enjoy!"
#!/usr/bin/perl use strict; $|++; use LWP::Simple; -d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $! +"; my $all_model_index = get "http://sportsillustrated.cnn.com/features/2 +006_swimsuit/"; while ($all_model_index =~ /(\/features\/2006_swimsuit\/(allstar|model +s)\/[-\w]+\.html)/g) { doit("$1"); } doit("/features/2006_swimsuit/allstar/allstar_reunion.html"); doit("/features/2006_swimsuit/athletes/"); doit("/features/2006_swimsuit/painting/"); sub doit { my $model_index = get "http://sportsillustrated.cnn.com/" . shift; while ($model_index =~ /\'(http:\/\/i.a.cnn.net\/si\/features\/2006_ +swimsuit\/images\/gallery\/photos\/)([\w.\-]+)t.jpg\'/g) { my $url = "$1$2.jpg"; my $file = "RESULTS/$2.jpg"; if (-e $file) { print "$url => $file: "; print "skip\n"; } else { print "$url => $file: "; print mirror($url, $file), "\n"; } } }

-- Randal L. Schwartz, Perl hacker
Be sure to read my standard disclaimer if this is a reply.

Replies are listed 'Best First'.
Swimsuits2007
by merlyn (Sage) on Feb 15, 2007 at 14:01 UTC
    The annual tradition:
    #!/usr/bin/env perl use strict; $|++; use LWP::Simple; -d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $! +"; my $all_model_index = get "http://sportsillustrated.cnn.com/features/2 +007_swimsuit/models/"; while ($all_model_index =~ /(\/features\/2007_swimsuit\/(models|painti +ng|onlocation)\/[-\w]+\/)/g) { doit("$1"); } doit("/features/2007_swimsuit/beyonce/"); doit("/features/2007_swimsuit/3d/"); sub doit { my $base = shift; print "$base =>\n"; my $model_index = get "http://sportsillustrated.cnn.com/$base/index2 +.html"; unless ($model_index) { $model_index = get "http://sportsillustrated.cnn.com/$base/"; } while ($model_index =~ /\"(http:\/\/i.a.cnn.net\/si\/features\/2007_ +swimsuit\/images\/photos\/)([\w.\-]+)t.jpg\"/g) { my $url = "$1$2.jpg"; my $file = "RESULTS/$2.jpg"; if (-e $file) { print "$url => $file: "; print "skip\n"; } else { print "$url => $file: "; print mirror($url, $file), "\n"; } } }
      The annual tradition:
      #!/usr/bin/env perl use strict; $|++; use LWP::Simple; -d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $! +"; my $all_model_index = get "http://sportsillustrated.cnn.com/features/2 +008_swimsuit/models/"; for ($all_model_index =~ m{"/features/2008[^"]+/index2\.html"}g) { next if /(\/features\/2008_swimsuit\/(models|athleteswives|painting| +cheerleaders|onlocation)\/[-\w]+\/)/; print "$_\n"; } # exit 0; while ($all_model_index =~ /(\/features\/2008_swimsuit\/(models|athlet +eswives|painting|cheerleaders|onlocation)\/[-\w]+\/)/g) { doit("$1"); } doit("/features/2008_swimsuit/selfportraits/"); doit("/features/2008_swimsuit/heidi-klum/"); doit("/features/2008_swimsuit/danica-patrick/"); sub doit { my $base = shift; print "$base =>\n"; my $model_index = get "http://sportsillustrated.cnn.com/$base/index2 +.html"; unless ($model_index) { $model_index = get "http://sportsillustrated.cnn.com/$base/"; } while ($model_index =~ m{\"(http://i.a.cnn.net/si/pr/subs/swimsuit/i +mages/)([\w.\-]+)_t.jpg\"}g) { my $url = "$1$2.jpg"; my $file = "RESULTS/$2.jpg"; if (-e $file) { print "$url => $file: "; print "skip\n"; } else { print "$url => $file: "; print mirror($url, $file), "\n"; } } }
        This script is a bit messy, but it seems to be grabbing all of the core images. No video downloads yet... I'm not that smart.
        #!/usr/bin/env perl use strict; $|++; use LWP::Simple; -d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $! +"; my $all_model_index = get "http://sportsillustrated.cnn.com/2009_swims +uit/models/"; for ($all_model_index =~ m{"/2009[^"]+/index2\.html"}g) { next if /(\/2009_swimsuit\/(models|dancers|tennis|onlocation)\/[-\w] ++\/)/; print "$_\n"; } # exit 0; while ($all_model_index =~ /(\/2009_swimsuit\/(models|dancers|tennis|o +nlocation)\/[-\w]+\/)/g) { doit("$1"); } doit("/2009_swimsuit/painting/$_/") for qw(brooklyn-decker julie-henderson irina-shayk jessica-white); my %done; sub doit { my $base = shift; return if $done{$base}++; print "$base =>\n"; my $model_index = get "http://sportsillustrated.cnn.com/$base/index2 +.html"; unless ($model_index) { $model_index = get "http://sportsillustrated.cnn.com/$base/"; } while ($model_index =~ m{\"(http://i.cdn.turner.com/si/pr/subs/swims +uit/images/)([\w.\-]+)_t.jpg\"}g) { my $url = "$1$2.jpg"; my $file = "RESULTS/$2.jpg"; if (-e $file) { print "$url => $file: "; print "skip\n"; } else { print "$url => $file: "; print mirror($url, $file), "\n"; } } }
Re: Swimsuits2006
by Jenda (Abbot) on Feb 14, 2006 at 17:31 UTC

    While we are at it:

    #!perl use strict; use warnings; no warnings 'uninitialized'; use LWP::UserAgent; use HTTP::Cookies; if (!@ARGV or $ARGV[0] =~ m{^[-/]h(elp)?$} or $ARGV[0] =~ m{^[-/]\?$}) + { print <<'*END*'; getimagesx.pl [-r referer] url [prefix [-r referer] url prefix] ... downloads all files in the sequence = finds the last number in the + URL and keeps incrementing the number and downloading until it fails three times in a row. The file will be named according to the part + of the URL following the last slash. If you specify just the prefix then the prefix will be prepended t +o the file names. If you specify the -r followed by a URL then that URL will be downloaded, cookies remembered and the URL will be sent to the server as the HTTP_REFERER with the image requests. *END* exit; } my $ua = LWP::UserAgent->new( env_proxy => 1, keep_alive => 1, timeout => 60, agent => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; (R1 1 +.3); .NET CLR 1.0.3705)', cookie_jar => HTTP::Cookies->new(), ); our $referer = ''; while ($ARGV[0] =~ m{^[-/]r}) { shift(@ARGV); $referer = shift(@ARGV); if ($referer !~ /##/) { our $result = $ua->get( $referer, ':content_file' => 'referer. +html'); if ($result->code() != 200) { die "Failed to load the referer $referer\n"; } } } while (@ARGV) { my $url = shift(@ARGV); if ($ARGV[0] =~ m{^[-/]r}) { shift(@ARGV); $referer = shift(@ARGV); } my $prefix = shift(@ARGV); if ($ARGV[0] =~ m{^[-/]r}) { shift(@ARGV); $referer = shift(@ARGV); } if (! defined $prefix and $url =~ m{([^/]+)/\d+\.\w+$}) { $prefix = $1; print "Automatic prefix '$prefix'\n"; } my $suffix = ''; if ($prefix =~ m{^(.*)/(.*)$}) { ($prefix, $suffix) = ($1, $2); } if ($url =~ m{^(.*/)(.*?)(\d+)([^0-9]*)$}) { my ($url_beg, $file, $num, $file_end) = ($1,$2,$3,$4); my ($result, $errors); #print "Referer: $referer\n\n"; while ($errors < 10) { local $referer = $referer; if ($referer =~ s/##/$num/g) { #print "Load referer: $referer\n"; $result = $ua->get( $referer, ':content_file' => 'refe +rer.html'); if ($result->code() != 200) { die "Failed to load the referer $referer\n"; } } if ($file =~ /[\&\?]/) { print $url_beg.$file.$num.$file_end," => ",$prefix.$nu +m.$file_end.$suffix,"\n"; $result = getstore($url_beg.$file.$num.$file_end, $pre +fix.$num.$file_end.$suffix); } else { print $url_beg.$file.$num.$file_end," => ",$prefix.$fi +le.$num.$file_end.$suffix,"\n"; $result = getstore($url_beg.$file.$num.$file_end, $pre +fix.$file.$num.$file_end.$suffix); } if ($result == 200) { $errors = 0; } else { $errors++; print "\t$result\n"; } $num++; } print "LAST RESULT: $result\n"; } else { my $file = $url; $file =~ s{^.*/}{}; $file = $prefix . $file; if ($file) { getstore( $url, $file); } else { print STDERR "Can't download directories! ($url)\n"; } } } use Data::Dumper; sub getstore { my ($url, $file) = @_; $file =~ s{(?:~(\d+))?(\.\w+)$}{'~' . ($1+1) . $2}e while (-e $fil +e); my $result = $ua->get($url, ':content_file' => $file, referer => $ +referer); return $result->status_line() if $result->code() != 200; if (wantarray()) { return (200, $result->content_type); } else { if ($result->content_type =~ /^text/i) { print "Bloody bastards. They do not return proper HTTP sta +tus!\n"; # unlink $file; return 404; } return 200; } }
    Just in case you wanted all images in a series ;-)

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://530145]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others pondering the Monastery: (7)
As of 2024-03-29 15:10 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found