Description: |
I run a daily email list that provides
headlines of Sci-Fi related news. One of the things
I include when available is info on new Sci-Fi movies as
they show up in the IMDB data feeds.
This is the script
I cron to get the new movie data. It can be generalized
to get info
about new movies in any IMDB category.
Since the IMDB data is updated semi-haphazardly, I
cron the script extremely frequently, thus I made it
'smart' enough to knot do anything unless it needs to.
My crontab looks something like this...
# This is important for the new-movies script.
FTP_PASSIVE=YES
# New movies .. several times a night, run the program.
# If it's output file allready exists, it won't do anything
#
# (defaults to 'last friday')
0 0-2,22,23 * * * cd $HOME/new-movies-work; $HOME/bin/new-movies
+.pl --lazy
# New movies ...
# Again, not intensive cause it does nothing if the output file has
# allready been created by the OTHER call (above)
#
# "a week ago last friday"
33 0-2,22,23 * * * cd $HOME/new-movies-work; $HOME/bin/new-movies
+.pl --lazy --datemod "- 1week"
|
#!/usr/local/bin/perl -w
### Description
#
# Download the imdb.com diff for the specified day (defaults to today)
# and pick the "new movies" to write to a file.
use strict;
use Getopt::Long;
use Date::Manip;
use Net::FTP;
use LWP;
use Compress::Zlib; # Compress::Zlib not used directly, but Archive::T
+ar
use Archive::Tar; # doesn't require it properly
### Configuration
use vars qw($ftplogin $ftppass $ftphost $ftpdir
$year_threshhold $hard_year_threshhold
$fudge_factor $fraction_power);
# ftp stuff
$ftplogin = 'anonymous';
$ftppass = 'yourname@yourhost';
$ftphost = 'ftp.imdb.com';
$ftpdir = '/pub/interfaces/diffs/';
# the number of years old a move can be and still be considered semi n
+ewish
$year_threshhold = 1;
# automatically junk movies this old -- no matter how cool they are
$hard_year_threshhold = 3;
# the number of "votes" to consider "inconsequential"
# and subtract from all counts to help baseline at '0'
$fudge_factor = 0;
# This is an exponent that will be aplied to the movie_year / this_yea
+r
# fraction .. the bigger it is, the bigger a deal the date
# difference is to the score
$fraction_power = 25;
### Options
use vars qw($verbose $date $datemod
$opt_remote $opt_local $opt_http
$file $outfile $lazy);
# options that default to false
$opt_http = $verbose = 0;
# options that default to true
$lazy = $opt_local = 1;
# options that have values
$date = 'last friday';
$datemod = "0"; # date offset, passed to DateCalc
$outfile = undef; # set to '-' for STDIN
$opt_remote = undef;
GetOptions ('verbose!' => \$verbose, # verbose
'remote!' => \$opt_remote, # get the .tgz file
'local!' => \$opt_local, # untar, parse, gen scores
'lazy!' => \$lazy, # be lazy, do nothing if outfile ex
+ist
'http!' => \$opt_http, # use http instead of ftp
'date=s' => \$date, # effective date
'datemod=s' => \$datemod, # modification to the date (if need
+ed)
'outfile=s' => \$outfile, # output file
);
$date = DateCalc($date,$datemod);
$file = UnixDate($date, 'diffs-%y%m%d.tar.gz');
$outfile = UnixDate($date, 'output-%y%m%d.txt') unless defined $outfil
+e;
if ($verbose) {
warn "Verbose Mode\n";
warn "Effective Date: $date\n";
warn "Output File: $outfile\n";
}
### Should we even bother?
if ($outfile ne '-' and $lazy and -e $outfile and -s _) {
warn "Output File exists, aborting.\n" if $verbose;
exit();
}
### Set $opt_remote if we don't allready know it
if ( ! defined $opt_remote ) {
# if we don't have explicit instructions, then look for the .tgz f
+ile
# if we've allready got it, don't bother downloading.
warn "Detectinging [no]remote behavior...\n" if $verbose;
if (-e $file) {
warn " Tar file found, 'noremote' behavior\n" if $verbose;
$opt_remote = 0;
} else {
warn " Tar file NOT found, 'remote' behavior\n" if $verbose;
$opt_remote = 1;
}
}
### Get the Tar File
if ($opt_remote) {
warn "Downloading file: $file\n" if $verbose;
# Pick a method for getting hte file
if ($opt_http) {
# HTTP support isn't the greatest ...
# * won't work if passive mode is neccessary (firewall)
# * can't autodetect if file isn't available
my ($ua, $req, $resp, $url);
$url = "ftp://${ftphost}${ftpdir}${file}";
warn "HTTP Download - $url\n" if $verbose;
$ua = new LWP::UserAgent;
$req = new HTTP::Request('GET', $url);
$resp = $ua->request($req, "./$file");
if ($resp->is_error()) {
# this might
die "HTTP Failed: " . $resp->status_line() . "\n";
}
} else {
# the GOOD way to get the file
warn "FTP Download\n" if $verbose;
my ($ftp, $status, $mod_date);
$ftp = Net::FTP->new($ftphost, Debug => $verbose);
die "FTP Failed: $@\n" unless defined $ftp;
$ftp->login($ftplogin, $ftppass)
or die "FTP Failed (Login)\n";
$ftp->cwd($ftpdir)
or die "FTP Failed: (cd $ftpdir)\n";
$ftp->binary()
or die "FTP Failed: (binary)\n";
$mod_date = $ftp->mdtm($file);
if (!defined($mod_date)) {
# It's not an error for the file to not exist
warn "FTP '$file' not found\n" if $verbose;
$ftp->quit;
exit();
}
$status = $ftp->get($file);
die "FTP Failed (get $file)\n" unless defined $status;
$ftp->quit;
}
}
### "Do Stuff" with teh Tar file
if ($opt_local) {
warn "Uncompressing $file\n" if $verbose;
my ($status, $tar);
$tar = new Archive::Tar;
$status = $tar->extract_archive($file, 1);
die "UNTar Failed: " . &Archive::Tar::error() . "\n"
unless defined $status;
}
### Meat of the Burger ... figure out who's new
if ($opt_local) {
# Because of the format of the diff.tgz files, there is no easy wa
+y of
# knowing if a movie is completley new or not -- and even if we di
+d,
# it might not get marked "sci-fi" when it's brand new ... that co
+uld
# come later.
# So.... what we pay attention to, is all movies that have had "sc
+i-fi"
# added to their genre's this time, that are either within the pas
+t X
# years OR appear to be new based on the movies.list file (which c
+ould
# mean their title was just edited .. but since the title might ha
+ve
# been ANYTHING, anywhere in the diff file -- we'll just cope with
+ that)
my ($genre_file, $movies_file)
= ('genres.list', 'movies.list');
my $this_year = UnixDate($date,'%Y');
# take into account the threshholds
my $magic_year = $this_year - $year_threshhold;
my $hard_year = $this_year - $hard_year_threshhold;
my %movies; # a hash of hashes - key by movie, then file.
my %years; # year in the movie title - key by movie.
my %scores; # absctract score of movies relevancy - key by movie.
# iterators
my $title;
my $file;
my $line;
warn "Doing GENRE file\n" if $verbose;
open GENRE, "diffs/$genre_file"
or die "couldn't open file '$genre_file': $!\n";
while (<GENRE>) {
next unless m{
^\> \s+ # starts with "> "
(.*? # main part of title [$1 = title]
\((\d+)(/.*?)?\) # year inside parens, might be (1999/I)
# [$2 = year, $3 = crap]
(\s+\(.*?\))? # [$4 = crap .. movies might be tv/vids/games
+]
)\t+Sci\-Fi$ # must end in Sc-Fi
}x;
my ($y, $t) = ($2, $1);
# don't even bother if it's really old
if ($hard_year > $y) {
warn " Ignoring Really Old: '$t'\n" if $verbose;
next;
}
$movies{$t} = { $genre_file => 1 };
$years{$t} = $y;
}
close GENRE;
warn "Doing MOVIE file\n" if $verbose;
open MOVIES, "diffs/$movies_file"
or die "couldn't open movie file '$movies_file': $!";
while ($line = <MOVIES>) {
next unless 0 == index $line, '> ';
foreach $title (keys %movies) {
next unless -1 < index $line, $title;
# the movies list is really important, treat it like 2 files.
$movies{$title}{$movies_file} = 1;
$movies{$title}{$movies_file.'dup'} = 1;
}
}
close MOVIES;
# drop any movie that's "old" and not in the movies file.
warn "Dropping Oldies...\n" if $verbose;
foreach $title (keys %movies) {
if ($magic_year > $years{$title} and
1 == scalar keys %{$movies{$title}})
{
delete $years{$title};
delete $movies{$title};
warn " Ignoring '$title'\n" if $verbose;
}
}
warn "Looping over all the list files...\n" if $verbose;
foreach $file (<diffs/*>) {
next if $file =~ m/$genre_file|$movies_file/o;
warn " ... $file\n" if $verbose;
open FILE, $file or die "couldn't open '$file': $!\n";
while ($line = <FILE>) {
next unless 0 == index $line, '> ';
foreach $title (keys %movies) {
next unless -1 < index $line, $title;
$movies{$title}{$file} = 1;
}
}
close FILE;
}
warn "Generating Scores\n" if $verbose;
foreach $title (keys %movies) {
# the more files it's in, and the later the year, the higher the s
+core.
# negative scores should be dropped (even if they are "new enough"
+,
# they aren't "new enough with enough data"
# basic scoring: year + files - today
#$scores{$title} = $years{$title} +
# scalar(keys %{$movies{$title}}) - $this_year - $fudge_factor;
warn " $title\n" if $verbose;
$scores{$title} = (scalar(keys %{$movies{$title}})
- $fudge_factor + $years{$title});
warn " Base Score: $scores{$title}\n" if $verbose;
$scores{$title} *= (($years{$title} / $this_year) ** $fraction_pow
+er);
warn " Score w/ratio: $scores{$title}\n" if $verbose;
$scores{$title} -= $this_year;
warn " Normalized: $scores{$title}\n" if $verbose;
}
warn "Outputing Scores\n" if $verbose;
my $outhandle = \*STDOUT;
if ($outfile ne '-') {
open $outhandle, ">$outfile" or die "Can't open '$outfile': $!";
}
foreach $title (sort { $scores{$b} <=> $scores{$a} } keys %scores)
+ {
print $outhandle "$scores{$title}\t$title\n";
}
}
|