Beefy Boxes and Bandwidth Generously Provided by pair Networks
No such thing as a small change
 
PerlMonks  

Harvest New Sci-Fi Movie Titles

by hossman (Prior)
on Jul 11, 2002 at 06:02 UTC ( #180953=sourcecode: print w/ replies, xml ) Need Help??

Category: Fun Stuff
Author/Contact Info hossman
Description: I run a daily email list that provides headlines of Sci-Fi related news. One of the things I include when available is info on new Sci-Fi movies as they show up in the IMDB data feeds. This is the script I cron to get the new movie data. It can be generalized to get info about new movies in any IMDB category.

Since the IMDB data is updated semi-haphazardly, I cron the script extremely frequently, thus I made it 'smart' enough to knot do anything unless it needs to.

My crontab looks something like this...

# This is important for the new-movies script. FTP_PASSIVE=YES # New movies .. several times a night, run the program. # If it's output file allready exists, it won't do anything # # (defaults to 'last friday') 0 0-2,22,23 * * * cd $HOME/new-movies-work; $HOME/bin/new-movies +.pl --lazy # New movies ... # Again, not intensive cause it does nothing if the output file has # allready been created by the OTHER call (above) # # "a week ago last friday" 33 0-2,22,23 * * * cd $HOME/new-movies-work; $HOME/bin/new-movies +.pl --lazy --datemod "- 1week"
#!/usr/local/bin/perl -w

### Description
#
# Download the imdb.com diff for the specified day (defaults to today)
# and pick the "new movies" to write to a file.

use strict;
use Getopt::Long;
use Date::Manip;
use Net::FTP;
use LWP;
use Compress::Zlib; # Compress::Zlib not used directly, but Archive::T
+ar
use Archive::Tar;   # doesn't require it properly

### Configuration

use vars qw($ftplogin $ftppass $ftphost $ftpdir
        $year_threshhold $hard_year_threshhold
        $fudge_factor $fraction_power);

# ftp stuff
$ftplogin   = 'anonymous';
$ftppass    = 'yourname@yourhost';
$ftphost    = 'ftp.imdb.com';
$ftpdir     = '/pub/interfaces/diffs/';

# the number of years old a move can be and still be considered semi n
+ewish
$year_threshhold  = 1;
# automatically junk movies this old -- no matter how cool they are
$hard_year_threshhold = 3;

# the number of "votes" to consider "inconsequential"
# and subtract from all counts to help baseline at '0'
$fudge_factor = 0;

# This is an exponent that will be aplied to the movie_year / this_yea
+r
# fraction .. the bigger it is, the bigger a deal the date
# difference is to the score
$fraction_power = 25;

### Options

use vars qw($verbose $date $datemod
        $opt_remote $opt_local $opt_http
        $file $outfile $lazy);

# options that default to false
$opt_http = $verbose = 0;
# options that default to true
$lazy = $opt_local = 1;
# options that have values
$date = 'last friday';
$datemod = "0"; # date offset, passed to DateCalc
$outfile = undef; # set to '-' for STDIN
$opt_remote = undef;

GetOptions ('verbose!' => \$verbose,   # verbose
        'remote!' => \$opt_remote, # get the .tgz file
        'local!' => \$opt_local,   # untar, parse, gen scores
        'lazy!' => \$lazy,         # be lazy, do nothing if outfile ex
+ist
        'http!' => \$opt_http,     # use http instead of ftp
        'date=s' => \$date,        # effective date
        'datemod=s' => \$datemod,  # modification to the date (if need
+ed)
        'outfile=s' => \$outfile,  # output file
        );

$date = DateCalc($date,$datemod);
$file = UnixDate($date, 'diffs-%y%m%d.tar.gz');
$outfile = UnixDate($date, 'output-%y%m%d.txt') unless defined $outfil
+e;

if ($verbose) {
    warn "Verbose Mode\n";
    warn "Effective Date: $date\n";
    warn "Output File: $outfile\n";
}

### Should we even bother?

if ($outfile ne '-' and $lazy and -e $outfile and -s _) {
    warn "Output File exists, aborting.\n" if $verbose;
    exit();
}

### Set $opt_remote if we don't allready know it

if ( ! defined $opt_remote ) {
    # if we don't have explicit instructions, then look for the .tgz f
+ile
    # if we've allready got it, don't bother downloading.
    warn "Detectinging [no]remote behavior...\n" if $verbose;
    if (-e $file) {
    warn "  Tar file found, 'noremote' behavior\n" if $verbose;
    $opt_remote = 0;
    } else {
    warn "  Tar file NOT found, 'remote' behavior\n" if $verbose;
    $opt_remote = 1;
    }
}

### Get the Tar File

if ($opt_remote) {
    warn "Downloading file: $file\n" if $verbose;

    # Pick a method for getting hte file

    if ($opt_http) {

    # HTTP support isn't the greatest ...
    #  * won't work if passive mode is neccessary (firewall)
    #  * can't autodetect if file isn't available
    
    my ($ua, $req, $resp, $url);
    $url = "ftp://${ftphost}${ftpdir}${file}";
    
    warn "HTTP Download - $url\n" if $verbose;
    
    $ua = new LWP::UserAgent;
    $req = new HTTP::Request('GET', $url);
    $resp = $ua->request($req, "./$file");

    if ($resp->is_error()) {
        # this might 
        die "HTTP Failed: " . $resp->status_line() . "\n";
    }
    
    } else {

    # the GOOD way to get the file
    
    warn "FTP Download\n" if $verbose;
    
    my ($ftp, $status, $mod_date);
    
    $ftp = Net::FTP->new($ftphost, Debug => $verbose);
    die "FTP Failed: $@\n" unless defined $ftp;
    
    $ftp->login($ftplogin, $ftppass)
        or die "FTP Failed (Login)\n";
    
    $ftp->cwd($ftpdir)
        or die "FTP Failed: (cd $ftpdir)\n";
    
    $ftp->binary()
        or die "FTP Failed: (binary)\n";

    $mod_date = $ftp->mdtm($file);
    if (!defined($mod_date)) {
        # It's not an error for the file to not exist

        warn "FTP '$file' not found\n" if $verbose;
        $ftp->quit;
        exit();
    }
    
    $status = $ftp->get($file);
    die "FTP Failed (get $file)\n" unless defined $status;
    
    $ftp->quit;
    
    }

}

### "Do Stuff" with teh Tar file

if ($opt_local) {
    warn "Uncompressing $file\n" if $verbose;

    my ($status, $tar);

    $tar = new Archive::Tar;
    $status = $tar->extract_archive($file, 1);
    die "UNTar Failed: " . &Archive::Tar::error() . "\n"
    unless defined $status;

}

### Meat of the Burger ... figure out who's new

if ($opt_local) {
    # Because of the format of the diff.tgz files, there is no easy wa
+y of
    # knowing if a movie is completley new or not -- and even if we di
+d,
    # it might not get marked "sci-fi" when it's brand new ... that co
+uld
    # come later.
    # So.... what we pay attention to, is all movies that have had "sc
+i-fi"
    # added to their genre's this time, that are either within the pas
+t X
    # years OR appear to be new based on the movies.list file (which c
+ould
    # mean their title was just edited .. but since the title might ha
+ve
    # been ANYTHING, anywhere in the diff file -- we'll just cope with
+ that)

    my ($genre_file, $movies_file)
    = ('genres.list', 'movies.list');
    
    my $this_year = UnixDate($date,'%Y');

    # take into account the threshholds
    my $magic_year = $this_year - $year_threshhold;
    my $hard_year  = $this_year - $hard_year_threshhold;
    
    my %movies; # a hash of hashes - key by movie, then file.
    my %years;  # year in the movie title - key by movie.
    my %scores; # absctract score of movies relevancy - key by movie.

    # iterators
    my $title;
    my $file;
    my $line;

    
    warn "Doing GENRE file\n" if $verbose;
    open GENRE, "diffs/$genre_file"
    or die "couldn't open file '$genre_file': $!\n";
    
    while (<GENRE>) {
    next unless m{
        ^\> \s+              # starts with "> "
        (.*?             # main part of title [$1 = title]
        \((\d+)(/.*?)?\) # year inside parens, might be (1999/I)
                         # [$2 = year, $3 = crap]
        (\s+\(.*?\))?    # [$4 = crap .. movies might be tv/vids/games
+]
        )\t+Sci\-Fi$     # must end in Sc-Fi
        }x;
    
    my ($y, $t) = ($2, $1);
    
    # don't even bother if it's really old
    if ($hard_year > $y) {
        warn "   Ignoring Really Old: '$t'\n" if $verbose;
        next;
    }
    
    $movies{$t} = { $genre_file => 1 };
    $years{$t} = $y;
    }

    close GENRE;

    warn "Doing MOVIE file\n" if $verbose;
    open MOVIES, "diffs/$movies_file"
    or die "couldn't open movie file '$movies_file': $!";

    while ($line = <MOVIES>) {
    next unless 0 == index $line, '> ';

    foreach $title (keys %movies) {
        next unless -1 < index $line, $title;

        # the movies list is really important, treat it like 2 files.
        $movies{$title}{$movies_file} = 1;
        $movies{$title}{$movies_file.'dup'} = 1; 
    }
    
    }

    close MOVIES;

    # drop any movie that's "old" and not in the movies file.
    warn "Dropping Oldies...\n" if $verbose;
    foreach $title (keys %movies) {
    if ($magic_year > $years{$title} and
        1 == scalar keys %{$movies{$title}})
    {
        delete $years{$title};
        delete $movies{$title};
        warn "   Ignoring '$title'\n" if $verbose;
    }
    }

    warn "Looping over all the list files...\n" if $verbose;
    foreach $file (<diffs/*>) {
    next if $file =~ m/$genre_file|$movies_file/o;

    warn " ... $file\n" if $verbose;
    open FILE, $file or die "couldn't open '$file': $!\n";
    
    while ($line = <FILE>) {
        
        next unless 0 == index $line, '> ';

        foreach $title (keys %movies) {
        next unless -1 < index $line, $title;
        
        $movies{$title}{$file} = 1;
        }
    }
    close FILE;
    }

    warn "Generating Scores\n" if $verbose;
    foreach $title (keys %movies) {
    # the more files it's in, and the later the year, the higher the s
+core.
    # negative scores should be dropped (even if they are "new enough"
+,
    # they aren't "new enough with enough data"

    # basic scoring:  year + files - today
    #$scores{$title} = $years{$title} +
    #    scalar(keys %{$movies{$title}}) - $this_year - $fudge_factor;
    
    warn "  $title\n" if $verbose;

    $scores{$title} = (scalar(keys %{$movies{$title}})
               - $fudge_factor + $years{$title});
    warn "     Base Score:    $scores{$title}\n" if $verbose;
    
    $scores{$title} *= (($years{$title} / $this_year) ** $fraction_pow
+er);
    warn "     Score w/ratio: $scores{$title}\n" if $verbose;

    $scores{$title} -= $this_year;
    warn "     Normalized:    $scores{$title}\n" if $verbose;
    
    }

    warn "Outputing Scores\n" if $verbose;
    my $outhandle = \*STDOUT;
    if ($outfile ne '-') {
    open $outhandle, ">$outfile" or die "Can't open '$outfile': $!";
    }
    
    foreach $title (sort { $scores{$b} <=> $scores{$a} } keys %scores)
+ {
    print $outhandle  "$scores{$title}\t$title\n";
    }
}


Comment on Harvest New Sci-Fi Movie Titles
Download Code

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://180953]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others cooling their heels in the Monastery: (10)
As of 2014-07-29 18:35 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    My favorite superfluous repetitious redundant duplicative phrase is:









    Results (226 votes), past polls