#!perl
use strict;
use warnings;
use Cache::FileCache;
# use Data::Dump::Streamer;
use DBM::Deep;
use HTML::FormatText;
use Mail::Sendmail 'sendmail';
use Params::Validate ':all';
use WWW::Mechanize;
use XML::RSS;
use vars ( '$DB', '$CACHE', '%CONFIG' );
main( sendmail => { smtp => '...',
From => '...',
To => '...' } );
exit;
sub main {
my %p = @_;
initialize( %p );
do_jobs_perl_org( category => 'MN perl',
url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
constraints => [ sub { /minnesota/i } ] );
do_jobs_perl_org( category => 'Tele perl',
url => 'http://jobs.perl.org/rss/telecommute.rss
+?limit=20' );
do_jobs_perl_org( category => 'Domino perl',
url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
constraints => [ sub { /\b(?:domino|lotus|notes)
+\b/i } ] );
# do_mn_job_bank( category => 'MN perl',
# constraints => [ sub { /\bperl\b/i } ],
# username => '...',
# password => '...',
# miles => 25,
# zip => ...,
# keyword => 'perl' );
#do_mn_job_bank( category => 'MN Domino',
# constraints => [ sub { /\b(?:domino|lotus\s+notes)\b/i
+ } ],
# username => '...',
# password => '...' );
return;
}
sub initialize {
my %p = validate( @_,
{ test => { type => SCALAR,
optional => 1,
default => 0 },
cache_namespace => { type => SCALAR,
optional => 1,
default => '1 week' },
cache_autopurge_interval => { type => SCALAR,
optional => 1,
default => '1 ho
+ur' },
db_filename => { type => SCALAR,
optional => 1,
default => "$ENV{HOME}/.jobwa
+tcher.db" },
wget_expiration => { type => SCALAR,
optional => 1,
default => '1 hour' },
sendmail => { type => HASHREF } } );
$DB = DBM::Deep->new( $p{db_filename} );
$CACHE = Cache::FileCache->new( { namespace => delete $p{cache_nam
+espace},
autopurge_interval => delete $p{
+cache_autopurge_interval} } );
@CONFIG{keys %p} = values %p;
return;
}
sub do_mn_job_bank {
my %p = validate( @_,
{ category => { type => SCALAR },
constraints => { type => ARRAYREF,
default => [ sub { 1 } ] },
username => { type => SCALAR },
password => { type => SCALAR },
zip => { type => SCALAR },
miles => { type => SCALAR },
keyword => { type => SCALAR },} );
my $www = WWW::Mechanize->new;
# Log in
$www->get( 'http://www.mnworks.org/jsli.cfm' );
$www->submit_form( fields => { li_user_name => $p{username},
li_password => $p{password} },
button => 'Submit' );
# Search
$www->get( 'http://www1.mnworks.org/jobseeker/js_srch_pg.cfm' );
$www->submit_form( fields => { i_geo_search_flag => 'zip_yes',
zip_miles => $p{miles},
zip_code => $p{zip},
Submit => 'More Search Options' } )
+;
print $www->content;
$www->submit_form( fields => { # i_geo_search_flags => 'zip_yes',
# zip_miles => $p{miles},
# zip_code => $p{zip},
i_job_description => $p{keyword} }
+);
}
sub do_jobs_perl_org {
my %p = validate( @_,
{ category => { type => SCALAR },
url => { type => SCALAR },
constraints => { type => ARRAYREF,
default => [ sub { 1 } ] } }
+);
my $rssurl = $p{url};
my $feed = rss_feed_from_url( url => $rssurl );
for my $job ( @{$feed->{items}} ) {
my $url = $job->{link};
my $title = $job->{title};
my $desc = html2text( html => get( url => $url ) );
# Remove spaces @ the start of lines
{
my %spaces;
++ $spaces{$_}
for
map length(),
$desc =~ /^( +)/gm;
my $spaces_to_remove
= ( sort { $spaces{$b} <=> $spaces{$a} }
keys %spaces )[0];
my $space_to_remove = " " x $spaces_to_remove;
$desc =~ s/^$space_to_remove//mg;
}
# Remove boilerplate & navigation at the top
$desc =~ s/
(?s:.+?)
(.+)[\r\n]+
[ \t]*=+[ \t]*[\r\n]+
/$1\n/x;
# Now collapse the description fields
$desc =~ s((.+)(?=^Description:)){
# Now I have a handle on just the key/value area ahead of
+the
# description
local $_ = $1;
# Collapse field lines together
s/^(.+:)[ \t]*[\r\n]+
(.+)/$1 $2/mgx;
# Remove empty lines
s/^[ \t]*[\r\n]+//gm;
# Remove trailing / leading whitespace
s/^\s+//;
s/\s+$//;
# Add in the \n I want.
"$_\n";
}sem;
# Finally remove the boilerplate from the botton.
$desc = reverse $desc;
$desc =~ s/(?s:.+?)
^ [ \t]* -+ [ \t]* [\r\n]+
^ [ \t]* [\r\n]+
//xsm;
# $desc =~ s/.+?^[ \t]*-{30,}[ \t]*[\r\n]+^\s+//s;
$desc = reverse $desc;
$desc =~ s/\s+$//;
$desc .= "\n";
if ( passes_constraints( constraints => $p{constraints},
title => $title,
desc => $desc ) ) {
notify( id => $url,
title => $title,
desc => "$p{category} $desc" );
}
}
return;
}
sub passes_constraints {
my %p = validate( @_,
{ constraints => { type => ARRAYREF },
title => { type => SCALAR },
desc => { type => SCALAR } } );
my @constraints = @{delete $p{constraints}};
if ( not @constraints ) {
@constraints = sub { 1 };
}
local $_ = "$p{title}\n$p{desc}";
# When
if ( /Job no longer in database/ ) {
return 0;
}
for my $constraint ( @constraints ) {
if ( $constraint->() ) {
return 1;
}
}
return 0;
}
sub notify {
my %p = validate( @_,
{ id => { type => SCALAR },
title => { type => SCALAR },
desc => { type => SCALAR } } );
return if $DB->{notified}{$p{id}};
if ( $CONFIG{test} ) {
print "NOTIFY $p{id}\n $p{title}\n";
return;
}
$DB->{notified}{$p{id}} = 1;
sendmail( %{$CONFIG{sendmail}},
Subject => $p{title},
Text => $p{desc} );
}
sub rss_feed_from_url {
my %p = validate( @_,
{ url => { type => SCALAR } } );
my $url = delete $p{url};
my $parser = XML::RSS->new;
$parser->parse( get( url => $url ) );
return $parser;
}
sub html2text {
my %p = validate( @_,
{ html => { type => SCALAR } } );
return HTML::FormatText->format_string( $p{html} );
}
sub get {
my %p = validate( @_,
{ url => { type => SCALAR } } );
my $url = delete $p{url};
my $content = $CACHE->get( $url );
if ( not defined $content ) {
my $www = WWW::Mechanize->new;
$www->get( $url );
$content = $www->content;
$CACHE->set( $url, $content, $CONFIG{wget_expiration} );
}
return $content;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.