This script checks select pages on some popular review sites for the latest review, and writes the date of the most recent review from each site to a file. Each time it is run, it checks against the previous results and sends an email notification with the date and link to page(s) with fresh reviews.
"mailx" was used to send email. I suspect that this may not be available in Windows, I tested only on Mac OS X and Ubuntu.
The following script has the pages hard-coded, as it was written for my school. Those pages (and your email addresses) could easily be replaced to suit your requirements.
I found "The 10-minute XPath Tutorial" ("Automating System Administration with Perl, 2nd ed.) very helpful in understanding XPath. Thanks also to the help of fellow perl monks!
By the way, "EB" and "MA" are shorthand for two separate campuses within our school.
Update 2014-07-28 - I ran perlcritic and fixed some potential problems
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use Text::CSV;
use Carp;
use LWP::Simple qw(get);
use Text::Unidecode qw(unidecode);
use HTML::TreeBuilder::XPath;
# Email Settings
my %email = (
to => 'me@example.com,you@example.com',
subject => 'New ECDS reviews found'
);
# Reviews subroutine and URLs to check
my $review_sites = [
{
site => 'Yelp',
sub => \&yelp_checker,
review_pages => {
'EB' =>
'http://www.yelp.com/biz/encinitas-country-day-school-encinitas?sort_b
+y=date_desc',
'MA' =>
'http://www.yelp.com/biz/encinitas-country-day-school-encinitas-2?sort
+_by=date_desc'
}
},
{
site => 'GreatSchools',
sub => \&gs_checker,
review_pages => {
'MA' =>
'http://www.greatschools.org/california/encinitas/9670-Encinitas-Count
+ry-Day-School/?tab=reviews'
}
},
{
site => 'PrivateSchoolReview',
sub => \&psr_checker,
review_pages => {
'MA' =>
'http://www.privateschoolreview.com/school_ov/school_id/
+2039'
}
},
{
site => 'Kudzu',
sub => \&kudzu_checker,
review_pages => {
'MA' =>
'http://www.kudzu.com/m/Encinitas-Country-Day-School-135
+71675'
}
},
{
site => 'MerchantCircle',
sub => \&mc_checker,
review_pages => {
'MA' =>
'http://www.merchantcircle.com/business/Encinitas.Country.Day.School.7
+60-942-1111?sort=created&dir=desc'
}
}
];
# Default date if no record
my $default_date = '00-00-0000';
# Month name to number conversion
my %month = (
January => '01',
February => '02',
March => '03',
April => '04',
May => '05',
June => '06',
July => '07',
August => '08',
September => '09',
October => '10',
November => '11',
December => '12'
);
# Where is the reviews file?
my $reviews_filepath = "reviews.txt";
# Where is the alert message file?
my $msg_filepath = "msg.txt";
# Slurp hash from reviews file
my $old_reviews = hash_from_csv($reviews_filepath);
my %new_reviews;
# Iterate through each site
for my $review_site (@$review_sites) {
my $pages = $review_site->{review_pages};
# iterate through each campus html and collect xpath nodes
while ( my ( $campus, $url ) = each %$pages ) {
my $html = get $url or croak("Can't reach $url $!\n");
$html =~ s/([^[:ascii:]]+)/unidecode($1)/ge;
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($html) or croak("Parse failed: $!\n");
my ($date) = $review_site->{'sub'}->($tree);
# create hash keys from campus and review site names
my $campus_site = $campus . '_' . $$review_site{'site'};
push( @{ $new_reviews{$campus_site} }, $date );
push( @{ $new_reviews{$campus_site} }, $url );
}
}
# Write message if new reviews
my $msg = '';
while ( my ( $item, $data ) = each %new_reviews ) {
unless ( $$old_reviews{$item}[0] eq $$data[0] ) {
$msg .= "New review on $$data[0]: \n $$data[1]\n";
}
}
# Save message.
open my $fh, ">:encoding(utf8)", "$msg_filepath"
or croak("cannot open $msg_filepath: $!");
print {$fh} $msg or croak("Can't print message:\n$msg\n$!");
close $fh;
# Write new review data to file.
hash_to_csv( \%new_reviews, $reviews_filepath );
# Email message if exists
send_email($msg) if length($msg);
######## SUBROUTINES #######
# import old data from file
sub hash_from_csv {
my $filepath = shift;
open my $fh, "<:encoding(utf8)", "$filepath"
or croak("cannot open $filepath: $!");
my $csv = Text::CSV->new( { binary => 1 } );
my %hash;
map { $hash{ shift @{$_} } = $_ } @{ $csv->getline_all($fh) };
close $fh;
return \%hash;
}
# write new data to file
sub hash_to_csv {
my ( $hash, $filepath ) = @_;
open my $fh, ">:encoding(utf8)", "$filepath"
or croak("cannot open $filepath: $!");
my $csv = Text::CSV->new( { binary => 1, eol => "\n" } );
for ( keys %$hash ) {
my $colref = [ $_, $$hash{$_}->[0] ];
$csv->print( $fh, $colref );
}
close $fh;
return;
}
# send email notifications
sub send_email {
my ($body) = @_;
open my $pipe, '|-', '/usr/bin/mailx', '-s', $email{subject}, $ema
+il{to}
or croak("can't open pipe to mailx: $!\n");
print $pipe $body;
close $pipe;
croak("mailx exited with a non-zero status: $?\n") if $?;
return;
}
# extract date of most recent review from GreatSchools tree
sub gs_checker {
my $tree = shift;
my $xpath =
'//div[contains(@class,"media mbs")]/div[(@class="author small make-99
+9999 fl pbn mbn")]';
my $dates = $tree->findnodes($xpath);
# dates returned as 'month dd, yyyy'
my $date;
$date = $$dates[0]->as_trimmed_text() if ( $$dates[0] );
if ( $date =~ /(\w{3,9})\s+(\d{1,2}),\s+(\d{4})/ ) {
$date = $3 . '-' . $month{$1} . '-' . $2;
}
return ( $date || $default_date );
}
# extract date of most recent review from Yelp tree
sub yelp_checker {
my $tree = shift;
my $xpath = '//meta[@itemprop="datePublished"][1]';
my $dates = $tree->findnodes($xpath);
# dates returned as 'yyyy-mm-dd'
if ( $$dates[0] ) {
return $$dates[0]->attr('content');
}
else {
return ( $$dates[0] || $default_date );
}
}
# extract date of most recent review from PrivateSchoolReview tree
sub psr_checker {
my $tree = shift;
my $xpath = '//meta[@itemprop="datePublished"][1]';
my $dates = $tree->findnodes($xpath);
# dates returned as 'yyyy-mm-dd'
if ( $$dates[0] ) {
return $$dates[0]->attr('content');
}
else {
return ( $$dates[0] || $default_date );
}
}
# extract date of most recent review from Kudzu tree
sub kudzu_checker {
my $tree = shift;
my $xpath = '//div[@class="review_post_date"]/p/span[@class="rp-da
+te"]';
my $dates = $tree->findnodes($xpath);
# date returned as 'mm/dd/yyyy'
my $date;
$date = $$dates[0]->as_trimmed_text() if ( $$dates[0] );
if ( $date =~ /(\d{1,2})\/(\d{1,2})\/(\d{4})/ ) {
$date = $3 . '-' . $1 . '-' . $2;
}
return ( $date || $default_date );
}
# extract date of most recent review from MerchantCircle tree
sub mc_checker {
my $tree = shift;
my $xpath = '//span[@itemprop="datePublished"][1]';
my $dates = $tree->findnodes($xpath);
# dates returned as 'Month dd, yyyy at hh:mm PM'
my $date;
$date = $$dates[0]->as_trimmed_text() if ( $$dates[0] );
if ( $date =~
/\s*(\w{3,9})\s*(\d{1,2})\s*\,\s*(\d{4})\s+at\s+\d{1,2}\:\d{2}
+\s+[AP]M/
)
{
$date = $3 . '-' . $month{$1} . '-' . $2;
}
return ( $date || $default_date );
}