#!/usr/bin/env perl use strict; use warnings; use utf8; use Text::CSV; use Carp; use LWP::Simple qw(get); use Text::Unidecode qw(unidecode); use HTML::TreeBuilder::XPath; # Email Settings my %email = ( to => 'me@example.com,you@example.com', subject => 'New ECDS reviews found' ); # Reviews subroutine and URLs to check my $review_sites = [ { site => 'Yelp', sub => \&yelp_checker, review_pages => { 'EB' => 'http://www.yelp.com/biz/encinitas-country-day-school-encinitas?sort_by=date_desc', 'MA' => 'http://www.yelp.com/biz/encinitas-country-day-school-encinitas-2?sort_by=date_desc' } }, { site => 'GreatSchools', sub => \&gs_checker, review_pages => { 'MA' => 'http://www.greatschools.org/california/encinitas/9670-Encinitas-Country-Day-School/?tab=reviews' } }, { site => 'PrivateSchoolReview', sub => \&psr_checker, review_pages => { 'MA' => 'http://www.privateschoolreview.com/school_ov/school_id/2039' } }, { site => 'Kudzu', sub => \&kudzu_checker, review_pages => { 'MA' => 'http://www.kudzu.com/m/Encinitas-Country-Day-School-13571675' } }, { site => 'MerchantCircle', sub => \&mc_checker, review_pages => { 'MA' => 'http://www.merchantcircle.com/business/Encinitas.Country.Day.School.760-942-1111?sort=created&dir=desc' } } ]; # Default date if no record my $default_date = '00-00-0000'; # Month name to number conversion my %month = ( January => '01', February => '02', March => '03', April => '04', May => '05', June => '06', July => '07', August => '08', September => '09', October => '10', November => '11', December => '12' ); # Where is the reviews file? my $reviews_filepath = "reviews.txt"; # Where is the alert message file? my $msg_filepath = "msg.txt"; # Slurp hash from reviews file my $old_reviews = hash_from_csv($reviews_filepath); my %new_reviews; # Iterate through each site for my $review_site (@$review_sites) { my $pages = $review_site->{review_pages}; # iterate through each campus html and collect xpath nodes while ( my ( $campus, $url ) = each %$pages ) { my $html = get $url or croak("Can't reach $url $!\n"); $html =~ s/([^[:ascii:]]+)/unidecode($1)/ge; my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($html) or croak("Parse failed: $!\n"); my ($date) = $review_site->{'sub'}->($tree); # create hash keys from campus and review site names my $campus_site = $campus . '_' . $$review_site{'site'}; push( @{ $new_reviews{$campus_site} }, $date ); push( @{ $new_reviews{$campus_site} }, $url ); } } # Write message if new reviews my $msg = ''; while ( my ( $item, $data ) = each %new_reviews ) { unless ( $$old_reviews{$item}[0] eq $$data[0] ) { $msg .= "New review on $$data[0]: \n $$data[1]\n"; } } # Save message. open my $fh, ">:encoding(utf8)", "$msg_filepath" or croak("cannot open $msg_filepath: $!"); print {$fh} $msg or croak("Can't print message:\n$msg\n$!"); close $fh; # Write new review data to file. hash_to_csv( \%new_reviews, $reviews_filepath ); # Email message if exists send_email($msg) if length($msg); ######## SUBROUTINES ####### # import old data from file sub hash_from_csv { my $filepath = shift; open my $fh, "<:encoding(utf8)", "$filepath" or croak("cannot open $filepath: $!"); my $csv = Text::CSV->new( { binary => 1 } ); my %hash; map { $hash{ shift @{$_} } = $_ } @{ $csv->getline_all($fh) }; close $fh; return \%hash; } # write new data to file sub hash_to_csv { my ( $hash, $filepath ) = @_; open my $fh, ">:encoding(utf8)", "$filepath" or croak("cannot open $filepath: $!"); my $csv = Text::CSV->new( { binary => 1, eol => "\n" } ); for ( keys %$hash ) { my $colref = [ $_, $$hash{$_}->[0] ]; $csv->print( $fh, $colref ); } close $fh; return; } # send email notifications sub send_email { my ($body) = @_; open my $pipe, '|-', '/usr/bin/mailx', '-s', $email{subject}, $email{to} or croak("can't open pipe to mailx: $!\n"); print $pipe $body; close $pipe; croak("mailx exited with a non-zero status: $?\n") if $?; return; } # extract date of most recent review from GreatSchools tree sub gs_checker { my $tree = shift; my $xpath = '//div[contains(@class,"media mbs")]/div[(@class="author small make-999999 fl pbn mbn")]'; my $dates = $tree->findnodes($xpath); # dates returned as 'month dd, yyyy' my $date; $date = $$dates[0]->as_trimmed_text() if ( $$dates[0] ); if ( $date =~ /(\w{3,9})\s+(\d{1,2}),\s+(\d{4})/ ) { $date = $3 . '-' . $month{$1} . '-' . $2; } return ( $date || $default_date ); } # extract date of most recent review from Yelp tree sub yelp_checker { my $tree = shift; my $xpath = '//meta[@itemprop="datePublished"][1]'; my $dates = $tree->findnodes($xpath); # dates returned as 'yyyy-mm-dd' if ( $$dates[0] ) { return $$dates[0]->attr('content'); } else { return ( $$dates[0] || $default_date ); } } # extract date of most recent review from PrivateSchoolReview tree sub psr_checker { my $tree = shift; my $xpath = '//meta[@itemprop="datePublished"][1]'; my $dates = $tree->findnodes($xpath); # dates returned as 'yyyy-mm-dd' if ( $$dates[0] ) { return $$dates[0]->attr('content'); } else { return ( $$dates[0] || $default_date ); } } # extract date of most recent review from Kudzu tree sub kudzu_checker { my $tree = shift; my $xpath = '//div[@class="review_post_date"]/p/span[@class="rp-date"]'; my $dates = $tree->findnodes($xpath); # date returned as 'mm/dd/yyyy' my $date; $date = $$dates[0]->as_trimmed_text() if ( $$dates[0] ); if ( $date =~ /(\d{1,2})\/(\d{1,2})\/(\d{4})/ ) { $date = $3 . '-' . $1 . '-' . $2; } return ( $date || $default_date ); } # extract date of most recent review from MerchantCircle tree sub mc_checker { my $tree = shift; my $xpath = '//span[@itemprop="datePublished"][1]'; my $dates = $tree->findnodes($xpath); # dates returned as 'Month dd, yyyy at hh:mm PM' my $date; $date = $$dates[0]->as_trimmed_text() if ( $$dates[0] ); if ( $date =~ /\s*(\w{3,9})\s*(\d{1,2})\s*\,\s*(\d{4})\s+at\s+\d{1,2}\:\d{2}\s+[AP]M/ ) { $date = $3 . '-' . $month{$1} . '-' . $2; } return ( $date || $default_date ); }