#!/usr/bin/perl
# Read in a file with search terms and a list of .pdf files, and output the search term and the .pdf file the term 
# was found in as a .csv output.  Can be expanded to potentially also list 

use warnings;
use strict;
use Parallel::Loops;
use Data::Dumper;

my $lookup_file = shift;
my $par = shift;
my @pdf_files = @ARGV;
my %results;


# Read in the lookup file and store query terms in hash (just in case can be useful later)
open( my $lookup_fh, "<", $lookup_file ) || die "Can't open the lookup file '$lookup_file': $!";
my %lookup = map{ chomp; $_ => 1 } <$lookup_fh>;
close( $lookup_fh );

if ( $par == 1 ) {
    &par_proc( \%lookup, \@pdf_files, \%results );
} else {
    &std_proc( \%lookup, \@pdf_files );
}

# Print out the results
for my $search_term ( sort keys %results ) {
    print join( ",", $search_term, $_ ), "\n" for ( sort { $a cmp $b } @{$results{$search_term}} );
}

sub std_proc {
# Iterate over search terms to look up files and spit results out to a new hash

    my $lookup = shift;
    my $pdfs = shift;
    foreach ( @$pdfs ) {
        my $pdf = $_;
        open( my $pdf_fh, "-|", "pdftotext $pdf -" ) || die "Error converting file: $pdf";
        my @data = <$pdf_fh>;
        for my $search_term ( keys %$lookup ) {
            push( @{$results{$search_term}}, $pdf ) if ( grep { $_ =~ /$search_term/ } @data );
        }
    }
}

sub par_proc {
# Parallel iteration method

    my $lookup = shift;
    my $pdfs = shift;
    my $results = shift;

    # Set up parallel loop processing.
    my $maxProcs = 12;
    my $pl = Parallel::Loops->new($maxProcs);
    $pl->share($results);
    
    $pl->foreach ( $pdfs, sub {
         my $pdf = $_;
         open( my $pdf_fh, "-|", "pdftotext $pdf -" ) || die "Error converting file: $pdf";
         my @data = <$pdf_fh>;
         for my $search_term ( keys $lookup ) {
             push( @{$results{$search_term}}, $pdf ) if ( grep { $_ =~ /$search_term/ } @data );
         }

    });
}