#!/usr/bin/perl # Read in a file with search terms and a list of .pdf files, and output the search term and the .pdf file the term # was found in as a .csv output. Can be expanded to potentially also list use warnings; use strict; use Parallel::Loops; use Data::Dumper; my $lookup_file = shift; my $par = shift; my @pdf_files = @ARGV; my %results; # Read in the lookup file and store query terms in hash (just in case can be useful later) open( my $lookup_fh, "<", $lookup_file ) || die "Can't open the lookup file '$lookup_file': $!"; my %lookup = map{ chomp; $_ => 1 } <$lookup_fh>; close( $lookup_fh ); if ( $par == 1 ) { &par_proc( \%lookup, \@pdf_files, \%results ); } else { &std_proc( \%lookup, \@pdf_files ); } # Print out the results for my $search_term ( sort keys %results ) { print join( ",", $search_term, $_ ), "\n" for ( sort { $a cmp $b } @{$results{$search_term}} ); } sub std_proc { # Iterate over search terms to look up files and spit results out to a new hash my $lookup = shift; my $pdfs = shift; foreach ( @$pdfs ) { my $pdf = $_; open( my $pdf_fh, "-|", "pdftotext $pdf -" ) || die "Error converting file: $pdf"; my @data = <$pdf_fh>; for my $search_term ( keys %$lookup ) { push( @{$results{$search_term}}, $pdf ) if ( grep { $_ =~ /$search_term/ } @data ); } } } sub par_proc { # Parallel iteration method my $lookup = shift; my $pdfs = shift; my $results = shift; # Set up parallel loop processing. my $maxProcs = 12; my $pl = Parallel::Loops->new($maxProcs); $pl->share($results); $pl->foreach ( $pdfs, sub { my $pdf = $_; open( my $pdf_fh, "-|", "pdftotext $pdf -" ) || die "Error converting file: $pdf"; my @data = <$pdf_fh>; for my $search_term ( keys $lookup ) { push( @{$results{$search_term}}, $pdf ) if ( grep { $_ =~ /$search_term/ } @data ); } }); }