http://www.perlmonks.org?node_id=1041118

hyu968 has asked for the wisdom of the Perl Monks concerning the following question:

Good evening Monks, I would like to ask for your help in possible optimization of the code below. I need to run this on a few million of text and HTML files, and I must say it runs pretty slow. If there are ways to optimize it, I am all ears. The bottleneck here is deHTMLing of the files. If I get a text file, it searches within the file for the string very fast and goes onto the next. Now once it hits HTML file, it takes some time to process it. So maybe there is a way to optimize the deHTMLing portion of the code below? I run multiple threads but each of them can't go through more than 5,000 files in 24 hours which I find ridiculously slow. Looking forward to your help! Thanks, GM
#!/usr/bin/perl -w #use strict; use Benchmark; # Get the HTML-Format package from the package manager. use HTML::Formatter; # Get the HTML-TREE from the package manager use HTML::TreeBuilder; use HTML::FormatText; use File::Copy; $startTime = new Benchmark; # Specifies the directory of the output file my $outfile="D:\\output\\2009\\2009.01.xls"; my $outdir="D:\\output\\2009\\01"; # Specifies the directory with the 8-Ks my $direct="D:\\8K\\2009\\01"; my $anchorstring='(chief\s*exec|ceo|chairman|founder|president)(.){0,2 +00}(\s+dies|\s+died|\s+passed|\s+death|(medical(.){0,30}leave)|(leave +(.){0,30}medical)|(diagnos(.){0,30}cancer)|(leave\s*of\s*absence)|(pe +rsonal\s*reasons)|(medical\s*treatment))'; my $anchorstring2='(death|passing)(.){0,200}(chief\s*exec|ceo|chairman +|founder|president)'; my $icounter=1; # Open the directory containing the files to read # Store the names of each file in an array @New1 opendir(DIR1, "$direct") || die "Can't open directory"; my @New1=readdir(DIR1); # Open (and overwrite) the output file # Print first line of output file open(OUTPUT, ">$outfile") || die "can't open $outfile: $!"; print OUTPUT "file \t form_type \t HTML \t cik \t report_date \t file_ +date \t name \t text \n"; # Loop over each file in the arry foreach $file(@New1) { print " File number $icounter out of $#New1 \n"; $icounter++; # Don't read the directories . and .. if ($file=~/^\./) {next;} # Initialize the variables # CHECK WHICH I NEED my $cik=-99; my $report_date=-99; my $file_date=-99; #my $file_number=-99; my $form_type="Not Found"; my $name=""; my $sic=-99; my $HTML=0; my $announcement_text='Not Found'; my $ao="Not Found"; my $tree="Empty"; my $data=""; # Open the file and put the content in variable $data # $data contains the entire filing { # Remove the default end of line character (\n) so that the entire + file can be read at once local $/; open (SLURP, "$direct\\"."$file") or die "can't open $file: $!"; # Read the contents into $data $data = <SLURP>; } close SLURP or die "cannot close $file: $!"; # The following steps obtain basic data from the filings if($data=~m/<HTML>/i) {$HTML=1;} if($data=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) {$form_type=$1;} if($data=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) {$cik=$1;} if($data=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) {$rep +ort_date=$1;} if($data=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) {$file_date=$1; +} #if($data=~m/^\s*SEC\s*FILE\s*NUMBER:\s*([0-9-]*)/m) {$file_number +=$1;} if($data=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) {$name=$1;} if($data=~m/^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?\[(\d{4} +)/m) {$sic=$1;} # The following steps extract the audit opinion (or whatever secti +on of text you want) # The first if statement determines whether the filing is in HTML +format or plain text. if($HTML==0) { if($data=~m/((?:.){0,200}$anchorstring(?:.){0,200})/is) {$announcement_text=$1;} } else { if($data=~m/((?:.){0,200}$anchorstring(?:.){0,200})/is) { $ao=$1; # Clean up HTML tags, etc. $tree=HTML::TreeBuilder->new->parse($ao); $formatter=HTML::FormatText->new(leftmargin=> 0, right +margin=>60); $announcement_text=$formatter->format($tree); $tree->delete(); } } if($announcement_text eq 'Not Found') { if($HTML==0) { if($data=~m/((?:.){0,200}$anchorstring2(?:.){0,200})/is) {$announcement_text=$1;} } else { if($data=~m/((?:.){0,200}$anchorstring2(?:.){0,200})/is) { $ao=$1; # Clean up HTML tags, etc. $tree=HTML::TreeBuilder->new->parse($ao); $formatter=HTML::FormatText->new(leftmargin=> 0, right +margin=>60); $announcement_text=$formatter->format($tree); $tree->delete(); } } } #if($announcement_text eq 'Not Found') {$announcement_text='Not Fo +und'} if($announcement_text eq 'Not Found') {next;} # Clean up a bit $announcement_text=~s/[^[:ascii:]]+//g; $announcement_text=~s/\s+/ /mg; print OUTPUT "$file \t $form_type \t $HTML \t $cik \t $report_date + \t $file_date \t $name \t $announcement_text \n"; copy("$direct\\"."$file", "$outdir\\"."$file") or die; } close(OUTPUT); # Show how long it took to run the program $endTime = new Benchmark; $runTime = timediff($endTime, $startTime); print ("Processing files took ", timestr($runTime));