Re: Extracting Data from a File

in reply to Extracting Data from a File

Here is a sample program using the modules mentioned to process all HTML files in your directory structure, parse the HTML headers for description and keyword values, and write the results to CSV files. Please note that description and keyword values are written to separate CSV files, since it did not seem to make sense to mix them in one file. Hope this helps.

#!/usr/bin/perl
######################################################################
# Name: extract_sample.pl
# Desc: Sample program to extract HTML header data as CSV files.
######################################################################
use strict;
use warnings;
use File::Find;
use HTTP::Headers;
use HTML::HeadParser;
use Text::CSV;

######################################################################
# Create objects for each CSV file to be created.
######################################################################
my $csv1 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_
+diag();
my $csv2 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_
+diag();
$csv1->eol ("\n");
$csv2->eol ("\n");

######################################################################
# Open CSV files for output.
######################################################################
my $dfile = 'description.csv';
my $kfile = 'keyword.csv';
open my $fh1, ">:encoding(utf8)", "$dfile" or die "Error opening $dfil
+e: $!";
open my $fh2, ">:encoding(utf8)", "$kfile" or die "Error opening $kfil
+e: $!";

######################################################################
# Set directory (and sub-directories) for File::Find to search.
######################################################################
my $dir = '.';
find (\&HTML_Files, $dir);
close $fh1 or die "Error closing $dfile: $!";
close $fh2 or die "Error closing $kfile: $!";
exit;

######################################################################
# This subroutine is called for each file in the directories searched.
######################################################################
sub HTML_Files {
   Parse_HTML_Header($File::Find::name) if /\.html?$/;
}


sub Parse_HTML_Header {
   ###################################################################
   # The 'parse' method below expects the HTML to be in a variable,
   # so we slurp the file contents into $text.
   ###################################################################
   my $ifile = shift;
   open(my $fh0, '<', $ifile) or die "Error opening $ifile: $!\n";
   my $text = '';
   {
      $/ = undef;
      $text = <$fh0>;
   }
   close $fh0;
   
   ###################################################################
   # Parse HTML header.
   ###################################################################
   my $h = HTTP::Headers->new;
   my $p = HTML::HeadParser->new($h);
   $p->parse($text);
   
   ###################################################################
   # Write results to separate CSV files for description and keywords.
   ###################################################################
   for ($h->header_field_names) {
      my @values = split ',', $h->header($_);
      if (/description/i) {
         $csv1->print ($fh1, \@values);
      } elsif (/keywords/i) {
         $csv2->print ($fh2, \@values);
      }
   }
}
[download]

"Its not how hard you work, its how much you get done."

In Section Seekers of Perl Wisdom