So I have about 85 XML files that are 25-30mb each I am trying to process with XML::Twig. The problem is that I cannot seem to get XML::Twig to release the memory it consumes and my script quickly dies a bloated memory related death after a number of files. In a nutshell this is what I am doing.
- Parse file using XML::Twig
- Get just the IDs for each record
- Lookup that ID in database and do some stuff
- Process next record until end of XML file and repeat for next file.
I threw it in debug and I can see the memory grow as each bit of XML is parsed. Can anyone see anything wrong with this code?
#!/usr/bin/perl -w
use strict;
use XML::Twig;
use DBI;
use DBD::Pg;
use SQL::Abstract;
use File::Copy;
use File::Basename;
my $inFile = 'data_100000_100500.xml';
if ( ! $inFile ) {
die("No input file specified");
}
if ( ! -f $inFile ) {
die("file '$inFile' not found");
}
my $dbname = "test";
my $user = "test";
my $password = "test";
my $host = "test03.server.com";
my $port = "5432";
my $dbh = DBI->connect("dbi:Pg:dbname=$dbname;host=$host;port=$port",
+$user, $password, {AutoCommit => 0});
my $sql = SQL::Abstract->new(quote_char=>'"');
my @missing;
my @localmissing;
my $trust = 0;
my $localcount;
my $count;
my $sth;
my $fileStartID;
my $fileEndID;
my %BIOG;
process($inFile);
if (@missing) {
open(MISSINGFILE, ">>missing.txt");
foreach my $missing (@missing) {
print MISSINGFILE $missing . "\n";
}
close MISSINGFILE;
print "\nUNVERIFIED see missing.txt for missing records.\n";
} else {
print "\nVerified 100%\n";
}
exit 0;
#
# Process the file
#
sub process {
%BIOG = ();
$inFile =~ /data_(\d+)_(\d+)/;
$fileStartID = $1;
$fileEndID = $2;
$localcount = 0;
print "Processing file " . $inFile . "\t";
my $t= new XML::Twig( TwigHandlers=> { BIOG => \&BIOG },
);
$t->parsefile( $inFile );
$t->dispose(); # Try to Free memory but does not work...
if ( @localmissing ) {
push(@missing,@localmissing);
my $missing = @localmissing;
print "Missing $missing/$localcount \n";
} else {
print "Verified 100%\n";
my $folder = dirname($inFile);
$folder =~ s/data_done/data_verified/;
move($inFile, $folder.basename($inFile));
}
}
#
# BIOG is XML element we are triggering
#
sub BIOG {
my ($t, $BIOG)= @_;
++$localcount;
if ( ! checkBiog($BIOG->field('BIOG_NBR')) ) {
push(@localmissing, $BIOG->field('BIOG_NBR'));
}
$t->purge(); # Tell XML::Twig to dispo of the rest of the tree we
+ don't care about
return 1;
}
#
# Check database for ID
#
sub checkBiog {
my ($biog) = @_;
if ( !%BIOG ) {
my %where = (
BIOG_NBR => { -between => [ $fileStartID, $fileEndID ] },
);
my($stmt, @bind) = $sql->select('BIOG', '"BIOG_NBR"', \%where)
+;
if (!$sth) {
$sth = $dbh->prepare($stmt);
}
my $result = $sth->execute(@bind);
while(my $data = $sth->fetchrow_hashref()) {
$BIOG{$data->{BIOG_NBR}} = 1;
}
}
if(defined($BIOG{$biog})) {
return 1;
} else {
return 0;
}
}