I've been doing a bit of research here and have found a number of nodes with some helpful information, but none I believe represents quite what I'm trying to accomplish.
I have two similar large files 1GB+ in size. I need to compare
file1 to
file2 with common
keys to find differences in the
values of
file1 to
file2. My initial thought was to load both into separate hashes and iterate
hash1 over
hash2. However, with files of this size I cannot store in memory. I've read of a few modules that may be of help and a lot of suggestion of using a RDBMS. Looking for some help and direction. Thanks!
sub process {
open (TODAY_IN, $out);
foreach my $line_t (<TODAY_IN>){
($path, $type, $size, $link, $nfiles) = split(/\s+/, $line_t);
$today{$path}{'Type'} = $type;
$today{$path}{'Size'} = $size;
$today{$path}{'Link'} = $link;
$today{$path}{'NFILES'} = $nfiles;
}
close TODAY_IN;
open (YESTERDAY_IN, $comp);
foreach my $line_y (<YESTERDAY_IN>){
($path, $type, $size, $link, $nfiles) = split(/\s+/, $line_y);
$yesterday{$path}{'Type'} = $type;
$yesterday{$path}{'Size'} = $size;
$yesterday{$path}{'Link'} = $link;
$yesterday{$path}{'NFILES'} = $nfiles;
}
close YESTERDAY_IN;
diff(%today, %yesterday);
}
sub diff {
open (COMP, ">$final");
foreach $key (keys %today){
if (exists $yesterday{$key}){
$size_t = $today{$key}{'Size'};
$size_y = $yesterday{$key}{'Size'};
$nfiles_t = $today{$key}{'NFILES'};
$nfiles_y = $yesterday{$key}{'NFILES'};
if ($size_y > 0 && $size_t > 0){
if ($size_t > $size_y){
my $diff_t = (1-($size_y/$size_t))*100;
if ($diff_t >= $max_size_diff){
$diffOut{$key}{'SizeYest'} = $size_y;
$diffOut{$key}{'SizeToday'} = $size_t;
$diffOut{$key}{'SizeDiff'} = $diff_t;
print COMP "$key\tYEST:$diffOut{$key}{'SizeYest'}\tT
+OD:$diffOut{$key}{'SizeToday'}\tDIFF:$diffOut{$key}{'SizeDiff'}\n";
}
}
elsif ($size_y > $size_t){
my $diff_y = (1-($size_t/$size_y))*100;
if ($diff_y >= $max_size_diff){
$diffOut{$key}{'SizeToday'} = $size_t;
$diffOut{$key}{'SizeYest'} = $size_y;
$diffOut{$key}{'SizeDiff'} = $diff_y;
print COMP "$key\tYEST:$diffOut{$key}{'SizeYest'}\tT
+OD:$diffOut{$key}{'SizeToday'}\tDIFF:$diffOut{$key}{'SizeDiff'}\n";
}
}
if (-d $key){
if ($nfiles_y > 0 && $nfiles_t > 0){
$diffFiles = $nfiles_t-$nfiles_y;
if ($diffFiles > $max_file_diff){
$diffOut{$key}{'FileDiff'} = $diffFiles;
print COMP "$key\tFDIFF:$diffOut{$key}{'FileDiff'
+}\n";
}
}
}
}
}
else {
$diffOut{$key}{'SizeToday'} = $size_t;
$diffOut{$key}{'SizeYest'} = 0;
$diffOut{$key}{'SizeDiff'} = "New";
print COMP "$key\tYEST:$diffOut{$key}{'SizeYest'}\tTOD:$diffO
+ut{$key}{'SizeToday'}\tDIFF:$diffOut{$key}{'SizeDiff'}\n";
}
}
close COMP;
}