http://www.perlmonks.org?node_id=1209441


in reply to Re^2: creating and managing many hashes
in thread creating and managing many hashes

To give us some more detail run this code against your data and post the output summary (not the report.txt file)

#!/usr/bin/perl use strict; use warnings; my $t0 = time(); my $infile = 'products.txt'; my %data = (); my %total = (); my $records = 0; open IN,'<',$infile or die "Could not open $infile $!"; while (<IN>){ my ($date, $product, $price, $qu) = split /\s+/,$_; $data{$date}{$product}{'price'} = $price; $data{$date}{$product}{'qu'} = $qu; $total{$product}{'count'} += 1; $total{$product}{'price'}{'sum'} += $price; $total{$product}{'qu'}{'sum'} += $qu; ++$records; } close IN; # calculate stats my $outfile = 'report.txt'; open OUT,'>',$outfile or die "Could not open $outfile"; for my $prod (keys %total){ my $count = $total{$prod}{'count'}; # mean $total{$prod}{'price'}{'mean'} = $total{$prod}{'price'}{'sum'}/$coun +t; $total{$prod}{'qu'}{'mean'} = $total{$prod}{'qu'}{'sum'}/$count; # std dev squared my ($sum_x2,$sum_y2); for my $date (keys %data){ my $x = $data{$date}{$prod}{'price'} - $total{$prod}{'price'}{'mea +n'}; $sum_x2 += ($x*$x); my $y = $data{$date}{$prod}{'qu'} - $total{$prod}{'qu'}{'mean'}; $sum_y2 += ($y*$y); } $total{$prod}{'price'}{'stddev'} = sprintf "%.4f",sqrt($sum_x2/$coun +t); $total{$prod}{'qu'}{'stddev'} = sprintf "%.4f",sqrt($sum_y2/$coun +t); my $line = join "\t",$prod, $total{$prod}{'price'}{'mean'}, $total{$prod}{'price'}{'stddev'}, $total{$prod}{'qu'}{'mean'}, $total{$prod}{'qu'}{'stddev'}; print OUT $line."\n"; } close OUT; # summary my $dur = time - $t0; printf " Products : %d Dates : %d Records : %d Run Time : %d s",0+keys %total, 0+keys %data, $records, $dur;

Update - code to create a 75MB test file

open OUT,'>','products.txt' or die "$!"; my @d = (0,31,28,31,30,31,30,31,31,30,31,30,31); for my $p ('0001'..'2000'){ my $product = "product_$p"; for my $y (2015..2017){ $d[2] = ($y % 4) ? 28 : 29; for my $m (1..12){ for my $d (1..$d[$m]){ my $date = sprintf "%04d-%02d-%02d",$y,$m,$d; my $price = int rand(500); my $qu = int rand(90_000); print OUT "$date\t$product\t$price\t$qu\n"; } } } } close OUT;

On my i5 desktop it takes about 5 seconds to correlate the price of 1 product against the other 1999. I guess 2 million pairs would be less than 2 hours

poj