`#!/usr/bin/perl
use strict;
use warnings;
my $t0 = time();
my $infile = 'products.txt';
my %data = ();
my %total = ();
my $records = 0;
open IN,'<',$infile or die "Could not open $infile $!";
while (<IN>){
my ($date, $product, $price, $qu) = split /\s+/,$_;
$data{$date}{$product}{'price'} = $price;
$data{$date}{$product}{'qu'} = $qu;
$total{$product}{'count'} += 1;
$total{$product}{'price'}{'sum'} += $price;
$total{$product}{'qu'}{'sum'} += $qu;
++$records;
}
close IN;
# calculate stats
my $outfile = 'report.txt';
open OUT,'>',$outfile or die "Could not open $outfile";
for my $prod (keys %total){
my $count = $total{$prod}{'count'};
# mean
$total{$prod}{'price'}{'mean'} = $total{$prod}{'price'}{'sum'}/$coun
+t;
$total{$prod}{'qu'}{'mean'} = $total{$prod}{'qu'}{'sum'}/$count;
# std dev squared
my ($sum_x2,$sum_y2);
for my $date (keys %data){
my $x = $data{$date}{$prod}{'price'} - $total{$prod}{'price'}{'mea
+n'};
$sum_x2 += ($x*$x);
my $y = $data{$date}{$prod}{'qu'} - $total{$prod}{'qu'}{'mean'};
$sum_y2 += ($y*$y);
}
$total{$prod}{'price'}{'stddev'} = sprintf "%.4f",sqrt($sum_x2/$coun
+t);
$total{$prod}{'qu'}{'stddev'} = sprintf "%.4f",sqrt($sum_y2/$coun
+t);
my $line = join "\t",$prod,
$total{$prod}{'price'}{'mean'},
$total{$prod}{'price'}{'stddev'},
$total{$prod}{'qu'}{'mean'},
$total{$prod}{'qu'}{'stddev'};
print OUT $line."\n";
}
close OUT;
# summary
my $dur = time - $t0;
printf "
Products : %d
Dates : %d
Records : %d
Run Time : %d s",0+keys %total, 0+keys %data, $records, $dur;
`
Update - code to create a 75MB test file
`open OUT,'>','products.txt' or die "$!";
my @d = (0,31,28,31,30,31,30,31,31,30,31,30,31);
for my $p ('0001'..'2000'){
my $product = "product_$p";
for my $y (2015..2017){
$d[2] = ($y % 4) ? 28 : 29;
for my $m (1..12){
for my $d (1..$d[$m]){
my $date = sprintf "%04d-%02d-%02d",$y,$m,$d;
my $price = int rand(500);
my $qu = int rand(90_000);
print OUT "$date\t$product\t$price\t$qu\n";
}
}
}
}
close OUT;
`
On my i5 desktop it takes about 5 seconds to correlate the price of 1 product against the other 1999. I guess 2 million pairs would be less than 2 hours
poj |
Comment onRe: creating and managing many hashes