good chemistry is complicated,and a little bit messy -LW PerlMonks

### Re^5: creating and managing many hashes

by poj (Abbot)
 on Feb 24, 2018 at 13:40 UTC ( #1209899=note: print w/replies, xml ) Need Help??

in reply to Re^4: creating and managing many hashes
in thread creating and managing many hashes

You would need to detail the correlation algorithm you want to use. The simple one I tried was this where \$v would be either 'price' or 'qu'. This assumes for any date you have the full data, so you need to decide how to handle the dates where you don't have 2 prices or qu to correlate.

```sub correlate {
my (\$p1,\$p2,\$v) = @_;
return if \$p1 eq \$p2;
my (\$xy,\$x2,\$y2);
for my \$date (sort keys %data){
my \$x = \$data{\$date}{\$p1}{\$v} - \$total{\$p1}{\$v}{'mean'} ;
my \$y = \$data{\$date}{\$p2}{\$v} - \$total{\$p2}{\$v}{'mean'} ;
\$xy += \$x * \$y;
\$x2 += \$x * \$x;
\$y2 += \$y * \$y;
}
my \$cor = \$xy / sqrt(\$x2 * \$y2);
print "\$p1 \$p2 \$v \$cor\n";
}

Update - after seeing Re^2: creating and managing many hashes

```#!/usr/bin/perl
use strict;
use warnings;

my \$infile = 'products.txt';
my %data  = ();
my %total = ();
my \$records = 0;

my \$t0 = time();
open IN,'<',\$infile or die "Could not open \$infile \$!";
while (<IN>){
my (\$date, \$product, \$price, \$qu) = split /\s+/,\$_;
\$data{\$date}{\$product}{'price'}   = \$price;
\$data{\$date}{\$product}{'qu'}      = \$qu;

\$total{\$product}{'count'}        += 1;
\$total{\$product}{'price'}{'sum'} += \$price;
\$total{\$product}{'qu'}{'sum'}    += \$qu;
++\$records;
}
close IN;

# calculate stats
my \$outfile = 'report.txt';
open OUT,'>',\$outfile or die "Could not open \$outfile";
for my \$prod (keys %total){
my \$count = \$total{\$prod}{'count'};

# mean
\$total{\$prod}{'price'}{'mean'} = \$total{\$prod}{'price'}{'sum'}/\$coun
+t;
\$total{\$prod}{'qu'}{'mean'}    = \$total{\$prod}{'qu'}{'sum'}/\$count;

# std dev squared
my (\$sum_x2,\$sum_y2);
for my \$date (keys %data){
my \$x = \$data{\$date}{\$prod}{'price'} - \$total{\$prod}{'price'}{'mea
+n'};
\$sum_x2 += (\$x*\$x);
my \$y = \$data{\$date}{\$prod}{'qu'} - \$total{\$prod}{'qu'}{'mean'};
\$sum_y2 += (\$y*\$y);
}

\$total{\$prod}{'price'}{'stddev'} = sqrt(\$sum_x2/\$count);
\$total{\$prod}{'qu'}{'stddev'}    = sqrt(\$sum_y2/\$count);

my \$line = join "\t",\$prod,
\$total{\$prod}{'price'}{'mean'},
\$total{\$prod}{'price'}{'stddev'},
\$total{\$prod}{'qu'}{'mean'},
\$total{\$prod}{'qu'}{'stddev'};

print OUT \$line."\n";

}
close OUT;

# correlate first 5 products against the rest
\$t0 = time;
for (1..5){
my \$p1 = sprintf 'product_%04d',\$_;
print "Correlating \$p1\n";
open OUT,'>',"correlate_\${p1}.txt" or die "\$!";
for my \$p2 (sort keys %total){
correlate(\$p1,\$p2,'price');
}
close OUT;
}

# summary
my \$dur = time - \$t0;
printf "
Products : %d
Dates    : %d
Records  : %d
Run Time : %d s",0+keys %total, 0+keys %data, \$records, \$dur;

sub correlate {
my (\$p1,\$p2,\$v) = @_;
return if \$p1 eq \$p2;
my \$sigma;
my \$count = 0;
for my \$date (sort keys %data){
# skip missing dataset
next unless (defined \$data{\$date}{\$p1} && defined \$data{\$date}{\$p2
+});

my \$zee1 = ( \$data{\$date}{\$p1}{\$v} - \$total{\$p1}{\$v}{'mean'} ) / \$
+total{\$p1}{\$v}{'stddev'} ;
my \$zee2 = ( \$data{\$date}{\$p2}{\$v} - \$total{\$p2}{\$v}{'mean'} ) / \$
+total{\$p2}{\$v}{'stddev'} ;
\$sigma += ( \$zee1 * \$zee2 );
++\$count;
}
my \$cor = \$sigma / ( \$count - 1 ) ;
print OUT "\$p1 \$p2 \$v \$cor\n";
}
poj

Create A New User
Node Status?
node history
Node Type: note [id://1209899]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others musing on the Monastery: (3)
As of 2021-06-20 00:53 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
What does the "s" stand for in "perls"? (Whence perls)

Results (93 votes). Check out past polls.

Notices?