Problems? Is your data what you think it is? PerlMonks

### comment on

 Need Help??

You would need to detail the correlation algorithm you want to use. The simple one I tried was this where \$v would be either 'price' or 'qu'. This assumes for any date you have the full data, so you need to decide how to handle the dates where you don't have 2 prices or qu to correlate.

```sub correlate {
my (\$p1,\$p2,\$v) = @_;
return if \$p1 eq \$p2;
my (\$xy,\$x2,\$y2);
for my \$date (sort keys %data){
my \$x = \$data{\$date}{\$p1}{\$v} - \$total{\$p1}{\$v}{'mean'} ;
my \$y = \$data{\$date}{\$p2}{\$v} - \$total{\$p2}{\$v}{'mean'} ;
\$xy += \$x * \$y;
\$x2 += \$x * \$x;
\$y2 += \$y * \$y;
}
my \$cor = \$xy / sqrt(\$x2 * \$y2);
print "\$p1 \$p2 \$v \$cor\n";
}

Update - after seeing Re^2: creating and managing many hashes

```#!/usr/bin/perl
use strict;
use warnings;

my \$infile = 'products.txt';
my %data  = ();
my %total = ();
my \$records = 0;

my \$t0 = time();
open IN,'<',\$infile or die "Could not open \$infile \$!";
while (<IN>){
my (\$date, \$product, \$price, \$qu) = split /\s+/,\$_;
\$data{\$date}{\$product}{'price'}   = \$price;
\$data{\$date}{\$product}{'qu'}      = \$qu;

\$total{\$product}{'count'}        += 1;
\$total{\$product}{'price'}{'sum'} += \$price;
\$total{\$product}{'qu'}{'sum'}    += \$qu;
++\$records;
}
close IN;

# calculate stats
my \$outfile = 'report.txt';
open OUT,'>',\$outfile or die "Could not open \$outfile";
for my \$prod (keys %total){
my \$count = \$total{\$prod}{'count'};

# mean
\$total{\$prod}{'price'}{'mean'} = \$total{\$prod}{'price'}{'sum'}/\$coun
+t;
\$total{\$prod}{'qu'}{'mean'}    = \$total{\$prod}{'qu'}{'sum'}/\$count;

# std dev squared
my (\$sum_x2,\$sum_y2);
for my \$date (keys %data){
my \$x = \$data{\$date}{\$prod}{'price'} - \$total{\$prod}{'price'}{'mea
+n'};
\$sum_x2 += (\$x*\$x);
my \$y = \$data{\$date}{\$prod}{'qu'} - \$total{\$prod}{'qu'}{'mean'};
\$sum_y2 += (\$y*\$y);
}

\$total{\$prod}{'price'}{'stddev'} = sqrt(\$sum_x2/\$count);
\$total{\$prod}{'qu'}{'stddev'}    = sqrt(\$sum_y2/\$count);

my \$line = join "\t",\$prod,
\$total{\$prod}{'price'}{'mean'},
\$total{\$prod}{'price'}{'stddev'},
\$total{\$prod}{'qu'}{'mean'},
\$total{\$prod}{'qu'}{'stddev'};

print OUT \$line."\n";

}
close OUT;

# correlate first 5 products against the rest
\$t0 = time;
for (1..5){
my \$p1 = sprintf 'product_%04d',\$_;
print "Correlating \$p1\n";
open OUT,'>',"correlate_\${p1}.txt" or die "\$!";
for my \$p2 (sort keys %total){
correlate(\$p1,\$p2,'price');
}
close OUT;
}

# summary
my \$dur = time - \$t0;
printf "
Products : %d
Dates    : %d
Records  : %d
Run Time : %d s",0+keys %total, 0+keys %data, \$records, \$dur;

sub correlate {
my (\$p1,\$p2,\$v) = @_;
return if \$p1 eq \$p2;
my \$sigma;
my \$count = 0;
for my \$date (sort keys %data){
# skip missing dataset
next unless (defined \$data{\$date}{\$p1} && defined \$data{\$date}{\$p2
+});

my \$zee1 = ( \$data{\$date}{\$p1}{\$v} - \$total{\$p1}{\$v}{'mean'} ) / \$
+total{\$p1}{\$v}{'stddev'} ;
my \$zee2 = ( \$data{\$date}{\$p2}{\$v} - \$total{\$p2}{\$v}{'mean'} ) / \$
+total{\$p2}{\$v}{'stddev'} ;
\$sigma += ( \$zee1 * \$zee2 );
++\$count;
}
my \$cor = \$sigma / ( \$count - 1 ) ;
print OUT "\$p1 \$p2 \$v \$cor\n";
}
poj

In reply to Re^5: creating and managing many hashes by poj
in thread creating and managing many hashes by Gtforce

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":

• Are you posting in the right place? Check out Where do I post X? to know for sure.
• Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
• Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
• Want more info? How to link or or How to display code and escape characters are good places to start.

Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others browsing the Monastery: (1)
As of 2021-08-02 12:10 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
My primary motivation for participating at PerlMonks is: (Choices in context)

Results (24 votes). Check out past polls.

Notices?