perlquestion
Perl_Noob2021
Need help for possible efficient perl code to count, rank and get the percentage. I have the following code, already running for 6hours and not yet complete. The target filed is a csv (all text) and is about 14m to 14.5m rows, and around 1100 to 1500columns and 62gig size
A run of 4hours is acceptable.
what it does:
- do a count (like a countif in excel)
- get the percent (based on 14m rows)
- get the rank based on count
Appreciate any help.
<code> $x="Room_reserve.csv";
$in = "D:\\package properties\\${x}.csv";
$in = "D:\\package properties\\${x}.csv";
$out = "D:\\package properties\\output\\${x}_output.csv";
open($fh, '<', $in) or die "Could not open file '$file' $!";
@data = <$fh>;
close($fh);
%counts;
@columns;
$first = 1;
#counter
foreach $dat (@data) {
chomp($dat);
@rows = split(',',$dat);
if ($first == 1) {
$first = 0;
next;
}
else {
$count = 1;
foreach $i (0..$#rows) {
if ( exists($columns[$i]{$rows[$i]}) ) {
$columns[$i]{$rows[$i]}++;
}
else {
$columns[$i]{$rows[$i]} = int($count);
}
}
}
}
#output
$first = 1;
open($fh, '>', $out) or die "Could not open file '$file' $!";
foreach $dat (@data) {
chomp($dat);
@rows = split(',',$dat);
foreach $i (0..$#rows) {
if ($i > 6) {
#for modifying name
if ( $first == 1 ) {
$line = join( ",", "Rank_$rows[$i]", "Percent_$rows[$i]", "Count_$rows[$i]", $rows[$i]);
print $fh "$line,";
if ( $i == $#rows ) {
$first = 0;
}
}
else {
@dat_val = reverse sort { $a <=> $b } values %{$columns[$i]};
%ranks = {};
$rank_cnt = 0;
foreach $val (@dat_val) {
if ( ! exists($ranks{$val}) ) {
$rank_cnt++;
}
$ranks{$val} = $rank_cnt;
}
$rank = $ranks{$columns[$i]{$rows[$i]}};
$cnt = $columns[$i]{$rows[$i]};
$ave = ($cnt / 14000000) * 100;
$line = join( ",", $rank, $ave, $cnt, $rows[$i]);
print $fh "$line,";
}
}
else {
print $fh "$rows[$i],";
}
}
print $fh "\n";
}
close($fh); </code>