#! perl -slw
use strict;
use Time::HiRes qw[ time ];
use Math::Random::MT qw[ srand rand ];

sub rndStr{ join'', @_[ map{ rand( scalar @_ ) } 1 .. shift ] }

our $LEN    //= 64;
our $GIGS   //= 1;
our $RL     //= 1e6;

my $recs = $GIGS * 1024**3 / $LEN;

srand( 1 );

my $numeric = rndStr( $RL, 0 .. 9 );
my $alpha   = rndStr( $RL, 'a'..'z' );
my $hex     = rndStr( $RL, 0 ..9, 'A' .. 'F' );
my $bin     = rndStr( $RL, map chr(), 32 .. 254 );

my $start = time;
for ( 1 .. $recs ) {
    printf STDERR "\r%f%%\t", $_ * 100 / $recs unless $_ % 10000;
    my $f1 = substr( $alpha,   rand( $RL -  6 ),  6 );
    my $f2 = substr( $numeric, rand( $RL -  8 ),  8 );
    my $f3 = substr( $hex,     rand( $RL -  7 ),  7 );
    my $f4 = substr( $bin,     rand( $RL - 36 ), 36 );
    printf "%s,%s,%s,'%s'\n", $f1, $f2, $f3, $f4;
}

printf STDERR "Took %.f seconds for $recs records\n", time() - $start;

__END__
C:\test>hugeDeDup-gen > 1GB.csv
99.956989%      Took 246 seconds for 16777216 records

##</code><code>##

#! perl -slw
use strict;

our $NBUF //= 5000;
our $IBUF //= 2e6;

my $start  = time;

my @outFHs;
my @outBufs;
my $n = 0;
my( $o, $buf ) = 0;

open DISK, '<', $ARGV[0] or die $!;

while( read( DISK, $buf, $IBUF, $o ) ) {
    open RAM, '<', \$buf;
    while( my $line = <RAM> ) {
        unless( $line =~ /\n$/ ) {
            $buf = $line;
            $o = length $buf;
            next;
        }
        ++$n;
        my $key = substr( $line, 7, 3 ) % 600;
        if(
            push( @{ $outBufs[ $key ] }, $line ) > $NBUF
        or
            !$outFHs[ $key ] && @{ $outBufs[ $key ] } > rand( $NBUF )

        ) {
            unless( defined $outFHs[ $key ] ) {
                open $outFHs[ $key ], '>', "$key.out" or die $!;
            }
            print { $outFHs[ $key ] } @{ $outBufs[ $key ] };
            @{ $outBufs[ $key ] } = ();
        }
    }
}

print { $outFHs[ $_ ] } @{ $outBufs[ $_ ] } for 0 .. $#outBufs;
close $_ for @outFHs;
close DISK;

printf "Took %d seconds for $n records\n", time() - $start, $n;

__END__
C:\test>Ibufd.pl -IBUF=20971520 -NBUF=6000 1GB.csv
Took 114 seconds for 16777216 records