#! perl -slw use strict; use Time::HiRes qw[ time ]; use Math::Random::MT qw[ srand rand ]; sub rndStr{ join'', @_[ map{ rand( scalar @_ ) } 1 .. shift ] } our $LEN //= 64; our $GIGS //= 1; our $RL //= 1e6; my $recs = $GIGS * 1024**3 / $LEN; srand( 1 ); my $numeric = rndStr( $RL, 0 .. 9 ); my $alpha = rndStr( $RL, 'a'..'z' ); my $hex = rndStr( $RL, 0 ..9, 'A' .. 'F' ); my $bin = rndStr( $RL, map chr(), 32 .. 254 ); my $start = time; for ( 1 .. $recs ) { printf STDERR "\r%f%%\t", $_ * 100 / $recs unless $_ % 10000; my $f1 = substr( $alpha, rand( $RL - 6 ), 6 ); my $f2 = substr( $numeric, rand( $RL - 8 ), 8 ); my $f3 = substr( $hex, rand( $RL - 7 ), 7 ); my $f4 = substr( $bin, rand( $RL - 36 ), 36 ); printf "%s,%s,%s,'%s'\n", $f1, $f2, $f3, $f4; } printf STDERR "Took %.f seconds for $recs records\n", time() - $start; __END__ C:\test>hugeDeDup-gen > 1GB.csv 99.956989% Took 246 seconds for 16777216 records #### #! perl -slw use strict; our $NBUF //= 5000; our $IBUF //= 2e6; my $start = time; my @outFHs; my @outBufs; my $n = 0; my( $o, $buf ) = 0; open DISK, '<', $ARGV[0] or die $!; while( read( DISK, $buf, $IBUF, $o ) ) { open RAM, '<', \$buf; while( my $line = ) { unless( $line =~ /\n$/ ) { $buf = $line; $o = length $buf; next; } ++$n; my $key = substr( $line, 7, 3 ) % 600; if( push( @{ $outBufs[ $key ] }, $line ) > $NBUF or !$outFHs[ $key ] && @{ $outBufs[ $key ] } > rand( $NBUF ) ) { unless( defined $outFHs[ $key ] ) { open $outFHs[ $key ], '>', "$key.out" or die $!; } print { $outFHs[ $key ] } @{ $outBufs[ $key ] }; @{ $outBufs[ $key ] } = (); } } } print { $outFHs[ $_ ] } @{ $outBufs[ $_ ] } for 0 .. $#outBufs; close $_ for @outFHs; close DISK; printf "Took %d seconds for $n records\n", time() - $start, $n; __END__ C:\test>Ibufd.pl -IBUF=20971520 -NBUF=6000 1GB.csv Took 114 seconds for 16777216 records