use strict; use warnings; my $size = 12; use Sys::Mmap; use Sort::Key::Merger 'keymerger'; $| = 1; open BIG, '<', 'big.txt' or die "unable to open big.txt"; mmap(my $big, 0, PROT_READ, MAP_SHARED, *BIG) or die "unable to mmap big.txt"; my $chunk = 8 * 1024 * 1024; my @files; my $n = 0; my $lenbig = length $big; for (my $i = 0; $i < $lenbig; $i += $chunk) { my $top = $i + $chunk; $top = $lenbig if $lenbig < $top; print STDERR "sorting from $i to $top\n"; my @s = sort { ( substr($big, $a, 30) cmp substr($big, $b, 30) or substr($big, $a, 10_000) cmp substr($big, $b, 10_000) or substr($big, $a, 16_000_000) cmp substr($big, $b, 16_000_000) or substr($big, $a) cmp substr($big, $b) ) } $i..($top); my $file = "small.$i"; push @files, $file; open F, '>', $file or die "unable to open $file"; print F pack(N => $_) for @s; close F; } my @fh; for (@files) { open my $fh, '<', $_ or die "unable to open $_"; push @fh, $fh; } my $merger = keymerger { if (read $_, my $data, 4) { my $i = unpack N => $data; return (substr($big, $i, $size), $i); } else { return () } } @fh; my $last = $merger->(); my $count = 1; while (defined(my $i = $merger->())) { # print substr($big, $i, $size), "\n"; if (substr($big, $i, $size) eq substr($big, $last, $size)) { $count++; } else { if ($count > 1) { print substr($big, $last, $size), " appears $count times\n"; } $last = $i; $count = 1; } } if ($count > 1) { print substr($big, $last, $size), " appears $count times\n"; }