#!/usr/bin/perl use strict; use warnings; use constant NEW_LET => 1; use constant LEN => 2; use Inline C =>; my $file = $ARGV[0] || 'phase1.data'; open(my $fh, '<', $file) or die "Unable to open '$file' for reading: $!"; my @word; while (<$fh>) { chomp; push @word, [split /\t/]; } my %seen_nform; for my $i (0 .. $#word - 1) { my ($nform1, $str1) = @{$word[$i]}; my (@new_word, %seen); for my $j ($i + 1 .. $#word) { my ($nform2, $str2) = @{$word[$j]}; my $new_let = diff($nform2, $nform1); next if ! $new_let || $seen{$new_let}++; push @new_word, [$str2, $new_let, length($new_let)]; } @new_word = sort { $b->[LEN] <=> $a->[LEN] } @new_word; for my $i2 (0 .. $#new_word - 1) { next if ! defined $new_word[$i2]; for my $j2 ($i2 + 1 .. $#new_word) { next if ! defined $new_word[$j2]; $new_word[$j2] = undef if ! distinct($new_word[$i2][NEW_LET], $new_word[$j2][NEW_LET]); } } for (grep defined, @new_word) { my $str2 = $_->[0]; my %uniq = map {$_ => undef} split //, $nform1 . $str2; my $new_nform = join '', sort keys %uniq; next if $seen_nform{$new_nform}++; print join "\t", $new_nform, $str1, $str2; print "\n"; } } __END__ __C__ SV* diff ( char *str1, char *str2 ) { SV *sv= newSVpvn( "", 0 ); int result_index= 0; char *result= SvGROW( sv, 257); /* identify all chars present in str2 */ while ( *str1 && *str2 ) { if ( *str1 < *str2) result[ result_index++ ]= *str1++; while ( *str1 && *str1 == *str2) { str1++; str2++; } if ( *str1 > *str2 ) str2++; } while (*str1) result[ result_index++ ]= *str1++; result[ result_index ]= 0; SvCUR_set( sv, result_index ); return sv; } int distinct(unsigned char *str1, unsigned char *str2) { /* Actual code has 256 0s - truncated for post */ char exists[256] = {}; /* Turn array into a hash */ while (*str1) { exists[*str1++] = 1; } /* Determine if str2 contains any chars str1 does not */ while (*str2) { if (! exists[*str2++]) return 1; } return 0; }