This is what I used. More of an older style I'd imagine, and probably could be cleaned up a bit, but it works (I think!):
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
# read in data
my $g_arr = [ ];
my $value_arr = [ ];
while ( <DATA> ) {
chomp;
# Assumes 2 genes per line in the first and second position!
my @line = split /,/, $_, 3;
$line[2] =~ s/\s*$//;
my $cluster_idx = 0;
my @found = ( -1, -1 );
foreach my $cluster ( @$g_arr ) {
foreach my $poss_g_match ( @$cluster ) {
if ( $line[0] eq $poss_g_match ) {
$found[0] = 1;
}
if ( $line[1] eq $poss_g_match ) {
$found[1] = 1;
}
}
last if ( $found[0] != -1 || $found[1] != -1 );
$cluster_idx++;
}
if ( $found[0] == -1 ) {
push @{$g_arr->[$cluster_idx]}, $line[0];
}
if ( $found[1] == -1 ) {
push @{$g_arr->[$cluster_idx]}, $line[1];
}
push @{$value_arr->[$cluster_idx]}, $line[2];
}
my $traversal_idx = 0;
foreach my $cluster ( @$g_arr ) {
print join( ',', @$cluster) . " : " . join( ',', @{$value_arr->[$tra
+versal_idx++]} ), "\n";
}
__DATA__
Gene1,Gene2,spc1,spc2
Gene3,Gene1,spc1,spc2,spc4
Gene4,Gene1,spc1,spc2,spc5,spc3,spc1
Gene2,Gene3,spc1,spc2
Gene2,Gene4,spc2,spc3
Gene3,Gene4,spc1,spc2
GeneA,GeneB,spc4,spc5
GeneB,GeneC,spc1,spc2
GeneC,GeneD,spc1,spc2
GeneD,GeneE,spc4,spc2
GeneE,GeneF,spc3,spc1
GeneX,GeneY,spc6,spc8
GeneX,GeneP,spc6,spc7
GeneUnknown.,GeneUnknown.,spc1,spc2
Outputs
Gene1,Gene2,Gene3,Gene4 : spc1,spc2,spc1,spc2,spc4,spc1,spc2,spc5,spc3
+,spc1,spc1,spc2,spc2,spc3,spc1,spc2
GeneA,GeneB,GeneC,GeneD,GeneE,GeneF : spc4,spc5,spc1,spc2,spc1,spc2,sp
+c4,spc2,spc3,spc1
GeneX,GeneY,GeneP : spc6,spc8,spc6,spc7
GeneUnknown.,GeneUnknown. : spc1,spc2
Update: Added comment