use strict;
use warnings;

my %site_length_catch;
my %sites;
my $maxsite = 0;

<DATA>; # skip header
foreach (<DATA>) {
  chomp;

	# split and give meaningful names
  my( $gene, $patient, $diagnosis, $mut_and_sit, $length ) = split /\s+/;

	# clean the site
	my $sit = $mut_and_sit;
	$sit =~ s/\D//g;

	# store patient to avoid double counting
  $site_length_catch{$gene}{$sit}{$patient} = 1; 

	# store all sites with mutations
	$sites{$sit} = 1;
	$maxsite = $sit > $maxsite ? $sit : $maxsite;
}

# now remove double counted patients from the data structure
foreach my $gene ( values %site_length_catch) {
		for my $count ( values %$gene ) {
				$count = keys %$count; # in scalar context you get the number of keys
		}
}

# print table in desired format
# uncomment one of the following two lines
my @sitesprinted = sort { $a <=> $b } keys %sites;   # sparse printing
#my @sitesprinted = 1..$maxsite;                      # full printing

# header first
print "Gene";
print "\tsite $_" for @sitesprinted;
print "\n";

# now the data
foreach my $gene (keys %site_length_catch) {
		print $gene;
		print "\t", $site_length_catch{$gene}{$_} // 0 for @sitesprinted; 
		print "\n";
}

__DATA__
Gene Name    Patient ID    Patient Diagnosis    Ammino Acid Mutation and Sit    Protein Length 
AAK1    19679    adenocarcinoma    L661I    21265
AAK1    19679    adenocarcinoma    L664T    21265
AAK1    19679    adenocarcinoma    L664T    21265
AAK1    19679    adenocarcinoma    L664T    21265
AAK1    19679    adenocarcinoma    L664T    21265
AAK1    19679    adenocarcinoma    L664T    21265
AAK1    19676    adenocarcinoma    L664T    21265
AAK1    19677    adenocarcinoma    L64F    21265
AAK1    19678    adenocarcinoma    L64R    21265
FKT1    101063    ER-PR-sitive_carcinoma    p.L52R    2773
FKT1    103872    ER-PR-sitive_carcinoma    p.E17K    2773
FKT1    107590    ER-PR-sitive_carcinoma    p.E17K    2773
FKT1    107600    ER-PR-sitive_carcinoma    p.E17K    2773
FKT1    1135911    NS    E17K    2773
TET3    152    chronic_lymocytic_leukaemia    p.R401H    10982
TET3    587220    adenocarcinoma    M935V    10982
TET3    587220    adenocarcinoma    R1534Q    10982
TET3    587256    adenocarcinoma    G1356R    10982
TET3    587338    adenocarcinoma    G1356W    10982