Hi Readers and helpers,
I am trying to count the occurreneces of the various elements of the keys of the Hash.More than one occurrence of the key occurs in the hash. I need to group all the keys and get the occurrences of each of their values
use XML::TreeBuilder;
sub tokenizefast ($) {
$line = $_[0];
$line =~ s/^\s*//;
$line =~ s/\s*$//;
# split into words to respect word boundaries
# and ignore space-containing bigrams
@words = split /\s+/, $line;
@ngrams = @words; #a copy
@ngramLengths = (2,3);
#now perform the grouping
foreach $slength (@ngramLengths) {
for(my $i=0;$i+$slength<=$#words+1;$i++){
my @tempwords= @words;
# print "tempwords: i: $i length $slength " ;
print @tempwords ; print "\n";
@tsome = splice(@tempwords,$i,$slength);
# print "after splice: ";
print @tsome; print "\n";
push(@ngrams,join(" ",@tsome));
}
}
return @ngrams;
}
## end of tokenize function
my $file= 'swbd_50k_42tags.xml';
my $tree = XML::TreeBuilder->new();
my %textHash;
$tree->parse_file($file);
foreach my $dialogue ($tree->find_by_tag_name ('dialogue')){
$dialogue_name = $dialogue->attr_get_i('name');
foreach my $turn ($dialogue->find_by_tag_name('turn')){
$turn_no = $turn->attr_get_i('no');
$turn_speaker = $turn->attr_get_i('speaker');
foreach my $utt ($dialogue->find_by_tag_name('utt')){
$da = $utt->attr_get_i('da');
$id = $utt->attr_get_i('id');
$inline = $utt->as_text;
@textarray = tokenizefast( $inline);
# print join(",",@textarray); print "\n";
if ($textHash{$da}) {
@someArray = @{ $textHash{$da} };
push @someArray , @textarray;
#print @someArray;
}
else {
$textHash{$da} = [ @textarray];
}
}
}
}
# extract different elements of xml doc
#write the final thing to a file
for $somekey (keys %textHash) {
@ans = @{ $textHash{$somekey} };
print " the key: $somekey" ;print " text {" ;
print join(", ", @ans);
print "}\n";
}
20050404 Edit by ysth: code tags
Retitled by BazB from 'Cry for Help'.