The code is tested with a toy example!
#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXML;
use List::Util qw(sum);
use diagnostics;
my $RTE = shift;
my $file_in_es=shift;
my $file_in_en=shift;
my $out =shift;
my $no;
my %lemma_hypo=();
my %stem_hypo=();
my %token_hypo=();
#reading RTE corpus
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file( $RTE );
#create hash files from ngram statistics
my %hash_en=to_hash($file_in_en);
my %hash_es=to_hash($file_in_es);
#open to write the results!
open (OUTPUT, ">$out");
for my $n (1..800){
my $entailment = $doc->find( '//pair[@id = '.$n.']/@entailment' );
# READ LEMMA, TOKENS and STEMS of TEXT.
my $lemma_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at
+tribute[@name="lemma"]' );
my @lemma_text = &to_Array($lemma_text);
@lemma_text = remove_punc(@lemma_text);
my $token_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at
+tribute[@name="token"]' );
my @token_text = &to_Array($token_text);
@token_text = remove_punc(@token_text);
my $stem_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/att
+ribute[@name="stem"]' );
my @stem_text = &to_Array($stem_text);
@stem_text = remove_punc(@stem_text);
my $hypo = $doc->find( '//pair[@id = '.$n.']/h' );
my @hypo = to_Array($hypo);
my @MI = ();
my $MI = 0;
#FOR EACH HYPO
for my $x (0..$#hypo) {
my @MI_x = ();
# READ LEMMA, TOKENS and STEMS of EACH HYPOTHESIS.
my $no=$x+1;
my $lemma_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation
+[@no = '.$no.']/word/attribute[@name="lemma"]' );
@{$lemma_hypo{$x}} = &to_Array($lemma_hypo);
@{$lemma_hypo{$x}} = remove_punc(@{$lemma_hypo{$x}});
my $token_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation
+[@no = '.$no.']/word/attribute[@name="token"]' );
@{$token_hypo{$x}} = &to_Array($token_hypo);
@{$token_hypo{$x}} = remove_punc(@{$token_hypo{$x}});
my $stem_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation[
+@no = '.$no.']/word/attribute[@name="stem"]' );
@{$stem_hypo{$x}} = &to_Array($stem_hypo);
@{$stem_hypo{$x}} = remove_punc(@{$stem_hypo{$x}});
for my $i (0..$#{$token_hypo{$x}}) {
my $current_token_hypo = lc($token_hypo{$x}[$i]);
my $current_stem_hypo = lc($stem_hypo{$x}[$i]);
my $current_lemma_hypo = lc($lemma_hypo{$x}[$i]);
$MI_x[$i]=0;
my $MI_token_hypo= my $MI_T = 0;
if (exists $hash_es{$current_token_hypo}) {
foreach $token_text (@token_text) {
$token_text=lc($token_text);
$MI_T=0;
if (exists $hash_en{$token_text}) {
$MI_T=MI($current_token_hypo,$token_text,\%hash_es
+,\%hash_en);
}
$MI_token_hypo = $MI_token_hypo + $MI_T;
}
$MI_x[$i]=$MI_token_hypo/$#token_text;
}
elsif (exists $hash_es{$current_lemma_hypo}) {
foreach $token_text (@token_text) {
$MI_T=0;
if (exists $hash_en{$token_text}) {
$MI_T=MI($current_lemma_hypo,$token_text,\%hash_es
+,\%hash_en);
}
$MI_token_hypo = $MI_token_hypo + $MI_T;
}
$MI_x[$i]=$MI_token_hypo/$#token_text;
}
elsif (exists $hash_es{$current_stem_hypo}) {
foreach $token_text (@token_text) {
$MI_T=0;
if (exists $hash_en{$token_text}) {
$MI_T=MI($current_stem_hypo,$token_text,\%hash_es,
+\%hash_en);
}
$MI_token_hypo = $MI_token_hypo + $MI_T;
}
$MI_x[$i]=$MI_token_hypo/$#token_text;
}
}
push @MI,mean(@MI_x);
if ($x==0) {$MI=$MI_x[0];}
elsif ($MI[$x] >= $MI[$x-1]) {$MI=$MI[$x];}
}
#$MI = sprintf("%.15f", $MI);
$MI = $MI*1000000;
$MI = sprintf("%.4f", $MI);
print OUTPUT "$n\t$entailment\t$MI\n";
}
close OUTPUT;
#=====================================================================
+===============================
# ***************** ALL FUNCTIONS AND SUBROUTINES ARE HERE *****
+***************************** =
#=====================================================================
+===============================
sub mean {
return sum(@_)/@_;
}
sub to_hash {
my %hash;
my $file = shift;
open(FILE, "<$file");
foreach my $l (<FILE>) {
my ($ngram,$line) = split /\t/, $l;
push(@{ $hash{$ngram} }, $line);
}
close FILE;
return %hash;
}
sub MI {
my ($string_es,$string_en,$hash_es,$hash_en)=@_;
my @array_es= my @array_en = my @intersection = ();
@array_es = @{$hash_es{$string_es}};
@array_en = @{$hash_en{$string_en}};
my $prob_es = ($#array_es+1)/6939873;
my $prob_en = ($#array_en+1)/6939873;
@intersection= Intersection(@array_es,@array_en);
my $prob_es_en= ($#intersection+1)/6939873;
$prob_es_en = ($prob_es_en + ($prob_es*$prob_en*0.1))/1.1;
my $mi= $prob_es_en* log($prob_es_en/($prob_es*$prob_en));
return $mi;
}
sub Intersection {
my (@array1,@array2)=@_;
my @union = my @intersection = my @difference = ();
my %count = ();
foreach my $element (@array1, @array2) { $count{$element}++ }
foreach my $element (keys %count) {
push @union, $element;
push @{ $count{$element} > 1 ? \@intersection : \@difference }, $e
+lement;
}
return @intersection;
}
sub to_Array {
my $string = shift;
my @array;
if (my @arraynodes = $string->get_nodelist) {
@array = map($_->string_value, @arraynodes);}
return @array;
}
sub remove_punc {
my @array = @_;
my @filtered;
for my $i (0..$#array){
unless ($array[$i] =~ m/[[:punct:]]/ ){
push @filtered,$array[$i];
}
}
return @filtered;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.