comment on

The code is tested with a toy example!

#!/usr/bin/perl

use strict;
use warnings;
use XML::LibXML;
use List::Util qw(sum);
use diagnostics;

my $RTE = shift;
my $file_in_es=shift;
my $file_in_en=shift;
my $out =shift;

my $no;
my %lemma_hypo=();
my %stem_hypo=();
my %token_hypo=();


#reading RTE corpus
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file( $RTE );

#create hash files from ngram statistics
my %hash_en=to_hash($file_in_en);
my %hash_es=to_hash($file_in_es);
#open to write the results!
open (OUTPUT, ">$out");




for my $n (1..800){

my $entailment = $doc->find( '//pair[@id = '.$n.']/@entailment' );

# READ LEMMA, TOKENS and STEMS of TEXT.
my $lemma_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at
+tribute[@name="lemma"]' );
my @lemma_text = &to_Array($lemma_text);
@lemma_text = remove_punc(@lemma_text);

my $token_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at
+tribute[@name="token"]' );
my @token_text = &to_Array($token_text);
@token_text = remove_punc(@token_text);

my $stem_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/att
+ribute[@name="stem"]' );
my @stem_text = &to_Array($stem_text);
@stem_text = remove_punc(@stem_text);

my $hypo = $doc->find( '//pair[@id = '.$n.']/h' );
my @hypo = to_Array($hypo);


my @MI = ();
my $MI = 0;
#FOR EACH HYPO
for my $x (0..$#hypo) {
    my @MI_x = ();
        
# READ LEMMA, TOKENS and STEMS of EACH HYPOTHESIS.

        my $no=$x+1;
    
        my $lemma_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation
+[@no = '.$no.']/word/attribute[@name="lemma"]' );
        @{$lemma_hypo{$x}} = &to_Array($lemma_hypo);
        @{$lemma_hypo{$x}} = remove_punc(@{$lemma_hypo{$x}});
        
        my $token_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation
+[@no = '.$no.']/word/attribute[@name="token"]' );
        @{$token_hypo{$x}} = &to_Array($token_hypo);
        @{$token_hypo{$x}} = remove_punc(@{$token_hypo{$x}});
               
        my $stem_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation[
+@no = '.$no.']/word/attribute[@name="stem"]' );
        @{$stem_hypo{$x}} = &to_Array($stem_hypo);
        @{$stem_hypo{$x}} = remove_punc(@{$stem_hypo{$x}});
    
    for my $i (0..$#{$token_hypo{$x}}) {
                my $current_token_hypo = lc($token_hypo{$x}[$i]);
                my $current_stem_hypo = lc($stem_hypo{$x}[$i]);
                my $current_lemma_hypo = lc($lemma_hypo{$x}[$i]);
        
        $MI_x[$i]=0;
        my $MI_token_hypo= my $MI_T = 0;
        
        if (exists $hash_es{$current_token_hypo}) {
            foreach $token_text (@token_text) {
                $token_text=lc($token_text);
                $MI_T=0;
                if (exists $hash_en{$token_text}) {
                    $MI_T=MI($current_token_hypo,$token_text,\%hash_es
+,\%hash_en);
                }
                $MI_token_hypo = $MI_token_hypo + $MI_T;
            }
        $MI_x[$i]=$MI_token_hypo/$#token_text;
        }
        elsif (exists $hash_es{$current_lemma_hypo}) {
            foreach $token_text (@token_text) {
                $MI_T=0;
                if (exists $hash_en{$token_text}) {
                    $MI_T=MI($current_lemma_hypo,$token_text,\%hash_es
+,\%hash_en);
                }
                $MI_token_hypo = $MI_token_hypo + $MI_T;
            }
        $MI_x[$i]=$MI_token_hypo/$#token_text;    
        }
        elsif (exists $hash_es{$current_stem_hypo}) {
            foreach $token_text (@token_text) {
                $MI_T=0;
                if (exists $hash_en{$token_text}) {
                    $MI_T=MI($current_stem_hypo,$token_text,\%hash_es,
+\%hash_en);
                }
                $MI_token_hypo = $MI_token_hypo + $MI_T;
            }
        $MI_x[$i]=$MI_token_hypo/$#token_text;    
        }
        
    }
    
    push @MI,mean(@MI_x);
    

    if ($x==0) {$MI=$MI_x[0];}
        elsif ($MI[$x] >= $MI[$x-1]) {$MI=$MI[$x];}
                        
}
 #$MI = sprintf("%.15f", $MI);
 $MI = $MI*1000000;
 $MI = sprintf("%.4f", $MI);
print OUTPUT "$n\t$entailment\t$MI\n";
}
close OUTPUT;


#=====================================================================
+===============================
# *****************      ALL FUNCTIONS AND SUBROUTINES ARE HERE  *****
+***************************** =
#=====================================================================
+===============================

sub mean {
    return sum(@_)/@_;
}


sub to_hash {
    my %hash;
my $file = shift;
open(FILE, "<$file");
foreach my $l (<FILE>) {
    my ($ngram,$line) = split /\t/, $l;
    push(@{ $hash{$ngram} }, $line);
    }
close FILE;
return %hash;
}


sub MI {
    my ($string_es,$string_en,$hash_es,$hash_en)=@_;
    my @array_es= my @array_en = my @intersection = ();
    @array_es = @{$hash_es{$string_es}};
    @array_en = @{$hash_en{$string_en}};
    my $prob_es = ($#array_es+1)/6939873;
    my $prob_en = ($#array_en+1)/6939873;
    @intersection= Intersection(@array_es,@array_en);
    my $prob_es_en= ($#intersection+1)/6939873;
    $prob_es_en = ($prob_es_en + ($prob_es*$prob_en*0.1))/1.1;
    my $mi= $prob_es_en* log($prob_es_en/($prob_es*$prob_en));
    return $mi;
}

sub Intersection {
    my (@array1,@array2)=@_;
    my @union = my @intersection = my @difference = ();
    my %count = ();
    foreach my $element (@array1, @array2) { $count{$element}++ }
    foreach my $element (keys %count) {
    push @union, $element;
    push @{ $count{$element} > 1 ? \@intersection : \@difference }, $e
+lement;
    }
    return @intersection;

}

sub to_Array {
    my $string = shift;
    my @array;
    if (my @arraynodes = $string->get_nodelist) {
    @array = map($_->string_value, @arraynodes);}
    return @array;
}

sub remove_punc {
    my @array = @_;
    my @filtered;
    for my $i (0..$#array){
            unless ($array[$i] =~ m/[[:punct:]]/ ){
            push @filtered,$array[$i];
        }
        
    }
    
    return @filtered; 
}
[download]

In reply to Re^10: statistics of a large text by perl_lover_always
in thread statistics of a large text by perl_lover_always

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


good chemistry is complicated, and a little bit messy -LW
	PerlMonks