Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
The code is tested with a toy example!
#!/usr/bin/perl use strict; use warnings; use XML::LibXML; use List::Util qw(sum); use diagnostics; my $RTE = shift; my $file_in_es=shift; my $file_in_en=shift; my $out =shift; my $no; my %lemma_hypo=(); my %stem_hypo=(); my %token_hypo=(); #reading RTE corpus my $parser = XML::LibXML->new(); my $doc = $parser->parse_file( $RTE ); #create hash files from ngram statistics my %hash_en=to_hash($file_in_en); my %hash_es=to_hash($file_in_es); #open to write the results! open (OUTPUT, ">$out"); for my $n (1..800){ my $entailment = $doc->find( '//pair[@id = '.$n.']/@entailment' ); # READ LEMMA, TOKENS and STEMS of TEXT. my $lemma_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at +tribute[@name="lemma"]' ); my @lemma_text = &to_Array($lemma_text); @lemma_text = remove_punc(@lemma_text); my $token_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at +tribute[@name="token"]' ); my @token_text = &to_Array($token_text); @token_text = remove_punc(@token_text); my $stem_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/att +ribute[@name="stem"]' ); my @stem_text = &to_Array($stem_text); @stem_text = remove_punc(@stem_text); my $hypo = $doc->find( '//pair[@id = '.$n.']/h' ); my @hypo = to_Array($hypo); my @MI = (); my $MI = 0; #FOR EACH HYPO for my $x (0..$#hypo) { my @MI_x = (); # READ LEMMA, TOKENS and STEMS of EACH HYPOTHESIS. my $no=$x+1; my $lemma_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation +[@no = '.$no.']/word/attribute[@name="lemma"]' ); @{$lemma_hypo{$x}} = &to_Array($lemma_hypo); @{$lemma_hypo{$x}} = remove_punc(@{$lemma_hypo{$x}}); my $token_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation +[@no = '.$no.']/word/attribute[@name="token"]' ); @{$token_hypo{$x}} = &to_Array($token_hypo); @{$token_hypo{$x}} = remove_punc(@{$token_hypo{$x}}); my $stem_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation[ +@no = '.$no.']/word/attribute[@name="stem"]' ); @{$stem_hypo{$x}} = &to_Array($stem_hypo); @{$stem_hypo{$x}} = remove_punc(@{$stem_hypo{$x}}); for my $i (0..$#{$token_hypo{$x}}) { my $current_token_hypo = lc($token_hypo{$x}[$i]); my $current_stem_hypo = lc($stem_hypo{$x}[$i]); my $current_lemma_hypo = lc($lemma_hypo{$x}[$i]); $MI_x[$i]=0; my $MI_token_hypo= my $MI_T = 0; if (exists $hash_es{$current_token_hypo}) { foreach $token_text (@token_text) { $token_text=lc($token_text); $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_token_hypo,$token_text,\%hash_es +,\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } elsif (exists $hash_es{$current_lemma_hypo}) { foreach $token_text (@token_text) { $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_lemma_hypo,$token_text,\%hash_es +,\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } elsif (exists $hash_es{$current_stem_hypo}) { foreach $token_text (@token_text) { $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_stem_hypo,$token_text,\%hash_es, +\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } } push @MI,mean(@MI_x); if ($x==0) {$MI=$MI_x[0];} elsif ($MI[$x] >= $MI[$x-1]) {$MI=$MI[$x];} } #$MI = sprintf("%.15f", $MI); $MI = $MI*1000000; $MI = sprintf("%.4f", $MI); print OUTPUT "$n\t$entailment\t$MI\n"; } close OUTPUT; #===================================================================== +=============================== # ***************** ALL FUNCTIONS AND SUBROUTINES ARE HERE ***** +***************************** = #===================================================================== +=============================== sub mean { return sum(@_)/@_; } sub to_hash { my %hash; my $file = shift; open(FILE, "<$file"); foreach my $l (<FILE>) { my ($ngram,$line) = split /\t/, $l; push(@{ $hash{$ngram} }, $line); } close FILE; return %hash; } sub MI { my ($string_es,$string_en,$hash_es,$hash_en)=@_; my @array_es= my @array_en = my @intersection = (); @array_es = @{$hash_es{$string_es}}; @array_en = @{$hash_en{$string_en}}; my $prob_es = ($#array_es+1)/6939873; my $prob_en = ($#array_en+1)/6939873; @intersection= Intersection(@array_es,@array_en); my $prob_es_en= ($#intersection+1)/6939873; $prob_es_en = ($prob_es_en + ($prob_es*$prob_en*0.1))/1.1; my $mi= $prob_es_en* log($prob_es_en/($prob_es*$prob_en)); return $mi; } sub Intersection { my (@array1,@array2)=@_; my @union = my @intersection = my @difference = (); my %count = (); foreach my $element (@array1, @array2) { $count{$element}++ } foreach my $element (keys %count) { push @union, $element; push @{ $count{$element} > 1 ? \@intersection : \@difference }, $e +lement; } return @intersection; } sub to_Array { my $string = shift; my @array; if (my @arraynodes = $string->get_nodelist) { @array = map($_->string_value, @arraynodes);} return @array; } sub remove_punc { my @array = @_; my @filtered; for my $i (0..$#array){ unless ($array[$i] =~ m/[[:punct:]]/ ){ push @filtered,$array[$i]; } } return @filtered; }

In reply to Re^10: statistics of a large text by perl_lover_always
in thread statistics of a large text by perl_lover_always

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others browsing the Monastery: (5)
As of 2024-04-19 03:06 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found