Beefy Boxes and Bandwidth Generously Provided by pair Networks
P is for Practical
 
PerlMonks  

Re^9: statistics of a large text

by BrowserUk (Pope)
on Feb 10, 2011 at 15:54 UTC ( #887484=note: print w/ replies, xml ) Need Help??


in reply to Re^8: statistics of a large text
in thread statistics of a large text

Do you think it has something to do with my code?

It is beginning to look that way. Can you post the latest version of your code?


Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.


Comment on Re^9: statistics of a large text
Re^10: statistics of a large text
by perl_lover_always (Acolyte) on Feb 10, 2011 at 16:10 UTC
    The code is tested with a toy example!
    #!/usr/bin/perl use strict; use warnings; use XML::LibXML; use List::Util qw(sum); use diagnostics; my $RTE = shift; my $file_in_es=shift; my $file_in_en=shift; my $out =shift; my $no; my %lemma_hypo=(); my %stem_hypo=(); my %token_hypo=(); #reading RTE corpus my $parser = XML::LibXML->new(); my $doc = $parser->parse_file( $RTE ); #create hash files from ngram statistics my %hash_en=to_hash($file_in_en); my %hash_es=to_hash($file_in_es); #open to write the results! open (OUTPUT, ">$out"); for my $n (1..800){ my $entailment = $doc->find( '//pair[@id = '.$n.']/@entailment' ); # READ LEMMA, TOKENS and STEMS of TEXT. my $lemma_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at +tribute[@name="lemma"]' ); my @lemma_text = &to_Array($lemma_text); @lemma_text = remove_punc(@lemma_text); my $token_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/at +tribute[@name="token"]' ); my @token_text = &to_Array($token_text); @token_text = remove_punc(@token_text); my $stem_text = $doc->find( '//pair[@id = '.$n.']/tAnnotation/word/att +ribute[@name="stem"]' ); my @stem_text = &to_Array($stem_text); @stem_text = remove_punc(@stem_text); my $hypo = $doc->find( '//pair[@id = '.$n.']/h' ); my @hypo = to_Array($hypo); my @MI = (); my $MI = 0; #FOR EACH HYPO for my $x (0..$#hypo) { my @MI_x = (); # READ LEMMA, TOKENS and STEMS of EACH HYPOTHESIS. my $no=$x+1; my $lemma_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation +[@no = '.$no.']/word/attribute[@name="lemma"]' ); @{$lemma_hypo{$x}} = &to_Array($lemma_hypo); @{$lemma_hypo{$x}} = remove_punc(@{$lemma_hypo{$x}}); my $token_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation +[@no = '.$no.']/word/attribute[@name="token"]' ); @{$token_hypo{$x}} = &to_Array($token_hypo); @{$token_hypo{$x}} = remove_punc(@{$token_hypo{$x}}); my $stem_hypo = $doc->find( '//pair[@id = '.$n.']/hAnnotation[ +@no = '.$no.']/word/attribute[@name="stem"]' ); @{$stem_hypo{$x}} = &to_Array($stem_hypo); @{$stem_hypo{$x}} = remove_punc(@{$stem_hypo{$x}}); for my $i (0..$#{$token_hypo{$x}}) { my $current_token_hypo = lc($token_hypo{$x}[$i]); my $current_stem_hypo = lc($stem_hypo{$x}[$i]); my $current_lemma_hypo = lc($lemma_hypo{$x}[$i]); $MI_x[$i]=0; my $MI_token_hypo= my $MI_T = 0; if (exists $hash_es{$current_token_hypo}) { foreach $token_text (@token_text) { $token_text=lc($token_text); $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_token_hypo,$token_text,\%hash_es +,\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } elsif (exists $hash_es{$current_lemma_hypo}) { foreach $token_text (@token_text) { $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_lemma_hypo,$token_text,\%hash_es +,\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } elsif (exists $hash_es{$current_stem_hypo}) { foreach $token_text (@token_text) { $MI_T=0; if (exists $hash_en{$token_text}) { $MI_T=MI($current_stem_hypo,$token_text,\%hash_es, +\%hash_en); } $MI_token_hypo = $MI_token_hypo + $MI_T; } $MI_x[$i]=$MI_token_hypo/$#token_text; } } push @MI,mean(@MI_x); if ($x==0) {$MI=$MI_x[0];} elsif ($MI[$x] >= $MI[$x-1]) {$MI=$MI[$x];} } #$MI = sprintf("%.15f", $MI); $MI = $MI*1000000; $MI = sprintf("%.4f", $MI); print OUTPUT "$n\t$entailment\t$MI\n"; } close OUTPUT; #===================================================================== +=============================== # ***************** ALL FUNCTIONS AND SUBROUTINES ARE HERE ***** +***************************** = #===================================================================== +=============================== sub mean { return sum(@_)/@_; } sub to_hash { my %hash; my $file = shift; open(FILE, "<$file"); foreach my $l (<FILE>) { my ($ngram,$line) = split /\t/, $l; push(@{ $hash{$ngram} }, $line); } close FILE; return %hash; } sub MI { my ($string_es,$string_en,$hash_es,$hash_en)=@_; my @array_es= my @array_en = my @intersection = (); @array_es = @{$hash_es{$string_es}}; @array_en = @{$hash_en{$string_en}}; my $prob_es = ($#array_es+1)/6939873; my $prob_en = ($#array_en+1)/6939873; @intersection= Intersection(@array_es,@array_en); my $prob_es_en= ($#intersection+1)/6939873; $prob_es_en = ($prob_es_en + ($prob_es*$prob_en*0.1))/1.1; my $mi= $prob_es_en* log($prob_es_en/($prob_es*$prob_en)); return $mi; } sub Intersection { my (@array1,@array2)=@_; my @union = my @intersection = my @difference = (); my %count = (); foreach my $element (@array1, @array2) { $count{$element}++ } foreach my $element (keys %count) { push @union, $element; push @{ $count{$element} > 1 ? \@intersection : \@difference }, $e +lement; } return @intersection; } sub to_Array { my $string = shift; my @array; if (my @arraynodes = $string->get_nodelist) { @array = map($_->string_value, @arraynodes);} return @array; } sub remove_punc { my @array = @_; my @filtered; for my $i (0..$#array){ unless ($array[$i] =~ m/[[:punct:]]/ ){ push @filtered,$array[$i]; } } return @filtered; }

      Hm. Your doing rather more than just creating two big hashes aren't you.

      There are also some pretty iffy programming practices in your code that mean you're using far more memory than you need to.

      For example, in the routine Intersection(), you're build two arrays, @intersection and @difference. But you never use the latter, so why construct it?

      And then, when you return @intersection to the caller, you return as a list assigning it to an array in the caller. It's not possible to tell by inspection how big that array is, but by returning this way, you are consuming at 3 times as much memory as is necessary. The memory for the array inside the subroutine; then as much again (and more) to turn that into a list on the stack; then finally you treble it when you assign it to the final array. If you returned a reference to the array, you would avoid all that duplication for what is a very minor change in syntax when you use it.

      Another problem with Intersection(), is the way you pass data into it. You pass it two arrays, and assign to two arrays inside, but all the data from both arrays will end up in the first of those. Ie:

      sub x{ my( @a, @b ) = @_; print "a:[@a]\nb:[@b]"; }; @x = 1..10; @y = 'a'..'f';; print "x:[@x]\ny:[@y]";; x:[1 2 3 4 5 6 7 8 9 10] y:[a b c d e f] x( @x, @y );; a:[1 2 3 4 5 6 7 8 9 10 a b c d e f] b:[]

      This is especially wasteful as all you do with the returned array is use its size! Even more wasteful when you realise that the two arrays you pass into Intersection() are derived from the keys in two hashes.

      All you really want from all that expensive (time & space) processing is a count of the keys that are common to both hashes. And that can be done far more economically and simply. This refactoring of MI() will save time and space and allow you to discard Intersection() completely:

      sub MI { my( $string_es, $string_en, $hash_es, $hash_en ) = @_; my $prob_es = ( keys %$hash_es ) / 6939873; my $prob_en = ( keys %$hash_en ) / 6939873; my $common = 0; exists $hash_en->{ $_ } and ++$common for keys %$hash_es; my $prob_es_en= ( $common ) / 6939873; $prob_es_en = ( $prob_es_en + ( $prob_es * $prob_en * 0.1) ) / 1.1 +; my $mi = $prob_es_en * log( $prob_es_en / ( $prob_es * $prob_en ) +); return $mi; }

      And don't you find your code more readable with the extra horizontal white-space?

      There are similar problems with to_hash().

      You build a huge hash inside a subroutine and then return it to the caller by flattening it to a (huge)list where you then have to re-build the huge hash again. Not only is this very expensive in terms of cpu, it requires at least 3 times as much memory as necessary to hold the hash.

      The simplest way to avoid that with minimal changes to your existing code is to pass a reference into your subroutine and have it populate it:

      #create hash files from ngram statistics my %hash_en; to_hash( \%hash_en, $file_in_en ); my %hash_es; to_hash( \%hash_es, $file_in_es ); ... sub to_hash { my( $href, $file ) = @_; open(FILE, "<$file"); foreach my $l (<FILE>) { my( $ngram, $line ) = split /\t/, $l; push @{ $href->{ $ngram } }, $line; } close FILE; }

      I suspect that if you made those changes to your subroutines, you might find that it would run without blowing your memory.

      If not, then you are processing what I suspect is a very large XML document using a parser that stores everything in memory. I've no personal experience of LibXML, but they are notorious for consuming prodigious amounts of memory. Switching to the excellent XML::Twig which is specifically designed for handling huge XML structures in a modest amount of memory might help.


      Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
      "Science is about questioning the status quo. Questioning authority".
      In the absence of evidence, opinion is indistinguishable from prejudice.
        Thanks a lot! very good points! I agree for the "sub to_hash" but for the "sub MI", I dont want the count of keys for hashes. instead of
        my $prob_es = ( keys %$hash_es ) / 6939873; my $prob_en = ( keys %$hash_en ) / 6939873;
        I want :
        my $prob_es = ( $#{$hash_es{$string_es}} + 1) / 6939873; my $prob_en = ( $#{$hash_es{$string_en}} + 1 ) / 6939873;
        which is the number of elements of the value array of each key!

        The same goes for MI, I need the intersection between the array of both value arrays for the keys to thw two hashes.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://887484]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others making s'mores by the fire in the courtyard of the Monastery: (9)
As of 2014-12-25 11:33 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (160 votes), past polls