Hi dear monks
I have written a code and it works. this is to see what word occurred in which lines. I have a very large text (more than one gigabyte) and I want to make my program very efficient to create a hash and store it for later use.
Please tell me how can I make it more efficient and faster to deal with large data. also is it possible to store it more efficient in a less space consuming method to be retrieved faster.
Thansk
use strict;
use warnings;
use Encode;
use Storable;
use Lingua::StopWords qw( getStopWords );
my $stopwords = getStopWords('en');
my $file_in = shift;
my $file_out = shift;
open (FILEIN,'<utf8',$file_in);
#open (OUTPUT, '>utf8',$file_out);
my $line_count=0;
my %hash;
for my $line (<FILEIN>){
$line_count++;
my @ngrams=produce_ngrams($line);
foreach my $ngram (@ngrams) {
push(@{ $hash{$ngram} }, $line_count);
}
}
store \%hash, $file_out;
my %stat = ();
%stat = %{retrieve($file_out)};
#foreach my $n (keys %stat) { print "$n: @{$stat{$n}}\n"; }
sub produce_ngrams {
my $string=shift;
$string =~ s/[[:punct:]]//g;
$string=lc($string);
my @unigrams = split ' ',$string;
#@unigrams= join ' ', grep { !$stopwords->{$_} } @unigrams;
my @bigrams=();
my @trigrams=();
my @qgrams=();
my @cgrams=();
my @ngrams=();
for my $i (0..$#unigrams-1) {
my $bigram = join " ", $unigrams[$i], $unigrams[$i+1];
push @bigrams, $bigram;
}
if ($#unigrams >= 2) {
for my $i (0..$#unigrams-2) {
my $trigram = join " ", $unigrams[$i], $unigrams[$i+1], $unigr
+ams[$i+2];
push @trigrams, $trigram;
}}
if ($#unigrams >= 3) {
for my $i (0..$#unigrams-3) {
my $qgram = join " ", $unigrams[$i], $unigrams[$i+1], $unigram
+s[$i+2],$unigrams[$i+3];
push @qgrams, $qgram;
}}
if ($#unigrams >= 4) {
for my $i (0..$#unigrams-4) {
my $cgram = join " ", $unigrams[$i], $unigrams[$i+1], $unigram
+s[$i+2],$unigrams[$i+3],$unigrams[$i+4];
push @cgrams, $cgram;
}}
push @ngrams, @unigrams,@bigrams,@trigrams,@qgrams,@cgrams;
return @ngrams;
}