This is what I ended up with and am using (incorporating
fizbin's points).
I've changed the definition of a word.
- Hyphenated words are now separated and treated separately
- Accented words are now ascii-ised
The other difference is that HTML entities are decoded while extracting text from the HTML.
All the text (apart from curly punctuation) is Latin 1.
#!/usr/bin/perl
use strict;
use warnings;
use locale;
my $config = {
stop => 'data/stop.txt',
minw => 3,
maxw => 20,
};
my $words = q(
and cat's O'Rielly counter-productive crèche
2000 1980s 777777 x0000
repeat repeat repeat
);
my %keywords = get_word(lc $words);
for (keys %keywords){
print "$_\n";
}
sub get_word{
my ($text) = @_;
my ($min, $max) = ($config->{minw}, $config->{maxw});
my %stop = get_stop($config->{stop});
my %acc = get_accent();
my %pat = get_patterns();
my %keywords;
for ($text){
s/$pat{rsqu}/'/g;
s/$pat{hyph}/ /g;
s/(.)/$acc{$1}?$acc{$1}:$1/eg;
}
my @words = ($text =~ /$pat{word}/g);
my %seen;
for (@words){
s/$pat{posv}//;
s/$pat{apos}//g;
next if length() < $min or length() > $max;
next if /$pat{numb}/ and not /$pat{date}/;
s/$pat{tail}// if /$pat{numb}/;
next if exists $stop{$_};
unless ($seen{$_}){
$seen{$_}++;
$keywords{$_} = undef;
}
}
return %keywords;
}
sub get_patterns{
my %pat = (
posv => q/'s$/, # possessive s
apos => qr/'/, # apostrophe
hyph => qr/-/, # hyphen
numb => qr/\d/, # number
date => qr/^[12]\d{3}s?$/, # date like 1960, 1990s
rsqu => qr/\x{2019}/, # right single quote
tail => qr/s$/, # trailing s for stripping off numbers
word => qr/([\w']+)/, # word that may contain apostrophe
);
return %pat;
}
sub get_accent{
return qw(
À A Á A Â A Ã A Ä A Å A Æ AE
Ç C
È E É E Ê E Ë E
Ì I Í I Î I Ï I
Ð TH Ñ N
Ò O Ó O Ô O Õ O Ö O Ø O
Ù U Ú U Û U Ü U
Ý U Þ TH ß ss
à a á a â a ã a ä a å a æ ae
ç c
è e é e ê e ë e
ì i í i î i ï i
ð th ñ n
ò o ó o ô o õ o ö o ø o
ù u ú u û u ü u
ý y þ th ÿ y
);
}
sub get_stop {
# load stop word file
# sample
my %stop = (
and => undef,
the => undef,
any => undef,
);
return %stop;
}
output:
---------- Capture Output ----------
> "C:\Perl\bin\perl.exe" clean_words.pl
1980
orielly
productive
creche
cat
counter
2000
repeat
> Terminated with exit code 0.