#!/usr/bin/perl use strict; use warnings; use HTML::Entities; my $config = { minw => 3, maxw => 20, }; my $stop = get_stop(); my $punc = get_punc(); my $text = q| cat's dogs O’Reilly ad-hoc “broken” hyphen- the and |; my $words_all = {}; # contrived loop to show usage for my $t ($text){ get_word($t); } print "$_\n" for keys %{$words_all}; sub get_word{ my ($text, $file_key) = @_; my ($min, $max) = ($config->{minw}, $config->{maxw}); for ($text){ s/’|'/'/g; s/(&#?\w+;)/exists $punc->{$1}?' ':$1/eg; } decode_entities($text); $text =~ s/[^\w'-]/ /g; my @words = split ' ', $text; for (@words){ s/^['-]//g; s/['-]s?$//; next if length() < $min or length() > $max; next if exists $stop->{$_}; next if /\d/ and not /^[12]\d{3}s?$/; next if /--/; push @{$words_all->{$_}}, $file_key; } } sub get_stop{ # sample return { qw( and '' any '' the '' they '' ) }; } sub get_punc{ # sample return { '’' => undef, '‘' => undef, '”' => undef, '“' => undef, }; } __DATA__ ---------- Capture Output ---------- > "C:\Perl\bin\perl.exe" _new.pl hyphen cat ad-hoc O'Reilly broken dogs > Terminated with exit code 0.