TIMTOWTDI
#!/usr/bin/perl
use strict; # https://perlmonks.org/?node_id=11148202
use warnings;
use List::AllUtils qw( rev_nsort_by );
my $corpusfile = '/tmp/d.11148202.corpus'; # FIXME filename
my $wordfile = '/tmp/d.11148202.words'; # FIXME filename
my %words2ids;
{
local @ARGV = $wordfile;
while( <> )
{
my ($key, $value) = split /[\t\n]/;
$words2ids{lc $key} .= " $value";
}
}
my $pat = do { local $" = '|'; qr/(@{[
map quotemeta, rev_nsort_by { length } keys %words2ids ]})/i};
my %found;
{
local @ARGV = $corpusfile;
print s/\b$pat\K/ $found{lc $1}++; $words2ids{lc $1} /ger while <>;
}
delete @words2ids{ keys %found }; # not found
local $, = "\n";
print '',"---------------- Not Found:", sort(keys %words2ids), '';
Outputs:
Lokho udebe <ZUL-SIL-0016-n> kukwenze isilomo.
Ukuzihlola izinyo <ZUL-SIL-0018-n> <ZUL-SIL-0018-n-other> kungahlenga
+izinyo lomhlathi <ZUL-SIL-0019-n> yakho.
Amakhala agxiza amafinyila.
Ulimi <ZUL-SIL-0017-n> amafutha ulimi <ZUL-SIL-0017-n> wonke ULIMI <ZU
+L-SIL-0017-n> amabheringi.
Sebenzisa amafutha ulimi <ZUL-SIL-0017-n>.
Zama ukugwema ukudla okuncinca udebe <ZUL-SIL-0016-n>.
---------------- Not Found:
ingemuva lomqala
umphimbo