#!/usr/bin/env perl use 5.016; use warnings; use autodie; my $corpusname = 'GFSEBcorpus.zul_selected-sentences_original'; my %words2ids; { open my $fh, '<', "$corpusname.example.tagset.txt"; while (<$fh>) { chomp; my ($text, $token) = split /\t/; $words2ids{fc $text} = $token; } } my $alt = join '|', sort { length($b) <=> length($a) } map fc, keys %words2ids; my $re = qr{(?i:($alt))}; my %found; { open my $in_fh, '<', "$corpusname.txt"; open my $out_fh, '>', "$corpusname.possible-annotation_example.txt"; while (<$in_fh>) { s/$re/++$found{fc $1}, "$1 $words2ids{fc $1}"/eg; print $out_fh $_; } } delete @words2ids{keys %found}; { open my $fh, '>', "$corpusname.tags-not-found_example.txt"; for (sort keys %words2ids) { say $fh "$_\t$words2ids{$_}"; } }