#!/usr/bin/perl =head1 NAME stopword-filter =head1 SYNOPSIS stopword-filter [-e encoding] stop.list text.file stopword-filter -t # (runs a simple test on internal utf8 data) =head1 DESCRIPTION The stop.list file should contain a set of white-space-separated words that should be removed from the text file. The remaining words in the text file (after splitting on non-letter/non-mark characters and removing stop words) will be printed to STDOUT, one word per line. The two files need to have the same character encoding, and STDOUT will be in that same encoding. The default encoding is utf8. =cut use strict; use warnings; use Getopt::Std; my %opt; my $Usage = "Usage: $0 -t # (to test)\n or: $0 [-e enc] stop.list text.file\n"; getopts( 'e:t', \%opt ) and ( $opt{t} || @ARGV == 2 ) or die $Usage; my ( $stoptext, $textdata ); my $enc = $opt{e} || 'utf8'; binmode STDOUT, ":encoding($enc)"; if ( $opt{t} ) { local $/ = ""; # empty string = "paragraph mode" for reading binmode DATA, ":encoding($enc)"; $stoptext = ; $textdata = ; if ( $stoptext =~ /\&#\d+;/ ) { # posting code on PM does this to data s/\&#(\d+);/chr($1)/eg for ( $stoptext, $textdata ); } # so turn numeric character entities back into utf8 characters } else { local $/; # undef = "slurp mode" for reading open( STOP, "<:encoding($enc)", $ARGV[0] ) or die "open failed for stoplist $ARGV[0]: $!\n"; $stoptext = ; close STOP; open( TEXT, "<:encoding($enc)", $ARGV[1] ) or die "open failed for textdata $ARGV[1]: $!\n"; $textdata = ; close TEXT; } my %stopword = map { $_ => undef } ( split ' ', $stoptext ); for my $word ( split /[^\pL\pM]+/, $textdata ) { next if ( exists( $stopword{$word} )); print "$word\n"; } __DATA__ فُو بَر بَز فَلُزِن برلكو، فُو تِدِّلِي بَر. سُكُون بَز مَلرِي؟ فُو! بَر، نَد بَز مِس.