#!/usr/bin/perl

=head1 NAME

stopword-filter

=head1 SYNOPSIS

 stopword-filter [-e encoding] stop.list  text.file

 stopword-filter -t  # (runs a simple test on internal utf8 data)

=head1 DESCRIPTION

The stop.list file should contain a set of white-space-separated words
that should be removed from the text file.  The remaining words in the
text file (after splitting on non-letter/non-mark characters and removing
stop words) will be printed to STDOUT, one word per line.

The two files need to have the same character encoding, and STDOUT
will be in that same encoding.  The default encoding is utf8.

=cut

use strict;
use warnings;
use Getopt::Std;

my %opt;
my $Usage = "Usage: $0 -t  # (to test)\n   or: $0 [-e enc] stop.list text.file\n"; 

getopts( 'e:t', \%opt ) and ( $opt{t} || @ARGV == 2 ) or die $Usage;

my ( $stoptext, $textdata );
my $enc = $opt{e} || 'utf8';
binmode STDOUT, ":encoding($enc)";

if ( $opt{t} ) {
    local $/ = "";  # empty string = "paragraph mode" for reading
    binmode DATA, ":encoding($enc)";
    $stoptext = <DATA>;
    $textdata = <DATA>;

    if ( $stoptext =~ /\&#\d+;/ ) {  # posting code on PM does this to data
        s/\&#(\d+);/chr($1)/eg for ( $stoptext, $textdata );
    }        # so turn numeric character entities back into utf8 characters
}
else {
    local $/;  # undef = "slurp mode" for reading
    open( STOP, "<:encoding($enc)", $ARGV[0] )
         or die "open failed for stoplist $ARGV[0]: $!\n";
    $stoptext = <STOP>;
    close STOP;
    open( TEXT, "<:encoding($enc)", $ARGV[1] )
         or die "open failed for textdata $ARGV[1]: $!\n";
    $textdata = <TEXT>;
    close TEXT;
}

my %stopword = map { $_ => undef } ( split ' ', $stoptext );
for my $word ( split /[^\pL\pM]+/, $textdata ) {
    next if ( exists( $stopword{$word} ));
    print "$word\n";
}

__DATA__
&#1601;&#1615;&#1608;
&#1576;&#1614;&#1585;
&#1576;&#1614;&#1586;

&#1601;&#1614;&#1604;&#1615;&#1586;&#1616;&#1606; &#1576;&#1585;&#1604;&#1603;&#1608;&#1548; &#1601;&#1615;&#1608; &#1578;&#1616;&#1583;&#1616;&#1617;&#1604;&#1616;&#1610; &#1576;&#1614;&#1585;. &#1587;&#1615;&#1603;&#1615;&#1608;&#1606;
 &#1576;&#1614;&#1586; &#1605;&#1614;&#1604;&#1585;&#1616;&#1610;&#1567; &#1601;&#1615;&#1608;! &#1576;&#1614;&#1585;&#1548; &#1606;&#1614;&#1583; &#1576;&#1614;&#1586; &#1605;&#1616;&#1587;.