#!/usr/bin/perl =head1 NAME grepp -- Perl version of gnu/unix "grep" =head1 SYNOPSIS grepp [-chlnivz] [-d delim] [-r enc] {regex or -f rex.file} [file_name(s)] =head1 DESCRIPTION These days, with more text files containing complicated foreign languages and structured markup, the old standard "grep" utility just isn't good enough, and "grepp" (written in Perl, hence the extra "p" in the name) provides the extra abilities that have become daily needs. At present, grepp replicates only the most commonly used features of grep (in terms of the "standard" command line options available), but to compensate, grepp provides some important things that would be impossible with grep. In particular: Text files that contain non-ASCII data, ranging from "vanilla" single-byte (ISO Latin1, CP1252) to variable-width "tutti-frutti" (ShiftJIS, Big5, GB, KSC, Unicode...) are hard to probe reliably, unless you can specify the characters or regular expressions you're looking for in the native language of the text, and grepp lets you do that. For non-Unicode encodings, it may be easiest to store the patterns you want to find in a text file, one pattern per line, and use the "-f file.name" option, instead of trying to enter a foreign language pattern on the command line. However, if you have access to a suitable terminal for using your foreign language of choice (e.g. kterm, hanterm, cxterm, etc), you could switch input modes while typing the grepp command line, in order to specify a search pattern in Japanese, Korean, Chinese, etc, and grepp will do the right thing with that. Another alternative is to use Unicode 4-digit hexidecimal code points (e.g. '\x{00f5}') in the regular expression to match characters according to their Unicode definitions. This is possible for any lanuage -- and any input character encoding -- because grepp converts all text data (and search expressions as well) into Unicode internally, in order to look for matches. By using Unicode for matching, grepp avoids "false alarms" that are possible when searching for a given multi-byte character; for example, if you search GB text for a Chinese character whose two-byte sequence is, say "\xA1\xB4", a typical "grep" search would find "hits" on lines that contain GB character pairs like "\xC2\xA1\xB4\xD1". Since grepp converts all text data into Unicode, the true boundaries between multi-byte characters are always respected and maintained. Of course, grepp always writes its output using the same encoding as the input; the text you get back is just like the text you put in (minus the parts you do not want). The use of Unicode also provides many handy "shortcuts" for finding classes of characters that might otherwise be hard to enumerate, like '\p{Punctuation}', '\p{Hebrew}', '\p{CurrencySymbol}', etc. A complete list of Unicode-based regex terms can be found in the "perlunicode" man page. Naturally, all of the traditional Perl regex shortcuts can be used as well: '\s' (any white-space), '\d' (any ASCII digit), '\b' ("zero width" word boundary condition), etc; the "perlre" man page provides a complete explanation of these and many other useful extensions beyond the scope of the standard "grep" program. This makes grepp attractive even for ASCII-only jobs. Another novel feature is the ability to specify a "delimiter" string of your own choosing, instead of the default line-feed "\n" character. if this is 'undef', grepp will read the entire input text as a single string, and will return it all if it happens to match the given search condition(s). You can also specify things like mark-up tags (e.g. '\n' or '') or any specific character sequence (e.g. '\t' or '/'). this allows a search pattern to extend across line breaks, and allows the output to be multi-line or partial-line, instead of just single, whole lines. Note that the delimiter cannot be treated as a regular expression; it must be a literal string. And, as a convenience, grepp will handle compressed files or data streams as the input "text"; if you give it a file name ending in ".gz", ".z" or ".Z", or if you use the "-z" option when piping compressed data on stdin, grepp will automatically uncompress the data as it reads. Any matches will be printed as uncompressed text (in the specified encoding). =cut require 5.008_000; use Getopt::Long; use PerlIO::gzip; use Encode; ($me = $0) =~ s%.*/%%; $Usage = " $me [-chlnivz] [-d delim] [-r enc] {regex or -f rex.file} [file_name(s)] like 'grep', but handles perl regexes, wide characters, compressed text, ... -c -- print total number of matches (not the matching text) -h -- print lines of matching data only, not prefixed by file name -l -- print just the file names that contain matches (not the data) -n -- include line numbers when listing matches (note that these first four options are mutually exclusive) -i -- ignore case distinctions when matching -v -- invert search logic (print lines that do not match) -z -- treat input data stream(s) as compressed text (1) -f rex.file -- read search patterns from rex.file, one per line (2) -d delim -- use 'delim' as input record separator instead of '\\n' (3) -r enc -- regex(es) and input data are in character set 'enc' (4) NOTES: (1) file names '*.Z', '*.z' and '*.gz' are treated as compressed by default (2) multiple regexes from a file are combined using 'or'; when used with '-v', we only list lines that match none of the regexes (3) delim may be 'undef' or include \\n,\\t and/or \\r, but is NOT a regex (4) use '-r help' for a list of supported character sets\n\n"; if ( @ARGV > 1 && $ARGV[0] =~ /^-[chilnvz]{2,}$/ ) { $arg = substr( $ARGV[0], 1 ); # a kluge to allow multiple option @args = split( //, $arg ); # characters to be combined as a for (@args) { $_ = "-".$_ } # single arg on the command line splice( @ARGV, 0, 1, @args ); # (e.g. "-nvi"), just like grep } die $Usage unless ( &GetOptions( 'n', 'l', 'v', 'i', 'h', 'z', 'c', 'd=s', 'f=s', 'r=s' )); die $Usage unless (( $opt_n + $opt_l + $opt_h + $opt_c <2 ) && ( @ARGV >= 1 or $opt_f or $opt_r eq 'help' )); my @enclist = Encode->encodings(":all"); # list of all supported character sets listEncodings() # this will exit when done if ( $opt_r =~ /\S/ and not grep( /^$opt_r$/, @enclist )); if ( $opt_f ) { open( PTN, $opt_f ) or die "Unable to open regex file $opt_f: $!\n$Usage"; while () { s/[\r\n]+//; push @ptns, $_; } close PTN; $ptn = join( '|', @ptns ); } else { $ptn = shift; } $ptn = decode( $opt_r, $ptn ) if ( $opt_r ); $regex = ( $opt_i ) ? qr/$ptn/io : qr/$ptn/o; if ( $opt_d ) { my %esc = ( n => "\n", r => "\r", t => "\t" ); foreach $e ( keys %esc ) { $opt_d =~ s/\\$e/$esc{$e}/g; } $/ = ( $opt_d eq 'undef' ) ? undef : "$opt_d"; } $exstat = 1; $|++; #turn off buffering for STDOUT $/ = "\x0a\x00" if ( $opt_r eq "UTF-16LE" ); if (@ARGV==0) { binmode( STDIN, "<:gzip" ) if ( $opt_z ); while (<>) { $read = ( $opt_r ) ? decode( $opt_r, $_ ) : $_; if ( $opt_v ^ ( $read =~ /$regex/ )) { if ( $opt_l ) { print "$ptn found on stdin\n"; exit(0); } elsif ( $opt_c ) { $matchCount++; } else { print "$.:" if ( $opt_n ); print; } $exstat = 0; } } print "$matchCount\n" if ( $opt_c ); exit( $exstat ); } $preff = ( $opt_h ) ? 0 : ( @ARGV > 1 ); for $f ( @ARGV ) { $openMode = ( $f =~ /\.g?z$/i ) ? "<:gzip" : "<"; unless ( open( INP, $openMode, $f )) { warn "Unable to open input file $f: $!\n"; next; } $matchCount = 0; $pformat = ( $preff ) ? "$f:" : ""; $pformat .= '$.:' if ( $opt_n ); $pformat .= '%s'; while () { $read = ( $opt_r ) ? decode( $opt_r, $_ ) : $_; if ( $opt_v ^ ( $read =~ /$regex/ )) { if ( $opt_l ) { print "$f\n"; $exstat = 0; last; } elsif ( $opt_c ) { $matchCount++; } else { eval "printf( \"$pformat\", \$_ )"; } $exstat = 0; } } close INP; if ( $opt_c ) { print "$f:" if ( $preff ); print "$matchCount\n"; } } exit( $exstat ); sub listEncodings { # user is asking for help: list all available encodings my $colwidth = length( (sort {length($b) <=> length($a)} @enclist)[0] ) + 2; my $ncol = int( 80/$colwidth ); my $nrow = int( scalar(@enclist)/$ncol ); $nrow++ if ( scalar(@enclist) % $ncol ); my $fmt = "%-${colwidth}s"; print $Usage; foreach my $r ( 0 .. $nrow ) { foreach my $c ( 0 .. $ncol ) { my $i = $c * $nrow + $r; printf( $fmt, $enclist[$i] ); } print "\n"; } exit( 0 ); }