#!/bin/sh - for i in $(find . -type f) do iconv -f ISO-8859-1 -t UTF-8 $i > $i.tmp rm $i mv $i.tmp $i done ##

##

#!/usr/bin/env perl
#
# unicount - count code points in input
# Tom Christiansen 

use v5.12;
use strict;
use sigtrap;
use warnings;
use charnames ();

use Carp                qw(carp croak confess cluck);
use List::Util          qw(max);
use Unicode::UCD        qw(charinfo charblock);

sub fix_extension;
sub process_input   (&) ;
sub set_encoding    (*$);
sub yuck            ($) ;

my $total = 0;
my %seen = ();

# deep magic here
process_input {
    $total += length;
    $seen{$_}++ for split //;
};

my $dec_width = length($total);
my $hex_width = max(4, length sprintf("%x", max map { ord } keys %seen));

for (sort keys %seen) {
    my $count = $seen{$_};
    my $gcat  = charinfo(ord())->{category};
    my $name  = charnames::viacode(ord())
             || "";

    printf "%*d U+%0*X GC=%2s %s\n",
            $dec_width => $count,
            $hex_width => ord(),
            $gcat      => $name;
}

exit;

##################################################

sub yuck($) {
    my $errmsg = $_[0];
    $errmsg =~ s/(?<=[^\n])\z/\n/;
    print STDERR "$0: $errmsg";
}

sub process_input(&) {
    my $function = shift();
    my $enc;

    if (@ARGV == 0 && -t STDIN && -t STDERR) {
        print STDERR "$0: reading from stdin, type ^D to end or ^C to kill.\n";
    }

    unshift(@ARGV, "-") if @ARGV == 0;

FILE:

    for my $file (@ARGV) {
        # don't let magic open make an output handle
        next if -e $file && ! -f _;
        my $quasi_filename = fix_extension($file);
        $file = "standard input" if $file eq q(-);
        $quasi_filename =~ s/^(?=\s*[>|])/< /;

        no strict "refs";
        my $fh = $file;   # is *so* a lexical filehandle! ###98#
        unless (open($fh, $quasi_filename)) {
            yuck("couldn't open $quasi_filename: $!");
            next FILE;
        }
        set_encoding($fh, $file) || next FILE;

        my $whole_file = eval {
            # could just do this a line at a time, but not if counting \R's
            use warnings "FATAL" => "all";
            local $/;
            scalar <$fh>;
        };

        if ($@) {
            $@ =~ s/ at \K.*? line \d+.*/$file line $./;
            yuck($@);
            next FILE;
        }

        do {
            # much faster to alias than to copy
            local *_ = \$whole_file;
            &$function;
        };

        unless (close $fh) {
            yuck("couldn't close $quasi_filename at line $.: $!");
            next FILE;
        }

    } # foreach file

}

# Encoding set to (after unzipping):
#    if file.pod => use whatever =encoding says
#    elsif file.ENCODING for legal encoding name -> use that one
#    elsif file is binary => use bytes
#    else => use utf8
#
# Note that gzipped stuff always shows up as bytes this way, but
#   it internal unzipped bytes are still counted after unzipping
#
sub set_encoding(*$) {
    my ($handle, $path) = @_;

    my $enc_name = (-f $path && -B $path) ? "bytes" : "utf8";

    if ($path && $path =~ m{ \. ([^\s.]+) \z }x) {
        my $ext = $1;
        die unless defined $ext;

        if ($ext eq "pod") {
            my $int_enc = qx{
                perl -C0 -lan -00 -e 'next unless /^=encoding/; print \$F[1]; exit' $path
            };
            if ($int_enc) {
                chomp $int_enc;
                $ext = $int_enc;
              ##print STDERR "$0: reset encoding to $ext on $path\n";
            }
        }

        require Encode;
        if (my $enc_obj = Encode::find_encoding($ext)) {
            my $name = $enc_obj->name || $ext;
            $enc_name = "encoding($name)";
        }
    }

    return 1 if eval {
        use warnings FATAL => "all";
        no strict "refs";
      ##print STDERR qq(binmode($handle, ":$enc_name")\n);
        binmode($handle, ":$enc_name") || die "binmode to $enc_name failed";
        1;
    };

    for ($@) {
        s/ at .* line \d+\.//;
        s/$/ for $path/;
    }

    yuck("set_encoding: $@");

    return undef;
}

sub fix_extension {
    my $path = shift();
    my %Compress = (
        Z       =>  "zcat",
        z       => "gzcat",            # for uncompressing
        gz      => "gzcat",
        bz      => "bzcat",
        bz2     => "bzcat",
        bzip    => "bzcat",
        bzip2   => "bzcat",
        lzma    => "lzcat",
    );

    if ($path =~ m{ \. ( [^.\s] +) \z }x) {
        if (my $prog = $Compress{$1}) {
            # HIP HIP HURRAY! for magic open!!!
            # HIP HIP HURRAY! for magic open!!!
            # HIP HIP HURRAY! for magic open!!!
            return "$prog $path |";
        }
    }

    return $path;
}

END {
    close(STDIN)  || die "couldn't close stdin: $!";
    close(STDOUT) || die "couldn't close stdout: $!";
}

UNITCHECK {
    $SIG{  PIPE  } = sub { exit };
    $SIG{__WARN__} = sub {
        confess "trapped uncaught warning" unless $^S;
    };
}

##

##

utf8 "\x99" does not map to Unicode at ./word_lets.cgi line 1