#!/bin/sh -
for i in $(find . -type f)
do
iconv -f ISO-8859-1 -t UTF-8 $i > $i.tmp
rm $i
mv $i.tmp $i
done
####
#!/usr/bin/env perl
#
# unicount - count code points in input
# Tom Christiansen
use v5.12;
use strict;
use sigtrap;
use warnings;
use charnames ();
use Carp qw(carp croak confess cluck);
use List::Util qw(max);
use Unicode::UCD qw(charinfo charblock);
sub fix_extension;
sub process_input (&) ;
sub set_encoding (*$);
sub yuck ($) ;
my $total = 0;
my %seen = ();
# deep magic here
process_input {
$total += length;
$seen{$_}++ for split //;
};
my $dec_width = length($total);
my $hex_width = max(4, length sprintf("%x", max map { ord } keys %seen));
for (sort keys %seen) {
my $count = $seen{$_};
my $gcat = charinfo(ord())->{category};
my $name = charnames::viacode(ord())
|| "";
printf "%*d U+%0*X GC=%2s %s\n",
$dec_width => $count,
$hex_width => ord(),
$gcat => $name;
}
exit;
##################################################
sub yuck($) {
my $errmsg = $_[0];
$errmsg =~ s/(?<=[^\n])\z/\n/;
print STDERR "$0: $errmsg";
}
sub process_input(&) {
my $function = shift();
my $enc;
if (@ARGV == 0 && -t STDIN && -t STDERR) {
print STDERR "$0: reading from stdin, type ^D to end or ^C to kill.\n";
}
unshift(@ARGV, "-") if @ARGV == 0;
FILE:
for my $file (@ARGV) {
# don't let magic open make an output handle
next if -e $file && ! -f _;
my $quasi_filename = fix_extension($file);
$file = "standard input" if $file eq q(-);
$quasi_filename =~ s/^(?=\s*[>|])/< /;
no strict "refs";
my $fh = $file; # is *so* a lexical filehandle! ###98#
unless (open($fh, $quasi_filename)) {
yuck("couldn't open $quasi_filename: $!");
next FILE;
}
set_encoding($fh, $file) || next FILE;
my $whole_file = eval {
# could just do this a line at a time, but not if counting \R's
use warnings "FATAL" => "all";
local $/;
scalar <$fh>;
};
if ($@) {
$@ =~ s/ at \K.*? line \d+.*/$file line $./;
yuck($@);
next FILE;
}
do {
# much faster to alias than to copy
local *_ = \$whole_file;
&$function;
};
unless (close $fh) {
yuck("couldn't close $quasi_filename at line $.: $!");
next FILE;
}
} # foreach file
}
# Encoding set to (after unzipping):
# if file.pod => use whatever =encoding says
# elsif file.ENCODING for legal encoding name -> use that one
# elsif file is binary => use bytes
# else => use utf8
#
# Note that gzipped stuff always shows up as bytes this way, but
# it internal unzipped bytes are still counted after unzipping
#
sub set_encoding(*$) {
my ($handle, $path) = @_;
my $enc_name = (-f $path && -B $path) ? "bytes" : "utf8";
if ($path && $path =~ m{ \. ([^\s.]+) \z }x) {
my $ext = $1;
die unless defined $ext;
if ($ext eq "pod") {
my $int_enc = qx{
perl -C0 -lan -00 -e 'next unless /^=encoding/; print \$F[1]; exit' $path
};
if ($int_enc) {
chomp $int_enc;
$ext = $int_enc;
##print STDERR "$0: reset encoding to $ext on $path\n";
}
}
require Encode;
if (my $enc_obj = Encode::find_encoding($ext)) {
my $name = $enc_obj->name || $ext;
$enc_name = "encoding($name)";
}
}
return 1 if eval {
use warnings FATAL => "all";
no strict "refs";
##print STDERR qq(binmode($handle, ":$enc_name")\n);
binmode($handle, ":$enc_name") || die "binmode to $enc_name failed";
1;
};
for ($@) {
s/ at .* line \d+\.//;
s/$/ for $path/;
}
yuck("set_encoding: $@");
return undef;
}
sub fix_extension {
my $path = shift();
my %Compress = (
Z => "zcat",
z => "gzcat", # for uncompressing
gz => "gzcat",
bz => "bzcat",
bz2 => "bzcat",
bzip => "bzcat",
bzip2 => "bzcat",
lzma => "lzcat",
);
if ($path =~ m{ \. ( [^.\s] +) \z }x) {
if (my $prog = $Compress{$1}) {
# HIP HIP HURRAY! for magic open!!!
# HIP HIP HURRAY! for magic open!!!
# HIP HIP HURRAY! for magic open!!!
return "$prog $path |";
}
}
return $path;
}
END {
close(STDIN) || die "couldn't close stdin: $!";
close(STDOUT) || die "couldn't close stdout: $!";
}
UNITCHECK {
$SIG{ PIPE } = sub { exit };
$SIG{__WARN__} = sub {
confess "trapped uncaught warning" unless $^S;
};
}
##
##
utf8 "\x99" does not map to Unicode at ./word_lets.cgi line 1