#!/bin/sh - for i in $(find . -type f) do iconv -f ISO-8859-1 -t UTF-8 $i > $i.tmp rm $i mv $i.tmp $i done #### #!/usr/bin/env perl # # unicount - count code points in input # Tom Christiansen use v5.12; use strict; use sigtrap; use warnings; use charnames (); use Carp qw(carp croak confess cluck); use List::Util qw(max); use Unicode::UCD qw(charinfo charblock); sub fix_extension; sub process_input (&) ; sub set_encoding (*$); sub yuck ($) ; my $total = 0; my %seen = (); # deep magic here process_input { $total += length; $seen{$_}++ for split //; }; my $dec_width = length($total); my $hex_width = max(4, length sprintf("%x", max map { ord } keys %seen)); for (sort keys %seen) { my $count = $seen{$_}; my $gcat = charinfo(ord())->{category}; my $name = charnames::viacode(ord()) || ""; printf "%*d U+%0*X GC=%2s %s\n", $dec_width => $count, $hex_width => ord(), $gcat => $name; } exit; ################################################## sub yuck($) { my $errmsg = $_[0]; $errmsg =~ s/(?<=[^\n])\z/\n/; print STDERR "$0: $errmsg"; } sub process_input(&) { my $function = shift(); my $enc; if (@ARGV == 0 && -t STDIN && -t STDERR) { print STDERR "$0: reading from stdin, type ^D to end or ^C to kill.\n"; } unshift(@ARGV, "-") if @ARGV == 0; FILE: for my $file (@ARGV) { # don't let magic open make an output handle next if -e $file && ! -f _; my $quasi_filename = fix_extension($file); $file = "standard input" if $file eq q(-); $quasi_filename =~ s/^(?=\s*[>|])/< /; no strict "refs"; my $fh = $file; # is *so* a lexical filehandle! ###98# unless (open($fh, $quasi_filename)) { yuck("couldn't open $quasi_filename: $!"); next FILE; } set_encoding($fh, $file) || next FILE; my $whole_file = eval { # could just do this a line at a time, but not if counting \R's use warnings "FATAL" => "all"; local $/; scalar <$fh>; }; if ($@) { $@ =~ s/ at \K.*? line \d+.*/$file line $./; yuck($@); next FILE; } do { # much faster to alias than to copy local *_ = \$whole_file; &$function; }; unless (close $fh) { yuck("couldn't close $quasi_filename at line $.: $!"); next FILE; } } # foreach file } # Encoding set to (after unzipping): # if file.pod => use whatever =encoding says # elsif file.ENCODING for legal encoding name -> use that one # elsif file is binary => use bytes # else => use utf8 # # Note that gzipped stuff always shows up as bytes this way, but # it internal unzipped bytes are still counted after unzipping # sub set_encoding(*$) { my ($handle, $path) = @_; my $enc_name = (-f $path && -B $path) ? "bytes" : "utf8"; if ($path && $path =~ m{ \. ([^\s.]+) \z }x) { my $ext = $1; die unless defined $ext; if ($ext eq "pod") { my $int_enc = qx{ perl -C0 -lan -00 -e 'next unless /^=encoding/; print \$F[1]; exit' $path }; if ($int_enc) { chomp $int_enc; $ext = $int_enc; ##print STDERR "$0: reset encoding to $ext on $path\n"; } } require Encode; if (my $enc_obj = Encode::find_encoding($ext)) { my $name = $enc_obj->name || $ext; $enc_name = "encoding($name)"; } } return 1 if eval { use warnings FATAL => "all"; no strict "refs"; ##print STDERR qq(binmode($handle, ":$enc_name")\n); binmode($handle, ":$enc_name") || die "binmode to $enc_name failed"; 1; }; for ($@) { s/ at .* line \d+\.//; s/$/ for $path/; } yuck("set_encoding: $@"); return undef; } sub fix_extension { my $path = shift(); my %Compress = ( Z => "zcat", z => "gzcat", # for uncompressing gz => "gzcat", bz => "bzcat", bz2 => "bzcat", bzip => "bzcat", bzip2 => "bzcat", lzma => "lzcat", ); if ($path =~ m{ \. ( [^.\s] +) \z }x) { if (my $prog = $Compress{$1}) { # HIP HIP HURRAY! for magic open!!! # HIP HIP HURRAY! for magic open!!! # HIP HIP HURRAY! for magic open!!! return "$prog $path |"; } } return $path; } END { close(STDIN) || die "couldn't close stdin: $!"; close(STDOUT) || die "couldn't close stdout: $!"; } UNITCHECK { $SIG{ PIPE } = sub { exit }; $SIG{__WARN__} = sub { confess "trapped uncaught warning" unless $^S; }; } #### utf8 "\x99" does not map to Unicode at ./word_lets.cgi line 1