while(my $uchar = $us->chop) {
my $utf8 = $uchar->utf8;
$rslt = (($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8) . $rslt;
}
####
while(my $uchar = $us->substr( $offset++, 1 )) {
my $utf8 = $uchar->utf8;
$rslt .= ($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8;
}
##
##
# This does what Unicode::Escape::escape does, except a lot less.
# It assumes the input is utf8 encoded.
# It assumes the input is defined.
# Invalid utf8 is deleted silently.
sub utf8_escape {
# Short circuit if it's all seven bit ASCII.
return $_[0] unless $_[0] =~ /[\x80-\xff]/;
my $s = shift;
$s =~ s{ ([\xc2-\xdf]) ([\x80-\xbf]) }{
'\\u' .
sprintf( '%04x',
( ( 0b00011111 & ord $1 ) << 6 )
| ( 0b00111111 & ord $2 )
)
}exmsg;
$s =~ s{ ([\xe0-\xef]) ([\x80-\xbf]) ([\x80-\xbf]) }{
'\\u' .
sprintf( '%04x',
( ( 0b00001111 & ord $1 ) << 12 )
| ( ( 0b00111111 & ord $2 ) << 6 )
| ( 0b00111111 & ord $3 )
)
}exmsg;
# valid utf8 that can't be encoded in \uXXXX
$s =~ s{ [\xf0-\xf4] [\x80-\xbf]{3} }{\\ufffd}xmsg;
# invalid utf8
$s =~ tr/\x80-\xff//d;
return $s;
}
##
##
use strict;
use warnings;
use Test::More;
use Unicode::Escape;
my @test_texts = (
{
text => "\x{e3}\x{81}\x{82}"
. "\x{e3}\x{81}\x{84}"
. "\x{e3}\x{81}\x{86}"
. "\x{e3}\x{81}\x{88}"
. "\x{e3}\x{81}\x{8a}",
name => 'utf8 test from Unicode::Escape-0.0.2',
},
{
text => q{},
name => 'empty string',
},
{
text => '0',
name => 'zero (false-looking)',
},
{
text => "\x{c2}\x{a2}"
. "\x{c2}\x{a3}"
. "\x{c2}\x{a4}",
name => 'some two-byte utf8',
},
# Unicode::Escape escapes this as '\udbea\udfcd' What's that?
# {
# text => "\x{f4}\x{8a}\x{af}\x{8d}",
# name => 'four-byte utf8',
# },
{
text => "one: X, two: \x{c2}\x{a5}, three: \x{e3}\x{81}\x{8a}",
name => 'mixed character length utf8',
},
);
plan 'tests' => scalar @test_texts;
foreach my $t ( @test_texts ) {
die 'bad test data' if grep { ! defined $t->{$_} } qw( text name );
my $text = $t->{text};
my $canonical = Unicode::Escape::escape( $text );
my $test = utf8_escape( $text );
# I use 'ok' with 'eq' instead of 'is' so that a failure doesn't
# puke a lot of unintelligible yuck.
ok( $canonical eq $test, "correct escaping for '$t->{name}'" );
}
##
##
use strict;
use warnings;
use Unicode::Escape;
use Benchmark qw( cmpthese timethese );
my $subs = {
'utf8esc' => sub { utf8_escape( $content ) },
'Uni::Esc' => sub { Unicode::Escape::escape( $content ) },
};
my @texts = (
[ 'abc123ABC456' x 10_000, 'no utf8' ],
[ "\x{e3}\x{81}\x{82}\x{e3}\x{81}\x{84}\x{e3}\x{81}\x{86}\x{e3}\x{81}\x{88}\x{e3}\x{81}\x{8a}" x 10_000, 'all 3-byte utf8' ],
);
foreach my $txt ( @texts ) {
$test_text = $txt->[0];
print "*** $txt->[1]\n";
cmpthese( -30, $subs );
}
##
##
sub utf8_escape_2 {
my $s = shift;
$s =~ s{
(
[\xc2-\xdf] [\x80-\xbf]
|
[\xe0-\xef] [\x80-\xbf]{2}
|
[\xf0-\xf4] [\x80-\xbf]{3}
)
}
{
my @c = map ord, split //, $1;
'\\u' .
(
( 2 == @c ) ?
sprintf( '%04x',
( ( 0b00011111 & $c[0]) << 6 )
| ( 0b00111111 & $c[1] )
)
: ( 3 == @c ) ?
sprintf( '%04x',
( ( 0b00001111 & $c[0] ) << 12 )
| ( ( 0b00111111 & $c[1] ) << 6 )
| ( 0b00111111 & $c[2] )
)
: 'fffd'
);
}xemsg;
$s =~ tr/\x80-\xff//d; # invalid utf8
return $s;
}