while(my $uchar = $us->chop) { my $utf8 = $uchar->utf8; $rslt = (($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8) . $rslt; } #### while(my $uchar = $us->substr( $offset++, 1 )) { my $utf8 = $uchar->utf8; $rslt .= ($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8; } #### # This does what Unicode::Escape::escape does, except a lot less. # It assumes the input is utf8 encoded. # It assumes the input is defined. # Invalid utf8 is deleted silently. sub utf8_escape { # Short circuit if it's all seven bit ASCII. return $_[0] unless $_[0] =~ /[\x80-\xff]/; my $s = shift; $s =~ s{ ([\xc2-\xdf]) ([\x80-\xbf]) }{ '\\u' . sprintf( '%04x', ( ( 0b00011111 & ord $1 ) << 6 ) | ( 0b00111111 & ord $2 ) ) }exmsg; $s =~ s{ ([\xe0-\xef]) ([\x80-\xbf]) ([\x80-\xbf]) }{ '\\u' . sprintf( '%04x', ( ( 0b00001111 & ord $1 ) << 12 ) | ( ( 0b00111111 & ord $2 ) << 6 ) | ( 0b00111111 & ord $3 ) ) }exmsg; # valid utf8 that can't be encoded in \uXXXX $s =~ s{ [\xf0-\xf4] [\x80-\xbf]{3} }{\\ufffd}xmsg; # invalid utf8 $s =~ tr/\x80-\xff//d; return $s; } #### use strict; use warnings; use Test::More; use Unicode::Escape; my @test_texts = ( { text => "\x{e3}\x{81}\x{82}" . "\x{e3}\x{81}\x{84}" . "\x{e3}\x{81}\x{86}" . "\x{e3}\x{81}\x{88}" . "\x{e3}\x{81}\x{8a}", name => 'utf8 test from Unicode::Escape-0.0.2', }, { text => q{}, name => 'empty string', }, { text => '0', name => 'zero (false-looking)', }, { text => "\x{c2}\x{a2}" . "\x{c2}\x{a3}" . "\x{c2}\x{a4}", name => 'some two-byte utf8', }, # Unicode::Escape escapes this as '\udbea\udfcd' What's that? # { # text => "\x{f4}\x{8a}\x{af}\x{8d}", # name => 'four-byte utf8', # }, { text => "one: X, two: \x{c2}\x{a5}, three: \x{e3}\x{81}\x{8a}", name => 'mixed character length utf8', }, ); plan 'tests' => scalar @test_texts; foreach my $t ( @test_texts ) { die 'bad test data' if grep { ! defined $t->{$_} } qw( text name ); my $text = $t->{text}; my $canonical = Unicode::Escape::escape( $text ); my $test = utf8_escape( $text ); # I use 'ok' with 'eq' instead of 'is' so that a failure doesn't # puke a lot of unintelligible yuck. ok( $canonical eq $test, "correct escaping for '$t->{name}'" ); } #### use strict; use warnings; use Unicode::Escape; use Benchmark qw( cmpthese timethese ); my $subs = { 'utf8esc' => sub { utf8_escape( $content ) }, 'Uni::Esc' => sub { Unicode::Escape::escape( $content ) }, }; my @texts = ( [ 'abc123ABC456' x 10_000, 'no utf8' ], [ "\x{e3}\x{81}\x{82}\x{e3}\x{81}\x{84}\x{e3}\x{81}\x{86}\x{e3}\x{81}\x{88}\x{e3}\x{81}\x{8a}" x 10_000, 'all 3-byte utf8' ], ); foreach my $txt ( @texts ) { $test_text = $txt->[0]; print "*** $txt->[1]\n"; cmpthese( -30, $subs ); } #### sub utf8_escape_2 { my $s = shift; $s =~ s{ ( [\xc2-\xdf] [\x80-\xbf] | [\xe0-\xef] [\x80-\xbf]{2} | [\xf0-\xf4] [\x80-\xbf]{3} ) } { my @c = map ord, split //, $1; '\\u' . ( ( 2 == @c ) ? sprintf( '%04x', ( ( 0b00011111 & $c[0]) << 6 ) | ( 0b00111111 & $c[1] ) ) : ( 3 == @c ) ? sprintf( '%04x', ( ( 0b00001111 & $c[0] ) << 12 ) | ( ( 0b00111111 & $c[1] ) << 6 ) | ( 0b00111111 & $c[2] ) ) : 'fffd' ); }xemsg; $s =~ tr/\x80-\xff//d; # invalid utf8 return $s; }