while(my $uchar = $us->chop) { my $utf8 = $uchar->utf8; $rslt = (($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8) . $rslt; } ##

##

    while(my $uchar = $us->substr( $offset++, 1 )) {
        my $utf8 = $uchar->utf8;
        $rslt .= ($utf8 =~ /[\x80-\xff]/) ? '\\u'.unpack('H4', $uchar->utf16be) : $utf8;
    }

##

##

# This does what Unicode::Escape::escape does, except a lot less.
# It assumes the input is utf8 encoded.
# It assumes the input is defined.
# Invalid utf8 is deleted silently.

sub utf8_escape {
    # Short circuit if it's all seven bit ASCII.
    return $_[0] unless $_[0] =~ /[\x80-\xff]/;

    my $s = shift;

    $s =~ s{ ([\xc2-\xdf]) ([\x80-\xbf]) }{
        '\\u' .
        sprintf( '%04x',
              ( ( 0b00011111 & ord $1 ) << 6 )
            |   ( 0b00111111 & ord $2 )
        )
    }exmsg;
    $s =~ s{ ([\xe0-\xef]) ([\x80-\xbf]) ([\x80-\xbf]) }{
        '\\u' . 
        sprintf( '%04x',
              ( ( 0b00001111 & ord $1 ) << 12 )
            | ( ( 0b00111111 & ord $2 ) <<  6 )
            |   ( 0b00111111 & ord $3 )
        )
    }exmsg;

    # valid utf8 that can't be encoded in \uXXXX
    $s =~ s{ [\xf0-\xf4] [\x80-\xbf]{3} }{\\ufffd}xmsg;

    # invalid utf8
    $s =~ tr/\x80-\xff//d;

    return $s;
}

##

##

use strict;
use warnings;

use Test::More;

use Unicode::Escape;

my @test_texts = (
    {
        text => "\x{e3}\x{81}\x{82}"
              . "\x{e3}\x{81}\x{84}"
              . "\x{e3}\x{81}\x{86}"
              . "\x{e3}\x{81}\x{88}"
              . "\x{e3}\x{81}\x{8a}",
        name => 'utf8 test from Unicode::Escape-0.0.2',
    },
    {
        text => q{},
        name => 'empty string',
    },
    {
        text => '0',
        name => 'zero (false-looking)',
    },
    {
        text => "\x{c2}\x{a2}"
              . "\x{c2}\x{a3}"
              . "\x{c2}\x{a4}",
        name => 'some two-byte utf8',
    },
    # Unicode::Escape escapes this as '\udbea\udfcd'  What's that?
#    {
#        text => "\x{f4}\x{8a}\x{af}\x{8d}",
#        name => 'four-byte utf8',
#    },
    {
        text => "one: X, two: \x{c2}\x{a5}, three: \x{e3}\x{81}\x{8a}",
        name => 'mixed character length utf8',
    },
);

plan 'tests' => scalar @test_texts;

foreach my $t ( @test_texts ) {
    die 'bad test data' if grep { ! defined $t->{$_} } qw( text name );

    my $text      = $t->{text};
    my $canonical = Unicode::Escape::escape( $text );
    my $test      = utf8_escape( $text );

    # I use 'ok' with 'eq' instead of 'is' so that a failure doesn't
    # puke a lot of unintelligible yuck.
    ok( $canonical eq $test, "correct escaping for '$t->{name}'" );
}

##

##

use strict;
use warnings;

use Unicode::Escape;

use Benchmark qw( cmpthese timethese );

my $subs = {
    'utf8esc'    => sub { utf8_escape( $content ) },
    'Uni::Esc'   => sub { Unicode::Escape::escape( $content ) },
};

my @texts = (
    [ 'abc123ABC456' x 10_000, 'no utf8' ],
    [ "\x{e3}\x{81}\x{82}\x{e3}\x{81}\x{84}\x{e3}\x{81}\x{86}\x{e3}\x{81}\x{88}\x{e3}\x{81}\x{8a}" x 10_000, 'all 3-byte utf8' ],
    );

foreach my $txt ( @texts ) {
    $test_text = $txt->[0];
    print "*** $txt->[1]\n";
    cmpthese( -30, $subs );
}

##

##

sub utf8_escape_2 {
    my $s = shift;

    $s =~ s{
               (
                   [\xc2-\xdf] [\x80-\xbf]
                   |
                   [\xe0-\xef] [\x80-\xbf]{2}
                   |
                   [\xf0-\xf4] [\x80-\xbf]{3}
               )
           }
           {
               my @c = map ord, split //, $1;
               '\\u' .
               (
                  ( 2 == @c ) ?
                      sprintf( '%04x',
                               ( ( 0b00011111 & $c[0]) << 6 )
                               | ( 0b00111111 & $c[1] )
                      )
                : ( 3 == @c ) ?
                      sprintf( '%04x',
                               ( ( 0b00001111 & $c[0] ) << 12 )
                               | ( ( 0b00111111 & $c[1] ) << 6 )
                               | ( 0b00111111 & $c[2] )
                      )
                : 'fffd'
               );
           }xemsg;

    $s =~ tr/\x80-\xff//d; # invalid utf8

    return $s;
}