The following code creates the test script and runs it: I don't like that so I rewrote it
#!/usr/bin/perl --
use strict; use warnings;
use Data::Dump;
my $utf1 = pack 'H*', join'', qw/61 61 c5 99 0a c4 8d 0a 61 61 c5 99 0
+a/;
my $utf2 = pack 'H*', join'', qw/c4 8d 0a 61 61 c5 99 0a c4 8d 0a/;
dd $utf1, $utf2;
utf8::decode( $utf1 );
utf8::decode( $utf2 );
dd $utf1, $utf2;
for my $string ( $utf1, $utf2 ){
my @lines = split /\n/, $string;
for my $line ( @lines ){
if( my($word) = $line =~ /^(.+)$/ ){
my $one = substr $1, 0, 1;
my $wone = substr $word, 0, 1;
dd { word => $word, 1 => $1, one => $one , wone => $wone }
+;
}
}
dd \@lines;
dd;
}
__END__
(
"aa\xC5\x99\n\xC4\x8D\naa\xC5\x99\n",
"\xC4\x8D\naa\xC5\x99\n\xC4\x8D\n",
)
(
"aa\x{159}\n\x{10D}\naa\x{159}\n",
"\x{10D}\naa\x{159}\n\x{10D}\n",
)
{ 1 => "aa\x{159}", one => "a", wone => "a", word => "aa\x{159}" }
{ 1 => "\x{10D}", one => "Ä", wone => "\x{10D}", word => "\x{10D}" }
{ 1 => "aa\x{159}", one => "a", wone => "a", word => "aa\x{159}" }
["aa\x{159}", "\x{10D}", "aa\x{159}"]
()
{ 1 => "\x{10D}", one => "Ä", wone => "\x{10D}", word => "\x{10D}" }
{ 1 => "aa\x{159}", one => "a", wone => "a", word => "aa\x{159}" }
{ 1 => "\x{10D}", one => "Ä", wone => "\x{10D}", word => "\x{10D}" }
["\x{10D}", "aa\x{159}", "\x{10D}"]
()
I've got perl 5.014001 and I get these from Data::Dump
Malformed UTF-8 character (unexpected end of string) and Wide character in print , it appears the utf flag gets turned off when substr-ing on $1 under some cases, and it appears this bug has surfaced before https://rt.perl.org/rt3/Public/Search/Simple.html?q=%241%20utf
But I'm no expert
|