# This string has a mixture of ASCII, UTF-8, 2 byte wide, and 4 byte # wide characters my $crazy = "Hello\x{26c4}".encode("utf-8","\x{26f0}"). "\x{10102}\x{2fa1b}"; # Now the string only has ASCII and UTF-8 characters my $sane = safeString($crazy); # testString($crazy) returns 7 # testString($sane) returns 3 # length($sane) returns 19 # trueLength($sane) returns 9 my $snowman = safeSubstr($crazy,5,1); ######################################## # safeString($string) # return a safe version of the string sub safeString { my ($string) = @_; return "" unless defined($string); my $t = testString($string); return $string if $t <= 3; return encode("utf-8",$string) if $t <= 5; # The string has both UTF-8 and wide characters so it needs # tender-loving care my @s = unpack('C*',$string); my @r; for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { push @r, $s[$i]; $i++; } elsif ($s[$i] > 255) { # encode a wide character push @r,unpack("C*",encode("utf-8",chr($s[$i]))); $i++; } else { # copy all the utf-8 bytes $n = _charBytes($i,@s) - 1; map { push @r, $s[$i+$_] } 0..$n; $i += $n + 1; } } return pack("C*",@r); } ######################################## # safeSubstr($string,$pos,$n) # return a safe substring (treats utf-8 sequences as a single # character) sub safeSubstr { my ($string,$pos,$n) = @_; $s = safeString($string); my $p = 0; my $rPos = 0; my $rEnd = -1; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $i++; } elsif ($s[$i] > 255) { $i++; } else { $i += _charBytes($i,@s); } $p++; $rPos = $i if $p == $pos; $rEnd = $i-1 if $p == $pos + $n; } $rEnd = scalar(@s) - 1 if $rEnd < 0; return "" if $rPos > $rEnd; my @r; map { push @r, $s[$_] } $rPos..$rEnd; return pack("C*",@r); } ######################################## # testString($string) # returns information about the characters in the string # # The 1, 2, and 4 bits of the result are for ASCII, UTF-8, and # wide characters respectively. If multiple bits are set, # characters of each type appear in the string. If the result is: # <= 1 simple ASCII string # <= 3 simple UTF-8 string # >3 && <= 5 mixed ASCII & wide characters # >= 6 mixed UTF-8 & wide characters sub testString { my ($s) = @_; return undef unless defined($s); my $r = 0; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $r |= 1; $i++; } elsif ($s[$i] > 255) { $r |= 4; $i++; } else { $r |= 2; $i += _charBytes($i,@s); } } return $r; } ######################################## # trueLength($string) # returns the number of UTF-8 characters in a string sub trueLength { my ($s) = @_; return unless defined($s); my $len = 0; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $i++; } elsif ($s[$i] > 255) { $i++; } else { $i += _charBytes($i,@s); } $len++; } return $len; } ######################################## # String support routines sub _charBytes { my $n = shift(@_); my $len = scalar(@_); if ($_[$n] < 128) { return 1; } elsif ($_[$n] > 65535) { return 4; } elsif ($_[$n] > 255) { return 2; } elsif (($_[$n] & 0xFC) == 0xFC) { return min(6,$len); } elsif (($_[$n] & 0xF8) == 0xF8) { return min(5,$len); } elsif (($_[$n] & 0xF0) == 0xF0) { return min(4,$len); } elsif (($_[$n] & 0xE0) == 0xE0) { return min(3,$len); } elsif (($_[$n] & 0xC0) == 0xC0) { return min(2,$len); } else { return 1; } }