http://www.perlmonks.org?node_id=1198023

Dealing with data that comes from webpages can be really complicated. There is likely to be a combination of ASCII, UTF-8, and wide characters in the data returned and you cannot depend on the website to tell you what type of content is being returned. The routines safeString, safeSubstr, testString, and trueLength can be used to easily manipulate these strings. Pass any string to safeString and you will never get a wide character warning from print. Use safeSubstr to extract complete UTF-8 characters sequences from a string. Use testString to tell you what's really in the string. Use trueLength to find out how many characters wide the output will be.
# This string has a mixture of ASCII, UTF-8, 2 byte wide, and 4 byte # wide characters my $crazy = "Hello\x{26c4}".encode("utf-8","\x{26f0}"). "\x{10102}\x{2fa1b}"; # Now the string only has ASCII and UTF-8 characters my $sane = safeString($crazy); # testString($crazy) returns 7 # testString($sane) returns 3 # length($sane) returns 19 # trueLength($sane) returns 9 my $snowman = safeSubstr($crazy,5,1); ######################################## # safeString($string) # return a safe version of the string sub safeString { my ($string) = @_; return "" unless defined($string); my $t = testString($string); return $string if $t <= 3; return encode("utf-8",$string) if $t <= 5; # The string has both UTF-8 and wide characters so it needs # tender-loving care my @s = unpack('C*',$string); my @r; for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { push @r, $s[$i]; $i++; } elsif ($s[$i] > 255) { # encode a wide character push @r,unpack("C*",encode("utf-8",chr($s[$i]))); $i++; } else { # copy all the utf-8 bytes $n = _charBytes($i,@s) - 1; map { push @r, $s[$i+$_] } 0..$n; $i += $n + 1; } } return pack("C*",@r); } ######################################## # safeSubstr($string,$pos,$n) # return a safe substring (treats utf-8 sequences as a single # character) sub safeSubstr { my ($string,$pos,$n) = @_; $s = safeString($string); my $p = 0; my $rPos = 0; my $rEnd = -1; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $i++; } elsif ($s[$i] > 255) { $i++; } else { $i += _charBytes($i,@s); } $p++; $rPos = $i if $p == $pos; $rEnd = $i-1 if $p == $pos + $n; } $rEnd = scalar(@s) - 1 if $rEnd < 0; return "" if $rPos > $rEnd; my @r; map { push @r, $s[$_] } $rPos..$rEnd; return pack("C*",@r); } ######################################## # testString($string) # returns information about the characters in the string # # The 1, 2, and 4 bits of the result are for ASCII, UTF-8, and # wide characters respectively. If multiple bits are set, # characters of each type appear in the string. If the result is: # <= 1 simple ASCII string # <= 3 simple UTF-8 string # >3 && <= 5 mixed ASCII & wide characters # >= 6 mixed UTF-8 & wide characters sub testString { my ($s) = @_; return undef unless defined($s); my $r = 0; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $r |= 1; $i++; } elsif ($s[$i] > 255) { $r |= 4; $i++; } else { $r |= 2; $i += _charBytes($i,@s); } } return $r; } ######################################## # trueLength($string) # returns the number of UTF-8 characters in a string sub trueLength { my ($s) = @_; return unless defined($s); my $len = 0; my @s = unpack('C*',$s); for (my $i = 0; $i < scalar(@s);) { if ($s[$i] < 128) { $i++; } elsif ($s[$i] > 255) { $i++; } else { $i += _charBytes($i,@s); } $len++; } return $len; } ######################################## # String support routines sub _charBytes { my $n = shift(@_); my $len = scalar(@_); if ($_[$n] < 128) { return 1; } elsif ($_[$n] > 65535) { return 4; } elsif ($_[$n] > 255) { return 2; } elsif (($_[$n] & 0xFC) == 0xFC) { return min(6,$len); } elsif (($_[$n] & 0xF8) == 0xF8) { return min(5,$len); } elsif (($_[$n] & 0xF0) == 0xF0) { return min(4,$len); } elsif (($_[$n] & 0xE0) == 0xE0) { return min(3,$len); } elsif (($_[$n] & 0xC0) == 0xC0) { return min(2,$len); } else { return 1; } }