# This string has a mixture of ASCII, UTF-8, 2 byte wide, and 4 byte
# wide characters
my $crazy = "Hello\x{26c4}".encode("utf-8","\x{26f0}").
      "\x{10102}\x{2fa1b}";

# Now the string only has ASCII and UTF-8 characters
my $sane = safeString($crazy);

# testString($crazy) returns 7
# testString($sane) returns 3

# length($sane) returns 19
# trueLength($sane) returns 9

my $snowman = safeSubstr($crazy,5,1);

########################################
# safeString($string)
#   return a safe version of the string
sub safeString
{
  my ($string) = @_;
  return "" unless defined($string);
  my $t = testString($string);
  return $string if $t <= 3;
  return encode("utf-8",$string) if $t <= 5;

  # The string has both UTF-8 and wide characters so it needs
  # tender-loving care
  my @s = unpack('C*',$string);
  my @r;
  for (my $i = 0; $i < scalar(@s);)
  {
    if ($s[$i] < 128)
    {
      push @r, $s[$i];
      $i++;
    } elsif ($s[$i] > 255) {
      # encode a wide character
      push @r,unpack("C*",encode("utf-8",chr($s[$i])));
      $i++;
    } else {
      # copy all the utf-8 bytes
      $n = _charBytes($i,@s) - 1;
      map { push @r, $s[$i+$_] } 0..$n;
      $i += $n + 1;
    }
  }
  return pack("C*",@r);
}

########################################
# safeSubstr($string,$pos,$n)
#   return a safe substring (treats utf-8 sequences as a single
#   character)
sub safeSubstr
{
  my ($string,$pos,$n) = @_;
  $s = safeString($string);
  my $p = 0;
  my $rPos = 0;
  my $rEnd = -1;
  my @s = unpack('C*',$s);
  for (my $i = 0; $i < scalar(@s);)
  {
    if ($s[$i] < 128)
    {
      $i++;
    } elsif ($s[$i] > 255) {
      $i++;
    } else {
      $i += _charBytes($i,@s);
    }
    $p++;
    $rPos = $i if $p == $pos;
    $rEnd = $i-1 if $p == $pos + $n;
  }
  $rEnd = scalar(@s) - 1 if $rEnd < 0;
  return "" if $rPos > $rEnd;
  my @r;
  map { push @r, $s[$_] } $rPos..$rEnd;
  return pack("C*",@r);
}

########################################
# testString($string)
#   returns information about the characters in the string
#
# The 1, 2, and 4 bits of the result are for ASCII, UTF-8, and
# wide characters respectively. If multiple bits are set,
# characters of each type appear in the string. If the result is:
#   <= 1        simple ASCII string
#   <= 3        simple UTF-8 string
#   >3 && <= 5  mixed ASCII & wide characters
#   >= 6        mixed UTF-8 & wide characters
sub testString
{
  my ($s) = @_;
  return undef unless defined($s);
  my $r = 0;
  my @s = unpack('C*',$s);
  for (my $i = 0; $i < scalar(@s);)
  {
    if ($s[$i] < 128)
    {
      $r |= 1;
      $i++;
    } elsif ($s[$i] > 255) {
      $r |= 4;
      $i++;
    } else {
      $r |= 2;
      $i += _charBytes($i,@s);
    }
  }
  return $r;
}

########################################
# trueLength($string)
#   returns the number of UTF-8 characters in a string
sub trueLength
{
  my ($s) = @_;
  return unless defined($s);

  my $len = 0;
  my @s = unpack('C*',$s);
  for (my $i = 0; $i < scalar(@s);)
  {
    if ($s[$i] < 128)
    {
      $i++;
    } elsif ($s[$i] > 255) {
      $i++;
    } else {
      $i += _charBytes($i,@s);
    }
    $len++;
  }
  return $len;
}

########################################
# String support routines
sub _charBytes
{
  my $n = shift(@_);
  my $len = scalar(@_);
  if ($_[$n] < 128)
  {
    return 1;
  } elsif ($_[$n] > 65535) {
    return 4;
  } elsif ($_[$n] > 255) {
    return 2;
  } elsif (($_[$n] & 0xFC) == 0xFC) {
    return min(6,$len);
  } elsif (($_[$n] & 0xF8) == 0xF8) {
    return min(5,$len);
  } elsif (($_[$n] & 0xF0) == 0xF0) {
    return min(4,$len);
  } elsif (($_[$n] & 0xE0) == 0xE0) {
    return min(3,$len);
  } elsif (($_[$n] & 0xC0) == 0xC0) {
    return min(2,$len);
  } else {
    return 1;
  }
}