#!/usr/bin/perl -w use strict; require utf8; my $s= pack "C*", 1..255; # Byte string to convert my $u= pack "U*", 1..255; # UTF-8 string my $e= substr($u,0,0); # Empty UTF-8 string my $r= $s; # Convert using regex $r =~ s{ ([^\0-\x7F]) }{ my $o= ord($1); sprintf "%c%c", 0xc0 | ( $o >> 6 ), 0x80 | ( $o & 0x3f ); }gex; my $i= $s.$e; # Convert by implicit upgrade to UTF-8 my $f= $s; # Upgrade via utf8.pm function utf8::upgrade( $f ); my $b= $s; # Upgrade then mark as bytes utf8::encode( $b ); if( $r eq $b ) { print "The regex and utf8::encode() match.\n"; } if( $u eq $i && $i eq $f ) { print "The 3 Unicode strings match.\n"; } if( join(" ",unpack"C*",$r) eq join(" ",unpack"C*",$i) ) { print "The byte- and unicode-strings have the same bytes.\n"; } if( $r ne $i ) { print "The byte- and unicode-strings are not equal.\n"; } print '$s contains ', length($s), " bytes.\n"; print '$i contains ', length($i), " characters.\n"; print '$r contains ', length($r), " bytes.\n"; #### The regex and utf8::encode() match. The 3 Unicode strings match. The byte- and unicode-strings have the same bytes. The byte- and unicode-strings are not equal. $s contains 255 bytes. $i contains 255 characters. $r contains 383 bytes.