#!/usr/bin/perl use 5.18.2; use warnings; use Data::Peek; use Unicode::Normalize qw( normalize ); use Encode qw( encode decode ); use charnames qw(:full); sub dp { my ($tag, $dta) = @_; my $dp = DPeek ($dta); printf "%-6s: %-52s", $tag, $dp =~ s{^(\S+)\K}{" " x (26 - length $1)}er; utf8::is_utf8 ($dta) and print join " + " => map { charnames::viacode (ord) } split // => $dta; say ""; } # dp $| = 1; foreach my $bytes ( "\xe1\xb8\xaf", "\xc3\xaf\xcc\x81", "\xc3\xad\xcc\x88", "\x69\xcc\x81\xcc\x88", "\x69\xcc\x88\xcc\x81", ) { my $u = decode ("utf-8", $bytes); dp ("Bytes", $bytes); dp ("UTF-8", $u); dp ("NF$_", normalize ($_, $u)) for qw( D C KD KC ); say ""; } #### Bytes : PV("\341\270\257"\0) UTF-8 : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE NFD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE NFKD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFKC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE Bytes : PV("\303\257\314\201"\0) UTF-8 : PV("\303\257\314\201"\0) [UTF8 "\x{ef}\x{301}"] LATIN SMALL LETTER I WITH DIAERESIS + COMBINING ACUTE ACCENT NFD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE NFKD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFKC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE Bytes : PV("\303\255\314\210"\0) UTF-8 : PV("\303\255\314\210"\0) [UTF8 "\x{ed}\x{308}"] LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS NFD : PV("i\314\201\314\210"\0) [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS NFC : PV("\303\255\314\210"\0) [UTF8 "\x{ed}\x{308}"] LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS NFKD : PV("i\314\201\314\210"\0) [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS NFKC : PV("\303\255\314\210"\0) [UTF8 "\x{ed}\x{308}"] LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS Bytes : PV("i\314\201\314\210"\0) UTF-8 : PV("i\314\201\314\210"\0) [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS NFD : PV("i\314\201\314\210"\0) [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS NFC : PV("\303\255\314\210"\0) [UTF8 "\x{ed}\x{308}"] LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS NFKD : PV("i\314\201\314\210"\0) [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS NFKC : PV("\303\255\314\210"\0) [UTF8 "\x{ed}\x{308}"] LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS Bytes : PV("i\314\210\314\201"\0) UTF-8 : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE NFKD : PV("i\314\210\314\201"\0) [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT NFKC : PV("\341\270\257"\0) [UTF8 "\x{1e2f}"] LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE