Yes
$_table{'Ö'} is an example. There is no
use utf8;. No identifiers neither Unicode strings are builtin into the module. The hash is sourced via this function:
my %_table = ();
sub load_map
{
open( UTF8, "<:encoding(utf8)", 'tab.bin' ) || die "can't open t
+ab.bin : $!";
while( <UTF8> ) {
chomp;
my $offset = index($_,' ');
my $bin = substr($_,$offset+1);
my $esc = substr($_,0,$offset);
$_table{$bin} = $esc;
}
close( UTF8 );
}
A
Dump of a (short) received string is this:
SV = PVMG(0x12fca50) at 0x134f698
REFCNT = 4
FLAGS = (PADMY,POK,pPOK,UTF8)
IV = 0
NV = 0
PV = 0x1361200 " <entry>R\303\226CHLING EN
+GINEERING PLASTICS (UK) LIMITED</entry>"\0 [UTF8 "
+ <entry>R\x{d6}CHLING ENGINEERING PLASTICS (UK) LIMITED</entry
+>"]
CUR = 85
LEN = 88
Going to think about that SSCCE.
Thanks
UPDATE:
here the SSCCE:
use strict;
use utf8; # used only for this SSCCE to set scalar $SGML at line 4
+4
use Devel::Peek qw (Dump);
use Encode qw(encode_utf8);
binmode(STDOUT, ":utf8");
my %_table = ();
# -----------------------------------
sub load_map
{
while( <DATA> ) {
chomp;
my $offset = index($_,' ');
my $bin = substr($_,$offset+1);
my $esc = substr($_,0,$offset);
$_table{$bin} = $esc;
}
}
# -----------------------------------
sub _mapchar
{
my ($char) = @_;
if ( $char !~ /[\r\n\s]/) {
my $nbytes = length encode_utf8($char);
if ($nbytes > 1) {
$char = exists $_table{$char} ? $_table{$char} : '?';
}
}
return $char;
}
# -----------------------------------
sub escapeUTF8
{
my ( $sgml_r) = @_;
Dump $$sgml_r;
$$sgml_r =~ s/(.)/_mapchar($1)/eg;
}
load_map();
my $SGML='RÖCHLING';
print "1: $SGML\n";
escapeUTF8(\$SGML);
print "2: $SGML\n";
__DATA__
$ $
Ö Ö
» »
~
it works, but still the regex is on every char