#!/usr/bin/perl
# usage:
# fix_surrogates.pl < infile > outfile
# Hi Surrogate: D800-DBFF
# Lo Surrogate: DC00-DFFF
use strict;
use warnings;
binmode STDIN; # Disable :crlf
binmode STDOUT; # Disable :crlf
my $read_size = 16*1024;
my $valid_pat = qr/ .[^\xD8-\xDF]
| .[\xD8-\xDB].[\xDC-\xDF]
/xs;
my $invalid_pat = qr/ .[\xDC-\xDF]
| .[\xD8-\xDB](?=.[^\xDC-\xDF])
/xs;
my $ibuf = '';
my $obuf = '';
for (;;) {
my $rv = read(STDIN, $ibuf, $read_size, length($ibuf));
die("$!\n") if !defined($rv);
last if !$rv;
for ($ibuf) {
/\G ($valid_pat+) /xgc && do { $obuf .= $1; };
/\G $invalid_pat /xgc && do { $obuf .= "\xFD\xFF"; redo };
}
print($obuf);
$ibuf = substr($ibuf, pos($ibuf)||0);
$obuf = '';
}
$ibuf =~ s/..?/\xFD\xFF/sg;
print($ibuf);
Update: Tested. Fixed character class that wasn't negated as it should have been.
>type testdata.pl
binmode STDOUT;
my $hi = "\xF4\xDB";
my $lo = "\xE2\xDE";
print "a\0" . $hi . $lo . "b\0" . "\n\0",
"c\0" . $hi . "c\0" . "d\0" . "\n\0",
"e\0" . $lo . "f\0" . "g\0" . "\n\0";
>perl testdata.pl | perl fix_surrogates.pl | perl -0777 -pe"BEGIN { bi
+nmode STDIN, ':encoding(UTF-16le)'; binmode STDOUT, ':encoding(US-ASC
+II)' }"
"\x{10d2e2}" does not map to ascii, <> chunk 1.
"\x{fffd}" does not map to ascii, <> chunk 1.
"\x{fffd}" does not map to ascii, <> chunk 1.
a\x{10d2e2}b
c\x{fffd}cd
e\x{fffd}fg
|