#!/usr/bin/perl # usage: # fix_surrogates.pl < infile > outfile # Hi Surrogate: D800-DBFF # Lo Surrogate: DC00-DFFF use strict; use warnings; binmode STDIN; # Disable :crlf binmode STDOUT; # Disable :crlf my $read_size = 16*1024; my $valid_pat = qr/ .[^\xD8-\xDF] | .[\xD8-\xDB].[\xDC-\xDF] /xs; my $invalid_pat = qr/ .[\xDC-\xDF] | .[\xD8-\xDB](?=.[^\xDC-\xDF]) /xs; my $ibuf = ''; my $obuf = ''; for (;;) { my $rv = read(STDIN, $ibuf, $read_size, length($ibuf)); die("$!\n") if !defined($rv); last if !$rv; for ($ibuf) { /\G ($valid_pat+) /xgc && do { $obuf .= $1; }; /\G $invalid_pat /xgc && do { $obuf .= "\xFD\xFF"; redo }; } print($obuf); $ibuf = substr($ibuf, pos($ibuf)||0); $obuf = ''; } $ibuf =~ s/..?/\xFD\xFF/sg; print($ibuf);