#!/usr/bin/perl use strict; use warnings; use HTML::Entities; my $lookup = get_cp1252_lookup(); my $str = join('', chr(0x93), 'double', chr(0x94), chr(0x201C), 'double', chr(0x201D), '‘single’' ); # "replaces HTML entities... # with the corresponding Unicode character" decode_entities($str); # replaces x80-x9f with unicode equivalant $str =~ s/([\x80-\x9f])/$lookup->{sprintf("%x", ord($1))}/eg; # "replaces unsafe characters... # with their entity representation" encode_entities($str); print "$str\n"; sub get_cp1252_lookup{ open my $fh, '<', 'cp1252_to_unicode.txt' or die "can't open input: $!"; my $lookup; while (<$fh>){ my ($cp1252, $utf8_str, $name) = split /\t/; $cp1252 =~ s/0x//; my $utf8 = $utf8_str =~ / /? '':chr(oct($utf8_str)); $lookup->{$cp1252} = $utf8; } return $lookup; } __END__ output: “double”“double”‘single’ extract from cp1252_to_unicode.txt: 0x91 0x2018 #LEFT SINGLE QUOTATION MARK 0x92 0x2019 #RIGHT SINGLE QUOTATION MARK 0x93 0x201C #LEFT DOUBLE QUOTATION MARK 0x94 0x201D #RIGHT DOUBLE QUOTATION MARK