Some texts contain weird SGML entities: ] \ etc. I found out that these are
ISONUM entities. Here is a hash that maps the names to the unicode characters, extracted from the table linked above. It is ready to be used in the
HTML::Entities::_decode_entities function.
{
'amp' => chr(0x00026),
'amp;' => chr(0x00026),
'apos' => chr(0x00027),
'apos;' => chr(0x00027),
'ast' => chr(0x0002A),
'ast;' => chr(0x0002A),
'brvbar' => chr(0x000A6),
'brvbar;' => chr(0x000A6),
'bsol' => chr(0x0005C),
'bsol;' => chr(0x0005C),
'cent' => chr(0x000A2),
'cent;' => chr(0x000A2),
'colon' => chr(0x0003A),
'colon;' => chr(0x0003A),
'comma' => chr(0x0002C),
'comma;' => chr(0x0002C),
'commat' => chr(0x00040),
'commat;' => chr(0x00040),
'copy' => chr(0x000A9),
'copy;' => chr(0x000A9),
'curren' => chr(0x000A4),
'curren;' => chr(0x000A4),
'darr' => chr(0x02193),
'darr;' => chr(0x02193),
'deg' => chr(0x000B0),
'deg;' => chr(0x000B0),
'divide' => chr(0x000F7),
'divide;' => chr(0x000F7),
'dollar' => chr(0x00024),
'dollar;' => chr(0x00024),
'equals' => chr(0x0003D),
'equals;' => chr(0x0003D),
'excl' => chr(0x00021),
'excl;' => chr(0x00021),
'frac12' => chr(0x000BD),
'frac12;' => chr(0x000BD),
'frac14' => chr(0x000BC),
'frac14;' => chr(0x000BC),
'frac18' => chr(0x0215B),
'frac18;' => chr(0x0215B),
'frac34' => chr(0x000BE),
'frac34;' => chr(0x000BE),
'frac38' => chr(0x0215C),
'frac38;' => chr(0x0215C),
'frac58' => chr(0x0215D),
'frac58;' => chr(0x0215D),
'frac78' => chr(0x0215E),
'frac78;' => chr(0x0215E),
'gt' => chr(0x0003E),
'gt;' => chr(0x0003E),
'half' => chr(0x000BD),
'half;' => chr(0x000BD),
'horbar' => chr(0x02015),
'horbar;' => chr(0x02015),
'hyphen' => chr(0x02010),
'hyphen;' => chr(0x02010),
'iexcl' => chr(0x000A1),
'iexcl;' => chr(0x000A1),
'iquest' => chr(0x000BF),
'iquest;' => chr(0x000BF),
'laquo' => chr(0x000AB),
'laquo;' => chr(0x000AB),
'larr' => chr(0x02190),
'larr;' => chr(0x02190),
'lcub' => chr(0x0007B),
'lcub;' => chr(0x0007B),
'ldquo' => chr(0x0201C),
'ldquo;' => chr(0x0201C),
'lowbar' => chr(0x0005F),
'lowbar;' => chr(0x0005F),
'lpar' => chr(0x00028),
'lpar;' => chr(0x00028),
'lsqb' => chr(0x0005B),
'lsqb;' => chr(0x0005B),
'lsquo' => chr(0x02018),
'lsquo;' => chr(0x02018),
'lt' => chr(0x0003C),
'lt;' => chr(0x0003C),
'micro' => chr(0x000B5),
'micro;' => chr(0x000B5),
'middot' => chr(0x000B7),
'middot;' => chr(0x000B7),
'nbsp' => chr(0x000A0),
'nbsp;' => chr(0x000A0),
'not' => chr(0x000AC),
'not;' => chr(0x000AC),
'num' => chr(0x00023),
'num;' => chr(0x00023),
'ohm' => chr(0x02126),
'ohm;' => chr(0x02126),
'ordf' => chr(0x000AA),
'ordf;' => chr(0x000AA),
'ordm' => chr(0x000BA),
'ordm;' => chr(0x000BA),
'para' => chr(0x000B6),
'para;' => chr(0x000B6),
'percnt' => chr(0x00025),
'percnt;' => chr(0x00025),
'period' => chr(0x0002E),
'period;' => chr(0x0002E),
'plus' => chr(0x0002B),
'plus;' => chr(0x0002B),
'plusmn' => chr(0x000B1),
'plusmn;' => chr(0x000B1),
'pound' => chr(0x000A3),
'pound;' => chr(0x000A3),
'quest' => chr(0x0003F),
'quest;' => chr(0x0003F),
'quot' => chr(0x00022),
'quot;' => chr(0x00022),
'raquo' => chr(0x000BB),
'raquo;' => chr(0x000BB),
'rarr' => chr(0x02192),
'rarr;' => chr(0x02192),
'rcub' => chr(0x0007D),
'rcub;' => chr(0x0007D),
'rdquo' => chr(0x0201D),
'rdquo;' => chr(0x0201D),
'reg' => chr(0x000AE),
'reg;' => chr(0x000AE),
'rpar' => chr(0x00029),
'rpar;' => chr(0x00029),
'rsqb' => chr(0x0005D),
'rsqb;' => chr(0x0005D),
'rsquo' => chr(0x02019),
'rsquo;' => chr(0x02019),
'sect' => chr(0x000A7),
'sect;' => chr(0x000A7),
'semi' => chr(0x0003B),
'semi;' => chr(0x0003B),
'shy' => chr(0x000AD),
'shy;' => chr(0x000AD),
'sol' => chr(0x0002F),
'sol;' => chr(0x0002F),
'sung' => chr(0x0266A),
'sung;' => chr(0x0266A),
'sup1' => chr(0x000B9),
'sup1;' => chr(0x000B9),
'sup2' => chr(0x000B2),
'sup2;' => chr(0x000B2),
'sup3' => chr(0x000B3),
'sup3;' => chr(0x000B3),
'times' => chr(0x000D7),
'times;' => chr(0x000D7),
'trade' => chr(0x02122),
'trade;' => chr(0x02122),
'uarr' => chr(0x02191),
'uarr;' => chr(0x02191),
'verbar' => chr(0x0007C),
'verbar;' => chr(0x0007C),
'yen' => chr(0x000A5),
'yen;' => chr(0x000A5),
}