$ cat utf8-and-html-entities.pl #!/usr/angebote/perlroot/bin/perl use strict; use warnings; # use strict; # use IO::File; # use Text::CSV_XS; # use DBI; # use Time::Local; # use Time::HiRes; # use Compress::Zlib; # use LWP::UserAgent; #use POSIX qw(locale_h); use HTML::Strip; use Test::More qw(no_plan); use Data::Dumper; #setlocale(LC_CTYPE, "de_DE.ISO8859-1"); require "../../perl/agentFunc.pl"; my $stringsBeforeAfter = [ [ 'blah', 'blah' ], [ 'Ü --', 'Ü --'], ["blah -- ’ -- blah", "blah -- -- blah"], ["Ü -- ’ -- blah", "Ü -- -- blah"], ]; foreach my $beforeAfter ( @$stringsBeforeAfter ) { my ( $before, $after ) = @$beforeAfter; my $transformed =HTML2Text( stripUtf8Entities( $before ) ); my $strings = [ [ "before", $before ], [ "after", $after ], [ "transformed", $transformed ] ]; #print "strings: " . Dumper($strings); is($transformed, $after, "stripUtf8Entities"); } foreach my $beforeAfter ( @$stringsBeforeAfter ) { my ( $before, $after ) = @$beforeAfter; my $transformed =HTML2Text( stripUtf8EntitiesBetter( $before ) ); my $strings = [ [ "before", $before ], [ "after", $after ], [ "transformed", $transformed ] ]; #print "strings: " . Dumper($strings); is($transformed, $after, "stripUtf8EntitiesBetter"); } sub HTML2Text { my ($changeText) = @_; my $htmlStripObject = HTML::Strip->new(); $changeText = $htmlStripObject->parse($changeText); return $changeText; } # works, but only for one special character: &rsquo # what happens when I hit another char that doesn't translate well out of utf8? sub stripUtf8Entities { my $string = shift || ""; my $utf8Entities = ["’"]; foreach my $utf8Entity ( @$utf8Entities ) { $string =~ s/$utf8Entity//g; } return $string; } #just a stub -- is there a better, more general way to do this? sub stripUtf8EntitiesBetter { my $string = shift || ""; return $string; }