#!/usr/bin/perl
use strict;
use warnings;
use CGI;
use Encode qw( decode );
use HTML::Entities qw( encode_entities );
my $cgi = CGI->new();
my $val = $cgi->param('key');
use Devel::Peek;
Dump($val);
$val = decode('iso-8859-15', $val) if defined($val);
print $cgi->header('text/html; charset=iso-8859-15');
binmode STDOUT, ':encoding(iso-8859-15)';
my $val_initializer = ( defined($val)
? sprintf(' value="%s"', encode_entities($val, '<>&"'))
: ''
);
print(<<"__EOI__");
<title>Test</title>
<form method="POST">
<input type="text" name="key"$val_initializer>
<input type="submit">
</form>
__EOI__
Let's make sure it works:
$ perl -e'print <<"__EOI__";
POST /zzz.cgi HTTP/1.0
Host: www.example.com
Content-Length: 11
key=Ch\xE2teau
__EOI__
' | nc www.example.com 80 | od -c
00000 H T T P / 1 . 1 2 0 0 O K \r
00020 \n D a t e : W e d , 2 8 A
00040 p r 2 0 1 0 2 2 : 1 0 : 1 4
00060 G M T \r \n S e r v e r : A p
00100 a c h e \r \n V a r y : A c c e
00120 p t - E n c o d i n g \r \n C o n
00140 t e n t - L e n g t h : 1 1 8
00160 \r \n C o n n e c t i o n : c l
00200 o s e \r \n C o n t e n t - T y p
00220 e : t e x t / h t m l ; c h
00240 a r s e t = i s o - 8 8 5 9 - 1
00260 5 \r \n \r \n < t i t l e > T e s t
00300 < / t i t l e > \n < f o r m m
00320 e t h o d = " P O S T " > \n < i
00340 n p u t t y p e = " t e x t "
00360 n a m e = " k e y " v a l u
00400 e = " C h 342 t e a u " > \n < i n
00420 p u t t y p e = " s u b m i t
00440 " > \n < / f o r m > \n
00453
Yup. Now let's test WWW::Mechanize.
use strict;
use warnings;
use open ':std', ':locale';
use charnames ':full';
use Encode qw( encode );
use WWW::Mechanize qw( );
# Avoiding script encoding issues.
my $val = "Ch\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}teau";
my $mech = WWW::Mechanize->new( autocheck => 1 );
$mech->get('http://www.server.com/zzz.cgi');
$mech->field('key', $val);
$mech->submit();
#print($mech->value('key'), "\n");
use Devel::Peek qw( Dump );
Dump($mech->value('key'));
Hum, I get:
SV = PV(0x1167c20) at 0x11d05c0
REFCNT = 1
FLAGS = (TEMP,POK,pPOK,UTF8)
PV = 0x11572c0 "Ch\303\203\302\242teau"\0 [UTF8 "Ch\x{c3}\x{a2}teau"
+]
CUR = 10
LEN = 16
But I expect:
SV = PV(0x1167c20) at 0x11d05c0
REFCNT = 1
FLAGS = (TEMP,POK,pPOK,UTF8)
PV = 0x115bcf0 "Ch\303\242teau"\0 [UTF8 "Ch\x{e2}teau"]
CUR = 8
LEN = 16
or the equivalent
SV = PV(0x1167c20) at 0x11d05c0
REFCNT = 1
FLAGS = (TEMP,POK,pPOK)
PV = 0x115bcf0 "Ch\342teau"\0
CUR = 7
LEN = 16
Some debugging shows the server side is receiving the following:
"Ch\303\242teau"
That's the UTF-8 encoding of the value, so the problem is getting the right data to the server. Ok, fine, maybe WWW::Mechanize stupidly sends the internal storage data of the string. The solution would be to encode the inputs yourself as follows:
#$mech->field('key', $val);
$mech->field('key', encode('iso-8859-15', $val));
$mech->submit();
But even with the change, the client side script is still sending the following to the server:
"Ch\303\242teau"
That's the UTF-8 encoding of the result of encode('iso-8859-15', $val). Does WWW::Mechanize assume the server expects UTF-8 rather than the page's encoding?
It's all I have time for right now.
- WWW-Mechanize-1.62
- libwww-perl-5.834
|