Beefy Boxes and Bandwidth Generously Provided by pair Networks
Problems? Is your data what you think it is?
 
PerlMonks  

Mime::Parser utf-8 issue

by mhearse (Hermit)
on Jun 11, 2009 at 22:55 UTC ( #770782=perlquestion: print w/ replies, xml ) Need Help??
mhearse has asked for the wisdom of the Perl Monks concerning the following question:

I have the following script which stores email to a database. I'm having a problem with message subjects/bodies which contain utf-8 encoded data. The message is split up into parts by Mime::Parser. The parts are stored in /tmp/msg-*.

The utf-8 subjects show up like this in the mysql table: UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06Laj55qE5Li7?==?UTF-8?B?6aKY?=

I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.

My program is able to store the utf-8 message body correctly.

Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.

Can someone offer guidance?

MAIN PROGRAM #!/usr/bin/perl use strict; use DBI; use File::Type; use Date::Parse; use MIME::Parser; use Mail::POP3Client; use Getopt::Std; use Data::Dumper; use POSIX qw(strftime); use Compress::Zlib qw(compress); binmode(STDOUT, ":utf8"); $|++; my %opts; getopts('d', \%opts); my $dbh = DBI->connect( 'dbi:mysql:mail_archive', 'username', 'password', { AutoCommit => 1 } ) or die $DBI::errstr; $dbh->{mysql_enable_utf8} = 1; $dbh->do("set character set utf8"); $dbh->do("set names utf8"); my %query; $query{max_pack} = $dbh->prepare(<<EOQ); set max_allowed_packet=16776192 EOQ $query{last_id} = $dbh->prepare(<<EOQ); select last_insert_id() EOQ $query{body_md5_match} = $dbh->prepare(<<EOQ); select body_id from MA_body where check_sum = md5(?) and body = ? EOQ $query{attach_md5_match} = $dbh->prepare(<<EOQ); select attach_id from MA_attach where check_sum = md5(?) and attach = +? EOQ $query{ins_header} = $dbh->prepare(<<EOQ); insert into MA_hdr ( subject, msgid, body_id, full_header ) values (?,?,?,?) EOQ $query{ins_addr} = $dbh->prepare(<<EOQ); insert into MA_addr ( email_header_id, hdr_id, header_type, email_id ) values (?,?,?,?) EOQ $query{lkup_email} = $dbh->prepare(<<EOQ); select email_id from MA_email where email_addr = ? EOQ $query{ins_email} = $dbh->prepare(<<EOQ); insert ignore into MA_email (email_addr) values (?) EOQ $query{ins_body} = $dbh->prepare(<<EOQ); insert into MA_body ( body, check_sum ) values (?, md5(?) ) EOQ $query{ins_attach} = $dbh->prepare(<<EOQ); insert into MA_attach ( attach_name, mime_type, check_sum, attach ) values (?, ?, md5(?), ?) EOQ $query{ins_attach_addr} = $dbh->prepare(<<EOQ); insert into MA_attach_addr ( hdr_id, attach_id ) values (?,?) EOQ $query{max_pack}->execute(); my $ft = File::Type->new(); POP3_CONNECTION: my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net"); $pop->User("mail_archive_mirror"); $pop->Pass("PTWH7EJU"); ### Loop forever. while (1) { if (! $pop->Connect()) { sleep 2; goto POP3_CONNECTION; } ### Now we iterate over each message present on the server. for (my $num = 1; $num <= $pop->Count(); $num++) { my $message = $pop->Retrieve($num); ### Cut message into parts. my $parts = parse_message($message); ### Store parts to sql tables. create_record($parts); ### Delete the message. $pop->Delete($num); } sleep 1; } ################################################################# sub debug { ################################################################# print @_, "\n" if $opts{d}; } ################################################################# sub create_record { ################################################################# my $parts = shift(); my $header_id; my $body_id; my $header_cntr = 1; ### Check to see if we have an existing body record. $query{body_md5_match}->execute($parts->{body}, $parts->{body}); if ($query{body_md5_match}->rows() == 1) { ($body_id) = $query{body_md5_match}->fetchrow_array(); } if (!$body_id) { ### We need to add a body record, becuase one doesn't exist. $query{ins_body}->execute( $parts->{body}, $parts->{body}, ); $query{last_id}->execute(); ($body_id) = $query{last_id}->fetchrow_array(); } ### Insert the header record for the message. $query{ins_header}->execute( $parts->{subject}, $parts->{'Message-ID'}, $body_id, $parts->{full_header}, ); $query{last_id}->execute(); ($header_id) = $query{last_id}->fetchrow_array(); ### Insert addr records for the header parts. for my $addr qw(from to cc) { for my $email (@{$parts->{"distinct_$addr"}}) { $query{ins_email}->execute($email); $query{lkup_email}->execute($email); my ($email_id) = $query{lkup_email}->fetchrow_array(); $query{ins_addr}->execute( $header_cntr++, $header_id, $addr, $email_id, ); } } ### Add attachments records. if (-d $parts->{output_dir}) { opendir DIR, $parts->{output_dir}; my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r +eaddir DIR; closedir DIR; chdir $parts->{output_dir}; ATTACH_LOOP: for my $attachment (@attachments) { open FILE, $attachment; my $contents = do { local $/; <FILE> }; close FILE; my $mt = $ft->mime_type($contents); my $contents_gz = compress($contents, 9); ### We skip large attachments. $query{attach_md5_match}->execute($contents_gz, $contents_ +gz) or next ATTACH_LOOP; my ($attach_id) = $query{attach_md5_match}->fetchrow_array +(); debug("# attach_id attach_md5_match: $attach_id"); if (!$attach_id) { $query{ins_attach}->execute( $attachment, $mt, $contents_gz, $contents_gz, ); $query{last_id}->execute(); ($attach_id) = $query{last_id}->fetchrow_array(); debug("# new attachment inserted into MA_attach: $atta +ch_id"); } ### Insert attach_addr record. $query{ins_attach_addr}->execute( $header_id, $attach_id, ); } } } ################################################################# sub parse_message { ################################################################# my $message = shift; my $parser = MIME::Parser->new() or return 0; $parser->ignore_errors(1) or return 0; $parser->extract_uuencode(1) or return 0; $parser->output_under('/tmp'); my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $parts = { map { my $val = $header->get($_); chomp $val if $val; $_ => $val; } qw(subject date to from cc Message-ID) }; $parts->{full_header} = $header->as_string(); ### Remove Trash from mail address fields. for my $var (qw(from to cc)) { $parts->{$var} = lc $parts->{$var}; # my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\. +[a-z0-9]*/g; my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\. +[a-z0-9\._-]*/g; my %seen; ### Get a uniqe list. for my $elmt (@matches) { $seen{$elmt}++; } @matches = keys %seen; ### Separate values with comma if (@matches) { $parts->{$var} = join ',', @matches; # $parts->{$var} =~ s/.$//; if ($var eq "from" || $var eq "to" || $var eq "cc") { $parts->{"distinct_$var"} = \@matches; } } } ### Gives the name of the output directory tree. $parts->{output_dir} = $parser->output_dir(); ### Put body together. if (-d $parts->{output_dir}) { $parts->{body} = ""; opendir DIR, $parts->{output_dir}; my @body_parts = grep { /^msg/ } readdir DIR; sort @body_parts; closedir DIR; chdir $parts->{output_dir}; for my $part (@body_parts) { open FILE, $part; my $contents = do { local $/; <FILE> }; close FILE; $parts->{body} .= $contents; } } ### $parts->{body} = join "", @{$entity->body()}; debug("# Email parts"); debug(Dumper($parts)); return $parts; } __END__

Comment on Mime::Parser utf-8 issue
Select or Download Code
Re: Mime::Parser utf-8 issue
by blahblahblah (Priest) on Jun 12, 2009 at 03:30 UTC
    I don't think that MIME::Parser touches the encoded headers. You should be able to decode them with something like the following:
    use Encode; my $val = $header->get($_); $val = Encode::decode('MIME-Header', $val); $val = Encode::encode('utf8', $val);
    (I'm sure you could even go further and combine my two lines into one, using Encode::from_to or something like that.)

    Joe

    P.S. Here's an obscure tip that you'll probably never need to worry about: the "Remove Trash..." block of your code should technically come after the decoding that I described above, just in case there is a comma in the encoded data which would be significant to your splitting of the From/To/Cc headers.

      Thanks for you reply. It works great. I have only one small problem. When running the proceeding code, I end up with some trash at the beginning of the subject. Such as: &#1514;\xB7\xA2. Any suggestions?
      my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $utf8 = decode('MIME-Header', $header); $header = encode('MIME-Header', $utf8);
        Hmmm... I'm not sure. This code works for me, with the header value from your original post.
        use Encode; my $header = '=?UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06La +j55qE5Li7?==?UTF-8?B?6aKY?='; my $utf8 = decode('MIME-Header', $header); print "uft8: $utf8\n";
        output is:
        uft8: 从博客文章中查找您感兴趣的主题

        Of course, I can't read any Chinese, so I have no idea if those are the right characters or just gibberish.

Re: Mime::Parser utf-8 issue
by runrig (Abbot) on Jan 22, 2013 at 22:51 UTC
    All you need is:
    my $entity = $parser->parse_data($message); my $header = $entity->head(); my $subject = Encode::decode('MIME-Header', $header->get('subject'));
    I'm not sure why it took nearly 300 lines of code to ask the question...

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://770782]
Approved by ww
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chanting in the Monastery: (5)
As of 2015-07-05 14:26 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (67 votes), past polls