Beefy Boxes and Bandwidth Generously Provided by pair Networks
Syntactic Confectionery Delight
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
I have the following script which stores email to a database. I'm having a problem with message subjects/bodies which contain utf-8 encoded data. The message is split up into parts by Mime::Parser. The parts are stored in /tmp/msg-*.

The utf-8 subjects show up like this in the mysql table: UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06Laj55qE5Li7?==?UTF-8?B?6aKY?=

I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.

My program is able to store the utf-8 message body correctly.

Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.

Can someone offer guidance?

MAIN PROGRAM #!/usr/bin/perl use strict; use DBI; use File::Type; use Date::Parse; use MIME::Parser; use Mail::POP3Client; use Getopt::Std; use Data::Dumper; use POSIX qw(strftime); use Compress::Zlib qw(compress); binmode(STDOUT, ":utf8"); $|++; my %opts; getopts('d', \%opts); my $dbh = DBI->connect( 'dbi:mysql:mail_archive', 'username', 'password', { AutoCommit => 1 } ) or die $DBI::errstr; $dbh->{mysql_enable_utf8} = 1; $dbh->do("set character set utf8"); $dbh->do("set names utf8"); my %query; $query{max_pack} = $dbh->prepare(<<EOQ); set max_allowed_packet=16776192 EOQ $query{last_id} = $dbh->prepare(<<EOQ); select last_insert_id() EOQ $query{body_md5_match} = $dbh->prepare(<<EOQ); select body_id from MA_body where check_sum = md5(?) and body = ? EOQ $query{attach_md5_match} = $dbh->prepare(<<EOQ); select attach_id from MA_attach where check_sum = md5(?) and attach = +? EOQ $query{ins_header} = $dbh->prepare(<<EOQ); insert into MA_hdr ( subject, msgid, body_id, full_header ) values (?,?,?,?) EOQ $query{ins_addr} = $dbh->prepare(<<EOQ); insert into MA_addr ( email_header_id, hdr_id, header_type, email_id ) values (?,?,?,?) EOQ $query{lkup_email} = $dbh->prepare(<<EOQ); select email_id from MA_email where email_addr = ? EOQ $query{ins_email} = $dbh->prepare(<<EOQ); insert ignore into MA_email (email_addr) values (?) EOQ $query{ins_body} = $dbh->prepare(<<EOQ); insert into MA_body ( body, check_sum ) values (?, md5(?) ) EOQ $query{ins_attach} = $dbh->prepare(<<EOQ); insert into MA_attach ( attach_name, mime_type, check_sum, attach ) values (?, ?, md5(?), ?) EOQ $query{ins_attach_addr} = $dbh->prepare(<<EOQ); insert into MA_attach_addr ( hdr_id, attach_id ) values (?,?) EOQ $query{max_pack}->execute(); my $ft = File::Type->new(); POP3_CONNECTION: my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net"); $pop->User("mail_archive_mirror"); $pop->Pass("PTWH7EJU"); ### Loop forever. while (1) { if (! $pop->Connect()) { sleep 2; goto POP3_CONNECTION; } ### Now we iterate over each message present on the server. for (my $num = 1; $num <= $pop->Count(); $num++) { my $message = $pop->Retrieve($num); ### Cut message into parts. my $parts = parse_message($message); ### Store parts to sql tables. create_record($parts); ### Delete the message. $pop->Delete($num); } sleep 1; } ################################################################# sub debug { ################################################################# print @_, "\n" if $opts{d}; } ################################################################# sub create_record { ################################################################# my $parts = shift(); my $header_id; my $body_id; my $header_cntr = 1; ### Check to see if we have an existing body record. $query{body_md5_match}->execute($parts->{body}, $parts->{body}); if ($query{body_md5_match}->rows() == 1) { ($body_id) = $query{body_md5_match}->fetchrow_array(); } if (!$body_id) { ### We need to add a body record, becuase one doesn't exist. $query{ins_body}->execute( $parts->{body}, $parts->{body}, ); $query{last_id}->execute(); ($body_id) = $query{last_id}->fetchrow_array(); } ### Insert the header record for the message. $query{ins_header}->execute( $parts->{subject}, $parts->{'Message-ID'}, $body_id, $parts->{full_header}, ); $query{last_id}->execute(); ($header_id) = $query{last_id}->fetchrow_array(); ### Insert addr records for the header parts. for my $addr qw(from to cc) { for my $email (@{$parts->{"distinct_$addr"}}) { $query{ins_email}->execute($email); $query{lkup_email}->execute($email); my ($email_id) = $query{lkup_email}->fetchrow_array(); $query{ins_addr}->execute( $header_cntr++, $header_id, $addr, $email_id, ); } } ### Add attachments records. if (-d $parts->{output_dir}) { opendir DIR, $parts->{output_dir}; my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r +eaddir DIR; closedir DIR; chdir $parts->{output_dir}; ATTACH_LOOP: for my $attachment (@attachments) { open FILE, $attachment; my $contents = do { local $/; <FILE> }; close FILE; my $mt = $ft->mime_type($contents); my $contents_gz = compress($contents, 9); ### We skip large attachments. $query{attach_md5_match}->execute($contents_gz, $contents_ +gz) or next ATTACH_LOOP; my ($attach_id) = $query{attach_md5_match}->fetchrow_array +(); debug("# attach_id attach_md5_match: $attach_id"); if (!$attach_id) { $query{ins_attach}->execute( $attachment, $mt, $contents_gz, $contents_gz, ); $query{last_id}->execute(); ($attach_id) = $query{last_id}->fetchrow_array(); debug("# new attachment inserted into MA_attach: $atta +ch_id"); } ### Insert attach_addr record. $query{ins_attach_addr}->execute( $header_id, $attach_id, ); } } } ################################################################# sub parse_message { ################################################################# my $message = shift; my $parser = MIME::Parser->new() or return 0; $parser->ignore_errors(1) or return 0; $parser->extract_uuencode(1) or return 0; $parser->output_under('/tmp'); my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $parts = { map { my $val = $header->get($_); chomp $val if $val; $_ => $val; } qw(subject date to from cc Message-ID) }; $parts->{full_header} = $header->as_string(); ### Remove Trash from mail address fields. for my $var (qw(from to cc)) { $parts->{$var} = lc $parts->{$var}; # my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\. +[a-z0-9]*/g; my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\. +[a-z0-9\._-]*/g; my %seen; ### Get a uniqe list. for my $elmt (@matches) { $seen{$elmt}++; } @matches = keys %seen; ### Separate values with comma if (@matches) { $parts->{$var} = join ',', @matches; # $parts->{$var} =~ s/.$//; if ($var eq "from" || $var eq "to" || $var eq "cc") { $parts->{"distinct_$var"} = \@matches; } } } ### Gives the name of the output directory tree. $parts->{output_dir} = $parser->output_dir(); ### Put body together. if (-d $parts->{output_dir}) { $parts->{body} = ""; opendir DIR, $parts->{output_dir}; my @body_parts = grep { /^msg/ } readdir DIR; sort @body_parts; closedir DIR; chdir $parts->{output_dir}; for my $part (@body_parts) { open FILE, $part; my $contents = do { local $/; <FILE> }; close FILE; $parts->{body} .= $contents; } } ### $parts->{body} = join "", @{$entity->body()}; debug("# Email parts"); debug(Dumper($parts)); return $parts; } __END__

In reply to Mime::Parser utf-8 issue by mhearse

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others meditating upon the Monastery: (10)
    As of 2015-07-02 05:00 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (27 votes), past polls