Beefy Boxes and Bandwidth Generously Provided by pair Networks
Think about Loose Coupling
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
I have the following script which stores email to a database. I'm having a problem with message subjects/bodies which contain utf-8 encoded data. The message is split up into parts by Mime::Parser. The parts are stored in /tmp/msg-*.

The utf-8 subjects show up like this in the mysql table: UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06Laj55qE5Li7?==?UTF-8?B?6aKY?=

I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.

My program is able to store the utf-8 message body correctly.

Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.

Can someone offer guidance?

MAIN PROGRAM #!/usr/bin/perl use strict; use DBI; use File::Type; use Date::Parse; use MIME::Parser; use Mail::POP3Client; use Getopt::Std; use Data::Dumper; use POSIX qw(strftime); use Compress::Zlib qw(compress); binmode(STDOUT, ":utf8"); $|++; my %opts; getopts('d', \%opts); my $dbh = DBI->connect( 'dbi:mysql:mail_archive', 'username', 'password', { AutoCommit => 1 } ) or die $DBI::errstr; $dbh->{mysql_enable_utf8} = 1; $dbh->do("set character set utf8"); $dbh->do("set names utf8"); my %query; $query{max_pack} = $dbh->prepare(<<EOQ); set max_allowed_packet=16776192 EOQ $query{last_id} = $dbh->prepare(<<EOQ); select last_insert_id() EOQ $query{body_md5_match} = $dbh->prepare(<<EOQ); select body_id from MA_body where check_sum = md5(?) and body = ? EOQ $query{attach_md5_match} = $dbh->prepare(<<EOQ); select attach_id from MA_attach where check_sum = md5(?) and attach = +? EOQ $query{ins_header} = $dbh->prepare(<<EOQ); insert into MA_hdr ( subject, msgid, body_id, full_header ) values (?,?,?,?) EOQ $query{ins_addr} = $dbh->prepare(<<EOQ); insert into MA_addr ( email_header_id, hdr_id, header_type, email_id ) values (?,?,?,?) EOQ $query{lkup_email} = $dbh->prepare(<<EOQ); select email_id from MA_email where email_addr = ? EOQ $query{ins_email} = $dbh->prepare(<<EOQ); insert ignore into MA_email (email_addr) values (?) EOQ $query{ins_body} = $dbh->prepare(<<EOQ); insert into MA_body ( body, check_sum ) values (?, md5(?) ) EOQ $query{ins_attach} = $dbh->prepare(<<EOQ); insert into MA_attach ( attach_name, mime_type, check_sum, attach ) values (?, ?, md5(?), ?) EOQ $query{ins_attach_addr} = $dbh->prepare(<<EOQ); insert into MA_attach_addr ( hdr_id, attach_id ) values (?,?) EOQ $query{max_pack}->execute(); my $ft = File::Type->new(); POP3_CONNECTION: my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net"); $pop->User("mail_archive_mirror"); $pop->Pass("PTWH7EJU"); ### Loop forever. while (1) { if (! $pop->Connect()) { sleep 2; goto POP3_CONNECTION; } ### Now we iterate over each message present on the server. for (my $num = 1; $num <= $pop->Count(); $num++) { my $message = $pop->Retrieve($num); ### Cut message into parts. my $parts = parse_message($message); ### Store parts to sql tables. create_record($parts); ### Delete the message. $pop->Delete($num); } sleep 1; } ################################################################# sub debug { ################################################################# print @_, "\n" if $opts{d}; } ################################################################# sub create_record { ################################################################# my $parts = shift(); my $header_id; my $body_id; my $header_cntr = 1; ### Check to see if we have an existing body record. $query{body_md5_match}->execute($parts->{body}, $parts->{body}); if ($query{body_md5_match}->rows() == 1) { ($body_id) = $query{body_md5_match}->fetchrow_array(); } if (!$body_id) { ### We need to add a body record, becuase one doesn't exist. $query{ins_body}->execute( $parts->{body}, $parts->{body}, ); $query{last_id}->execute(); ($body_id) = $query{last_id}->fetchrow_array(); } ### Insert the header record for the message. $query{ins_header}->execute( $parts->{subject}, $parts->{'Message-ID'}, $body_id, $parts->{full_header}, ); $query{last_id}->execute(); ($header_id) = $query{last_id}->fetchrow_array(); ### Insert addr records for the header parts. for my $addr qw(from to cc) { for my $email (@{$parts->{"distinct_$addr"}}) { $query{ins_email}->execute($email); $query{lkup_email}->execute($email); my ($email_id) = $query{lkup_email}->fetchrow_array(); $query{ins_addr}->execute( $header_cntr++, $header_id, $addr, $email_id, ); } } ### Add attachments records. if (-d $parts->{output_dir}) { opendir DIR, $parts->{output_dir}; my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r +eaddir DIR; closedir DIR; chdir $parts->{output_dir}; ATTACH_LOOP: for my $attachment (@attachments) { open FILE, $attachment; my $contents = do { local $/; <FILE> }; close FILE; my $mt = $ft->mime_type($contents); my $contents_gz = compress($contents, 9); ### We skip large attachments. $query{attach_md5_match}->execute($contents_gz, $contents_ +gz) or next ATTACH_LOOP; my ($attach_id) = $query{attach_md5_match}->fetchrow_array +(); debug("# attach_id attach_md5_match: $attach_id"); if (!$attach_id) { $query{ins_attach}->execute( $attachment, $mt, $contents_gz, $contents_gz, ); $query{last_id}->execute(); ($attach_id) = $query{last_id}->fetchrow_array(); debug("# new attachment inserted into MA_attach: $atta +ch_id"); } ### Insert attach_addr record. $query{ins_attach_addr}->execute( $header_id, $attach_id, ); } } } ################################################################# sub parse_message { ################################################################# my $message = shift; my $parser = MIME::Parser->new() or return 0; $parser->ignore_errors(1) or return 0; $parser->extract_uuencode(1) or return 0; $parser->output_under('/tmp'); my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $parts = { map { my $val = $header->get($_); chomp $val if $val; $_ => $val; } qw(subject date to from cc Message-ID) }; $parts->{full_header} = $header->as_string(); ### Remove Trash from mail address fields. for my $var (qw(from to cc)) { $parts->{$var} = lc $parts->{$var}; # my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\. +[a-z0-9]*/g; my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\. +[a-z0-9\._-]*/g; my %seen; ### Get a uniqe list. for my $elmt (@matches) { $seen{$elmt}++; } @matches = keys %seen; ### Separate values with comma if (@matches) { $parts->{$var} = join ',', @matches; # $parts->{$var} =~ s/.$//; if ($var eq "from" || $var eq "to" || $var eq "cc") { $parts->{"distinct_$var"} = \@matches; } } } ### Gives the name of the output directory tree. $parts->{output_dir} = $parser->output_dir(); ### Put body together. if (-d $parts->{output_dir}) { $parts->{body} = ""; opendir DIR, $parts->{output_dir}; my @body_parts = grep { /^msg/ } readdir DIR; sort @body_parts; closedir DIR; chdir $parts->{output_dir}; for my $part (@body_parts) { open FILE, $part; my $contents = do { local $/; <FILE> }; close FILE; $parts->{body} .= $contents; } } ### $parts->{body} = join "", @{$entity->body()}; debug("# Email parts"); debug(Dumper($parts)); return $parts; } __END__

In reply to Mime::Parser utf-8 issue by mhearse

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others romping around the Monastery: (5)
    As of 2014-07-26 06:41 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      My favorite superfluous repetitious redundant duplicative phrase is:









      Results (175 votes), past polls