http://www.perlmonks.org?node_id=683392
Category: Win32
Author/Contact Info

Dr James Freeman
airmedical guess what goes here? gmail DOT com

Description:

Have you ever wanted to:

  1. Migrate from Outlook to another mail program like Thunderbird, Evolution, Netscape, Eudora, or even (al)Pine on *nix?
  2. Convert an Outlook/Eschange pst file into a standard Berkely unix mbox format?
  3. Extract the email headers from Microsoft Outlook?
  4. Extract the contents of your spam folder for analysis?
  5. Extract all the attachments from you email into a single folder?
  6. Understand Win32::OLE and how to use it to automate Windows stuff?
  7. Use Win32::OLE to recurse a tree structure?

BUT you have not been able to find the right tool/code/example/snippet?

Well this script does that using Win32::OLE and the Outlook and MAPI interfaces. You may need Outlook Redemption to bypass all the "security" related stuff that M$ did to break MAPI, although it worked fine for me using Outlook 2000 (SR-1 pre the patch from hell) and Windows XP. If it "breaks" with endless "Do you want to do that messages" let me know. Import tested into Windows ports of Thunderbird and Evolution. Optionally non RFC 1521 compliant message rebuild to deal with the, uhm, "features" of various mail programs.

use strict;
use warnings;
use MIME::Lite;
use HTML::Parser;
use Text::Wrap;
use File::Spec;
use Getopt::Long;
use Win32::OLE;
BEGIN { Win32::OLE->Initialize(Win32::OLE::COINIT_OLEINITIALIZE); } # 
+MAPI bug
use Win32::OLE::Const 'Microsoft Outlook'; 
use Win32::OLE::Const 'Microsoft CDO.*Library';
use Win32::OLE::Variant;
$|++;   
my $ROOT_FOLDER  = "Personal Folders";
my $TEXT_ONLY    = 0;                 
my $SAVEDIR      = "C:\\mbox\\"; 
my $ATTACH_PATH  = $SAVEDIR."attachments\\";   
my $DEBUG        = 0;
my $DEFAULT_EMAIL= 'nobody@nowhere.com';  # common to use "-" when mis
+sing
my $NO_SUBJECT   = 'No subject';          # common to use "-" when mis
+sing
my  @LOGON_PARAMS = undef;                # should work if not try unc
+ommenting

#my $SERVERNAME   = "computername";       # local computer name
#my $USERNAME     = getlogin();           # should be able to find "us
+ername"
#   @LOGON_PARAMS = ( {'ProfileInfo' => "$SERVERNAME\n$USERNAME" }, un
+def );

my ($DUMP, $NO_FOLDERS, $NO_SPACES, $THUNDERBIRD, $HELP, $ITEMS);

GetOptions(
    "root|r=s"          => \$ROOT_FOLDER,
    "attach|a=s"        => \$ATTACH_PATH,
    "savedir|s=s"       => \$SAVEDIR,
    "dump"              => \$DUMP,
    "nofolders|n"       => \$NO_FOLDERS,
    "nospaces"          => \$NO_SPACES,
    "text|t"            => \$TEXT_ONLY,
    "thunderbird"       => \$THUNDERBIRD,
    "debug|d=i"         => \$DEBUG,
    "help|h|?"          => \$HELP,
);

if ( $HELP or $ARGV[0] and $ARGV[0] =~ m/[h\?]/ ) {
    print "
Usage $0
    --root=<str>    # Root folder to extract. Defualt 'Personal Folder
+s'    (r)
    --savedir=<str> # Path to directory to save folders into.         
+      (s)
                    # Automagically created if it does not exist.
                    # Defaults to $SAVEDIR
    --attach=<str>  # Path to directory to save attachments into.     
+      (a)
                    # Automagically created if it does not exist.
                    # Defaults to $ATTACH_PATH
    --nofolders     # Flattens folder stucture, all files in root fold
+er    (n)
                    # with names like root.folder.subfolder
    --nospaces      # Remove spaces from filenames and paths
    --dump          # Dumps entire parse to STDOUT for redirection
    --text          # Override default MIME output. In text mode body 
+text  (t)
                    # and textified HTML are saved. Attachments extrac
+ted 
                    # and saved in attach path. Links to files are als
+o
                    # added at the end of the email so you can find th
+em.
    --thunderbird   # RFC 1521 calls for multipart alternative email t
+o be
                    # sent with the text part before the RTF before th
+e 
                    # HTML, however Thunderbird requires HTML first fo
+r
                    # it to behave as expected. Thunderbird avoids the
+ issue
                    # when it imports email with text and HTML by simp
+ly not
                    # importing the text at all! 
    --debug=N       # Levels 0=off; 1=folders&count; 2=+From; 3=+heade
+r     (d)
    --help          # Help but you must already know that             
+      (h)
    /?              # Windows style help request

";
system('pause');
print qq!
INSTRUCTIONS

First you need to get a .pst file you can open in Outlook. Maybe it's 
already there, but if you have got one from Exchange you need to point
Outlook at it. Make sure you have the folder view on View|Folder List.
Right click Personal Folders and select Open Personal Folders File (*.
+pst).
Browse to the pst, select it and click OK

Now run the script. Be aware this script is no speed demon and will ta
+ke 
about 1 minute to process every 20 MB of pst (on my old laptop). Close
+ down 
your destination mail reader so it can find the new folders when it re
+starts. 

Transfer everything in $SAVEDIR except for $ATTACH_PATH into (Win32):

Evolution:   
C:\\Documents and Settings\\%USER%\\.evolution\\mail\\local\\
Thunderbird
C:\\Documents and Settings\\%USER%\\Aplication Data\\Thunderbird
    \\Profiles\\<%SALT%.%USER%>\\Local Folders\\

When you open your mail agent the next time it will index all the new 
+mail.

Note in the process of exporting all the attachments are saved to file
+. These
files are found in $ATTACH_PATH.

PST files all have a root called "Personal Folders" that can't be chen
+ged
from within Outlook. To avoid name clashes the first one will be in 1.
+sbd,
the next in 2.sbd, etc.
!; 
  exit
}

print "Processing root folder: $ROOT_FOLDER\n" if $DEBUG;

do { mkdir $SAVEDIR or die "Can't mkdir $SAVEDIR $!\n" }
    unless -d $SAVEDIR;
do { mkdir $ATTACH_PATH or die "Can't mkdir $ATTACH_PATH $!\n" }
    unless -d $ATTACH_PATH;
 
my $OUTLOOK = Win32::OLE->new('Outlook.Application') 
    or die "Could not instatiate Outlook: " . Win32::OLE::LastError();
my $NS = $OUTLOOK->GetNamespace("MAPI")
    or die "Could not get namespace: " . Win32::OLE::LastError();
# Note that we are accessing Outlook via two similar but different OO 
+models
# We have to use MAPI to get at the headers via the Fields method
my $MAPI = Win32::OLE->new('MAPI.Session') 
    or die "Could not instantiate MAPI Session: " . Win32::OLE::LastEr
+ror();
$MAPI->Logon(@LOGON_PARAMS);  # returns undef regardless of success
die "Logon Failed: " . Win32::OLE::LastError() if Win32::OLE::LastErro
+r();

for my $parent ( 1..$NS->Folders->Count ) {
    get_fh($parent); # required for mail programs to find parent.sbd. 
+normally
                     # used to return a file handle but make an empty 
+file
    recurse_folders( $NS->Folders($parent), $parent );
}

sub recurse_folders {
    my ($folder, $parent) = @_;
    return if $folder->DefaultItemType != olMailItem;   # not a mail f
+older
    my $fh = get_fh("$parent/".$folder->Name);
    if ( ("$parent/".$folder->Name) =~ m/$ROOT_FOLDER/i ) {
        printf "\n\nFolder: %s\nTotal entries: %s\n", 
            "$parent/".$folder->Name, $folder->Items->Count if $DEBUG;
        my $headers = get_headers($folder);
        my $fake_header = 0;
        for my $i (1..$folder->Items->Count) {
            printf "Processing %d\r", ++$ITEMS unless $DEBUG;
            my $message = $folder->Items->Item($i);
            next unless $message->Class == olMail;         # mail item
            my $header  = $headers->{$message->EntryID};
            do{$fake_header++} if $header =~ m/X-Fake-Header: True/;
            next if $DEBUG and $DEBUG < 4 ;
            (my $html = $message->HTMLBody || '' ) =~ s/\r//g;
            (my $body = $message->Body     || '' ) =~ s/\r//g;
            my $attach = $message->Attachments ? get_attachments($mess
+age) : '';
            my $str = mbox_format($header,$body,$html,$attach);
            $str =~ tr/\11\12\40-\176//cd;  # ascify aka remove all cr
+ap chars
            print $fh $str;
        }
        print "Fake headers: $fake_header\n" if $DEBUG;
    }
    # recursive tree walk
    if ($folder->Folders and $folder->Folders->Count) {
        for my $i (1..$folder->Folders->Count) {
            recurse_folders($folder->Folders($i),"$parent/".$folder->N
+ame);
        }
    }
}

sub get_headers {
    my $folder = shift;
    my %headers;
    # ***here we change from using Outlook object to using MAPI object
+***
    $folder = $MAPI->GetFolder($folder->EntryID, $folder->StoreID) 
        or die Win32::OLE::LastError();
    my $items = $folder->Messages;
    print "Got ", $items->Count, " MAPI message(s)\n" if $DEBUG;
    for my $i (1..$items->Count) {
        my $message = $items->item($i);
        next unless $message;
        my $entry_id = $message->id;
        my $header = $message->Fields->Item(CdoPR_TRANSPORT_MESSAGE_HE
+ADERS);
        $header = $header ? $header->Value : missing_header($message);
        $header =~ s/\r//g;
        # create the From <someone> <asctime> line that starts an mbox
+ record
        my ($from) = $header =~ m/(?:From:|Return-path).*?\b([\w\.\-\%
+\+]+\@[\w\.\-]+)/i;
        $from ||= $DEFAULT_EMAIL;
        my ($date) = $header =~ m/Date:\s+(.+)/i;
        $date ||= get_date($message);
        $date = asctime_format($date);
        $headers{$entry_id} = "From $from $date\n$header";
        print "From $from $date\n" if $DEBUG > 1;
        print $header if $DEBUG > 2;
    }
    return \%headers
}

# Fake header a header
sub missing_header {
    my $message   = shift;
    my $date      = get_date($message);
    # we need eval as these calls explode it there is no data to get
    my $to        = eval{$message->Fields->Item(CdoPR_DISPLAY_TO)->Val
+ue;}            || $DEFAULT_EMAIL;
    my $from_email= eval{$message->Fields->Item(CdoPR_SENDER_EMAIL_ADD
+RESS)->Value;}  || $DEFAULT_EMAIL;
    my $from_name = eval{$message->Fields->Item(CdoPR_SENDER_NAME)->Va
+lue;}           || '';
    my $subject   = eval{$message->Fields->Item(CdoPR_SUBJECT)->Value;
+}               || $NO_SUBJECT;
    $from_name   = qq|"$from_name" | if $from_name;
    my $boundary = get_boundary();
    return <<HEADER;
Return-path: <$from_email>
From: $from_name<$from_email>
To: <$to>
Subject: $subject
Date: $date
MIME-Version: 1.0
Content-Type: multipart/mixed;
    boundary="$boundary"
X-Fake-Header: True

HEADER
}

# Standard mail form is 'Tue,  1 Jan 1980 00:00:01' ie not asctime
sub get_date {
    my $message = shift;
    my $str = $message->TimeLastModified->Date('ddd MMM d yyyy');
    my($wd,$m,$d,$y) = split " ", $str;
    my $t = $message->TimeLastModified->Time('hh:mm:ss');
    return sprintf "%3s, %2d %3s %4d %8s", $wd,$d,$m,$y,$t;
}

sub asctime_format { 
    sprintf "%3s %3s %2d %8s %4d",(split/[,\s]+/,$_[0])[0,2,1,4,3];
}

sub get_attachments {
    my $message = shift;
    my @attach;
    # need MAPI again to get CID via Fields method using "undocumented
+"
    # definitely unexported constant PR_ATTACH_CONTENT_LOCATION (0x371
+2001E)
    my $mapi_message = $MAPI->GetMessage($message->EntryID) 
        or die Win32::OLE::LastError();
    for my $i ( 1..$message->Attachments->Count ) {
        my $attachment = $message->Attachments->Item($i);
        my $cid = $mapi_message->Attachments->Item($i)->Fields->Item(0
+x3712001E) || '';
        $cid = $cid->Value if $cid; 
        my $filename = get_filename($attachment->Filename);
        $filename =~ s/[^\w\.\-\/ ]+/_/g;
        $filename =~ s/\s+/_/g if $NO_SPACES;
        my $filepath = File::Spec->catfile($ATTACH_PATH, $filename);
        push @attach, [ $filepath, $attachment->Filename, $cid ];
        $attachment->SaveAsFile($filepath) unless $DEBUG;
    }
    return \@attach;
}

sub attachment_path_map { 
    my $attach = shift;
    return '' unless @$attach;
    return "\n\n". (join '', map{"Attachment: $_->[0]\n"} @$attach) 
}

sub mbox_format {
    my ($header,$body,$html,$attach) = @_;
    my (undef,$boundary) = $header =~ m/\bBoundary\s*=\s*(['"]?)([^\1\
+n]+)\1/i;
    if ($TEXT_ONLY) {
        my $text = html2text($html);
        $body .= "\n\n<HTML>\n\n$text\n" if $text;
        $body .= attachment_path_map($attach);
        # (\n\s+.*)* removes extended lines below Content-Type:
        $header =~ s!Content-Type:\s.*(\n\s+.*)*!Content-Type: text/pl
+ain!i;
    }    
    elsif ($boundary) {
        my $msg = MIME::Lite->new( Type =>'multipart/mixed' );
        # although the RFC says run the parts from low->high definitio
+n
        # (ie TEXT->HTML) Thunderbird does not get it so as user may w
+ant 
        # preferential HTML you have to give Thunderbird the HTML firs
+t
        $msg->attach( Type => 'TEXT', 
                      Data => $body ) if $body and not $THUNDERBIRD; 
        $msg->attach( Type => 'HTML', 
                      Data => $html ) if $html;
        $msg->attach( Type => 'TEXT', 
                      Data => $body ) if $body and $THUNDERBIRD;
        for my $file (@$attach) {
            if ( $file->[2] ) {  # found a cid so inline it
                $msg->attach( Type => 'AUTO', 
                              Path => $file->[0], 
                              Id => $file->[2],
                              Filename => $file->[1],
                              Disposition => 'inline' )
            } else {
                $msg->attach( Type => 'AUTO', 
                              Path => $file->[0], 
                              Filename => $file->[1],
                              Disposition => 'attachment' )
            }
        }
        $msg->attr('content-type.boundary', $boundary );
        my $email =  $msg->as_string;
       (undef, my $mime) = split /\n\n/,$email,2;
        $body = $mime;
      
    }
    elsif ( $header =~ m!Content-Type:\s*text/html!i ) {
        $body = $html if $html;
    } 
    return "$header$body\n\n";
}

sub get_fh {
    my $folder = shift;
    $folder =~ s/[^\w\.\-\/ ]+/_/g;
    $folder =~ s!^/!!;
    $folder =~ s/\s+/_/g if $NO_SPACES;
    print "Folder: $folder\n" if $DEBUG;
    my $filepath;
    if ($NO_FOLDERS) { 
        $folder =~ s!/!.!g;
        $filepath = File::Spec->catfile( $SAVEDIR, $folder );
    }
    else {
        my @path = split "/", $folder;
        my $file = pop @path;
        @path = map{"$_.sbd"}@path;
        my $dir = File::Spec->catdir( $SAVEDIR, @path);
        safe_mkdir( $SAVEDIR, @path ) unless -d $dir;
        $filepath = File::Spec->catfile( $SAVEDIR, @path, $file );    
+    
    }

    print "Folder path: $filepath\n" if $DEBUG;
    return *STDOUT if $DEBUG or $DUMP;
    open my $fh, ">", $filepath or die "Can't write $filepath $!\n";
    return $fh;
}

sub safe_mkdir {
     my @dirs = ();
     while ( my $dir = shift ) {
        my $mkdir = File::Spec->catdir(@dirs,$dir);
        print "Doing mkdir $mkdir\n" if $DEBUG;
        push @dirs, $dir;
        next if -d $mkdir;
        mkdir $mkdir or die "Can't create directory $mkdir $!\n";
     }
}

# simple closure counter unique filename generator
{
    my $c;
    sub get_filename { 
        my $filename = shift;
        $c ||= time(); 
        $filename =~ s/[^\w\.\-]/_/g;
        sprintf "MB%08d_%s", $c++, $filename 
    }
}
# simple closure counter unique boundary generator
{
    my $c;
    sub get_boundary {
        $c ||= time(); 
        sprintf "----------_tachyon_%d", $c++; 
    }
}

sub html2text {
    my $html = shift;
    my %inside;
    my $text = '';
    my $tag = sub { $inside{$_[0]} += $_[1]; $text .= " " };
    my $txt = sub { $text .= $_[0] unless $inside{script} or $inside{s
+tyle} };
    HTML::Parser->new(  api_version => 3,
                        handlers => [ start => [$tag, "tagname, '+1'"]
+,
                                      end   => [$tag, "tagname, '-1'"]
+,
                                      text  => [$txt, "dtext"] ],
                        marked_sections => 1,
    )->parse($html);
    $text =~ tr/\11\12\40-\176//cd; # remove wide non ascii chars
    $text = Text::Wrap::fill('', '', $text);
    $text =~ s/^\s+//;
    return $text;
}

sub ole_dump {
    my $obj = shift;
    printf "%s %s\n", $_, $obj->$_ for sort keys %$obj;
}