#!/usr/bin/perl # Simple program to remove duplicate email messages # from an mbox file. This program only looks at the content # of the message for uniqueness, not entire message with the headers. # There is no file locking, use this program on a backup # of your mbox file. # Enjoy. use strict; use warnings; use Digest::MD5 qw(md5_hex); #grab file names from the program parameters. #and do some error checking. my ($from, $to) = @_; die "usage: $0 from to" unless (defined $from && defined $to); my (%uniq, $msg); my ($head, $body); my $i = 0; $|++; open (my $fh, "<$from") || die "cannot open $from: $!"; while(<$fh>) { #emails in mbox files always begin with ^From #when /^From / is matched, process the previous message #then start on this message if(m/^From /) { next if ($msg eq ""); #increment the counter for a status report $i++; #print a status report if necessary. #I like to do it this way print '.' if(($i % 50) == 0); print " $i\n" if(($i % 1000) == 0); #since evolution can give different headers on the same message, #only hash the body of the message, and use that to compare to other #emails. The entire message will be stored in the hash though. ($head, $body) = split /\n\n/, $msg; #standard perl technique for removing duplicates, using hashes and #md5 files. $uniq{md5_hex($body)} = $msg; #done processing the previous message, start the next message $msg = $_; } else { #current line didn't match /^From / so this line is part of the #middle of the current message. Just tack it on to the end. $msg .= $_; } } #print the results to a file. open (my $th, ">$to") || die "cannot open $to: $!"; while(my ($k, $v) = each %uniq) { print $th $v; }