1: #!/usr/bin/perl 2: 3: # Simple program to remove duplicate email messages 4: # from an mbox file. This program only looks at the content 5: # of the message for uniqueness, not entire message with the headers. 6: # There is no file locking, use this program on a backup 7: # of your mbox file. 8: # Enjoy. 9: 10: use strict; 11: use warnings; 12: use Digest::MD5 qw(md5_hex); 13: 14: #grab file names from the program parameters. 15: #and do some error checking. 16: my ($from, $to) = @_; 17: die "usage: $0 from to" unless (defined $from && defined $to); 18: my (%uniq, $msg); 19: my ($head, $body); 20: my $i = 0; 21: 22: $|++; 23: 24: open (my $fh, "<$from") || die "cannot open $from: $!"; 25: while(<$fh>) { 26: #emails in mbox files always begin with ^From 27: #when /^From / is matched, process the previous message 28: #then start on this message 29: if(m/^From /) { 30: next if ($msg eq ""); 31: #increment the counter for a status report 32: $i++; 33: #print a status report if necessary. 34: #I like to do it this way 35: print '.' if(($i % 50) == 0); 36: print " $i\n" if(($i % 1000) == 0); 37: #since evolution can give different headers on the same message, 38: #only hash the body of the message, and use that to compare to other 39: #emails. The entire message will be stored in the hash though. 40: ($head, $body) = split /\n\n/, $msg; 41: #standard perl technique for removing duplicates, using hashes and 42: #md5 files. 43: $uniq{md5_hex($body)} = $msg; 44: 45: #done processing the previous message, start the next message 46: $msg = $_; 47: } else { 48: #current line didn't match /^From / so this line is part of the 49: #middle of the current message. Just tack it on to the end. 50: $msg .= $_; 51: } 52: } 53: 54: #print the results to a file. 55: open (my $th, ">$to") || die "cannot open $to: $!"; 56: while(my ($k, $v) = each %uniq) { 57: print $th $v; 58: }
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: Remove Duplicates from a mbox file
by Anonymous Monk on Sep 23, 2003 at 22:36 UTC | |
by Anonymous Monk on Mar 24, 2010 at 02:44 UTC | |
by coolmichael (Deacon) on Sep 24, 2003 at 05:41 UTC | |
by Anonymous Monk on Oct 11, 2007 at 03:20 UTC | |
by Anonymous Monk on Oct 21, 2009 at 13:40 UTC |
Back to
Craft