Beefy Boxes and Bandwidth Generously Provided by pair Networks
P is for Practical
 
PerlMonks  

parsembox

by vxp (Pilgrim)
on Jul 28, 2002 at 21:51 UTC ( #185851=sourcecode: print w/ replies, xml ) Need Help??

Category: Text Processing
Author/Contact Info vpolyakov@katrillion.com
Description: This is a little something that parses an mbox file and grabs email address out of it (I use it at work to parse a bounce file and grab email addresses out of it for various purposes). Feel free to modify it, use it, whatever. (Credit info: this was actually not written by me, but by the previous network admin)
#!/usr/bin/perl -w
use strict;

my $file = shift;
my $msg = undef;
my $count = 0;
my $count_match = 0;
my $blank = 1;
my $matched = 0;
my $addr_flag = undef;

open(MBOX, "< $file")
        or die "Couldn't open mbox: !$\n";
open(ADDR_LOG, ">> addr.list")
        or die "Couldn't open list: !$\n";
open(NOMATCH, ">> nomatch.mbox")
        or die "Couldn't open nomatch: !$\n";

sub print_addr
{
        my $addr = shift;
        print "$addr_flag\n" if ($addr eq '1');
        print ADDR_LOG "$addr\n";
        $addr_flag = undef;
        $matched = 1;
        $count_match++;
}

while (<MBOX>)
{
        if ($blank && /\AFrom .*\d{4}/)
        {
                $count++;
                print NOMATCH "$msg" if (!$matched && defined($msg));
                $msg = $_;
                $blank = 0;
                $matched = 0;
        } else {
                $msg .= $_;
                $blank = m#\A\Z#o ? 1 : 0;
                if (!$blank && !$matched)
                {
                        if (!defined($addr_flag))
                        {
                                if (/^\s-+ The following addresses had
+ permanent fatal errors -+$/)
                                {
                                        $addr_flag = "std";
                                } elsif (/not accepting mail with atta
+chments or embedded images:?$/) {
                                        my ($addr) = /Your mail to (.*
+) could not/;
                                        print_addr("$addr\@aol.com");
                                } elsif (/permanent error; I've given 
+up\. Sorry it didn't work out\.$/) {
                                        $addr_flag = "std";
                                } elsif (/undeliverable to the followi
+ng:$/) {
                                        $addr_flag = "postfix";
                                } elsif (/Final-Recipient:/) {
                                        my ($addr) = /822;(.*)/;
                                        if (defined($addr))
                                        {
                                                $addr =~ s/^\s//;
                                                if ($addr =~ /<.*>/)
                                                {
                                                        $_ = $addr;
                                                        ($addr) = /<(.
+*)>/;
                                                }
                                                print_addr($addr);
                                        }
                                } elsif (/Receiver not found:/) {
                                        my ($addr) = /Receiver not fou
+nd:(.*)/;
                                        $addr =~ s/^\s//;
                                        print_addr("$addr\@compuserve.
+com");
                                } elsif (/delete existing messages and
+ then empty their trash/) {
                                        $addr_flag = "std";
                                } elsif (/^was not delivered to:$/) {
                                        $addr_flag = "space";
                                } elsif (/^Your message$/) {
                                        $addr_flag = "to";
                                } elsif (/^recipients\. The following 
+address\(es\) failed:$/) {
                                        $addr_flag = "space";
                                } elsif (/^Delivery to the following r
+ecipients failed\.$/) {
                                        $addr_flag = "space";
                                } elsif (/Here is your List of Failed 
+Recipients/) {
                                        $addr_flag = "std";
                                } elsif (/The user\(s\) account is tem
+porarily over quota/) {
                                        $addr_flag = "std";
                                } elsif (/-+Transcript of session foll
+ows -+/) {
                                        $addr_flag = "space";
                                } elsif (/Reason: Not in authenticatio
+n system/) {
                                        my ($addr) = /to '(.*)'/;
                                        print_addr($addr);
                                } elsif (/Reason: User .* is not found
+ in the cc:Mail Directory/) {
                                        my ($addr) = /User "(.*)"/;
                                        print_addr($addr);
                                } elsif (/^User unknown: /) {
                                        my ($addr) = /^User unknown: (
+.*)/;
                                        print_addr($addr);
                                } elsif (/User mailbox exceeds allowed
+ size/) {
                                        my ($addr) = /allowed size: (.
+*)/;
                                        print_addr($addr);
                                }
                        } else {
                                if ($addr_flag eq "std")
                                {
                                        my ($addr) = /<(.*)>/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "to") {
                                        my ($addr) = /\sTo:\s*(.*)/;
                                        if (defined($addr))
                                        {
                                                print_addr($addr);
                                        } else {
                                                $addr_flag = undef;
                                        }
                                } elsif ($addr_flag eq "postfix") {
                                        my ($addr) = /\s(.*) \(user no
+t found\)/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "space") {
                                        my ($addr) = /\s*(.*):?/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "wrap-std") {
                                        $addr_flag = "std";
                                } elsif ($addr_flag eq "wrap-to") {
                                        $addr_flag = "to";
                                } elsif ($addr_flag eq "wrap-space") {
                                        $addr_flag = "space";
                                }
                        }
                }
        }
}

print "Total: $count\n";
print "Match: $count_match\n";
print "Miss : " . ($count - $count_match) . "\n";

close(ADDR_LOG);
close(MBOX);
close(NOMATCH);

Edit by dws to add <code> tags

Comment on parsembox
Download Code
Re: parsembox
by vxp (Pilgrim) on Jul 28, 2002 at 21:55 UTC
    I'm sorry, it appears as if there is absolutely no indentation or any formatting whatsoever.. however when i click on "edit your code" it is formatted there and _with_ indentation... :-)
      you must write <code> </code> tags around your code
Re: parsembox
by ehdonhon (Curate) on Jul 29, 2002 at 00:14 UTC

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://185851]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others examining the Monastery: (9)
As of 2015-07-06 08:55 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (70 votes), past polls