I needed to extract contacts from a text file that were listed in mailing lable format (originally from a word doc) like this:
Mr. and Mrs. John Doe
3528 East Main St
Centerville, Iowa
85213
CRAIG and LORI SHOEMAKER
2939 LEATHER WAY
COBBLER, MS 85204
543-216-7890
Zip and phone are optional. Zip is either on line with city and state, or on next line.
Capitalization is changed to First letters only capitalized with the help of my "Capitalize Title" Regex. A couple more regexes ensure certain small words are left lower case unless at the beginning or end of a string. Here is the perl code I used:
#!/usr/bin/perl -w
use strict;
use Fcntl qw(:flock :seek);
print qq(Adress Extractor V 0.0.1
by Timothy Dutton
(C) 2007
This program will extract addresses from a plain text file
and saves a new file in tab-delimited database format.
-Press Enter to Continue-\n);
my $input = <>;
print qq(Addresses can be extracted in the following formats:
John and Jane Doe
123 Anywhere St.
Smalltown, AZ 12345
John and Jane Doe
123 Anywhere St.
Smalltown, AZ 12345
123-234-3456
John and Jane Doe
123 Anywhere St.
Smalltown, AZ
12345
(123)123-1234
);
my $filename;
my $outname;
my $file;
my $outfile;
my $found;
my $count;
# main program loop
while (1) {
print "Enter file name: (Enter x to exit)\n";
$input = <>;
chomp($input);
$filename = $input || '';
$outname = '';
if ($filename =~ /^(.*)(\..*)/) {
my $name = $1 || '';
my $ext = $2 || '';
$outname = $name . '_out' . $ext;
}
$file = &getFile($filename);
$outfile = '';
$found = 1;
$count = 0;
while ($found) {
# look for addresses
if ($file =~ /\s*([a-z\. ]+)\s*\n\s*([a-z0-9\. #]+)\s*\n\s*([a-z \
+.]+)\s*,\s*([a-z \.]+)\s*\n?(?:\s*(\d{1,15}(?:-\d{1,4})?)\s*\n)?(?:\s
+*(\+?(?:1\s*[-\/\.]?)?(?:\((?:\d{3})\)|(?:\d{3}))\s*[-\/\.]?\s*(?:\d{
+3})\s*[-\/\.]?\s*(?:\d{4})(?:(?:[ \t]*[xX]|[eE][xX][tT])\.?[ \t]*(?:\
+d+))*))?/i) {
$file =~ s/\s*([a-z\. ]+)\s*\n\s*([a-z0-9\. #]+)\s*\n\s*([a-z \.
+]+)\s*,\s*([a-z \.]+)\s*\n?(?:\s*(\d{1,15}(?:-\d{1,4})?)\s*\n)?(?:\s*
+(\+?(?:1\s*[-\/\.]?)?(?:\((?:\d{3})\)|(?:\d{3}))\s*[-\/\.]?\s*(?:\d{3
+})\s*[-\/\.]?\s*(?:\d{4})(?:(?:[ \t]*[xX]|[eE][xX][tT])\.?[ \t]*(?:\d
++))*))?//i;
my @address = ($1,$2,$3,$4,$5,$6);
my @no_uc = qw(a an the and but or nor as at by for in of on to
+from into onto with as by);
foreach my $i (0 .. 5) {
$address[$i] = '' unless $address[$i];
# make string all lower case;
$address[$i] =~ tr/[A-Z]/[a-z]/;
# capitalize first letters of words
$address[$i] =~ s/\b(\w+)\b/ucfirst($1)/ge;
# DE-capitalize the @no_uc words, unless they are the first or
+ last
foreach my $word (@no_uc) {
# don't lower case if at beginning or end of string
# so make sure there is white space on both sides of word
$address[$i] =~ s/(\s)($word)(\s)/$1 . lc($2) . $3/ige;
}
print "Processing ... $address[$i]\n";
}
# add the address to the outfile
$outfile .= join("\t",@address) . "\n";
$count++
# no addresses found
} else {
$found = 0;
}
}
#write the file
&saveFile($outname,$outfile);
print qq($count addresses were saved to $outname.\n\n);
}
# end main program loop
exit;
#############
sub getFile {
my $filename = shift;
open(IN,"$filename") or &cry("Can't open $filename: $!","Error at "
+. __LINE__ . ": Can't open $filename: $!");
flock(IN, LOCK_SH); # shared lock
seek(IN, 0, SEEK_SET); # rewind to beginning
my(@file) = <IN>;
close(IN);
if (wantarray) {
return @file;
} else {
my $file = join("",@file);
return $file;
}
}
##############
sub saveFile {
my $filename = shift;
my $data = shift;
my $error = '';
# open file for append or overwrite
#open(OUT, ">>$filename"); #open file to append
open(OUT, ">$filename") or $error = "Cannot Save $filename\n"; #open
+/create file to overwrite
flock(OUT, LOCK_EX); # set exclusive lock
print OUT $data; #append the page enter time to file
close(OUT); # close file
return $error;
}
|