#!/usr/bin/perl -w use strict; use Fcntl qw(:flock :seek); print qq(Adress Extractor V 0.0.1 by Timothy Dutton (C) 2007 This program will extract addresses from a plain text file and saves a new file in tab-delimited database format. -Press Enter to Continue-\n); my $input = <>; print qq(Addresses can be extracted in the following formats: John and Jane Doe 123 Anywhere St. Smalltown, AZ 12345 John and Jane Doe 123 Anywhere St. Smalltown, AZ 12345 123-234-3456 John and Jane Doe 123 Anywhere St. Smalltown, AZ 12345 (123)123-1234 ); my $filename; my $outname; my $file; my $outfile; my $found; my $count; # main program loop while (1) { print "Enter file name: (Enter x to exit)\n"; $input = <>; chomp($input); $filename = $input || ''; $outname = ''; if ($filename =~ /^(.*)(\..*)/) { my $name = $1 || ''; my $ext = $2 || ''; $outname = $name . '_out' . $ext; } $file = &getFile($filename); $outfile = ''; $found = 1; $count = 0; while ($found) { # look for addresses if ($file =~ /\s*([a-z\. ]+)\s*\n\s*([a-z0-9\. #]+)\s*\n\s*([a-z \.]+)\s*,\s*([a-z \.]+)\s*\n?(?:\s*(\d{1,15}(?:-\d{1,4})?)\s*\n)?(?:\s*(\+?(?:1\s*[-\/\.]?)?(?:\((?:\d{3})\)|(?:\d{3}))\s*[-\/\.]?\s*(?:\d{3})\s*[-\/\.]?\s*(?:\d{4})(?:(?:[ \t]*[xX]|[eE][xX][tT])\.?[ \t]*(?:\d+))*))?/i) { $file =~ s/\s*([a-z\. ]+)\s*\n\s*([a-z0-9\. #]+)\s*\n\s*([a-z \.]+)\s*,\s*([a-z \.]+)\s*\n?(?:\s*(\d{1,15}(?:-\d{1,4})?)\s*\n)?(?:\s*(\+?(?:1\s*[-\/\.]?)?(?:\((?:\d{3})\)|(?:\d{3}))\s*[-\/\.]?\s*(?:\d{3})\s*[-\/\.]?\s*(?:\d{4})(?:(?:[ \t]*[xX]|[eE][xX][tT])\.?[ \t]*(?:\d+))*))?//i; my @address = ($1,$2,$3,$4,$5,$6); my @no_uc = qw(a an the and but or nor as at by for in of on to from into onto with as by); foreach my $i (0 .. 5) { $address[$i] = '' unless $address[$i]; # make string all lower case; $address[$i] =~ tr/[A-Z]/[a-z]/; # capitalize first letters of words $address[$i] =~ s/\b(\w+)\b/ucfirst($1)/ge; # DE-capitalize the @no_uc words, unless they are the first or last foreach my $word (@no_uc) { # don't lower case if at beginning or end of string # so make sure there is white space on both sides of word $address[$i] =~ s/(\s)($word)(\s)/$1 . lc($2) . $3/ige; } print "Processing ... $address[$i]\n"; } # add the address to the outfile $outfile .= join("\t",@address) . "\n"; $count++ # no addresses found } else { $found = 0; } } #write the file &saveFile($outname,$outfile); print qq($count addresses were saved to $outname.\n\n); } # end main program loop exit; ############# sub getFile { my $filename = shift; open(IN,"$filename") or &cry("Can't open $filename: $!","Error at " . __LINE__ . ": Can't open $filename: $!"); flock(IN, LOCK_SH); # shared lock seek(IN, 0, SEEK_SET); # rewind to beginning my(@file) = ; close(IN); if (wantarray) { return @file; } else { my $file = join("",@file); return $file; } } ############## sub saveFile { my $filename = shift; my $data = shift; my $error = ''; # open file for append or overwrite #open(OUT, ">>$filename"); #open file to append open(OUT, ">$filename") or $error = "Cannot Save $filename\n"; #open/create file to overwrite flock(OUT, LOCK_EX); # set exclusive lock print OUT $data; #append the page enter time to file close(OUT); # close file return $error; }