Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

Re: Parse mailing addresses with a regex

by diotalevi (Canon)
on Jun 23, 2003 at 14:27 UTC ( [id://268175]=note: print w/replies, xml ) Need Help??


in reply to Parse mailing addresses with a regex

Once upon a time I tried to parse addresses and came up with some fugly code. You'd first want to check out Lingua::EN::AddressParse and see if you can use it as-is or modify it to suit your needs. If all else fails here's the code I was using. Keep in mind this is two files. unabbrev.pm takes the standard US postal service abbreviation and expands them. parse_address.pl probably read one address per line. You will definately need to modify this to use it.

Also, this is not how you should program. The prototypes should go, the direct access of @_, etc. Don't use this as a style guide. Please. This is not how I write perl anymore. For production or otherwise. Ugly, ugly, ugly.

# unabbrev.pm # Fixed the unabbrev.pm file up some. sub uword { join ' ', map unabrev($_), split ' ', shift; } sub unabrev { local $_ = shift; return $_ unless /\w/; # One really big expression s/^e$/east/ or s/^w$/west/ or s/^(?:n|no)$/north/ or s/^(?:s|so)$/south/ or s/^ne$/north east/ or s/^nw$/north west/ or s/^se$/south east/ or s/^sw$/south west/ or s/^(?:avs|aves)$/avenue south/ or s/^beachrd$/beach road/ or s/^ccedar$/cedar/ or s/^(?:adn|add'n)$/addition/ or s/^appache$/apache/ or s/^apt$/apartment/ or s/^apts$/apartments/ or s/^(?:av|ave)$/avenue/ or s/^(?:bch|bchch|beac)/beach/ or s/^(?:bx|b0x)/box $1/ or s/^blvd$/boulevard/ or s/^brg$/burg/ or s/^bldg$/building/ or s/^cen$/center/ or s/^(?:centeral|cental)$/central/ or s/^char$/character/ or s/^chas$/chase/ or s/^ches$/chesapeake/ or s/^chig$/chicago/ or s/^cir$/circle/ or s/^(?:cty|co|cnty)$/county/ or s/^(?:ct|crt|cour)/court/ or s/^cr$/curve/ or s/^crk$/creek/ or s/^crl$/curl/ or s/^(?:crystaln|crytl)$/crystal/ or s/^ctr$/center/ or s/^dist$/district/ or s/^(?:drv|drve|dr)$/drive/ or s/^est$/estate/ or s/^fst$/forest/ or s/^ft$/fort/ or s/^(?:govt|govern|gov't)$/government/ or s/^(?:grv|grov)$/grove/ or s/^hgld$/highland/ or s/^hglds$/highlands/ or s/^(?:hgt|hht|height|ht|hghtss|hghts)$/heights/ or s/^(?:hy|hyw|hwy)$/highway/ or s/^isl$/island/ or s/^(?:jct|jction|jctn|junctn|juncton)$/junction/ or s/^(?:jctns|jcts)$/junctions/ or s/^l00p$/loop/ or s/^(?:lk|lak)$/lake/ or s/^lks$/lakes/ or s/^li'l$/lil/ or s/^(?:la|lanes|ln)$/lane/ or s/^ml$/mill/ or s/^mls$/mills/ or s/^mkt$/market/ or s/^(?:mt|mnt)$/mount/ or s/^mpls$/minneapolis/ or s/^(?:mtn|mntain|mntn)$/mountain/ or s/^(?:mntns|mtns)$/mountains/ or s/^(?:nth|nrth)$/north/ or s/^nrthbrk$/northbrook/ or s/^(?:unorg|unorgized)$/unorganized/ or s/^ph$/penthouse/ or s/^(?:pk|prk)$/park/ or s/^(?:pkwy|parkwy|pkway|pky)$/parkway/ or s/^pl$/place/ or s/^plaz$/plaza/ or s/^(?:pobox|po)$/box/ or s/^prct$/precinct/ or s/^pres$/president/ or s/^pt$/point/ or s/^pts$/points/ or s/^qtr$/quarter/ or s/^qtrs$/quarters/ or s/^(?:r|rt)$/route/ or s/^rd$/road/ or s/^rdg$/ridge/ or s/^resor$/resort/ or s/^(?:ri|rv|riv|rvr)$/river/ or s/^(?:rte|rr|rural)$/route/ or s/^(?:rs|rst)$/rest/ or s/^rverview$/riverview/ or s/^(?:shr|shoar)$/shore/ or s/^(?:shoars|shrs)$/shores/ or s/^(?:spgs|spngs|sprngs)$/springs/ or s/^(?:st|str)$/street/ or s/^svc$/service/ or s/^terr$/terrace/ or s/^twp$/township/ or s/^(?:tr|trl|trails|trls)$/trail/ or s/^trlr$/trailer/ or s/^vac$/vacation/; return $_; } 1; # address_parse.pl #!/usr/bin/perl $ID = 0; $ADDRESS = 1; while ($record = <>) { ($id,$address) = split /\t/, $record; @words = split /\s+/, $address; %record = (); HOUSE: if ($words[0] =~ /^\d+$/) { $record{house} = shift @words; ($record{odd}) = (($record{house} % 2) == 0 ? 'e' : 'o'); } if ($words[0] =~ /^1\/2$/) { $record{fraction} = shift @words; } UNIT: for ($i = $#words; $i >= 0; $i--) { if (is_unit($words[$i])) { $record{unit} = join ' ', @words[$i .. $#words]; $#words = $i - 1; last UNIT; } } unless (defined $record{unit}) { if ($words[$#words] =~ /\d+$/) { $record{unit} = pop @words; } } $t = $words[$#words]; if (is_ew($t)) { $record{direction} = pop @words; $t = $words[$#words]; if (is_ns($t)) { $t = pop @words; $record{direction} = "$t ".$record{direction}; } } elsif (is_ns($t)) { $record{direction} = pop @words; } unless (exists $record{direction}) { for ($i = 0; $i < @words; $i++) { if (is_ns($words[$i]) or is_ew($words[$i])) { $record{direction} .= ' '.$words[$i]; $words[$i] = ''; } } } @words = grep /\w/, @words; for ($i = $#words;$i>=0;$i--) { if (is_type($words[$i])) { $record{type} = $words[$i]; $words[$i] = ''; goto DIR; } } DIR: @words = grep /\w/, @words; for ($i = 0; $i < @words-1; $i++) { if ($words[$i] eq 'p' and ($words[$i+1] =~ /^(?:o|0)$/)) { $words[$i] = ''; $words[$i+1] = ''; if (exists $record{unit}) { $record{unit} = 'po '.$record{unit}; } else { $record{unit} = 'po'; } } } $t = join ' ', @words; $record{street} = $t if $t; if (1) { $line = join "\t", map {defined $_?$_:'\\N'} ($id, @record{qw(house odd fraction street direction type unit )}); for (undef,undef) { $line =~ s/ +/ /g; $line =~ s/ +\t/\t/g; $line =~ s/\t +/\t/g; } print $line,"\n"; } } sub is_ns ($) { return $_[0] =~ /^(?:north|south)$/; } sub is_ew ($) { return $_[0] =~ /^(?:east|west)$/; } sub is_unit ($) { return $_[0] =~ /(?:box|apartment|lot|suite|campus|lower|upper|flo +or|gymnasium|hall|building)\s*/; } sub is_type ($) { return $_[0] =~ /(?:alley|avenue|aveue|avneu|bay|boulevard|circle|court|courtt +|courttt|cove|crest|curve|dale|drive|grove|highway|hill |knoll|lane|mall|orchard|park|parkway|pass|pines|place|plaza|raod|ridg +e|road|route|square|state|street|summit|ter|terrace|tra il|walk|way)/o; }

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://268175]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others chanting in the Monastery: (3)
As of 2024-04-18 01:21 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found