Here I start by checking if there's a valid month name in the string. If there is, I start extracting the dates from the string; else skipt that line.
Note that I had to add a space after the first '...' because '...September' is not a valid month name.
Updated to return valid dates
#!/usr/bin/perl
# filename: extract_dates.pl
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Indent = 1;
my @dates;
push @dates, findMonth($_) while (<DATA>);
print Dumper \@dates; # or do something else with your dates
sub findMonth {
my @words = split / /,shift;
my %months = map {$_ => 1 } qw/
january jan february feb march mar
april apr may may june jun
july jul august aug september sep
october oct november nov december dec
/;
foreach (@words) {
if (exists $months{lc($_)} ) {
return extractDate(
{
MONTH => $_,
STRING => "@words"
}
);
last;
}
}
return;
}
sub extractDate {
my $self = shift;
return makeValidDate(
{STRING=>$self->{STRING},DATE=>$1}
) if ($self->{STRING} =~
/
(
(?:
(
[123]?\d
(st|nd|rd|th)?
\s+
)?
$self->{MONTH}
\s\d{1,4}
(
(-[123]?\d)?
(,\s[123]?\d\s)*
(and\s\d+)?
(,\s\d{4})?
)?
)
)
/x
);
return;
}
sub makeValidDate {
my $self = shift;
my ($string) = $self->{STRING} =~/^(.+)$/;
$self->{DATE} =~s/-/X/g;
$self->{DATE} =~s/(^-|\W|and|th|st|nd|rd)/ /gi;
my @date = split /\s+/,$self->{DATE};
my $date = {};
foreach (@date) {
if ($_=~/^\d{1,2}$/) {
push @{$date->{days}},$_
} elsif ($_=~/^\d{4}$/) {
$date->{year} = $_
} elsif ($_=~/(\d+)X(\d+)/) {
push @{$date->{days}},$1 .. $2
} else {
$date->{month} = lc($_)
}
}
use Date::Manip;
my $out_date = {};
foreach (@{$date->{days}}) {
my $temp_date = ParseDate(
$_ . " "
. $date->{month} . " "
. $date->{year}
);
$temp_date = &UnixDate($temp_date,"%D");
push @{$out_date->{dates}},$temp_date;
$out_date->{string} = $string;
}
return $out_date;
}
__DATA__
... the next social club meeting is on April 15, 1994...
... September 21-23, 1994 we will be hosting visitors...
... submissions should be made by 11 February 1994...
... Mail sent 7 Feb 1994...
... On the 16th September 1994 Mr X will be giving a talk on ...
... unconfirmed conference dates are March 4, 5 and 6, 1994...
$VAR1 = [
{
'dates' => [
'04/15/94'
],
'string' => '... the next social club meeting is on April 15, 1994
+...'
},
{
'dates' => [
'09/21/94',
'09/22/94',
'09/23/94'
],
'string' => '... September 21-23, 1994 we will be hosting visitors
+...'
},
{
'dates' => [
'02/11/94'
],
'string' => '... submissions should be made by 11 February 1994...
+'
},
{
'dates' => [
'02/07/94'
],
'string' => '... Mail sent 7 Feb 1994...'
},
{
'dates' => [
'09/16/94'
],
'string' => '... On the 16th September 1994 Mr X will be giving a
+talk on ...'
},
{
'dates' => [
'03/04/94',
'03/05/94',
'03/06/94'
],
'string' => '... unconfirmed conference dates are March 4, 5 and 6
+, 1994...'
}
];
|