In order to avoid failure with embedded newlines (or your other record-separator of choice), I use this:
my $old_INPUT_RECORD_SEPARATOR = $/;
$/ = $self->record_delimiter;
open (DELIMFILE, '<', $filename) or (Carp::confess("Cannot open fi
+le [$filename]: $!"));
my $record;
while (<DELIMFILE>) {
chomp;
$record = $_;
# If a line contains an odd amount of doublequotes ("), then w
+e'll need to continue reading until we find another line that contain
+s an odd amount of doublequotes.
# This is in order to catch fields that contain recordseparato
+rs (but are encased in ""'s).
if (grep ($_ eq '"', split ('', $_)) % 2 == 1) {
# Keep reading data and appending to $record until we find
+ another line with an odd number of doublequotes.
while (<DELIMFILE>) {
$record .= $_;
if (grep ($_ eq '"', split ('', $_)) % 2 == 1) { last;
+ }
}
} ## end if (grep ($_ eq '"', split...))
push (@{$ar_returnvalue}, ReadRecord($self, $record));
} ## end while (<DELIMFILE>)
close (DELIMFILE);
$/ = $old_INPUT_RECORD_SEPARATOR;
And ReadRecord uses a regex to consume the string field by field:
my $field_value;
my $delimiter = $self->field_delimiter;
while ($inputstring) {
undef $field_value;
if ($inputstring =~ /^"/) {
$field_value = $inputstring;
if ($inputstring =~ /^"(([^"]|"")+)"(?:[$delimiter]|$)/p) {
($field_value, $inputstring) = ($1, ${^POSTMATCH});
# Unescape escaped quotes
$field_value =~ s/""/"/g;
} else {
Carp::confess("Parsing error with remaining data [$inputst
+ring]");
}
} else {
$field_value = $inputstring;
if ($inputstring =~ /^([^$delimiter"]*)(?:[$delimiter]|$)/p) {
($field_value, $inputstring) = ($1, ${^POSTMATCH});
}
} ## end else [ if ($inputstring =~ /^"/)]
}
This conforms to RFC 4180 :) |