Well...
I have a lot of gzipped csv files with the data from a legacy system I have to import. I don't want to uncompress them on disk and I don't want to read them entirely in memory. Also, I have to support multiline rows in the csv files (enclosed by a string delimiter).
After seeking for the modules, I didn't see anyone which could receive a filehandle that is a pipe from gunzip, neighter that support multiline rows... So, I end with the following code:
This is the parser...
package CSVParse;
use strict;
use warnings;
sub new {
my $self = shift;
$self =
{
string_delim => '"',
escape_char => "\\",
field_delim => ',',
reg_delim => "\n"
};
bless $self, "CSVParse";
$self->{fh} = shift;
return $self;
}
# supomos que ele esteja no início de uma coluna!
sub fetch_column {
my $self = shift;
my $context = 'raw_data';
my @contexts = ();
my $data = undef;
while (1) {
# char buffer
my $buf;
read($self->{fh},$buf,1) or do {
$self->{EOF} = 1;
last;
};
$self->{last_char_read} = $buf;
if ($context eq 'escape') {
$data = '' unless defined $data;
$data .= $buf;
$context = shift @contexts;
} elsif ($context eq 'string') {
if ($buf eq $self->{string_delim}) {
$context = shift @contexts;
} else {
$data = '' unless defined $data;
$data .= $buf;
}
} else {
if ($buf eq $self->{escape_char}) {
push @contexts, $context;
$context = 'escape';
} elsif ($buf eq $self->{string_delim}) {
push @contexts, $context;
$context = 'string';
} elsif ($buf eq $self->{field_delim} ||
$buf eq $self->{reg_delim}) {
# voltar um caractere
seek($self->{fh},0,tell($self->{fh})-1);
# sair do loop.
last;
} else {
$data = '' unless defined $data;
$data .= $buf;
}
}
}
return $data;
}
sub fetch_row {
my $self = shift;
if ($self->{EOF}) {
return undef;
}
my @cols = ();
# supomos que ele comece numa posição OK
while (1) {
my $col = $self->fetch_column();
last if $self->{EOF};
push @cols, $col;
if ($self->{last_char_read} eq $self->{reg_delim}) {
# sair do loop.
last;
}# elsif ($buf eq ($self->{field_delim})) { next; } else { nex
+t; }
}
return \@cols;
}
sub parse_file {
my $self = shift;
my @rows = ();
while (1) {
my $cols = $self->fetch_row();
last unless defined $cols;
push @rows,$cols;
}
return @rows;
}
1;
And this is a sample code...
open my $tabelaclientes, "gunzip -c somefile.csv.gz|" || die $!;
my $csv = CSVParse->new($tabelaclientes);
while (1) {
my $row = $csv->fetch_row();
last unless defined $row;
for (@$row) {
utf8::decode($_);
}
print join(",",@$row)."\n";
}
close $tabelaclientes;