What's maybe controversial with my implementation is the use of a tied array as object. I know lots of people consider this evil. Basically, the data of the CSV file is stored in an 2D array with the column and rownames attached. The rational behind this is, most of the time you just want the data. Now you don't have to do something like $obj->get_data. Second, it's very close to the R syntax.
Update: As requested, the main code, without any POD.
package Text::CSV::R;
require 5.005;
use strict;
use warnings;
require Exporter;
use Text::CSV;
use Text::CSV::R::Matrix;
use Carp;
use Scalar::Util qw(reftype);
our @ISA = qw(Exporter);
our %EXPORT_TAGS = (
'all' => [
qw(
read_csv read_csv2 read_table read_delim rownames colnames
)
]
);
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our $VERSION = '0.01';
our $DEFAULT_OPTS = {
header => undef,
skip => 0,
nrow => -1,
sep_char => q{,},
quote_char => q{"},
allow_whitespace => 0,
binary => 1,
blank_lines_skip => 1,
};
# A mapping of the R options to the Text:CSV options. If there is no
# Text::CSV equivalent, the same option name is used (R options are
# not passed to Text::CSV).
our $R_OPT_MAP = {
sep => 'sep_char',
quote => 'quote_char',
skip => 'skip',
nrow => 'nrow',
header => 'header',
encoding => 'encoding',
row_names => 'row_names',
strip_white => 'allow_whitespace',
blank_lines_skip => 'blank_lines_skip',
};
sub colnames {
my ( $tied_ref, $values ) = @_;
my $tied_obj = tied @{$tied_ref};
if ( defined $values ) { $tied_obj->{colnames} = $values; }
return $tied_obj->{colnames};
}
sub rownames {
my ( $tied_ref, $values ) = @_;
my $tied_obj = tied @{$tied_ref};
if ( defined $values && reftype $values eq 'ARRAY' ) {
if (scalar @{$values} != scalar @{$tied_obj->{ARRAY}}) {
croak 'Invalid rownames length';
}
$tied_obj->{rownames} = $values;
}
return $tied_obj->{rownames};
}
# merge the global default options, function defaults and user options
sub _merge_options {
my ( $t_opt, $u_opt ) = @_;
my %ret = %{$DEFAULT_OPTS};
@ret{ keys %{$t_opt} } = values %{$t_opt};
@ret{ keys %{$u_opt} } = values %{$u_opt};
for my $k ( keys %{$R_OPT_MAP} ) {
if ( defined $ret{$k} ) {
$ret{ $R_OPT_MAP->{$k} } = $ret{$k};
}
}
return \%ret;
}
sub read_table {
my ( $file, %u_opt ) = @_;
return _read( $file, _merge_options( {}, \%u_opt ) );
}
sub read_csv {
my ( $file, %u_opt ) = @_;
my $t_opt = { sep_char => q{,}, header => 1, };
return _read( $file, _merge_options( $t_opt, \%u_opt ) );
}
sub read_csv2 {
my ( $file, %u_opt ) = @_;
my $t_opt = { sep_char => q{;}, header => 1, };
return _read( $file, _merge_options( $t_opt, \%u_opt ) );
}
sub read_delim {
my ( $file, %u_opt ) = @_;
my $t_opt = { sep_char => "\t", header => 1, };
return _read( $file, _merge_options( $t_opt, %u_opt ) );
}
# check if $file is a filehandle, if not open file with correct encodi
+ng.
# Then let _parse_fh do the work.
sub _read {
my ( $file, $opts ) = @_;
my $data_ref;
if (reftype \$file eq 'SCALAR') {
my $encoding = q{};
if (defined $opts->{encoding} && length $opts->{encoding} > 0)
+ {
$encoding = ':encoding(' . $opts->{encoding} . ')';
}
open my $IN, '<' . $encoding, $file or croak "Cannot open $fil
+e for reading: $!";
$data_ref = _parse_fh( $IN, $opts );
close $IN or croak "Cannot close $file: $!";
} else {
$data_ref = _parse_fh( $file, $opts );
}
return $data_ref;
}
# parsing of the file in a 2d array, store column and row names.
sub _parse_fh {
my ( $IN, $opts ) = @_;
my @data;
my $obj = tie @data, 'Text::CSV::R::Matrix';
my %text_csv_opts = %{$opts};
delete @text_csv_opts{ keys %{$R_OPT_MAP} };
my $csv = Text::CSV->new( \%text_csv_opts )
or croak q{Cannot use CSV: } . Text::CSV->error_diag();
# skip lines
my $line_number = 0;
while ( $line_number < $opts->{skip} && <$IN> ) {
$line_number++;
}
$line_number = 0;
my $max_cols = 0;
LINE:
while ( my $line = <$IN> ) {
chomp $line;
# blank_lines_skip option
if ( !length($line) && defined $opts->{'blank_lines_skip'}
&& $opts->{'blank_lines_skip'} )
{
next LINE;
}
my $status = $csv->parse($line)
or croak q{Cannot parse CSV: } . $csv->error_input();
push @data, [ $csv->fields() ];
if ( scalar( @{ $data[-1] } ) > $max_cols ) {
$max_cols = scalar @{ $data[-1] };
}
$line_number++;
# nrow option. Store one more because file might contain heade
+r.
last LINE
if ( defined $opts->{nrow}
&& $opts->{nrow} >= 0
&& $line_number > $opts->{nrow} );
}
my $auto_col_row = scalar @{$data[0]} == $max_cols - 1 ? 1 : 0;
# read column names
if ( $auto_col_row || ( defined $opts->{header} && $opts->{header}
+ ) ) {
colnames( \@data, shift @data );
}
else {
colnames( \@data, [ map { 'V' . $_ } 1 .. $max_cols ] );
if ( defined $opts->{nrow} && scalar(@data) > $opts->{nrow} )
+{
pop @data;
}
}
# read row names
my @rownames;
if ( $auto_col_row ) {
for my $row (@data) {
push @rownames, $row->[0];
shift @{$row};
}
} elsif ( defined $opts->{row_names} && reftype \$opts->{row_names
+} eq 'SCALAR' ) {
for my $row (@data) {
push @rownames, $row->[$opts->{row_names}];
splice @{$row}, $opts->{row_names}, 1;
}
} else {
@rownames = 1 .. scalar @data;
}
rownames(\@data, \@rownames);
return \@data;
}
1;