Thanks. I will try this out.
And here is some simple code I have so far. It just loops through the column and collects characteristics of the column into attributes that might be useful. For example, one thing it does is determine if the row is sorted. If it is, and the header is not in the proper sort order, it sets a "has_sort_mismatch" flag. One thing the code accounts for is that the header might be multiple rows anywhere in the first 5 rows of the spreadsheet. That makes things slightly trickier.
package Data::Column ;
use Moose;
use Scalar::Util qw(looks_like_number);
use Log::Log4perl::Shortcuts qw (:all);
use namespace::autoclean;
has 'first_cell' => (is => 'ro', isa => 'Defined', lazy =>
+1, default => 0, writer => '_set_first_cell' );
has 'blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_blank_count' );
has 'mixed_case_count' => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_mixed_case_count' );
has 'upper_case_count' => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_upper_case_count' );
has 'lower_case_count' => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_lower_case_count' );
has 'preceding_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_preceding_blank_count' );
has 'following_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_following_blank_count' );
has 'unique_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_unique_count' );
has 'strings_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_strings_count' );
has 'min_length' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_min_length' );
has 'max_length' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_max_length' );
has 'numbers_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_numbers_count' );
has 'percent_blank' => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_blank' );
has 'percent_strings' => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_strings' );
has 'percent_numbers' => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_numbers' );
has 'non_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_non_blank_count' );
has 'spaces_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_spaces_count' );
has 'is_all_strings' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_is_all_strings' );
has 'has_sort_mismatch' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_has_sort_mismatch' );
has 'is_all_numbers' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_is_all_numbers' );
has 'is_pure' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_is_pure' );
has 'is_sorted' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_is_sorted' );
has 'is_mixed' => (is => 'ro', isa => 'Bool', lazy => 1,
+default => 0, writer => '_set_is_mixed' );
has 'first_five_rows' => (is => 'ro', isa => 'ArrayRef[Int]', la
+zy => 1, default => 0, writer => '_set_first_five_rows' );
has 'data' => (traits => ['Array'], is => 'ro', requi
+red => 1,
isa => 'ArrayRef', default => sub { []
+ },
handles => { count => 'count', is_empt
+y => 'is_empty', elements => 'elements', get_data => 'get' },
);
sub BUILD {
my $s = shift;
my ($blank_count, $string_count, $number_count, $strings_with_spaces
+) = (0) x 4;
my $last_cell;
my $count = 0;
my %uniques = ();
my $sort_order = '';
my $current_sort_order;
my $is_sorted = 1;
my $min_length = 0;
my $max_length = 0;
my $first_non_blank_data_cell = 0;
my @first_five_rows = ();
my $preceding_blank_row_count = 0;
my $following_blank_row_count = 0;
my $lower_case_count = 0;
my $upper_case_count = 0;
my $mixed_case_count = 0;
foreach my $cell ( $s->elements ) {
$count++;
my $is_string = 0;
if (!$cell) {
$blank_count++;
push (@first_five_rows, 0) if $count < 6;
next;
}
if (!$first_non_blank_data_cell && $count > 5) {
$first_non_blank_data_cell = $count;
}
push (@first_five_rows, 1) if $count < 6;
if (length $cell < $min_length || !$min_length) {
$min_length = length $cell;
}
if (length $cell > $max_length || !$max_length) {
$max_length = length $cell;
}
$uniques{$cell} = 1;
if ( !looks_like_number($cell) ) {
$string_count++;
$is_string = 1;
my $has_lower = $cell =~ /[a-z]/;
my $has_upper = $cell =~ /[A-Z]/;
$mixed_case_count++ if ($has_lower && $has_upper);
$lower_case_count++ if $has_lower && !$has_upper;
$upper_case_count++ if $has_upper && !$has_lower;
} elsif ($cell) {
$number_count++;
}
if ($cell =~ / /) {
$strings_with_spaces++;
}
if ($is_sorted && $count > 4 && $last_cell && ( $cell ne $last_cel
+l ) ) {
if ($is_string) {
if ($cell gt $last_cell) {
$current_sort_order = 'asc';
} else {
$current_sort_order = 'desc';
}
} else {
if ($cell > $last_cell) {
$current_sort_order = 'asc';
} else {
$current_sort_order = 'desc';
}
}
$sort_order = $current_sort_order if !$sort_order;
if ( $sort_order && $sort_order ne $current_sort_order) {
$is_sorted = 0;
}
} elsif ($count > 4) {
$last_cell = $cell;
}
}
my $non_blank_count = $s->count - $blank_count;
if ($non_blank_count == $string_count && $string_count) {
$s->_set_is_all_strings(1);
$s->_set_is_pure(1);
} elsif ($non_blank_count == $number_count && $number_count) {
$s->_set_is_all_numbers(1);
$s->_set_is_pure(1);
} else {
$s->_set_is_mixed(1);
}
my $first_non_blank_row = 0;
my $row_count = 0;
foreach my $row (@first_five_rows) {
if ($row) {
$first_non_blank_row = $row_count;
last;
}
$row_count++;
}
foreach my $row (@first_five_rows[$row_count + 1, 4]) {
if ($row && $following_blank_row_count) {
last;
}
if (!$row) {
$following_blank_row_count++;
}
}
my $sort_mismatch = 0;
my $first_cell = $s->get_data($first_non_blank_row);
my $first_cell_is_all_upper = 0;
my $first_cell_is_all_lower = 0;
my $first_cell_is_mixed = 0;
if (!looks_like_number($first_cell)) {
my $first_cell_has_lower = $first_cell =~ /[a-z]/;
my $first_cell_has_upper = $first_cell =~ /[A-Z]/;
my $first_cell_is_mixed if ($first_cell_has_lower && $first_cell_h
+as_upper);
if (!$first_cell_is_mixed) {
if ($first_cell =~ /[a-z]/) {
$first_cell_is_all_lower = 1;
} elsif ($first_cell =~ /[A-Z]/) {
$first_cell_is_all_upper = 1;
} else {
$first_cell_is_mixed = 1;
}
}
}
if ($is_sorted) {
my $first_cell_is_greater;
if (looks_like_number $first_cell) {
$first_cell_is_greater = $first_cell > $first_non_blank_data_cel
+l;
} else {
$first_cell_is_greater = $first_cell gt $first_non_blank_data_ce
+ll;
}
logd($first_cell_is_greater);
if ($first_cell_is_greater && $sort_order eq 'asc') {
logd('hi');
$sort_mismatch = 1;
}
if (!$first_cell_is_greater && $sort_order eq 'desc') {
$sort_mismatch = 1;
}
}
my $non_blank = $s->count - $blank_count;
$s->_set_non_blank_count($non_blank);
$s->_set_has_sort_mismatch($sort_mismatch);
$s->_set_mixed_case_count($mixed_case_count);
$s->_set_upper_case_count($upper_case_count);
$s->_set_lower_case_count($lower_case_count);
$s->_set_percent_blank($blank_count / $s->count * 100);
$s->_set_percent_strings($string_count / $non_blank * 100);
$s->_set_percent_numbers($number_count / $non_blank * 100);
$s->_set_spaces_count($strings_with_spaces);
$s->_set_blank_count($blank_count);
$s->_set_max_length($max_length);
$s->_set_min_length($min_length);
$s->_set_strings_count($string_count);
$s->_set_numbers_count($number_count);
$s->_set_unique_count(scalar keys %uniques);
$s->_set_preceding_blank_count($first_non_blank_row);
$s->_set_following_blank_count($following_blank_row_count);
$s->_set_is_sorted($is_sorted) if $s->count > 6;
$s->_set_first_five_rows(\@first_five_rows);
$s->_set_first_cell($first_cell);
}
sub has_blanks {
my $s = shift;
return $s->blank_count > 0;
}
|