Re^2: Useful heuristics for analyzing arrays of data to determine column header

in reply to Re: Useful heuristics for analyzing arrays of data to determine column header
in thread Useful heuristics for analyzing arrays of data to determine column header

Thanks. I will try this out.

And here is some simple code I have so far. It just loops through the column and collects characteristics of the column into attributes that might be useful. For example, one thing it does is determine if the row is sorted. If it is, and the header is not in the proper sort order, it sets a "has_sort_mismatch" flag. One thing the code accounts for is that the header might be multiple rows anywhere in the first 5 rows of the spreadsheet. That makes things slightly trickier.

package Data::Column ;

use Moose;
use Scalar::Util qw(looks_like_number);
use Log::Log4perl::Shortcuts qw (:all);
use namespace::autoclean;

has 'first_cell'            => (is => 'ro', isa => 'Defined', lazy => 
+1, default => 0, writer => '_set_first_cell' );
has 'blank_count'           => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_blank_count' );
has 'mixed_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_mixed_case_count' );
has 'upper_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_upper_case_count' );
has 'lower_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_lower_case_count' );
has 'preceding_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_preceding_blank_count' );
has 'following_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_following_blank_count' );
has 'unique_count'          => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_unique_count' );
has 'strings_count'         => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_strings_count' );
has 'min_length'            => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_min_length' );
has 'max_length'            => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_max_length' );
has 'numbers_count'         => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_numbers_count' );
has 'percent_blank'         => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_blank' );
has 'percent_strings'       => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_strings' );
has 'percent_numbers'       => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_numbers' );
has 'non_blank_count'       => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_non_blank_count' );
has 'spaces_count'          => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_spaces_count' );
has 'is_all_strings'        => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_all_strings' );
has 'has_sort_mismatch'     => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_has_sort_mismatch' );
has 'is_all_numbers'        => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_all_numbers' );
has 'is_pure'               => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_pure' );
has 'is_sorted'             => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_sorted' );
has 'is_mixed'              => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_mixed' );
has 'first_five_rows'       => (is => 'ro', isa => 'ArrayRef[Int]', la
+zy => 1, default => 0, writer => '_set_first_five_rows' );
has 'data'                  => (traits => ['Array'], is => 'ro', requi
+red => 1,
                                isa => 'ArrayRef', default => sub { []
+ },
                                handles => { count => 'count', is_empt
+y => 'is_empty', elements => 'elements', get_data => 'get' },
                              );

sub BUILD {
  my $s = shift;

  my ($blank_count, $string_count, $number_count, $strings_with_spaces
+) = (0) x 4;
  my $last_cell;
  my $count = 0;
  my %uniques = ();
  my $sort_order = '';
  my $current_sort_order;
  my $is_sorted = 1;
  my $min_length = 0;
  my $max_length = 0;
  my $first_non_blank_data_cell = 0;
  my @first_five_rows = ();
  my $preceding_blank_row_count = 0;
  my $following_blank_row_count = 0;
  my $lower_case_count = 0;
  my $upper_case_count = 0;
  my $mixed_case_count = 0;

  foreach my $cell ( $s->elements ) {
    $count++;
    my $is_string = 0;
    if (!$cell) {
      $blank_count++;
      push (@first_five_rows, 0) if $count < 6;
      next;
    }

    if (!$first_non_blank_data_cell && $count > 5) {
      $first_non_blank_data_cell = $count;
    }

    push (@first_five_rows, 1) if $count < 6;

    if (length $cell < $min_length || !$min_length) {
      $min_length = length $cell;
    }

    if (length $cell > $max_length || !$max_length) {
      $max_length = length $cell;
    }

    $uniques{$cell} = 1;

    if ( !looks_like_number($cell) ) {
      $string_count++;
      $is_string = 1;
      my $has_lower = $cell =~ /[a-z]/;
      my $has_upper = $cell =~ /[A-Z]/;
      $mixed_case_count++ if ($has_lower && $has_upper);
      $lower_case_count++ if $has_lower && !$has_upper;
      $upper_case_count++ if $has_upper && !$has_lower;
    } elsif ($cell) {
      $number_count++;
    }
    if ($cell =~ / /) {
      $strings_with_spaces++;
    }

    if ($is_sorted && $count > 4 && $last_cell && ( $cell ne $last_cel
+l ) ) {
      if ($is_string) {
        if ($cell gt $last_cell) {
          $current_sort_order = 'asc';
        } else {
          $current_sort_order = 'desc';
        }
      } else {
        if ($cell > $last_cell) {
          $current_sort_order = 'asc';
        } else {
          $current_sort_order = 'desc';
        }
      }

      $sort_order = $current_sort_order if !$sort_order;

      if ( $sort_order && $sort_order ne $current_sort_order) {
        $is_sorted = 0;
      }
    } elsif ($count > 4) {
      $last_cell = $cell;
    }
  }

  my $non_blank_count = $s->count - $blank_count;

  if ($non_blank_count == $string_count && $string_count) {
    $s->_set_is_all_strings(1);
    $s->_set_is_pure(1);
  } elsif ($non_blank_count == $number_count && $number_count) {
    $s->_set_is_all_numbers(1);
    $s->_set_is_pure(1);
  } else {
    $s->_set_is_mixed(1);
  }

  my $first_non_blank_row = 0;
  my $row_count = 0;
  foreach my $row (@first_five_rows) {
    if ($row) {
      $first_non_blank_row = $row_count;
      last;
    }
    $row_count++;
  }
  foreach my $row (@first_five_rows[$row_count + 1, 4]) {
    if ($row && $following_blank_row_count) {
      last;
    }
    if (!$row) {
      $following_blank_row_count++;
    }
  }

  my $sort_mismatch = 0;
  my $first_cell = $s->get_data($first_non_blank_row);
  my $first_cell_is_all_upper = 0;
  my $first_cell_is_all_lower = 0;
  my $first_cell_is_mixed = 0;
  if (!looks_like_number($first_cell)) {
    my $first_cell_has_lower = $first_cell =~ /[a-z]/;
    my $first_cell_has_upper = $first_cell =~ /[A-Z]/;
    my $first_cell_is_mixed if ($first_cell_has_lower && $first_cell_h
+as_upper);
    if (!$first_cell_is_mixed) {
      if ($first_cell =~ /[a-z]/) {
        $first_cell_is_all_lower = 1;
      } elsif ($first_cell =~ /[A-Z]/) {
        $first_cell_is_all_upper = 1;
      } else {
        $first_cell_is_mixed = 1;
      }
    }
  }

  if ($is_sorted) {
    my $first_cell_is_greater;
    if (looks_like_number $first_cell) {
      $first_cell_is_greater = $first_cell > $first_non_blank_data_cel
+l;
    } else {
      $first_cell_is_greater = $first_cell gt $first_non_blank_data_ce
+ll;
    }
    logd($first_cell_is_greater);
    if ($first_cell_is_greater && $sort_order eq 'asc') {
      logd('hi');
      $sort_mismatch = 1;
    }
    if (!$first_cell_is_greater && $sort_order eq 'desc') {
      $sort_mismatch = 1;
    }
  }

  my $non_blank = $s->count - $blank_count;
  $s->_set_non_blank_count($non_blank);
  $s->_set_has_sort_mismatch($sort_mismatch);
  $s->_set_mixed_case_count($mixed_case_count);
  $s->_set_upper_case_count($upper_case_count);
  $s->_set_lower_case_count($lower_case_count);
  $s->_set_percent_blank($blank_count / $s->count * 100);
  $s->_set_percent_strings($string_count / $non_blank * 100);
  $s->_set_percent_numbers($number_count / $non_blank * 100);
  $s->_set_spaces_count($strings_with_spaces);
  $s->_set_blank_count($blank_count);
  $s->_set_max_length($max_length);
  $s->_set_min_length($min_length);
  $s->_set_strings_count($string_count);
  $s->_set_numbers_count($number_count);
  $s->_set_unique_count(scalar keys %uniques);
  $s->_set_preceding_blank_count($first_non_blank_row);
  $s->_set_following_blank_count($following_blank_row_count);
  $s->_set_is_sorted($is_sorted) if $s->count > 6;
  $s->_set_first_five_rows(\@first_five_rows);
  $s->_set_first_cell($first_cell);
}

sub has_blanks {
  my $s = shift;

  return $s->blank_count > 0;
}
[download]

$PM = "Perl Monk's";
$MCF = "Most Clueless ~~Friar~~ ~~Abbot~~ ~~Bishop~~ ~~Pontiff~~ ~~Deacon~~ ~~Curate~~ ~~Priest~~ Vicar";
$nysus = $PM . ' ' . $MCF;
Click here if you love Perl Monks

Comment on Re^2: Useful heuristics for analyzing arrays of data to determine column header Download Code

In Section Seekers of Perl Wisdom