comment on

#!/usr/bin/perl
use strict;
use warnings FATAL => 'all';
no warnings 'uninitialized';
use Getopt::Long 'GetOptions';
use English '$PROGRAM_NAME';
use autouse 'Pod::Usage' => 'pod2usage';
our %CHANGES;

our $VERSION = ~0;

=head1 NAME

fix-tabs - Fixes some common problems in tab-delimited files

=head1 DESCRIPTION

This fixes some problems I encountered with tabular text files.

=over

=item * Removes empty trailing rows

=item * Adds missing trailing columns

=item * Warns if the header is missing columns

=item * Warns if the file isn't a text file

=item * Warns if there is only one column.

=item * Dos2unix line ending conversion

=item * Fixes Excel formatted numbers:

=over

=item * (...) parens around negative numbers

=item * Optional $ sign

=item * Optional commas

=back

=back

=head1 SYNOPSIS

fix-tabs [options] file1 file2 ...

  Options:
    --help Displays this message
    --man  Displays the manual
    --clip Removes things off the right edge of the table

=cut

GetOptions(
    help => sub { pod2usage( -verbose => 1 ) },
    man  => sub { pod2usage( -verbose => 2 ) },
    clip => \our ($CLIP_EDGES),
    )
    or pod2usage( -verbose => 0 );
if ( not scalar @ARGV ) {
    pod2usage( -verbose => 0 );
}

for my $file (@ARGV) {
    fix_file($file);
}
exit;

# To regenerate the regex on the following line, run this
# command. It'll be uglier than what's below but that's because I made
# the one below prettier. It's still equivalent and is the source for
# the below.
#
# perl -MRegexp::Common -le 'print qr/(?:$RE{num}{real}|$RE{num}{int}|
+$RE{num}{real}{-sep=>','}{-group=>3}|$RE{num}{int}{-sep=>','}{-group=
+>3})/'

my $NUMBER;

BEGIN {
    $NUMBER
        = qr/(?x-ism:(?:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])
(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])
(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)(?:[0123456789]+))|
(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]{1,3}
(?:(?:[,])[0123456789]{3})*)(?:(?:[.])(?:[0123456789]{0,}))?)
(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)
(?:[0123456789]{1,3}(?:,[0123456789]{3})*))))/x;
}

my $EXCEL_NUMBER;

BEGIN {
    $EXCEL_NUMBER = qr/(?xsm)
(?:
    \( \$? $NUMBER \)
  |
    \$? $NUMBER
)/;
}

sub fix_file {
    my ($file) = @_;

    -T $file or die "$file isn't a text file.\n";

    # Read the header line and get the # of columns I expect every
    # other line to have.
    open my $fh, '<', $file or die "Can't open $file: $!";
    my $header_line = <$fh>;

    $header_line =~ m/\t/mx
        or die "$file isn't a tab delimited text file.\n";

    my %field_names;

    {
        my @labels = split /\t/, $header_line;
        chomp $labels[-1];

        for my $index ( 0 .. $#labels ) {
            my $column_name = $labels[$index];
            if ( $column_name =~ s/\A\s+// ) {
                ++$CHANGES{header_whitespace};
            }
            if ( $column_name =~ s/\s+\z// ) {
                ++$CHANGES{header_whitespace};
            }

            $field_names{$column_name} = $index;
        }
    }
    my $expected_columns = scalar keys %field_names;

    # The input file will be copied here. This shouldn't be left
    # around after the program is finished.
    my $tmpfile = "$file.tmp";
    open my $out, '>', $tmpfile
        or die "Couldn't open $tmpfile for writing: $!";

    # If there are Windows line endings, that's automatically a fix.
    if ( $header_line =~ tr/\r//d ) {
        ++$CHANGES{windows_cr};
    }

    # Copy out the header line using the cleaned up headigns.
    print {$out} join( "\t",
        sort { $field_names{$a} <=> $field_names{$b} }
            keys %field_names )
        . "\n"
        or die "Couldn't write to $tmpfile: $!";

    while ( my $line = <$fh> ) {

        # Again, fixing Windows line endings.
        if ( $line =~ tr/\r//d ) {
            ++$CHANGES{windows_cr};
        }

        if ( not $line =~ m/\S/msx ) {

            # Just skip empty lines and cause the file to be
            # rewritten.
            ++$CHANGES{blank};
            next;
        }

        my @values = split /\t/, $line;
        chomp $values[-1];

        # Column # fixes. Either too many or too little.
        if ( scalar(@values) < $expected_columns ) {
            ++$CHANGES{col_count};
            push @values, ('') x ( $expected_columns - scalar @values 
+);
        }
        elsif ( scalar(@values) > $expected_columns ) {
            ++$CHANGES{col_count};
            if ( not $CLIP_EDGES ) {
                while ( $values[-1] eq ''
                    and scalar(@values) > $expected_columns )
                {
                    pop @values;
                }
            }
            else {
                splice @values, $expected_columns;
            }

            if ( scalar(@values) > $expected_columns ) {
                warn
                    "Too many columns in row $.. Expected $expected_co
+lumns, got @{[ scalar @values ]}.\n";
            }
        }

        print {$out} join( "\t", @values ) . "\n"
            or die "Couldn't write to $tmpfile: $!";
    }

    close $out
        or die "Couldn't flush $tmpfile: $!";

    if ( not keys %CHANGES ) {
        print "$file ok.\n";
        unlink $tmpfile
            or die "Couldn't remove unused $tmpfile: $!";
    }
    else {
        print "$file fixed.\n";

        # Report on several named things getting fixed. This just puts
        # nice names on the stuff.
        for my $change (
            [ windows_cr => 'Windows line endings' ],
            [ col_count  => 'Column count' ],
            [ blank      => 'Blank lines' ],
            [ fix_num    => 'Number formatting' ]
            )
        {
            my ( $field, $desc ) = @$change;
            my $fix = delete $CHANGES{$field};
            if ( not defined $fix ) {
                next;
            }

            print "$desc: $fix\n";
        }

        for ( sort grep { $CHANGES{$_} } keys %CHANGES ) {
            print "$_: $CHANGES{$_}\n";
        }

        my $backupfile = "$file.old";

        # Add a number to the .old to find a file name that isn't used
        # yet.
        while ( -e $backupfile ) {
            my ($id) = $backupfile =~ m/\.(\d+)$/msx;
            no warnings 'numeric';
            $id += 0;

            $backupfile =~ s/\d+$//msx;
            $backupfile .= ".$id";
        }
        rename $file, $backupfile
            or die "Couldn't rename $file to backupfile";
        rename $tmpfile, $file
            or die "Couldn't rename $tmpfile to $file";
    }

    return 1;
}

sub fix_excel_number {
    my $newnum = $_;
    my $oldnum = $_;

    $newnum =~ s/\A\s*($EXCEL_NUMBER)\s*\z/normalize_excel_number($1)/
+e;

    if ( $newnum ne $oldnum ) {
        $_ = $newnum;
        warn "Fix number $oldnum -> $newnum\n";
        ++$CHANGES{fix_num};
        return 1;
    }
    else {
        return 0;
    }
}

sub normalize_excel_number {
    my $num = shift @_;

    $num =~ tr/$,//d;
    $num =~ s/\A\((.+)\)\z/-$1/;
    return $num;
}
[download]

In reply to Fixup tabular text file by diotalevi

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


laziness, impatience, and hubris
	PerlMonks