Beefy Boxes and Bandwidth Generously Provided by pair Networks
laziness, impatience, and hubris
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/perl use strict; use warnings FATAL => 'all'; no warnings 'uninitialized'; use Getopt::Long 'GetOptions'; use English '$PROGRAM_NAME'; use autouse 'Pod::Usage' => 'pod2usage'; our %CHANGES; our $VERSION = ~0; =head1 NAME fix-tabs - Fixes some common problems in tab-delimited files =head1 DESCRIPTION This fixes some problems I encountered with tabular text files. =over =item * Removes empty trailing rows =item * Adds missing trailing columns =item * Warns if the header is missing columns =item * Warns if the file isn't a text file =item * Warns if there is only one column. =item * Dos2unix line ending conversion =item * Fixes Excel formatted numbers: =over =item * (...) parens around negative numbers =item * Optional $ sign =item * Optional commas =back =back =head1 SYNOPSIS fix-tabs [options] file1 file2 ... Options: --help Displays this message --man Displays the manual --clip Removes things off the right edge of the table =cut GetOptions( help => sub { pod2usage( -verbose => 1 ) }, man => sub { pod2usage( -verbose => 2 ) }, clip => \our ($CLIP_EDGES), ) or pod2usage( -verbose => 0 ); if ( not scalar @ARGV ) { pod2usage( -verbose => 0 ); } for my $file (@ARGV) { fix_file($file); } exit; # To regenerate the regex on the following line, run this # command. It'll be uglier than what's below but that's because I made # the one below prettier. It's still equivalent and is the source for # the below. # # perl -MRegexp::Common -le 'print qr/(?:$RE{num}{real}|$RE{num}{int}| +$RE{num}{real}{-sep=>','}{-group=>3}|$RE{num}{int}{-sep=>','}{-group= +>3})/' my $NUMBER; BEGIN { $NUMBER = qr/(?x-ism:(?:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.]) (?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E]) (?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)(?:[0123456789]+))| (?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]{1,3} (?:(?:[,])[0123456789]{3})*)(?:(?:[.])(?:[0123456789]{0,}))?) (?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?) (?:[0123456789]{1,3}(?:,[0123456789]{3})*))))/x; } my $EXCEL_NUMBER; BEGIN { $EXCEL_NUMBER = qr/(?xsm) (?: \( \$? $NUMBER \) | \$? $NUMBER )/; } sub fix_file { my ($file) = @_; -T $file or die "$file isn't a text file.\n"; # Read the header line and get the # of columns I expect every # other line to have. open my $fh, '<', $file or die "Can't open $file: $!"; my $header_line = <$fh>; $header_line =~ m/\t/mx or die "$file isn't a tab delimited text file.\n"; my %field_names; { my @labels = split /\t/, $header_line; chomp $labels[-1]; for my $index ( 0 .. $#labels ) { my $column_name = $labels[$index]; if ( $column_name =~ s/\A\s+// ) { ++$CHANGES{header_whitespace}; } if ( $column_name =~ s/\s+\z// ) { ++$CHANGES{header_whitespace}; } $field_names{$column_name} = $index; } } my $expected_columns = scalar keys %field_names; # The input file will be copied here. This shouldn't be left # around after the program is finished. my $tmpfile = "$file.tmp"; open my $out, '>', $tmpfile or die "Couldn't open $tmpfile for writing: $!"; # If there are Windows line endings, that's automatically a fix. if ( $header_line =~ tr/\r//d ) { ++$CHANGES{windows_cr}; } # Copy out the header line using the cleaned up headigns. print {$out} join( "\t", sort { $field_names{$a} <=> $field_names{$b} } keys %field_names ) . "\n" or die "Couldn't write to $tmpfile: $!"; while ( my $line = <$fh> ) { # Again, fixing Windows line endings. if ( $line =~ tr/\r//d ) { ++$CHANGES{windows_cr}; } if ( not $line =~ m/\S/msx ) { # Just skip empty lines and cause the file to be # rewritten. ++$CHANGES{blank}; next; } my @values = split /\t/, $line; chomp $values[-1]; # Column # fixes. Either too many or too little. if ( scalar(@values) < $expected_columns ) { ++$CHANGES{col_count}; push @values, ('') x ( $expected_columns - scalar @values +); } elsif ( scalar(@values) > $expected_columns ) { ++$CHANGES{col_count}; if ( not $CLIP_EDGES ) { while ( $values[-1] eq '' and scalar(@values) > $expected_columns ) { pop @values; } } else { splice @values, $expected_columns; } if ( scalar(@values) > $expected_columns ) { warn "Too many columns in row $.. Expected $expected_co +lumns, got @{[ scalar @values ]}.\n"; } } print {$out} join( "\t", @values ) . "\n" or die "Couldn't write to $tmpfile: $!"; } close $out or die "Couldn't flush $tmpfile: $!"; if ( not keys %CHANGES ) { print "$file ok.\n"; unlink $tmpfile or die "Couldn't remove unused $tmpfile: $!"; } else { print "$file fixed.\n"; # Report on several named things getting fixed. This just puts # nice names on the stuff. for my $change ( [ windows_cr => 'Windows line endings' ], [ col_count => 'Column count' ], [ blank => 'Blank lines' ], [ fix_num => 'Number formatting' ] ) { my ( $field, $desc ) = @$change; my $fix = delete $CHANGES{$field}; if ( not defined $fix ) { next; } print "$desc: $fix\n"; } for ( sort grep { $CHANGES{$_} } keys %CHANGES ) { print "$_: $CHANGES{$_}\n"; } my $backupfile = "$file.old"; # Add a number to the .old to find a file name that isn't used # yet. while ( -e $backupfile ) { my ($id) = $backupfile =~ m/\.(\d+)$/msx; no warnings 'numeric'; $id += 0; $backupfile =~ s/\d+$//msx; $backupfile .= ".$id"; } rename $file, $backupfile or die "Couldn't rename $file to backupfile"; rename $tmpfile, $file or die "Couldn't rename $tmpfile to $file"; } return 1; } sub fix_excel_number { my $newnum = $_; my $oldnum = $_; $newnum =~ s/\A\s*($EXCEL_NUMBER)\s*\z/normalize_excel_number($1)/ +e; if ( $newnum ne $oldnum ) { $_ = $newnum; warn "Fix number $oldnum -> $newnum\n"; ++$CHANGES{fix_num}; return 1; } else { return 0; } } sub normalize_excel_number { my $num = shift @_; $num =~ tr/$,//d; $num =~ s/\A\((.+)\)\z/-$1/; return $num; }

In reply to Fixup tabular text file by diotalevi

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others contemplating the Monastery: (6)
As of 2024-04-16 09:44 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found