Beefy Boxes and Bandwidth Generously Provided by pair Networks
Welcome to the Monastery
 
PerlMonks  

change/limit already existing file names to [-_.0-9a-zA-Z]+

by parv (Priest)
on Jan 09, 2003 at 09:19 UTC ( #225491=sourcecode: print w/replies, xml ) Need Help??
Category: file management, utility scripts
Author/Contact Info parv
Description:

sanefilename.perl changes characters in file names which are not composed of '[-_.a-zA-Z0-9]' characters. and...

  • all the characters not matching '[-_.a-zA-Z0-9]' are converted to '-'.
  • '-_' or '_-' sequence is changed to single '-'.
  • any sequence of '.-', '._', '-.', '_.' is changed to single '.'.
  • multiple occurrences of [-_] are changed to one.

in case of surprise(s), refer to the source code. it is also avilable from...
http://www103.pair.com/parv/comp/src/perl/sanename.perl

#! /usr/local/bin/perl -w

##  author: parv, parv UNDERSCORE fm AT emailuser DOT net
##
##  date:  jan 09 2003
##
##  version: 0.012
##
##  license:
##    this software is free to be used in any form only if proper
##    credit is given.  i am not responsible for any kind of damage or
##    loss.  use it at your own risk.
##
##  name: sanefilename.perl
##
##  purpose:
##    change characters in file names which are not composed of
##    [-_.a-zA-Z0-9] characters.  and do other changes; see
##    sanitize() sub.
##
##  usage:
##    sanefilename.perl [-options] file [file2 file3 ...]
##
##    for details, issue this command...
##
##      sanefilename.perl -usage
##

use strict;

# pod module to generate, well, usage messages
use Pod::Usage;

# module to parse options
use Getopt::Long qw(:config default require_order);

# modules to move & parse file name
use File::Basename;
use File::Copy;


#  existent file name handler - sub dispatch table
#
my %existent =
    ( #  option given to handle existent file names
      #
      'opt'  => [ 'notexist' , qr/^(?:ask|force|notexist)$/ ]

      #  based on $existent{opt}->[0], will contain reference
      #  to one of the following subs
      , 'handle'  => undef

      #  TO BE IMPLEMENTED
      , 'ask'  =>  sub { return; }

      #  force file move
      , 'force'  =>
         sub { my ($path , $old, $new , $exist) = @_;

               verbose(" '${path}${new}' already exists, will be overw
+ritten...\n")
                 if -e "${path}$new";

               move_file($path , $old , $new , $exist);
               return;
             }

      #  new file does't exist, so move
      , 'notexist'  =>
         sub { my ($path , $old, $new , $exist) = @_;

               unless ( -e "${path}$new" )
               { move_file($path , $old , $new , $exist);
                 return;
               }

               verbose(" '${path}${new}' already exists, '${path}${old
+}' skipped ...\n");
               return;
             }
    );

#  get & check options
#  ----
#  save existent handling option separately so that it will be easy
#  to update valid options, $existent{opt}->[1], and related sub
#  references
#  ----
my $opt = get_opt( $existent{'opt'} );

#  check if any arguments remain which will be file names
#
pod2usage( '-msg' => " give file name(s) to change to sane version(s)\
+n"
           , '-exitval' => 0
           , '-verbose' => 0
         )
  unless scalar @ARGV;

#  save the appropriate sub reference based on dup option
#  to save time in foreach() due to double indirection
#
$existent{'handle'} = $existent{ $existent{'opt'}->[0] };

#  regex to specify valid characters
#
my $char_re = valid_char( $opt->{'lower'} );

foreach (@ARGV)
{
  my ($old_file , $path) = fileparse($_ , '');

  # sanity check

  die " '$path' - not a writable directory, exiting... \n"
    unless -d $path || -w $path;

  unless ( -e $path . $old_file )
  {
    verbose(" '${path}${old_file}' doesn't exist, skipped...\n");
    next;
  }

  # if the old name is OK, skip
  #
  if ( $old_file !~ m/$char_re->[1]/ )
  {
    print " '${path}${old_file}' - okay, skipped...\n"
      if $opt->{'verbose'} >= 3;

    next;
  }

  # calculate new file name
  my $new_file =
      sanitize( \$old_file , $char_re , $opt->{lower} );

  # very unlikely scenario that _current_ file name will be same
  # as the calculated one from this _current_ name
  #
  if ( $old_file eq $new_file )
  {
    verbose(<<_MSG_);
 sanitized new name is same as the old.
 ${path}${old_file} is not moved...
_MSG_

    next;
  }

  #  call appropriate sub depending on the -exist option given
  #
  $existent{handle}->($path , $old_file , $new_file , $existent{'opt'}
+->[0] );
}

print " ...done\n" if $opt->{verbose};


## subroutines

#  calculate new file name from the old
sub sanitize
{
  #  $old - old name
  #  $char - regex (array ref) specifying valid character set
  #  $lower - option if to exclude upper case letters
  #
  my ($old , $char , $lower) = @_;

  die "sanitize: expecting \$old and\$char to be defined\n"
    unless defined $old
       and ref $char eq 'ARRAY';

  local $_ = $$old;

  tr [A-Z] [a-z] if $lower;

  # remove end non [-_.${alpha}\d] characters
  #
  #s/(?: $char->[1] $ | ^ $char->[1] )//x;

  # change all the "wrong" characters to -
  #
  s/$char->[1]/-/g;

  # prefer - to _
  s/(?:_-|-_)/-/go;

  # prefer . to - or _
  s/(?:\.[-_]|[-_]\.)/./go;

  # minimize the consecutive occurrence of . - _ to one of each
  #
  s/([-._]){2,}/$1/go;

  return $_;
}

sub move_file
{
  my ($path , $old , $new , $exist) = @_;

  unless (defined $path and defined $old and defined $new)
  {
    verbose("move_file: \$path, \$old, \$new is/are undefined");
    return;
  }

  $old = $path . $old;
  $new = $path . $new;

  my $old_new = sub { printf " '%s'  ->  '%s'\n", ($old , $new); };

  $old_new->()
    if $opt->{'nomove'}
      or $opt->{'verbose'} >= 2
      or ($opt->{'verbose'} && ($exist eq 'force'));

  return if $opt->{'nomove'};

  move("$old" , "$new")
    || die "couldn't move '$old' to '$new': $!\n" ;

  return;
}

sub valid_char
{
  my $low = shift;

  # create list to be used as regex in file renaming
  #
  my $char = '-_.0-9a-z';

  $char .= 'A-Z' unless $low;

  #  return two regexen: latter is the complement of the former
  #
  return [ qr/[$char]/ , qr/[^$char]/ ];
}

sub get_opt
{
  #  existent handling option is saved in this array at [0]
  my $exist = shift;

  my %opt = ( #  supersedes any option, shows program usage
              'usage'  => 0

              #  control amount of output generated
              , 'verbose'  => 2

              #  check option if names needed to be lowercased
              , 'lower'  => 0

              #  instead of actual move, only show the new name
              , 'nomove'  => 0
            );

  #  get options
  GetOptions( 'usage|help' => \$opt{'usage'}

              , 'exist=s'  => \$exist->[0]
              , 'ask'      => sub { $exist->[0] = 'ask' }
              , 'force'    => sub { $exist->[0] = 'force' }
              , 'notexist|skip'
                  => sub { $exist->[0] = 'notexist' }

              , 'quiet'      => sub { $opt{'verbose'} = 0 }
              , 'verbose=i'  => \$opt{'verbose'}

              , 'lower' => \$opt{'lower'}

              , 'nomove' => \$opt{'nomove'}
            )
    || die pod2usage('-exitval'  => 2 , '-verbose'  => 1);

  #  exit normally is asked for usage
  #
  pod2usage('-exitval'  => 0 , '-verbose'  => 3)
    if $opt{'usage'};

  #  die horribly due to wrong option given
  #
  pod2usage( '-msg'  => " incorrect (existent and/or verbose options w
+ere given\n"
            , '-exitval'  => 1
            , '-verbose'  => 0
           )
    unless check_opt($exist , $opt{'verbose'});

  return \%opt;
}

sub check_opt
{
  my ($exist , $verbose) = @_;

  return ( $verbose =~ m/^\d+$/ && $exist->[0] =~ m/$exist->[1]/ );
}

sub verbose
{
  warn(@_) if $opt->{'verbose'};
  return;
}

__DATA__


## start usage documentation

=head1 NAME

sanefilename.perl - undo odd characters in file names

=head1 SYNOPSIS

B<sanefilename.perl> [-options] F<file> [F<file2> F<file3> ...]

B<sanefilename.perl> -usage

=head1 DESCRIPTION

sanefilename.perl changes characters in file names which are
not composed of C<[-_.a-zA-Z0-9]> characters.

all the characters not matching C<[-_.a-zA-Z0-9]> are converted to
I<->.

I<-_> or I<_-> sequence is changed to single I<->.

Any sequence of I<.->, I<._>, I<-.>, I<_.> is changed to single I<.>.

multiple occurrences of [-_.] are changed to one.

=head1 OPTIONS

=over

=item B<-usage>

shows the whole program documentation; supersedes any other option

=item B<-nomove>

do not actually move the files, just print the old given names and
new sanitized ones.

=item B<-lower>

excludes [A-Z] characters from the [-_.a-zA-Z0-9] character set.

=item B<-quiet> same as -verbose=0

=item B<-verbose>=I<0>|I<1>|I<2>|I<3>

controls how much output is generated; larger is the number, more the
output be.

=over

=item 0

only fatal error messages are generated

=item 1

as 0, plus non-fatal error messages are shown

=item 2

default.  as 1, plus converted file names are shown

=item 3

as 2, plus OK/unchanged old file names are shown

=back

=item B<-ask> (unimplemented)

=item B<-force>

=item B<-notexist> | B<-skip>

=item B<-exist>=I<ask>|I<force>|I<notexist>|I<skip>

controls the behaviour when sanitized new file name is as an existing
file.

if the new sanitized name is same as old given name, that is not
considered to be an already existent file.

consider there are two files: I<P^Q^R> and I<P-Q-R>.  the sanitized
name for the first file, without the B<-lower> option, would be
I<P-Q-R>.  since this soon-to-be file already exists, it creates
a good situtation for providing an option.

the file name I<P-Q-R>, without the B<-lower> option, produces the
same new name.  any further processing or file moving is skipped,
except for any messages requested via B<-verbose> option.

=over

=item ask (unimplemented)

an alternative name is asked to enter.

=item force

old given file is moved to the new calculated name, obliterating an
already existing file.

=item notexist | skip

default.  move old file to new name only if it does not exist.

=back

=back

=head1 See Also

see these fine perlpods too:

=over

=item *

File::Basename

=item *

File::Copy

=back

=head1 Distribution and such

parv, parv UNDERSCORE fm AT emailuser DOT net

jan 09 2003

version 0.012

this software is free to be used in any form only if proper credit is
given.  i am not responsible for any kind of damage or loss.  use it
at your own risk.

=cut

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://225491]
help
Chatterbox?
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others taking refuge in the Monastery: (8)
As of 2018-07-16 14:58 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    It has been suggested to rename Perl 6 in order to boost its marketing potential. Which name would you prefer?















    Results (342 votes). Check out past polls.

    Notices?