Re^4: Masking data in a MySQL dump file. Ideas? (SQL::Statement::MySQL)

Here is the supplemental SQL::Statement::MySQL. If anybody wants to release this onto CPAN, they are very welcome!
package SQL::Statement::MySQL;
use strict;
use base qw(Class::Accessor);
require SQL::Statement;
use Carp qw(croak);
use Scalar::Util qw(weaken);
use List::Util qw(max);

=head2 C<< new %ARGS >>

Creates a new instance

=cut

sub new {
    my ($class,%args) = @_;
    $class->SUPER::new(\%args);
}

__PACKAGE__->mk_accessors(qw(parser tables sql command));

sub columns { @{ $_[0]->{columns}}}

=head2 C<< where >>

Stub implementation - MySQL dumps don't contain
UPDATE, DELETE or SELECT statements.

=cut

sub where { '' };
sub colvalues {
    my ($self) = @_;

    my $values = $self->{colvalues} ||= $self->parser->parse_values($s
+elf->{_colvalues});
    my @columns = $self->columns;

    for my $v (@$values) {
        if (@$v != @columns) {
            warn ~~@$v . " / " . ~~ @columns;
            for (0.. max($#columns,$#$v)) {
                warn "$_: " . $columns[$_]->name . " => $v->[$_]"
            };
            warn "Values: @$v";
            #die "Weird statement columns: " . $self->sql;
            die;
        };
    };

    @$values
}

=head2 C<< $s->reconstruct >>

Reconstructs the SQL statement from the internal
representation. This is convenient for changing
a statement.

NOT IMPLEMENTED CURRENTLY

=cut

sub reconstruct {
    croak "reconstruct() is not implemented";
}

package SQL::Parser::MySQL;
use strict;
use base qw(Class::Accessor);
use List::Util qw( max ); # for better error reporting
require SQL::Statement;
use vars qw(%unquote $sql_name);

$sql_name = qr/\`?(\w+)\`?/;

=head1 NAME

SQL::Parser::MySQL - parse MySQL dumps

=head1 ABSTRACT

This is a parser for SQL statements as found
in MySQL dumps.

=head1 SYNOPSIS

  use SQL::Statement::MySQL;
  my $parser = SQL::Parser::MySQL->new();
  $/ = $parser->record_separator;
  while (<>) {
      my ($stmt) = $parser->parse($_);
  }

=head2 C<< new SQL >>

Parses a single statement from a MySQL dump. While this
class attempts to behave the same as a L<SQL::Statement>
instance, it is no subclass.

=cut

__PACKAGE__->mk_accessors(qw(record_separator tables current_table));

sub new {
    my $class = shift;
    my %args;
    if (@_ == 1) {
        %args = (statement => @_)
    } else {
        %args = @_
    }
    $args{record_separator} ||= ";\n";
    $args{tables} ||= {};
    $args{current_table} = undef;

    $class->SUPER::new(\%args);
}

sub line_separator { ";\n" }

=head2 C<< parse SQL >>

Returns a L<SQL::Statement::MySQL> instance
which can be used to inspect and modify the
SQL statement. The line is autochomped.

=cut

sub parse {
    my ($self,$sql) = @_;
    my $sep = $self->line_separator;
    $sql =~ s/\Q$sep\E\z//sm;
    
    my $values;
    my ($table,$columns,@columns,$command);
    if ($sql =~ /^(?>--[^\n]*\n+)*\s*INSERT INTO\s*$sql_name\s+VALUES 
+\((.*)\)\z/smi) {
        $command = 'INSERT';
        ($table,$values) = ($1,$2);
        if (! exists $self->tables->{$table}) {
            # who cares?
            #die "Insert into unknown table '$table' in statement >>$s
+ql<<";
            $self->tables->{$table}->{columns} = []
        }
    } elsif ($sql =~ /^(?>--[^\n]*\n+)*\s*DROP TABLE IF EXISTS $sql_na
+me\z/smi) {
        $command = 'DROP';
        ($table) = ($1);
        # We ignore DROP statements
        return;
    } elsif ($sql =~ /^(?>--[^\n]*\n+)*\s*LOCK TABLES $sql_name WRITE\
+z/smi) {
        $command = 'LOCK';
        ($table) = ($1);
        # We ignore LOCK statements
        return;
    } elsif ($sql =~ /^(?>--[^\n]*\n+)*\s*UNLOCK TABLES\z/smi) {
        $command = 'UNLOCK';
        ($table) = ($1);
        # We ignore UNLOCK statements
        return;
    } elsif ($sql =~ /^(?>--[^\n]*\n+)*\s*CREATE\s*TABLE $sql_name \((
+.*)\)(?: (?:TYPE|ENGINE)=(?:MyISAM|InnoDB))?(?: AUTO_INCREMENT=\d+)?(
+?: DEFAULT CHARSET=\S+)?(?: PACK_KEYS=1)?\s*\z/sm) {
        $command = 'CREATE';
        ($table,$columns) = ($1,$2);
        my (@columns,@keycolumns,@primaries,@unique);
        1 while ($columns =~ s/,\n\s*KEY\s*$sql_name\s*\((.*)\)\s*//);
        while ($columns =~ s/,\n\s*UNIQUE KEY\s*$sql_name\s*\((.*)\)\s
+*//) {
            @primaries = split /,/, $1;
        };
        if ($columns =~ s/,\n\s*PRIMARY KEY\s*\((.*)\)\s*//) {
            @primaries = split /,/, $1;
        };
        #warn $columns;
        @keycolumns = ($columns =~ s/,\n\s*KEY\s*\w+\s*\(.*\)\s*//g);
        @columns = (map { /(\w+)/ ? $1 : '<unknown>' } split /,\n/, $c
+olumns);
        $self->current_table( SQL::Statement::Table->new($table));
        $self->tables->{ $table }->{keycolumns} = \@keycolumns;
        $self->tables->{ $table }->{primarykeys} = \@primaries;
        $self->tables->{ $table }->{columns} = [ map { SQL::Statement:
+:Column->new({
                           table => $table,
                           column => $_,
                         })} @columns
                   ];

    } elsif ($sql =~ /^(?>--[^\n]*\n+)(?>\/\*[^\n]+\*\/)?\s*\z/smi) {
        # empty statement save for comments
        return
    } elsif ($sql =~ m!^(?>/\*[^\n]+\*/)?\s*\z!smi) {
        # empty statement save for comments
        return
    } else {
        die "Unknown/unparseable statement: >>$sql<<";
    }

    SQL::Statement::MySQL->new(
        sql        => $sql,
        parser     => $self,
        tables     => $table,
        columns    => [@{ $self->tables->{ $table }->{columns} }],
        _colvalues => $values,
        command    => $command,
    );
}

=head2 C<< $s->parse_values VALUES >>

Splits a comma-separated string into its components. No decoding
or unquoting of
the values is done, as the values are expected to be reused in a
subsequent C<print> statement.

The following three values are recognized:

=over 4

=item *

A literal NULL

=item *

An integer number

=item *

A single-quote quoted string. The following escapes
can occur within such a string:

  \' - a literal single quote
  \" - a literal double quote
  \\ - a backslash
  \n - a newline
  \r - a carriage return
  \0 - maybe a binary zero
  \A-\Z - something weird, maybe CTRL-A to CTRL-Z ?

=back

The function returns a reference to an array.

If you want to use the values within Perl,
for example by passing them into a DBI handle,
you can use the C<unquote_value> method to convert the
values to something directly useable by Perl.

=cut

my $string = qr/(?:(?>')(?:(?>[^\\']+)|(?>(?:\\['\\"rn0A-Z])+))*')/;
my $null   = qr/(?:(?>NULL))/;
my $num    = qr/(?:(?>-?\d+)(?:\.\d+)?)/;
my $value  = qr/(?:$string|$null|$num)/;

sub parse_values {
    my ($self,$values) = @_;
    my @r;
    
    while ($values =~ /\G(?:^|\),\(|\s*$)/gc) {
        my @res;
        while ($values =~ /\G($value)(?:,?)/gc) {
            push @res, $1;
        };
        #warn "Next: ", substr $values, pos $values, 3;
        push @r, \@res
            if @res;
    };
    
    my $consumed = pos $values;
    if (substr($values,$consumed) =~ /\S/ and $consumed != length $val
+ues) {
        while ($values =~ s/^($value),//) {
            warn "Got: $1 (and more)\n";
        };
        if ($values =~ s/^($value)//) {
            warn "Fin: $1 (nevermore)\n";
        }
        if ($values ne '') {
            warn "Left over: >$values<";
        }
        #warn $self->sql;
        #warn "Malformed VALUES clause: >$values< in $_[1]";
        #warn "Found:";
        #warn $_ for ($values =~ /((?>$value))(?>(?:,|$))/g);
        die "Cannot continue - parsing error";
    };
    return \@r;
}

=head2 C<< unquote_value VALUES >>

Takes a list of quoted values and unquotes them.
If the list contains only one element and that element is an
array reference, another reference to the mapped array
elements is returned.

The following rules apply for each value:

=item *

If the value is a literal C<NULL>, the function
returns C<undef>

=item *

If the value looks like a number, that number
is returned as a string to avoid the numeric imprecision
that's possible if conversion happens.
No explicit numerical translation is done, for example
by adding C<+0> to the number.

=item *

If the value is a string quoted within single quotes,
the following escapes get replaced
and the enclosing single quotes get stripped:

  \' - a literal single quote
  \" - a literal double quote
  \\ - a backslash
  \n - a newline
  \r - a carriage return
  \0 - a binary zero
  \A-\Z - CTRL-A to CTRL-Z ?

=cut

for my $l (qw(n r \\ 0 ' ")) {
    $unquote{$l} = eval qq{"\\$l"}
};
for my $l ('A'..'Z') {
    $unquote{$l} = chr((ord $l)-64);
}

sub unquote_value {
    my ($self) = shift;
    if (@_ == 1 and ref($_[0]) and ref($_[0]) eq 'ARRAY') {
        return [ $self->unquote_value( @$_[0]) ]
    } else {
        return map {
            if (/^$null$/) {
                undef
            } elsif (/^$num$/) {
                $_+0
            } elsif (/^$string$/) {
                # remove quotes
                chop;
                $_ = substr $_, 1;

                s/\\(.)/$unquote{$1}/eg;
                $_
            } else {
                die "Unhandled/unknown value >>$_<<"
            }
        } (@_)
    }
}

1;

=head1 SEE ALSO

L<SQL::Statement>

=cut
[download]
Comment on Re^4: Masking data in a MySQL dump file. Ideas? (SQL::Statement::MySQL) Select or Download Code

We don't bite newbies here... much
	PerlMonks