Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
#!/usr/bin/perl -w use strict; use DBI; use Text::CSV_XS; use Getopt::Std; use vars qw/ %option /; getopts( 's:d:r:vnh', \%option ); if ( $option{ h } ) { &usage; exit; } foreach ( qw/ s r d / ) { die &usage if ! $option{ $_ }; } my $schema_file = $option{ s }; my $raw_data = $option{ r }; my $data_dir = format_data_dir( $option{ d } ); my $no_header = $option{ n } || 0; # errors that are not immediately fatal will be written to the followi +ng log: my $error_log = 'err.log'; if ( ! -e $data_dir ) { if ( $option{ v } ) { # we're validating an exisisting database, so there had # darned well better be one there! die "Attempt to validate database '$data_dir' failed. Folder +does not exist.\n"; } mkdir $data_dir or die "Could not mkdir $data_dir: $!"; } my $database = schema( $schema_file ); if ( $option{ v } ) { print "\nWarning: you are about to validate an existing database.\ +n" . "This program will drop the tables prior to validation.\n" . "Did you back up the database? "; exit if 'y' ne lc substr <STDIN>, 0, 1; create_raw_data( $data_dir, $raw_data, $no_header, $database ); } my $dbh = DBI->connect("DBI:CSV:f_dir=$data_dir", { RaiseError => 1 } +); create_database( $dbh, $data_dir, $database ); print "All tables successfully created.\n"; add_data_to_database( $dbh, $raw_data, $database ); print "All data successfully added to database.\n"; open ERROR_LOG, "> $error_log" or die "Cannot open $error_log for writ +ing: $!"; print "Now validating unique fields.\n"; my $error_count = validate_unique_fields( \*ERROR_LOG, $dbh, $database + ); print "Now validating foreign key constraints.\n"; $error_count += validate_foreign_keys( \*ERROR_LOG, $dbh, $database ); print "Now validating data against regular expressions.\n"; $error_count += validate_with_regexes( \*ERROR_LOG, $dbh, $database ); close ERROR_LOG; if ( $error_count ) { my $text = $error_count == 1 ? " was" : "s were"; print "\nWARNING\n$error_count error$text found while validating t +he data. Errors were written to '$error_log'\n"; } else { unlink $error_log; print "\nNo errors were found while validating the data.\n"; } ### ### # main subs follow # ### ### sub schema { # The following datatypes are used for documentation purposes, but # DBD::CSV does not support these properties. They are mainly her +e so # that programmers can customize these attributes for their own da +tabase. # # They will also be used in this program to ensure that data added + to database # will pass validation (this will minimize database corruption ) # # currently, each key in %data_type points to a hashref. This is +done in case # we need to add extra functionality in the future. # # note that DBD::CSV relies on SQL::Statement which does not suppo +rt most # data types, such as DateTime or Boolean. my $schema_file = shift; my %data_type = ( INT => { validate => \&is_int }, VARCHAR => { validate => \&is_varchar }, CHAR => { validate => \&is_char } ); open SCHEMA, "< $schema_file" or die "Cannot open $schema_file for + reading:$!\n"; my $table = ''; my %schema; while ( <SCHEMA> ) { next if /^#/; # skip comments; next if /^\s*$/; # skip blank lines $table = $1, next if ( /^\s*\[([^\]]+)\]\s*$/ ); if ( substr( $_, 0, 1 ) ne '_' ) { if ( ! $table ) { error_in_data( 'unknown', $_, "Could not determine tab +le for line $. in schema '$schema_file'"); } my ( $field, $type, @extra ) = split; $type = uc $type; $type = 'INT' if $type eq 'INTEGER'; if ( $type !~ /^(INT|(?:VAR)?CHAR)/ ) { error_in_data( $table, $_, "Unknown type '$1' in schem +a '$schema_file'" ); } if ( ! exists $schema{ $table }{ $field } ) { $schema{ $table }{ $field } = $type; push @{ $schema{ $table }{ _field_order } }, $field; # + holds the order of fields if ( defined $extra[0] and uc $extra[0] eq 'UNIQUE' ) +{ push @{ $schema{ $table }{ _unique } }, $field; # +field must not have duplicates } if ( defined $extra[-1] and $extra[-1] =~ m!^/(.*)/$! ) { $schema{ $table }{ _regex }{ $field } = $1; } } else { error_in_data( $table, $_, "Redefined field '$field' i +n schema '$schema_file'" ); } } elsif ( substr( $_, 0, 12 ) eq '_foreign_key' ) { my ( undef, @data ) = split; if ( @data != 3 ) { error_in_data( $table, $_, "Foreign key definitions ar +e in form 'thistable.field, foreign.table, foreign.table.field"); } else { push @{ $schema{ $table }{ _foreign_key } }, \@data; } } } close SCHEMA; foreach my $table ( keys %schema ) { $schema{ $table }{ _rec_count } = 0; # stop uninitialized warn +ings later } $schema{ _data_type } = \%data_type; return \%schema; } sub create_raw_data { # This will only be called if validating an existing database # or CSV file my ( $data_dir, $raw_data, $no_header, $database ) = @_; if ( -e $raw_data ) { print "Warning: raw data file '$raw_data' exists. Overwrite? +[y/n] "; exit if 'y' ne lc substr <STDIN>, 0, 1; } open RAW_DATA, "> $raw_data" or die "Cannot open $raw_data for wri +ting: $!"; foreach my $table ( keys %$database ) { next if '_' eq substr $table, 0, 1; # these are data keys my $fields = join ', ', @{ $database->{ $table }{ _field_order + } }; $fields = "# $fields\n"; open TABLE, "< $data_dir$table" or die "Cannot open $data_dir$ +table for reading: $!"; my $header = <TABLE>; if ( ! $no_header ) { $header = ''; } print RAW_DATA "table=$table\n", $fields, $header, <TABLE>, "\ +n.\n\n"; close TABLE; } close RAW_DATA; } sub create_database { # drop tables if they exist and then recreate them. my ( $dbh, $data_dir, $database ) = @_; foreach my $table ( keys %$database ) { next if substr( $table, 0, 1 ) eq '_'; if ( -e $data_dir.$table ) { $dbh->do( "DROP TABLE $table" ); } my @fields; if ( ! exists $database->{$table}{_field_order} ) { die "Table: '$table' does not have a _field_order specific +ation.\n"; } my @field_order = @{ $database->{$table}{_field_order} }; my $order_count = unique( @field_order ); if ( @field_order != $order_count ) { die "Table: '$table' has duplicate fields in _field_order. +\n"; } my $field_count = unique( grep { /^[^_]/ } keys %{$database->{ + $table }} ); if ( $field_count != $order_count ) { my $comp = $field_count > $order_count ? "more" : "fewer"; die "Table '$table' has $comp fields than _field_order spe +cifies."; } foreach my $field ( @field_order ) { if ( ! exists $database->{ $table }{ $field } ) { die "Table: '$table' does not have field '$field' list +ed in _field_order"; } push @fields, "$field $database->{$table}{$field}"; } my $fields = join ',', @fields; $dbh->do("CREATE TABLE $table ( $fields )"); } } sub add_data_to_database { # This could use some work. Just a roungh hack to get the data. # for right now it's fine, but in the future, this could be # problematic if someone creates a huge amount of data. # # Records are separated by putting a period on a line by itself: " +\n.\n" # First line of each record is table name. Subsequent records are + fields # to be entered. Empty lines and lines beginning with a # are ski +pped my ( $dbh, $raw_data, $database ) = @_; my $csv = Text::CSV_XS->new; local $/ = "\n.\n"; open DATA_FILE, "< $raw_data" or die "Cannot open $raw_data for re +ading: $!\n"; while ( <DATA_FILE> ) { chomp; next if /^\s*$/; my @records = grep { /^[^#]/ } split /\n/; my $table = trim( shift @records ); if ( $table =~ /=/ ) { $table = trim( ( split /=/, $table, 2 )[-1] ); } if ( ! exists $database->{$table} ) { die "Table '$table' found in input data but is not in sche +ma."; } my $place_holders = '?,' x @{ $database->{$table}{_field_order +} }; chop $place_holders; # remove trailing comma my $sql = "INSERT INTO $table VALUES ( $place_holders )"; my $sth = $dbh->prepare( $sql ); foreach my $record ( @records ) { next if $record !~ /\S/ or $record =~ /^\.$/; # skip empty + records my $status; if( ! ( $status = $csv->parse($record) ) ) { # parse a CSV + string into fields error_in_data( $table, $record, "Bad argument in CSV r +ecord" ); } my @columns = $csv->fields(); # get the parsed fields if ( @columns != @{ $database->{$table}{_field_order} } ) +{ my $in_length = @columns; my $expected_length = @{ $database->{$table}{_field_or +der} }; error_in_data( $table, $record, "Expected $expected_le +ngth columns. Found $in_length columns." ); } else { validate_record( $database, $table, \@columns, $csv ); $sth->execute( @columns ); $database->{ $table }{ _rec_count }++; } } # next record } # end while close DATA_FILE; print "\n"; foreach my $table ( sort keys %$database ) { next if substr( $table, 0, 1 ) eq '_'; my $count = $database->{ $table }{ _rec_count }; if ( $count == 0 ) { print "WARNING: table '$table' created but no records adde +d.\n"; } else { print "$count records added to table '$table'.\n"; } } print "\n"; } sub validate_unique_fields { # for each defined unique field, grab all of them from the # database and add them to an autoincrementing hash. If any # value is greater than one, then we know we have more than # one unique key my ( $err, $dbh, $database ) = @_; my $error_count = 0; foreach my $table ( %$database ) { if ( exists $database->{ $table }{ _unique } ) { my @unique_fields = @{ $database->{ $table }{ _unique } }; foreach my $field ( @unique_fields ) { my %field_count; my $sql = "SELECT $field FROM $table"; my $fields = $dbh->selectall_arrayref( $sql ); foreach ( @$fields ) { $field_count{ $_->[0] }++; } foreach ( keys %field_count ) { if ( $field_count{$_} > 1 ) { print $err "Table: '$table' Unique field: '$_' + was found $field_count{$_} times\n"; $error_count++; } } } # next $field } } # next $table return $error_count; } sub validate_foreign_keys { # the foreign keys for a table are an array ref of array refs. # each inner array ref is a table key, foreign table, and foreign +key. # this routine grabs the table keys and ensures that each one matc +hes # a foreign key in the foreign table. my ( $err, $dbh, $database ) = @_; my $error_count = 0; foreach my $table ( %$database ) { my $last_error_count = $error_count; if ( exists $database->{ $table }{ _foreign_key } ) { my @fkey_array = @{ $database->{ $table }{ _foreign_key } +}; my $base_err = 'Foreign Key constraint violated.'; foreach my $fkey_data ( @fkey_array ) { my ( $this_key, $ftable, $ftable_key ) = @$fkey_data; if ( ! exists $database->{ $table }{ $this_key } ) { my $err_message = "$base_err Table '$table' does +not have field '$this_key'\n"; print $err $err_message; $error_count++; } if ( ! exists $database->{ $ftable } ) { my $err_message = "$base_err Foreign table '$ftab +le' does not exist.\n"; print $err $err_message; $error_count++; } if ( exists $database->{ $ftable } and ! exists $datab +ase->{ $ftable }{ $ftable_key } ) { my $err_message = "$base_err Foreign table '$ftab +le' does not have field '$ftable_key\n"; print $err $err_message; $error_count++; } if ( exists $database->{ $ftable } and exists $database->{ $ftable }{ $ftable_key } and ( ! exists $database->{ $ftable }{ _unique } or ! grep { /$ftable_key/ } @{$database->{ $ftable + }{ _unique }} ) ) { my $err_message = "$base_err Table '$table', fiel +d '$this_key'. ". "'$ftable_key' not defined as UN +IQUE in '$ftable'\n"; print $err $err_message; $error_count++; } next if $error_count > $last_error_count; my $sql = "SELECT $this_key FROM $table"; my $sth = $dbh->prepare( $sql ); $sth->execute; my $tbl_array_ref = $sth->fetchall_arrayref; foreach ( @$tbl_array_ref ) { my $data = $dbh->quote( $_->[ 0 ] ); my $sql = "SELECT $ftable_key FROM $ftable WHERE +$ftable_key = $data"; my $sth = $dbh->prepare( $sql ); my $rv = $sth->execute; if ( $rv eq '0E0' ) { my $err_message = "$base_err Table '$table', +field '$this_key', ". "value $data Not found in ta +ble '$ftable', field '$ftable_key'\n"; print $err $err_message; $error_count++; } } } # next $fkey_data } } # next $table return $error_count; } sub validate_record { # this function loops through each field in the record, determines # the type of field and then attempts to verify that the data in t +he # field does, in fact, match the data type supplied. # The program will die if the data type does not match my ( $database, $table, $columns, $csv ) = @_; my $data_type = $database->{ _data_type }; my @fields = @{ $database->{$table}{_field_order} }; foreach my $index ( 0 .. $#fields ) { my ( $type ) = ( $database->{ $table}{ $fields[ $index] } =~ / +^(\w+)/ ); my $function = $data_type->{ $type }{ 'validate' }; if ( ref $function ne 'CODE' ) { # this should not happen die "No validation routine found for type: '$type'."; } my $type_match = $function->( $columns->[$index], $database->{ + $table }{ $fields[$index] } ); if ( ! $type_match ) { my $status = $csv->combine( @$columns ); my $data = $csv->string; my $err_message = "Field: '$columns->[ $index ]' does not +match data type: '$database->{ $table }{ $fields[$index] }'."; error_in_data( $table, $data, $err_message ); } } } sub validate_with_regexes { # every field in the database may have an optional regex added to # describe valid data types. This sub will test whether or not th +e # data for those fields matches the regex. my ( $err, $dbh, $database ) = @_; my $error_count = 0; foreach my $table ( %$database ) { if ( exists $database->{ $table }{ _regex } ) { my %fields = %{$database->{ $table }{ _regex }}; my @bad_regexes; while ( my ( $field, $regex ) = each %fields ) { if ( ! is_valid_pattern( $regex ) ) { print $err "Table: '$table', field '$field' had in +valid pattern '$regex'. Discarded.\n"; push @bad_regexes, $field; $error_count++; } } delete @fields{ @bad_regexes } if @bad_regexes; my $fieldnames = join ',', keys %fields; my $sql = "SELECT $fieldnames FROM $table"; my $sth = $dbh->prepare( $sql ); $sth->execute; while ( my $data = $sth->fetchrow_hashref ) { foreach ( keys %$data ) { my $value = $data->{ $_ }; my $regex = $fields{ $_ }; if ( $value !~ /$regex/ ) { print $err "Table: '$table' field: '$_' value: + '$value' did not match /$fields{$_}/.\n"; $error_count++; } } } } } # next $table return $error_count; } ### ### # utility subs follow # ### ### sub usage { print <<" END_USAGE"; usage: db_validate.pl [-h] [-s schema_file] [-d database_dir] [-r raw_ +data] [-v] [-n] Validate and/or create a CSV database. -h help -? help -s schema_file File containing schema. See POD for details -d database_dir The directory where the database will be created. If +-v is supplied, this is location where data is to be read fr +om, and then written back to. -r raw_data The file containing raw database information. If -v i +s supplied, this is where the raw information will be wr +itten to. -v Validate existing database. -n If -v is used, this says that the data in -d does *not +* have a header line labeling the fields. END_USAGE } sub error_in_data { my ( $table, $record, $message ) = @_; die "\nERROR in table: '$table'\n$message\nRecord: $record\n"; } sub trim { my $data = shift; $data =~ s/^\s+//; $data =~ s/\s+$//; $data; } sub unique { my @array = @_; my %count; @array = grep { ! $count{$_}++ } @array; @array; } # each routine returns a true/false value for whether or not the data +passes validation sub is_int { my $data = shift; return $data =~ /^-?\d+$/ ? 1 : 0; } sub is_varchar { my ( $data, $desc ) = @_; # $desc will typically be something like VARCHAR(64) my ( $length ) = ( $desc =~ /\(\s*(\d+)/ ); return ( length $data <= $length ) ? 1 : 0; } sub is_char { my ( $data, $desc ) = @_; # $desc will typically be something like CHAR(8) my ( $length ) = ( $desc =~ /\(\s*(\d+)/ ); return ( length $data == $length ) ? 1 : 0; } sub is_valid_pattern { my $pattern = shift; return eval { '' =~ /$pattern/; 1 } || 0; } sub format_data_dir { my $dir = shift; # the following is stolen from CGI.pm my $os = $^O; unless ( defined $os and $os ) { require Config; $os = $Config::Config{'osname'}; } my $sep; if ($os=~/Win/i) { $sep = '\\'; } elsif ($os=~/vms/i) { $sep = '/'; } elsif ($os=~/bsdos/i) { $sep = '/'; } elsif ($os=~/dos/i) { $sep = '\\'; } elsif ($os=~/^MacOS$/i) { $sep = ':'; } elsif ($os=~/os2/i) { $sep = '\\'; } else { $sep = '/'; } $dir .= $sep if $dir !~ /$sep$/; # add the separator if it's not t +here return $dir; } __END__ =head1 NAME db_validate.pl - Simple validation for DBD::CSV database creation =head1 SYNOPSIS perl db_validate.pl -s schema.txt -r raw_data.txt -d data_dir =head1 DESCRIPTION DBD::CSV is a useful module if you are forced to work with CSV files a +s a database. Unfortunately, the module does no data validation. This program allows you to define a schema, along with UNIQUE fields and fo +reign key constraints. Further, basic data type validation for INT, CHAR, a +nd VARCHAR is supported. Oddly enough, DBD::CSV, while only allowing tho +se datatypes to be used in a C<CREATE table> statement, does not actually + check to see whether or not the data you are inserting matches those datatyp +es. If one supplies the -v option on the command line, then this program v +alidates and existing database. The -r option is then used to specify where th +e raw_data will be written to. The -n ("no header") option may be used +when validating if you are validating a CSV file while does not have field +names listed as the first line of the file. Much of the information provided in here is for those who wish to main +tain and extend this program. These sections are marked with B<MAINTENANCE BEG +IN> and B<MAINTENANCE END> and may be skipped if you do not care about this. =head1 Schema Each database to be created and validated must have an associated sche +ma created by the programmer. The schema location is passed to the program via the command line -s s +witch. The schema is then read into the C<$schema> variable in C<&schema>. C< +$schema> is a hash ref. Every key in the hash that does not begin with an unde +rscore (_) is assumed to be a table name. Each table is also a hashref. Eve +ry key in a table hash that does not begin with an underscore is assumed to b +e a field name. Keys beginning with underscores are metadata describing t +heir respectives tables. These keys are C<_field_order>, C<_foreign_key>, C<_unique>, and C<_rec_count>. These are used to validate the data in + the database. The format of the schema file must be as follows (items enclosed in cu +rly braces are optional): [tablename] field DATATYPE {UNIQUE} {/regex/} _foreign_key tablename.field foreigntable foreigntable.key Currently, the only datatypes supported are INT, CHAR, and VARCHAR. C +HAR and VARCHAR must have a number in parenthese following them which specifie +s the maximum number of characters allowed. For example: [users] user_id INT UNIQUE user_name VARCHAR(10) In the schema, blank lines and lines starting with a sharp (#) are ski +pped. Fields in the database will be in the order listed. As soon as a new +table name is encountered, the previous table definition is assumed to be co +mplete. A regex may be supplied as the last item on the field definition line. + If a field value does not match the regex, a warning will be written to the + error log. For example to ensure that user ids are positive integers: [users] user_id INT UNIQUE /^\d+$/ The regex B<must> begin and end with a forward slash. You can use a r +egex such as C</.+/> to enforce a C<NOT NULL> requirement. Also note that you may write C<INTEGER> instead of C<INT>, if you pref +er. =head2 Table Fields Each field in a table may be of type INT, VARCHAR, or CHAR. These are + the only datatypes supported by DBD::CSV when creating tables. These datatypes + are stored in C<$int>, C<$varchar>, and C<$char>, respectively. Let's exa +mine the following SQL C<CREATE> statement and see how it's translated: CREATE TABLE users ( user_id INTEGER UNIQUE, name VARCHAR(64), area CHAR(8) ) B<MAINTENANCE BEGIN> The following is the minimum legal specification for this table, as de +fined in the C<$schema> variable. users => { user_id => $int, name => "$varchar(64)", area => "$char(8)", _field_order => [qw/ user_id name area /] } Upon validating the data, and exception will be thrown if C<user_id> d +oes not match the regular expression C</^-?\d+$/>. The C<name> would throw an exception if it exceeded the length of 64 a +nd C<area> would throw an exception if the length did not match 8. When +the table is created, the order of fields in the table would match the order spe +cified in C<_field_order>. B<MAINTENANCE END> This definition for this table would be created in the schema file as +follows: [users] user_id INT name VARCHAR(64) area CHAR(8) The =head2 Metadata As mentioned previously, there are four metadata keys used for tables. + These are C<_field_order>, C<_foreign_key>, C<_unique>, and C<_rec_count>. =over 4 =item _field_order B<MAINTENANCE BEGIN> Since hashes have an effectively arbitrary order for their keys, the C<_field_order> key is used to specify the order of the fields when th +e tables are created in the database. This is an array reference in the C<$sch +ema> variable: _field_order => [qw/ user_id name area /] If the tables fields and the field order do not match, an exception wi +ll be thrown when the program attempts to create the table. B<MAINTENANCE END> =item _unique C<DBD::CSV> does not actually allow for PRIMARY KEYS. Rather than try + to write a bunch of arcane and effectively useless code to allow for this, we h +ave a C<UNIQUE> specifier. Internally, this is merely an array reference sp +ecifying the keys in the table for which we do not allow duplicate values. B<MAINTENANCE BEGIN> In the example of the C<users> table above, the table specification is + as follows: users => { user_id => $int, name => "$varchar(64)", area => "$char(8)", _field_order => [qw/ user_id name area /], _unique => [qw/ user_id /] } B<MAINTENANCE END> To make a user_id UNIQUE, we do the following: [users] user_id INT UNIQUE name VARCHAR(64) area CHAR(8) After all data has been added to the database, the program will valida +te all unique fields and an error will be written to an error log if any dupl +icate values are found in these fields. =item _foreign_key Despite arguments by some diehard MySQL supporters :), any real databa +se supports foreign key constraints. Naturally, DBD::CSV does not. The C<_foreign_key> value for a table is an array ref of array refs. Each + inner array has the following format: field foreign_table foreign_field After data is added to the database, foreign key validation will occur +. For each table that has a C<_foreign_field> key, the program will look up +the value in C<field> and ensure that C<foreign_table> has a corresponding valud + in C<foreign_field>. Further, the C<foreign_field> must be designated as + a C<_unique> field. If these conditions are not met, an entry will be a +dded to the error log. Consider a lookup table for a music CD database. Each CD could potent +ially have several artists and each artist could be present on several CDs. + Assuming we have two tables named C<artists> and C<CDs>, with C<artist_id> and +C<CD_id> respectively, we could define the lookup table as follows: B<MAINTENANCE BEGIN> Internal representation cd_artist => { artist => $int, cd => $int, _field_order => [qw/ artist_id cd_id /], _foreign_key => [ [ qw/ artist artists artist_id / ], [ qw/ cd CDs CD_id / ] ] } B<MAINTENANCE END> [cd_artist] artist INT cd INT _foreign_key artist artists artist_id _foreign_key cd CDs CD_id Note that the tables will still be created if a foreign key constraint + is violated and there is nothing in the C<DBD::CSV> module to prevent thi +s. This is for advisory purposes only. Your database will function even if th +e foreign key is not defined as C<UNIQUE>, but you should probably take a look a +t your schema to look for problems. =item _record_count This is an internal field used to track how many records were written +for each table. After data is added to the database, a list of the number of r +ecords written to each table will be displayed to the screen. A B<WARNING> m +essage will occur for all tables that have no records written. =back =head1 DATABASE CREATION =head2 Creating a schema Let's take a look at a simple database and see how it's all put togeth +er. We have three tables: CDs, artist, and cd_artist. The last is a lookup t +able. What follows is not intended to be an complete database. [CDs] CD_id INT /^\d++$/ CD_name VARCHAR(30) /.+/ # date will be in YYYY year_released CHAR(4) [artist] artist_id INT UNIQUE /^\d+$/ artist_name VARCHAR(30) /.+/ [cd_artist] artist INT cd INT _foreign_key artist artist artist_id _foreign_key cd CDs CD_id Note that when this schema is read that the lines are split on whitesp +ace. C<DBD::CSV> does not allow whitespace in a table name. Further, I would not recommend the naming convention that I have used. + Foreign key field names should (IMHO) match the field names of the tables that + they should match. I have made them distinct so that it's easier to distin +guish them in this example. =head2 Creating the raw data for the database Now that we have our schema, we need to create the raw data. The data + file contains all of the tables and data that will be added to the database +. Each table is separated by "\n.\n". The data for a table will be read in c +hunks that is split on the separator. I suppose that it's possible for some +one to have such a large amount of data that this could be a problem, but if +you do, you should be using a real database. As with the schema file, lines c +omposed of all white space or beginning with a sharp will be discarded. Field +s must be separated with a comma and if a field contains a comma, it must be quo +ted. As each chunk is read, the program looks for the first line which cont +ains data and assumes this to be a table name. You may use either of the two fo +llowing statements to declare a tablename: table=artist or simply: artist Here is a small raw data file: table=artist #artist_id, artist_name 1,Ovid 2,Yello . CDs # CD_id, CD_name,date_released 1,One Second,1987 2,"Baby",1991 3,"Ovid's Greatest Hits, Volume Zero",0000 . cd_artist # artist, cd 1,3 2,1 4,2 Let's say we save the schema in a file called C<schema.txt> and the da +ta in a file called C<raw.txt> and we want to write this to a database calle +d C<stuff>. We'd use the following command: perl db_validate.pl -s schema.txt -r raw.txt -d stuff If a directory called C<stuff> does not exist, it will be created for +us. Issuing that command, we get the following: All tables successfully created. ERROR in table: 'CDs' Field: 'Ovid's Greatest Hits, Volume Zero' does not match data type: +'VARCHAR(30)'. Record: 3,"Ovid's Greatest Hits, Volume Zero",0000 Be examing our schema, we see that we have defined C<CD_name> as C<VAR +CHAR(30)>. The field in question is over 30 characters long, so we change C<CD_na +me> in the schema to C<VARCHAR(40)>. Then we rerun the command. We get the +following output: All tables successfully created. 3 records added to table 'CDs'. 2 records added to table 'artist'. 3 records added to table 'cd_artist'. All data successfully added to database. Now validating unique fields. Now validating foreign key constraints. Now validating data against regular expressions. WARNING 3 errors were found while validating the data. Errors were written t +o 'err.log' Hmm... what errors? Opening the error.log, we see: Foreign Key constraint violated. Table 'cd_artist', field 'artist', +value '4' Not found in table 'artist', field 'artist_id' Foreign Key constraint violated. Table 'cd_artist', field 'cd'. 'CD_ +id' not defined as UNIQUE in 'CDs' Table: 'CDs', field 'CD_id' had invalid pattern '^\d++$'. Discarded. Oops. Let's change those in our schema. CD_id INT UNIQUE /^\d+$/ The regular expression, in this case, may seem superfluous, but it doe +s two things for us: 1. It effectively ensures that we will have at least one digit in th +e id. 2. It ensures that all ids are positive. Also, we see that there is no artist with an id of 4. That should be +changed to 2. Rerunning the program this time creates all of the tables and generate +s absolutely no errors. =head1 Validating an existing database If you have an existing CSV database, define a schema for the database +. Then, run the program as normal using the C<-v> switch. This tells the prog +ram that we will be validating an existing database rather than creating a new +one from scratch. If you wish to validate a CSV file which does not have the f +ield names as the first line (i.e., the first line of the file is also data +), then use the C<-n> option to tell the program that the CSV files has "no he +aders". B<WARNING>: You want to backup the database or CSV file before doing t +his! This program will copy the database or CSV file to the raw_data file a +nd, if all goes well, it will drop your tables (or CSV file) before proceedin +g. Then it will attempt to re-add the data to the files. If it does not succe +ssfully add the data, your data may very well be corrupted. Also, the files or files created by validating B<will> have a header, +as C<DBD::CSV> automatically adds them, due to the way the module works. + If this is a problem, you should remove them yourself. When you have completed validating a database, the raw data fill will +not be unlinked. I have left the file there if it's necessary to debug your application. I strongly recommend using this feature if you have created an applica +tion that uses DBD::CSV. Use your application, put it through its paces (with a +ll the tests you wrote, right?) and then use this application to test for dat +a corruption. =head1 COPYRIGHT Copyright (c) 2001 Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify i +t under the same terms as Perl itself =head1 AUTHOR Curtis "Ovid" Poe <poec@yahoo.com> Address bug reports and comments to: poec@yahoo.com. =head1 BUGS 2001/11/10 There are no known bugs at this time. However, I modified +this pretty heavily to get it "production ready". Please let me know if there are + any issues with it. =head1 SEE ALSO L<DBD::CSV>, L<DBI>, L<Text::CSV_XS> and L<SQL::Statement>. =cut

In reply to CSV Database Validation by Ovid

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others browsing the Monastery: (8)
    As of 2014-08-30 19:15 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The best computer themed movie is:











      Results (293 votes), past polls