Fellow Perlmongers,
I have been working for several weeks now on creating a piece that, if done properly, could be usefully added to CPAN somewhere, as I have found no similar tool anywhere, and I'm sure others would find it useful. In fact, I'm nearly certain to be reinventing this wheel.
I need to be able to take user input from an HTML form (textarea field) and convert it into an appropriate select query for a MySQL database that will return to the user the results of their query. The entire program will be much more complex than this, but for now, this code is my test case for producing the portion of the select query usually found between "WHERE" and "ORDER BY."
If someone knows of a package out there that can do this, I would be delighted to stop inventing this wheel again. Until then, I personally view this as a rather crude and non-CPAN-worthy piece of code that might barely get the job done for me. I'm putting this here so that others of you, much wiser than myself in the ways of Perl, can enlighten me as to some potential problems or improvements with this code. Additionally, perhaps this can be a starting block for others who need something similar, for as rough as it is, it does work at a basic level.
#!/usr/bin/perl -wT
#DEFINE EXTERNAL MODULE USES
use CGI;
use CGI::Carp qw(fatalsToBrowser);
use strict;
use Encode;
use Encode qw(encode decode);
use open qw( :std :encoding(UTF-8) );
print "Content-type: text/html; charset=utf-8\n\n"; #print CGI::heade
+r();
binmode STDOUT, ":utf8";
=item NOTES
OPERATOR ORDER OF PRECEDENCE
NOT / ^
OR / |
XOR
AND / &
"AND" is the default operator if words are entered without an operator
+.
NOTE: These operators are currently accepted in UPPERCASE only.
=cut
our $thisprogram = "search-test.pl"; #MAY NEED TO BE FULLY QUALIFIE
+D URL
our $line=""; #I'M TOO LAZY TO DECLARE IT A
+GAIN EVERYWHERE IT'S NEEDED.
our %input;
our $buffer="";
our $pair="";
our @pairs=();
our $DEBUG=0; #SET TO TWO FOR A MORE STEP-B
+Y-STEP OUTPUT
our $PAD = qw'(<>)'; #CONSTANT THAT STILL SEEMS TO
+ONLY WORK PART OF THE TIME
our $table = 'MyTable'; #JUST AN EXAMPLE, AS THIS '
+TEST' SCRIPT DOES NOT ATTEMPT
our $columnName = 'MyTextColumn'; #TO CONNECT TO AN ACTUAL DATA
+BASE
if ($ENV{CONTENT_LENGTH}) {
read(STDIN, $buffer, $ENV{CONTENT_LENGTH});
@pairs = split(/&/,$buffer);
} else {
my $buffer = $ENV{QUERY_STRING};
@pairs = split(/\&/,$buffer);
}
foreach $pair (@pairs) {
$pair=~s/`/ /g; #REMOVE BACKTICKS AS A SECURITY AND FUNCTIONA
+LITY MEASURE TO PROTECT REMAINDER OF SCRIPT
#$input{translate}='';
my ($name, $value) = split(/=/,$pair);
$value =~ tr/+/ /;
$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
$name = decode("utf-8", $name); $value = decode("utf-8", $valu
+e);
$input{$name} = $value;
}
$line=$input{terms};
if ($DEBUG==2) {print "10: Current line: $line<p>\n"};
$line =~ s/\(/PaReNtHeSiSLEFT/g; #REMOVE ALL () FRO
+M ORIGINAL SEARCH FOR SECURITY AND FUNCTIONALITY
$line =~ s/\)/PaReNtHeSiSRGHT/g; #REMOVE ALL () FRO
+M ORIGINAL SEARCH FOR SECURITY AND FUNCTIONALITY
$line =~ s/(\(|\))/ $1 /g; #PAD ORIGINAL PA
+RENTHESES WITH SPACES FOR NOW
$line =~ s/""/QuOtEd_StRiNg/g; #TEMPORARILY REP
+LACE DOUBLE QUOTES WITH A KEEPER TAG
my ($i,$p,$toggle,$terms)=(0,'',"FALSE",$line);
$line='';
for ($i = 0; $i < length($terms); $i++) {
$p = substr($terms, $i, 1);
#DEAL WITH QUOTED PHRASES
if ($p eq '"') { if ($toggle eq "FALSE") { $toggle = "TRUE" } else
+ { $toggle = "FALSE" }};
if ($toggle eq "TRUE") {
$p =~ s/\s/()/; #REPLACE SPACES WI
+TH () TEMPORARILY TO KEEP QUOTE TOGETHER
$p =~ s/([`~=?.+*&^%$#@!<>[\]{}:'])/\\$1/; #ESCAPE ALL OF T
+HESE CHARACTERS IN THE QUOTE (SOME OF THESE ARE FOR PERL COMPATIBILI
+TY)
#$p =~ s/\\/\\\\/g; #ESCAPE THE ESC
+APE CHARACTER ITSELF, IF IN A QUOTE
$p =~ s/(')/$PAD$1$PAD/; #PAD APOSTROPHES F
+OR NOW SO THAT THEY WILL NOT BECOME WORD BOUNDARIES
$p =~ s/(,)/$PAD$1$PAD/; #PAD COMMAS FOR NO
+W SO THAT THEY WILL NOT BECOME WORD BOUNDARIES
$p =~ s/\r/\\\\r/g; #REPLACE RETURN
+ CHARACTER WITH MYSQL-ESCAPED EQUIVALENT
$p =~ s/\n/\\\\n/g; #REPLACE LINE-F
+EED CHARACTER WITH MYSQL-ESCAPED EQUIVALENT
$p =~ s/\t/\\\\t/g; #REPLACE TAB CH
+ARACTER WITH MYSQL-ESCAPED EQUIVALENT
} else {
$p =~ s/%/\\%/g; #ESCAPE ALL PERCEN
+T SIGNS [THIS STEP *MUST* PRECEDE * WILDCARD SUBST.]
if ((substr($terms, $i-1, 1)=~/\p{IsWord}/) && (substr($terms,
+ $i+1, 1)=~/\p{IsWord}/)) {
$p =~ s/-/$PAD\-$PAD/; #PAD UNQUOTED MI
+NUS SIGNS IN WORDS
$p =~ s/\*/%/; #REPLACE ALL WIL
+DCARD ASTERISKS WITH CORRECT DATABASE WILDCARD
$p =~ s/(')/$PAD$1$PAD/; #PAD APOSTROPHES F
+OR NOW SO THAT THEY WILL NOT BECOME WORD BOUNDARIES
} else {
$p =~ s/[<>]/ /; #REPLACE THESE UNQ
+UOTED CHARACTERS WITH A SPACE, PRESERVING WORD BOUNDARY
};
#if ($DEBUG==2) {print "15: Current line: $line<p>\n"};
$p =~ s/[`~=.,+\n\r*!\/\[\]]/ /; #REPLACE THESE UNQ
+UOTED CHARACTERS WITH A SPACE, PRESERVING WORD BOUNDARY
$p =~ s/\\(?!\%)/ /g; #EXCEPT IN THE CA
+SE OF AN ESCAPED PERCENT SIGN, CONVERT SLASHES TO SPACE
$p =~ s/([~=$#\@{}:'])/\\$1/; #ESCAPE THESE UNQ
+UOTED CHARACTERS
$p =~ s/_/$PAD\\_$PAD/; #ESCAPE/PAD UND
+ERLINE CHARS [THIS STEP *MUST* PRECEDE ? WILDCARD SUBST.]
$p =~ s/\?/_/; #REPLACE ALL WIL
+DCARD QUESTON MARKS WITH CORRECT DATABASE WILDCARD
$p =~ s/\|/ OR /; #REPLACE UNQUOTED
+ PIPES WITH 'OR' OPERATOR TERM
$p =~ s/\&/ AND /; #REPLACE UNQUOTE
+D AMPERSANDS WITH 'AND' OPERATOR TERM
$p =~ s/\^/ NOT /; #REPLACE UNQUOTE
+D EXPONENT SIGNS WITH 'NOT' OPERATOR TERM
$p =~ s/^-/ /; #REMOVE UNQUOTED
+ MINUS SIGNS OUTSIDE OF WORDS
};
$line .= $p;
};
if ($DEBUG==2) {print "20: Current line: $line<p>\n"};
$line =~ s/((?:".*?")|(?:\(.*?\)))* #MATCH AND HOLD ANY QUOTED
+OR PARENTHETICAL STRING
(?:^|\s|\b)
(?<!\(<>\))
(?!["()]|(?:(?<=\d)\:)) #DO NOT ALLOW PATTERN TO MA
+TCH IF ADJACENT TO ONE OF ABOVE
(?!
(?:\s|\))
(?:OR|AND|NOT|XOR)
(?:\s|\()
) #DO NOT MATCH A SEARCH OPERAT
+OR TERM
(?=(?:\(<>\))*) #INCLUDE ANY OF OUR PADDING
+ WE INTRODUCED EARLIER
(
(?:(?:\(<>\))*
|(?:\\.{1})*
|(?:\d+\:\d+)*
|[^"() ])+) #BUT MATCH OTHER WORDS, INCL
+UDE ANY ESCAPED CHARACTER
(?>(?:\(<>\))*) #INCLUDE ANY ADDITIONAL PAD
+DING
(?!["()]|(?:\:(?>\d))) #DO NOT ALLOW PATTERN TO MAT
+CH IF ADJACENT TO ONE OF THESE
(?:\s|$|\b)
(?<!\(<>\))
((?:".*?")|(?:\(.*?\)))* #MATCH AND HOLD ANY FURTHER QU
+OTED OR PARENTHETICAL STRINGS
/$1 ($2) $3/gx; #SURROUND MATCHES FROM ABOV
+E WITH PARENTHESES (WORDS OR PHRASES)
if ($DEBUG==2) {print "30: Current line: $line<p>\n"};
$line =~ s/\(<>\)//g; #REMOVE OUR P
+ADDING NOW, AS IT HAS DONE ITS JOB
$line =~ s/\(\)/ /g; #REMOVE OUR ()
+ SPACE REPLACEMENTS AND PUT SPACES BACK
$line =~ s/\(\\'\)/\\'/g; #FIX ANY DANG
+LING APOSTROPHES WHICH WERE SEEN AS WORD BOUNDARIES
if ($DEBUG==2) {print "40: Current line: $line<p>\n"}; #
$line =~ s/( ( \([^)]+\) \s* OR (?:\s+|\b) )+ #GROUP IN PAR
+ENTHESES ALL SEARCH TERMS JOINED BY 'OR'
( \(.*?\) ) )/($1)/gx; #THIS WILL G
+IVE 'ORed' WORDS LOGICAL PRECEDENCE
if ($DEBUG==2) {print "50: Current line: $line<p>\n"};
$line =~ s/(\(\s+\()/((/g; #REMOVE SPAC
+ES BETWEEN NESTED LEFT PARENTHESES
$line =~ s/(\)\s+\))/))/g; #REMOVE SPAC
+ES BETWEEN NESTED RIGHT PARENTHESES
if ($DEBUG==2) {print "60: Current line: $line<p>\n"};
$line =~ s/"([^"]+)"/($1)/g; #REPLACE QUO
+TES FOR QUOTED STRINGS WITH PARENTHESES
if ($DEBUG==2) {print "70: Current line: $line<p>\n"};
$line =~ s% #RECURSIVEL
+Y LOCATE ALL PARENTHESIZED TERMS AND PHRASES
(\()
( (?> [^()]+) #Match no parentheses without r
+ecursion
|
(??{ $line }) # Group with matching parentheses
)*
|
(\))
%AND $1$table\.$columnName LIKE "\%$2\%"$3%gx;
+#FORMAT PARENTHETICAL TERMS INTO SELECT QUERY FORM
$line=~s/\s*(AND)*\s*$table\.$columnName\sLIKE\s"\%\s*\%"\s*//g; #R
+EMOVE EMPTY SELECT TERMS
$line=~s/\)\s*AND\s+AND\s*\(/) AND (/g; #REPLACE DO
+UBLE 'AND' WITH SINGLE 'AND' OPERATOR
$line=~s/\)\s*NOT\s+AND\s*\(/) AND NOT (/g; #CHANGE 'NO
+T AND' TO 'AND NOT'
$line=~s/\)\s*XOR\s+AND\s*\(/) XOR (/g; #REMOVE 'AN
+D' IN 'XOR AND'
$line=~s/\)\s*OR\s+AND\s*\(/) OR (/g; #REMOVE 'AND'
+ IN 'OR AND'
$line=~s/QuOtEd_StRiNg/\\"/g; #REPLACE OUR
+QUOTE MARKER WITH AN ESCAPED QUOTATION MARK
$line=~s/PaReNtHeSiSLEFT/\\(/g; #REPLACE PA
+RENTHESES
$line=~s/PaReNtHeSiSRGHT/\\)/g; #REPLACE PA
+RENTHESES
#1 while ($line=~s/\(\s*AND\s*\(/((/g);
if ($DEBUG==2) {print "80: Current line: $line<p>\n"};
1 while ($line=~s/(\()
(?: \s* | (\(*) )*
AND
(?: \s* | (\(*) )*
/$1$2$3/x);
$line=~s/^\s*AND\s*(\()/$1/;
1 while ($line=~s/^\s*\((\s|\n|\r)*\)\s*(AND)*\s*//); #REMOVE LEADI
+NG EMPTY PARENTHETICAL TERM, IF EXISTS
1 while ($line=~s/^\s*(OR|XOR|AND)+\s//); #REMOVE SURPL
+US LEADING OPERATOR TERMS, IF EXIST
if ($DEBUG==2) {print "90: Current line: $line<p>\n"};
print <<HTML;
<html lang="utf8">
<head>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf8">
<title>SEARCH ENGINE TESTING</title>
</HEAD>
<BODY>
<h3 align="center">Enter your test search into the search box.</h3>
<form name="ff" method="POST" accept-encoding="UTF-8" accept-charset="
+utf-8" action="$thisprogram">
<textarea rows="5" cols="65" name="terms" style="background-color:#999
+999;" value="$input{terms}">$input{terms}</textarea>
<input type="submit" value="Submit" name="submit"></input>
</form>
$line
</body>
</html>
HTML
Blessings,
~ Polyglot ~