I've needed to write a parser to handle a limited grammar, in order to allow experienced users to specify the values to go in each column of a report (as described in How to parse a limited grammar).
It only handles single statements, which is its intended purpose, and is decidedly less flexible than a real language, but it is lightweight and does the job. The result of parsing a statement is a code ref, which can be deref'ed to return the actual 'runtime' value. I have no idea what you would call this type of parser, or whether I should have created a stack of ASTs instead of using closures, but I took a guess that, given that I was going to call each statement thousands of times, using closures would be more efficient than ASTs - I haven't tried the other route, so I may well be wrong.
Now that I have a working prototype, I'd be interested in hearing criticism.
#!/usr/bin/perl
use strict;
use warnings FATAL => 'all';
use Data::Dump::Streamer;
my %Parsers = ( 'NUM' => \&_parse_literal,
'LITERAL' => \&_parse_literal,
'ASSIGNMENT' => \&_parse_assignment,
'VAR' => \&_parse_var,
'METHOD' => \&_parse_method,
'FUNC' => \&_parse_func,
'FILTER' => \&_parse_filter,
'PARAM_LIST' => \&_parse_param_list,
'OP' => \&_parse_op,
'NOT' => \&_parse_not,
'TERM' => \&_parse_term,
'OPEN' => \&_parse_deref,
'EXPRESSION' => \&_parse_expression,
'CLAUSE' => \&_parse_clause,
);
my %OPs = ( '*' => [ 6, sub { $_[0]->() * $_[1]->() } ],
'/' => [ 6, sub { $_[0]->() / $_[1]->() } ],
'+' => [ 5, sub { $_[0]->() + $_[1]->() } ],
'-' => [ 5, sub { $_[0]->() - $_[1]->() } ],
'&&' => [ 3, sub { $_[0]->() && $_[1]->() } ],
'||' => [ 2, sub { $_[0]->() || $_[1]->() } ],
);
my %Terms = ( map {$_ => 1} qw(NUM NOT LITERAL ASSIGNMENT VAR FUNC) )
+;
my %Derefs = ( '{' => [ 'HASH', 'key', '}', sub { $_[0]->{ $_[1] }
+} ],
'[' => [ 'ARRAY', 'index', ']', sub { $_[0]->[ $_[1] ]
+} ], );
## parse("statement",{vars},{funcs})
#===================================
sub parse {
#===================================
my $text = shift;
my %state = ( vars => shift @_,
funcs => shift @_,
tokens => tokenise($text),
);
my $sub = $Parsers{EXPRESSION}->( \%state )
|| sub { };
if ( @{ $state{tokens} } ) {
die "Still have unprocessed tokens :"
. Dump( @{ $state{tokens} } )->Out;
}
return ( $sub, \%state );
}
## EXPRSSSION [OP|FILTER EXPRESSION|{HASH DEREF}|[ARRAY DEREF]]
#===================================
sub _parse_expression {
#===================================
my $state = shift;
my $next_token = $state->{tokens}[0];
my $expression = $Parsers{CLAUSE}->($state)
|| return;
# OP EXPRESSION
while ( $next_token = $state->{tokens}[0] ) {
return $expression
unless defined $next_token
&& $next_token->[0] =~ /^OP|FILTER|OPEN/;
$expression = $Parsers{ $next_token->[0] }->( $state, $express
+ion );
}
return $expression;
}
## (EXPRESSION) | TERM
#===================================
sub _parse_clause {
#===================================
my $state = shift;
my $next_token = $state->{tokens}[0];
# ( EXPRESSION )
if ( defined $next_token
&& $next_token->[0] eq 'PAREN'
&& $next_token->[1] eq '(' )
{
shift @{ $state->{tokens} };
my $expression = $Parsers{EXPRESSION}->($state);
$next_token = $state->{tokens}[0];
die "Missing closing parenthesis"
unless defined $next_token
&& $next_token->[0] eq 'PAREN'
&& $next_token->[1] eq ')';
shift @{ $state->{tokens} };
return $expression;
}
return $Parsers{TERM}->($state);
}
## OP CLAUSE
#===================================
sub _parse_op {
#===================================
my $state = shift;
my $expression = shift;
my $token = shift @{ $state->{tokens} };
my ( $precedence, $op ) = @{ $OPs{ $token->[1] } };
my $clause = $Parsers{CLAUSE}->($state)
|| die "Missing clause after $token->[1]";
my $next_token = $state->{tokens}[0];
# Next token has higher precedence
$clause = $Parsers{OP}->( $state, $clause )
if defined $next_token
&& $next_token->[0] eq 'OP'
&& $OPs{ $next_token->[1] }[0] > $precedence;
return sub { $op->( $expression, $clause ) };
}
## DEREF VAR
#===================================
sub _parse_deref {
#===================================
my $state = shift;
my $var_expression = shift;
my $token = shift @{ $state->{tokens} };
die "Unmatched '$token->[1]'"
unless exists $Derefs{ $token->[1] };
my $deref = $Derefs{ $token->[1] };
my $key_expression = $Parsers{EXPRESSION}->($state)
|| die "Missing $deref->[1] when dereferencing $deref->[0]";
$token = shift @{ $state->{tokens} };
die "Missing closing $deref->[2]"
unless defined $token
&& $token->[0] eq 'CLOSE'
&& $token->[1] eq $deref->[2];
my $ref_type = $deref->[0];
my $deref_code = $deref->[3];
return sub {
my $var = $var_expression->();
die "Var is not of type '$ref_type'"
unless ref $var && ref $var eq $ref_type;
my $key = $key_expression->();
return $deref_code->( $var, $key );
};
}
## ! CLAUSE
#===================================
sub _parse_not {
#===================================
my $state = shift;
my $token = shift @{ $state->{tokens} };
my $clause = $Parsers{CLAUSE}->($state)
|| die "Missing clause after !";
return sub { !$clause->() };
}
## NUM | NOT | LITERAL | ASSIGNMENT | VAR | FUNC
#===================================
sub _parse_term {
#===================================
my $state = shift;
my $next_token = $state->{tokens}[0];
return
unless defined $next_token
&& exists $Terms{$next_token->[0]};
return $Parsers{ $next_token->[0] }->($state);
}
## $ | $VAR [.METHOD]
#===================================
sub _parse_var {
#===================================
my $state = shift;
my $vars = $state->{vars};
my $token = shift @{ $state->{tokens} };
my $var_name = $token->[1] || '';
my $next_token = $state->{tokens}[0];
my $var = sub {
die "Undefined var '\$$var_name'" unless exists $vars->{$var_n
+ame};
$vars->{$var_name};
};
return $var
unless defined $next_token
&& $next_token->[0] eq 'METHOD';
return $Parsers{ $next_token->[0] }->( $state, $var );
}
## $ | $VAR = EXPRSSSION
#===================================
sub _parse_assignment {
#===================================
my $state = shift;
my $vars = $state->{vars};
my $token = shift @{ $state->{tokens} };
my $var_name = $token->[1] || '';
my $expression = $Parsers{EXPRESSION}->($state)
|| die "No expression passed to assignment";
return sub {
$vars->{$var_name} = $expression->();
}
}
## .METHOD(PARAM_LIST) [.METHOD]
#===================================
sub _parse_method {
#===================================
my $state = shift;
my $object = shift;
my $token = shift @{ $state->{tokens} };
my $method_name = $token->[1];
my @param_list = @{ $Parsers{PARAM_LIST}->($state) };
my $method = sub {
return $object->()->$method_name( map { $_->() } @param_list )
+;
};
my $next_token = $state->{tokens}[0];
if ( defined $next_token && $next_token->[0] eq 'METHOD' ) {
return $Parsers{METHOD}->( $state, $method );
}
return $method;
}
## | FILTER(PARAM_LIST)
#===================================
sub _parse_filter {
#===================================
my $state = shift;
my $expression = shift;
my $token = shift @{ $state->{tokens} };
my $funcs = $state->{funcs};
my $filter_name = $token->[1];
my @param_list = @{ $Parsers{PARAM_LIST}->($state) };
return sub {
die "Undefined filter '$filter_name'"
unless exists $funcs->{$filter_name};
return $funcs->{$filter_name}
->( map { $_->() } ( @param_list, $expression ) );
};
}
## FUNC(PARAM_LIST) [.METHOD]
#===================================
sub _parse_func {
#===================================
my $state = shift;
my $token = shift @{ $state->{tokens} };
my $funcs = $state->{funcs};
my $func_name = $token->[1];
my @param_list = @{ $Parsers{PARAM_LIST}->($state) };
my $next_token = $state->{tokens}[0];
my $func = sub {
die "Undefined function '$func_name'"
unless exists $funcs->{$func_name};
return $funcs->{$func_name}->( map { $_->() } @param_list );
};
return $func
unless defined $next_token
&& $next_token->[0] eq 'METHOD';
return $Parsers{ $next_token->[0] }->( $state, $func );
}
## ([PARAM[,PARAM]])
#===================================
sub _parse_param_list {
#===================================
my $state = shift;
my $next_token = $state->{tokens}[0];
return []
unless defined $next_token
&& $next_token->[0] eq 'PAREN'
&& $next_token->[1] eq '(';
shift @{ $state->{tokens} };
my @param_list;
while ( my $next_token = $state->{tokens}[0] ) {
die "Missing closing parenthesis on param list"
unless defined $next_token;
if ( $next_token->[0] eq 'COMMA' ) {
shift @{ $state->{tokens} };
next;
}
last if $next_token->[0] eq 'PAREN' && $next_token->[1] eq ')'
+;
push @param_list, $Parsers{EXPRESSION}->($state);
}
shift @{ $state->{tokens} };
return \@param_list;
}
## LITERAL
#===================================
sub _parse_literal {
#===================================
my $state = shift;
my $token = shift @{ $state->{tokens} };
my $literal = $token->[1];
return sub {$literal};
}
# Breaks up the text into an array of tokens
#===================================
sub tokenise {
#===================================
my $tokeniser = _get_tokeniser(@_);
my @tokens;
while ( my $token = $tokeniser->() ) {
# Reassemble ['OP','-'], ['NUM',123]
# into ['NUM','-123'] where appropriate
if ( $token->[0] eq 'NUM'
&& defined $tokens[-1]
&& $tokens[-1][0] eq 'OP'
&& $tokens[-1][1] =~ /[-+]/
&& ( !defined $tokens[-2]
|| $tokens[-2][0] =~ /^OP|NOT$/ )
)
{
$token->[1] = $tokens[-1][1] eq '+' ? $token->[1] : 0 - $t
+oken->[1];
pop @tokens;
}
push @tokens, $token;
}
return \@tokens;
}
# Returns a stream of tokens
#===================================
sub _get_tokeniser {
#===================================
my $text = shift;
$text = '' unless defined $text;
my $atom = qr/[a-zA-Z_]\w*/o;
return sub {
TOKEN: {
return [ 'NUM', $1 ] if $text =~ /\G ((?:\d+(?:\.\d*)?)) /
+gcx;
return [ 'LITERAL', $2 ]
if $text =~ /\G (['"]) (.*?) (?<!\\) \1/gcxs;
return [ 'ASSIGNMENT', $1 ] if $text =~ /\G \$ ($atom) \
+s* = /gcx;
return [ 'VAR', $1 ] if $text =~ /\G \$ ($atom)? /gcxo;
return [ 'METHOD', $1 ] if $text =~ /\G \.($atom) /gcxo;
return [ 'OPEN', $1 ] if $text =~ /\G ([{[]) /gcx;
return [ 'CLOSE', $1 ] if $text =~ /\G ([}\]]) /gcx;
return [ 'PAREN', $1 ] if $text =~ /\G ([()]) /gcx;
return [ 'COMMA', '' ] if $text =~ /\G , /gcx;
return [ 'NOT', '' ] if $text =~ m{\G ! }gcx;
return [ 'OP', $1 ] if $text =~ m{\G (\Q||\E | && | [-+*/]
+) }gcx;
return [ 'FILTER', $1 ] if $text =~ /\G \| \s* ($atom) /gc
+xo;
return [ 'FUNC', $1 ] if $text =~ /\G ($atom) /gcxo;
redo TOKEN if $text =~ /\G \s+ /gcx;
my $pos = pos $text || 0;
if ( $text =~ /\G (.+)/gcx ) {
substr( $text, $pos, 0 ) = ' HERE-> ';
die "Syntax error ' $1 ' : ' $text ' ";
}
return;
}
};
}
############################# END PARSER #############################
+#######
# Dummy class for method testing
package Foo;
sub new {
bless( {}, 'Foo' );
}
sub bar {
shift;
return 'Bar', @_;
}
######### TESTS #############
package main;
run_tests(@ARGV);
sub run_tests {
my $test_no = shift;
my @tests = (
'BLANK',
[ q{}, [] ],
'INTS & LITERALS',
[ q{0}, 0 ],
[ q{1}, 1 ],
[ q{2}, 2 ],
[ q{''}, '' ],
[ q{'abc'}, 'abc' ],
[ qq{'abc\ndef'}, "abc\ndef" ],
'INTS & LITERALS in ()',
[ q{('abc')}, 'abc' ],
[ q{(0)}, 0 ],
[ q{(2)}, 2 ],
'VARS',
[ q{$lang}, 'English' ],
[ q{($lang)}, 'English' ],
'FUNCS',
[ q{caps('lower')}, 'LOWER' ],
[ q{caps('lower',$lang)}, [ 'LOWER', 'ENGLISH' ] ],
'FILTER',
[ q{$lang | caps}, 'ENGLISH' ],
[ q{$lang | caps('lower')}, [ 'LOWER', 'ENGLISH' ] ],
'METHODS',
[ q{new_foo.bar}, 'Bar' ],
[ q{new_foo.bar('Foo')}, [ 'Bar', 'Foo' ] ],
[ q{new_foo.bar('Foo') | caps('other')}, [ 'OTHER', 'BAR', 'FO
+O' ] ],
'DEREF',
[ q($hash{'name'}{'first'}), 'John' ],
[ q($hash{'name'}{'third'}), undef ],
[ q{$array[2]}, 'third' ],
[ q{$array[-3]}, 'second' ],
'LOGIC',
[ q{2 || 3}, 2 ],
[ q{2 || 0}, 2 ],
[ q{0 || 3}, 3 ],
[ q{0 || 0}, 0 ],
[ q{2 && 3}, 3 ],
[ q{2 && 0}, 0 ],
[ q{0 && 3}, 0 ],
[ q{0 && 0}, 0 ],
[ q{2 || 3 && 4}, 2 ],
[ q{2 || 0 && 4}, 2 ],
[ q{2 || 3 && 0}, 2 ],
[ q{2 || 0 && 0}, 2 ],
[ q{0 || 3 && 4}, 4 ],
[ q{0 || 0 && 4}, 0 ],
[ q{0 || 3 && 0}, 0 ],
[ q{0 || 0 && 0}, 0 ],
[ q{2 && 3 || 4}, 3 ],
[ q{2 && 0 || 4}, 4 ],
[ q{2 && 3 || 0}, 3 ],
[ q{2 && 0 || 0}, 0 ],
[ q{0 && 3 || 4}, 4 ],
[ q{0 && 0 || 4}, 4 ],
[ q{0 && 3 || 0}, 0 ],
[ q{0 && 0 || 0}, 0 ],
[ q{(2 || 3) && 4}, 4 ],
[ q{(2 || 0) && 4}, 4 ],
[ q{(2 || 3) && 0}, 0 ],
[ q{(2 || 0) && 0}, 0 ],
[ q{(0 || 3) && 4}, 4 ],
[ q{(0 || 0) && 4}, 0 ],
[ q{(0 || 3) && 0}, 0 ],
[ q{(0 || 0) && 0}, 0 ],
[ q{2 && (3 || 4)}, 3 ],
[ q{2 && (0 || 4)}, 4 ],
[ q{2 && (3 || 0)}, 3 ],
[ q{2 && (0 || 0)}, 0 ],
[ q{0 && (3 || 4)}, 0 ],
[ q{0 && (0 || 4)}, 0 ],
[ q{0 && (3 || 0)}, 0 ],
[ q{0 && (0 || 0)}, 0 ],
[ q{(0 || 0) && (0 || 0)}, 0 ],
[ q{(0 || 0) && (0 || 5)}, 0 ],
[ q{(0 || 0) && (4 || 0)}, 0 ],
[ q{(0 || 0) && (4 || 5)}, 0 ],
[ q{(0 || 3) && (0 || 0)}, 0 ],
[ q{(0 || 3) && (0 || 5)}, 5 ],
[ q{(0 || 3) && (4 || 0)}, 4 ],
[ q{(0 || 3) && (4 || 5)}, 4 ],
[ q{(2 || 0) && (0 || 0)}, 0 ],
[ q{(2 || 0) && (0 || 5)}, 5 ],
[ q{(2 || 0) && (4 || 0)}, 4 ],
[ q{(2 || 0) && (4 || 5)}, 4 ],
[ q{(2 || 3) && (0 || 0)}, 0 ],
[ q{(2 || 3) && (0 || 5)}, 5 ],
[ q{(2 || 3) && (4 || 0)}, 4 ],
[ q{(2 || 3) && (4 || 5)}, 4 ],
[ q{(0 && (0 && (0 || 0)))}, 0 ],
[ q{(0 && (0 && (0 || 5)))}, 0 ],
[ q{(0 && (0 && (4 || 0)))}, 0 ],
[ q{(0 && (0 && (4 || 5)))}, 0 ],
[ q{(0 && (3 && (0 || 0)))}, 0 ],
[ q{(0 && (3 && (0 || 5)))}, 0 ],
[ q{(0 && (3 && (4 || 0)))}, 0 ],
[ q{(0 && (3 && (4 || 5)))}, 0 ],
[ q{(2 && (0 && (0 || 0)))}, 0 ],
[ q{(2 && (0 && (0 || 5)))}, 0 ],
[ q{(2 && (0 && (4 || 0)))}, 0 ],
[ q{(2 && (0 && (4 || 5)))}, 0 ],
[ q{(2 && (3 && (0 || 0)))}, 0 ],
[ q{(2 && (3 && (0 || 5)))}, 5 ],
[ q{(2 && (3 && (4 || 0)))}, 4 ],
[ q{(2 && (3 && (4 || 5)))}, 4 ],
[ q{2 && 3 && 4}, 4 ],
[ q{0 && 3 && 4}, 0 ],
[ q{2 && 0 && 4}, 0 ],
[ q{2 && 3 && 0}, 0 ],
[ q{2 || 3 || 4}, 2 ],
[ q{0 || 3 || 4}, 3 ],
[ q{2 || 3 || 0}, 2 ],
[ q{0 || 0 || 4}, 4 ],
'ARITHMETIC',
[ q{1+2}, 3 ],
[ q{5-2}, 3 ],
[ q{3*4}, 12 ],
[ q{18/3}, 6 ],
[ q{10-2-3}, 5 ],
[ q{27/3/2}, 4.5 ],
[ q{2+3*4-12/3}, 10 ],
[ q{(2+3)*4-12/3}, 16 ],
'SIGNED NUMBERS',
[ q{-3}, -3 ],
[ q{+3}, 3 ],
[ q{3+3}, 6 ],
[ q{-2-4}, -6 ],
[ q{+3 + -4}, -1 ],
'NOT',
[ q{ !3}, '' ],
[ q{ !0}, 1 ],
[ q{ !'abc'}, '' ],
[ q{ !''}, 1 ],
[ q{ !3 && 4}, '' ],
[ q{ !0 && 4}, 4 ],
'CR in statement',
[ qq{(2+3)\n*4-\n12/3}, 16 ],
'ASSIGNMENT',
[q{$abc = 123},'123'],
[q{$abc},'123'],
[q{$abc = 123 + 24},'147'],
[q{$abc},'147'],
);
my @errors = (
[ q{,}, 'unprocessed' ],
[ q{$lang,$lang}, 'unprocessed' ],
[ q{.caps()}, 'unprocessed' ],
[ q{$?}, 'Syntax error' ],
[ q{3 && 4 || && 5}, 'Missing clause' ],
[ q{$def}, 'Undefined var' ],
[ q{func}, 'Undefined function' ],
[ q{$lang | xyz}, 'Undefined filter' ],
[ q{$lang.abc}, 'locate object method' ],
[ q{$hash.name.third}, 'unblessed' ],
[ q($array{2}), "not of type 'HASH'" ],
[ q{$hash[first]}, "not of type 'ARRAY'" ],
[ q({'deref'}), 'unprocessed tokens' ],
);
if ( defined $test_no ) {
@tests = ( $tests[ $test_no - 1 ] );
$test_no--;
}
else { $test_no = 0 }
my $vars = {
lang => 'English',
hash =>
{ name => { first => 'John', second => 'Smith', '3' => 't
+hree' } },
array => [ 'first', 'second', 'third', 'fourth' ],
};
my $funcs = {
caps => sub {
return map {uc} @_;
},
new_foo => sub {
return Foo->new(@_);
},
};
my $i = 1;
foreach my $test (@tests) {
unless ( ref $test ) {
print "\n$test\n";
$i++;
next;
}
eval {
my ( $sub, $state ) = parse( $test->[0], $vars, $funcs );
my @expect = map { defined $_ ? $_ : '<undef>' }
ref $test->[1] ? @{ $test->[1] } : $test->[1];
my @return = map { defined $_ ? $_ : '<undef>' } $sub->();
eval {
die if @expect != @return;
for ( my $j = 0; $j < @expect; $j++ ) {
die if $expect[$j] ne $return[$j];
}
};
if ($@) {
die "Expected ["
. join( ',', @expect )
. "], got ["
. join( ',', @return )
. "] instead\n"
. ( Dump($sub)->Out );
}
print( $i+ $test_no, " : SUCCESS : $test->[0]\n" );
};
if ($@) {
die( "Running test "
. ( $i + $test_no )
. " : $test->[0]:\n $@\n" );
}
$i++;
}
$i = 1;
foreach my $test (@errors) {
my ( $sub, $state );
eval {
( $sub, $state ) = parse( $test->[0], $vars, $funcs );
$sub->();
};
my $error = $@ || 'ERROR: No error thrown!';
my $regex = $test->[1];
if ( $error =~ m/$regex/ ) {
print "Error test $i: SUCCESS : $test->[0]\n";
$i++;
}
else { die $test->[0] . " : $error" }
}
}