I'm trying to find out how many captures a compiled regular expression will do. My first try was good enough
for me, but hv pointed out several cases where it could break. While refining the code I came upon several other
cases and wrote a big test suite.
Here's the routine code:
sub captures {
local $_ = shift;
croak "$_ is not a compiled regexp" unless ref eq 'Regexp';
my @p; # remember the kind of opening parentheses we've seen
my ($n, $x) = (0, 1);
/\G\(\?[ism]*(x?)[^:]*:/gc # global /xism block
&& ( $x = $1 ? 1 : 0 );
while( /\G(?=.)/gcs ) {
$x > 0 ? /\G[^[\\()#]+/gc # ignore standard stuff (/x)
: /\G[^[\\()]+/gc; # ignore standard stuff
/\G(?:\\.)+/gcs; # ignore backslashed stuff
$x > 0 && /\G#.*/gc; # ignore comments under /x
/\G\[\^?\]?[^]]*\]/gc; # character class
/\G\(\?[ism]*(x?)[ism]*(?:-[ism]*(x?)[ism]*)?([:)])/gc && do {
$x++, $3 eq ':' && push @p, '+x' if $1; # (?xism:...) and
$x--, $3 eq ':' && push @p, '-x' if $2; # (?xism) blocks
};
/\G\(\?\(\d+\)/gc && push @p, 'sp'; # conditional regexp
/\G\(\?/gc && push (@p, 'sp') && next; # other special regexp
/\G\(/gc && ($n++, push @p, 'cp'); # a capturing parenthese
/\G\)/gc && do { # a closing parenthese
$x-- if $p[-1] eq '+x'; # compute /x state
$x++ if $p[-1] eq '-x';
pop @p;
};
}
$n;
}
And here's the test suite (test 12 is a simple case
where (?{ ... }) can break my code):
use Test::More;
# test the captures() method
my @regexps = (
[ qr/foo/, 0 ],
[ qr/foo(.*)bar/, 1 ],
[ qr/\(foo(bar(baz)*)/, 2 ],
[ qr/((?=.)ldkj\(.*\)(?i:bar(.*))b)/, 2 ],
[ qr/foo # (bar)/ims, 1 ], # 5
# thanks Hugo for suggesting these difficult cases
[ qr{(
x
)}x, 1 ],
[ qr{
(?x: # (comment)
)
(?-x: # (capture) )
}, 1 ],
[ qr{[()<>]}, 0 ],
[ qr{([])<(>]+)}, 1 ],
[ qr{[a # (comment)
b]}, 0 ], # 10
# other difficult cases I've found on my own,
# while browsing perlre and perlretut
[ qr/(?x) # (comment)
(?-x) # (capture) (?x) # (comment)/, 1 ],
[ qr/foo(?{ print ( "foo" ) })bar/, 0 ],
# the following are taken from perlre
[ qr<
(?{ $cnt = 0 }) # Initialize $cnt.
(
a
(?{
local $cnt = $cnt + 1; # Update $cnt, backtracking-sa
+fe.
})
)*
aaaa
(?{ $res = $cnt }) # On success copy to non-localized
# location.
>x, 1 ],
[ qr{
\(
(?:
(?> [^()]+ ) # Non-parens without backtracking
|
(??{ $re }) # Group with matching parens
)*
\)
}x, 0 ],
[ qr{ \(
(
[^()]+ # x+
|
\( [^()]* \)
)+
\)
}x, 1 ], # 15
[ qr{ \(
(
(?> [^()]+ ) # change x+ above to (?> x+ )
|
\( [^()]* \)
)+
\)
}x, 1 ],
[ qr{(?>#[ \t]*)}, 0 ],
[ qr{#[ \t]*(?![ \t])}, 0 ],
[ qr/ (?> \# [ \t]* ) ( .+ ) /x, 1 ],
[ qr/ \# [ \t]* ( [^ \t] .* ) /x, 1 ], # 20
[ qr{ ( \( )?
[^()]+
(?(1) \) )
}x, 1 ],
# taken from perlretut
[ qr/^(.+)(e|r)(.*)$/, 3 ],
[ qr/^
[+-]? # first, match an optional sign
( # then match integers or f.p. mantissas:
\d+\.\d+ # mantissa of the form a.b
|\d+\. # mantissa of the form a.
|\.\d+ # mantissa of the form .b
|\d+ # integer of the form a
)
([eE][+-]?\d+)? # finally, optionally match an exponent
$/x, 2 ],
[ qr/^
[+-]?\ * # first, match an optional sign *and space*
( # then match integers or f.p. mantissas:
\d+\.\d+ # mantissa of the form a.b
|\d+\. # mantissa of the form a.
|\.\d+ # mantissa of the form .b
|\d+ # integer of the form a
)
([eE][+-]?\d+)? # finally, optionally match an exponent
$/x, 2 ],
[ qr/^
[+-]?\ * # first, match an optional sign
( # then match integers or f.p. mantissas:
\d+ # start out with a ...
(
\.\d* # mantissa of the form a.b or a.
)? # ? takes care of integers of the form a
|\.\d+ # mantissa of the form .b
)
([eE][+-]?\d+)? # finally, optionally match an exponent
$/x, 3 ], # 25
[ qr/^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/, 3 ],
[ qr/(?# Match an integer:)[+-]?\d+/, 0 ],
[ qr/(?# Match an integer:)[+-]?\d+/, 0 ],
[ qr/(?x)( # freeform version of an integer regexp
[+-]? # match an optional sign
\d+ # match a sequence of digits
)
/x, 1 ],
[ qr/([+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)/, 4 ], # 30
[ qr/([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)/, 1 ],
[ qr/([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE]([+-]?\d+))?)/, 2 ],
[ qr/\( ( [^()]+ | \([^()]*\) )+ \)/x, 1 ],
[ qr/\( ( (?>[^()]+) | \([^()]*\) )+ \)/x, 1 ],
[ qr/^(\w+)(\w+)?(?(2)\2\1|\1)$/, 2 ], # 35
[ qr/[ATGC]+(?(?<=AA)G|C)$/, 0 ],
[ qr/(?{local $c = 0;}) # initialize count
( a # match 'a'
(?{local $c = $c + 1;}) # increment count
)* # do this any number of times,
aa # but match 'aa' at the end
(?{$count = $c;}) # copy local $c var into $count
/x, 1 ],
[ qr/(?(?{
$lang eq 'EN'; # is the language English?
})
the | # if so, then match 'the'
(die|das|der) # else, match 'die|das|der'
)
/xi, 1 ],
[ qr/^1 # match an initial '1'
(
(??{'0' x $s0}) # match $s0 of '0'
1 # and then a '1'
(?{
$largest = $s0; # largest seq so far
$s2 = $s1 + $s0; # compute next term
$s0 = $s1; # in Fibonacci sequence
$s1 = $s2;
})
)+ # repeat as needed
$ # that is all there is
/x, 1 ],
);
plan tests => scalar @regexps;
for ( @regexps ) {
is( captures( $_->[0] ), $_->[1], "$_->[1] captures" );
}
Any help appreciated...