Welcome to the Monastery PerlMonks

Numeric list to optimised regexp

by ncw (Friar)
 on Sep 07, 2000 at 01:24 UTC Need Help??

1: #!/usr/bin/perl -w
2: #
3: # The challenge - to write a function which given a list of non
4: # negative integers returns a regexp which will match those and only
5: # those numbers.
6: #
7: # Try the test program like this :-
8: #
9: #-- Simple numeric list
10: # ./numeric-list-to-regexp.pl 0..255
11: #  \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
12: #-- Sparse numeric list
13: # ./numeric-list-to-regexp.pl 0..11,15,21..33
14: # \d|1[015]|2[1-9]|3[0-3]
15: #-- Numbers divisble by 3
16: # ./numeric-list-to-regexp.pl 'map {\$_ * 3} 0..33'
17: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
18: #-- All prime numbers < 100
19: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { \$_ % all(2..sqrt(\$_)+1) != 0 } (1..100)'
20: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
21:
22: use strict;
23:
24: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
25:     unless @ARGV;
26: my @list = eval "@ARGV";
27: my \$re = numeric_list_to_regexp( @list );
28: check_numeric_list_to_regexp(\$re, \@list);
29: exit;
30:
31: ############################################################
32: # Converts a list of numbers into a regexp which will
33: # match those numbers and those numbers only.
34: #
35: # It does this by constructing a regexp and then progressively
36: # simplifying it - recursively if necessary.  It uses regexp's to
37: # transform the regexp of course!  This is almost a general purpose
38: # regexp optimiser.
39: #
40: # We assume that the caller will bound the regexp with ^( and )\$ or
41: # \D(?: and )\D or whatever takes their fancy
42: #
43: # Set \$DEBUG to 1 if you want to print lots of info and check the
44: # regexp works after each transformation.
45: #
46: # Warning: code contains heavy regexps - lift with care ;-)
47: # Caution: Code may use exponential time and space ;-(
48: ############################################################
49:
50: sub numeric_list_to_regexp
51: {
52:     my (@list) = @_;
53:     my \$DEBUG = 0;
54:
55:     # The basic regexp with |'s on the start and end to make our life
56:     # easier
57:     # Should uniq here too...
58:     my \$re = "|" . join("|", sort { \$a <=> \$b } @list) . "|";
59:
60:     # Transform the regexp in stages, making sure at all time the
61:     # regexp is correct if \$DEBUG is set
62:
63:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
64:
65:     # 1) Concatenate all the single characters a|b|c into [abc]'s
66:     \$re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
67:     {
68:         my ( \$string ) = ( \$1 );
69:         print "string = '\$string'\n" if \$DEBUG;
70:         "|[" . join("", split m{\|}, \$string) . "]"
71:     }gex;
72:
73:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
74:
75:     # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
76:     \$re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
77:     {
78:         my ( \$string, \$prefix ) = ( \$1, \$2 );
79:         print "prefix = '\$prefix', string = '\$string'\n" if \$DEBUG;
80:         "|\$prefix\(?:" . join("|", map { substr(\$_, length \$prefix) } split m{\|}, \$string) . ")"
81:     }gex;
82:
83:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
84:
85:     # 3) Find all the aX|bX|cX and change to (a|b|c)X]
86:     \$re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
87:     {
88:         my ( \$string, \$postfix ) = ( \$1, \$3 );
89:         print "postfix = '\$postfix', string = '\$string'\n" if \$DEBUG;
90:         \$string =~ s{ \Q\$postfix\E  (?= \| | \$ ) }{}gx;
91:         print "...string = '\$string'\n" if \$DEBUG;
92:         "|(?:\$string)\$postfix"
93:     }gex;
94:
95:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
96:
97:     # 4) Change (?:a|b|c) into [abc]
98:     \$re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
99:     {
100:         my ( \$string ) = ( \$1 );
101:         print "string = '\$string'\n" if \$DEBUG;
102:         "[" . join("", split m{\|}, \$string) . "]"
103:     }gex;
104:
105:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
106:
107:     # 5) Optimise [abc] into [a-c] or \d
108:     # This doesn't optimise all the cases only the complete continuous
109:     # range in the [ ... ]
110:     \$re =~ s{ \[ ( \d{3,} ) \] }
111:     {
112:         my ( \$string, \$start, \$end ) = ( \$1, substr(\$1, 0, 1), substr(\$1, -1, 1) );
113:         print "match ['\$string']...range [\$start-\$end]\n" if \$DEBUG;
114:         if (\$end - \$start + 1 == length \$string)
115:         {
116:             \$start == 0 && \$end == 9 ? '\d' : "[\$start-\$end]";
117:         }
118:         else
119:         {
120:             "[\$string]";
121:         }
122:     }gex;
123:
124:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
125:
126:     # 6) recurse on any digit sequences left (?:ab|cd|ef)
127:     \$re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
128:     {
129:         my ( \$string ) = ( \$1 );
130:         print "**** Recursing on '\$string'\n" if \$DEBUG;
131:         "(?:" . numeric_list_to_regexp(split m{\|}, \$string) . ")";
132:     }gex;
133:
134:     check_numeric_list_to_regexp(\$re, \@list) if \$DEBUG;
135:
136:     # 7) fix the | on each end
137:     \$re =~ s{^\|}{};
138:     \$re =~ s{\|\$}{};
139:
140:     print "**** Returning '\$re'\n" if \$DEBUG;
141:
142:     return \$re;
143: }
144:
145: ############################################################
146: # Test subroutine to check the regexp performs as advertised
147: #
148: # Call with a regexp and a reference to a list of numbers
149: # it will check that the regexp matches all the list and
150: # doesn't match some others (obviously it can't check them
151: # all can it!) die-ing on any failures.
152: ############################################################
153:
154: sub check_numeric_list_to_regexp
155: {
156:     my (\$re, \$list) = @_;
157:     my %list = map { \$_ => 1 } @\$list;
158:     print "Re: \$re\n";
159:
160:     # Put some other test cases in
161:     \$list{\$_} += 0 for (0..999);
162:     \$list{int(rand()*1000)} += 0 for (0..99);
163:     \$list{int(rand()*10000)} += 0 for (0..99);
164:     \$list{int(rand()*100000)} += 0 for (0..99);
165:
166:     # print join(", ", map {"\$_ => \$list{\$_}"} keys %list), "\n";
167:     \$re =~ s{^\|}{};		# fix | on start and end
168:     \$re =~ s{\|\$}{};
169:     \$re = "^(?:\$re)\\$";		# put in ^(?: ... )\$
170:     \$re = qr{\$re};		# compile the regexp for speed
171:
172:     # Check all the keys in list against the regexp - some should pass
173:     # and some should fail
174:     for my \$item (keys %list)
175:     {
176:         if (\$list{\$item} xor (\$item =~ /\$re/))
177:         {
178:             die "*** FAILED '\$re' for '\$item' ShouldMatch: \$list{\$item}\n";
179:         }
180:         else
181:         {
182:             # print "OK '\$re' for '\$item'\n";
183:         }
184:     }
185: }

Replies are listed 'Best First'.
RE (tilly) 1: Numeric list to optimised regexp
by tilly (Archbishop) on Sep 07, 2000 at 01:38 UTC
Out of curiosity, how does its output compare to my more general-purpose "create an optimized RE for this list" that I gave in RE (tilly) 4: SAS log scanner?
I didn't realise anyone had had a go at this sort of thing already, though it is inevitable really!

I'll anwser tilly's question with an example:-

My code gives (for the list 1..255)

[1-9]|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
((?:1(?:|0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0| +1|2|3|4|5|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9 +)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4| +5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9))|2(?:| +0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0|1|2|3|4|5| +6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9)|5(?:|0|1 +|2|3|4|5)|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9 +)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4| +5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9)))
My aim was to get rid of as many alternations as possible (which are slow) and turn them into character classes (which are fast). I wanted also to factor the regexp as much as possible.

If you change my code replacing all \d's with \w or whatever it should work fine for any list of words, but I designed and tested it with numeric lists in mind.

My first attempt at this problem used a trie like data structure but I abandonded it once I had the idea of using backtracking regexps - the irony of using regexps to optimise regexps was irresistable!

OK, this is nice. OTOH I really want to see the win moved down to the RE engine, and at least one optimization that was discussed with Ilya would move all of the wins from both of our approaches down.

So someday you should see all matches speed up because of this kind of logic, without having to do any work for it... :-)

You might check out Text::Trie (by Ilya). So it might be useful to have "right" Tries to go with these "left" Tries.

- tye (but my friends call me "Tye")
I didn't know that he released that as a module. I would hope that someday this win is native in the RE...

Create A New User
Node Status?
node history
Node Type: perlcraft [id://31329]
Approved by root
help
Chatterbox?
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others perusing the Monastery: (4)
As of 2018-03-23 19:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
When I think of a mole I think of:

Results (296 votes). Check out past polls.

Notices?