1: #!/usr/bin/perl -w
2: #
3: # The challenge - to write a function which given a list of non
4: # negative integers returns a regexp which will match those and only
5: # those numbers.
6: #
7: # Try the test program like this :-
8: #
9: #-- Simple numeric list
10: # ./numeric-list-to-regexp.pl 0..255
11: # \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
12: #-- Sparse numeric list
13: # ./numeric-list-to-regexp.pl 0..11,15,21..33
14: # \d|1[015]|2[1-9]|3[0-3]
15: #-- Numbers divisble by 3
16: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'
17: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
18: #-- All prime numbers < 100
19: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
20: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
21:
22: use strict;
23:
24: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
25: unless @ARGV;
26: my @list = eval "@ARGV";
27: my $re = numeric_list_to_regexp( @list );
28: check_numeric_list_to_regexp($re, \@list);
29: exit;
30:
31: ############################################################
32: # Converts a list of numbers into a regexp which will
33: # match those numbers and those numbers only.
34: #
35: # It does this by constructing a regexp and then progressively
36: # simplifying it - recursively if necessary. It uses regexp's to
37: # transform the regexp of course! This is almost a general purpose
38: # regexp optimiser.
39: #
40: # We assume that the caller will bound the regexp with ^( and )$ or
41: # \D(?: and )\D or whatever takes their fancy
42: #
43: # Set $DEBUG to 1 if you want to print lots of info and check the
44: # regexp works after each transformation.
45: #
46: # Warning: code contains heavy regexps - lift with care ;-)
47: # Caution: Code may use exponential time and space ;-(
48: ############################################################
49:
50: sub numeric_list_to_regexp
51: {
52: my (@list) = @_;
53: my $DEBUG = 0;
54:
55: # The basic regexp with |'s on the start and end to make our life
56: # easier
57: # Should uniq here too...
58: my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
59:
60: # Transform the regexp in stages, making sure at all time the
61: # regexp is correct if $DEBUG is set
62:
63: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
64:
65: # 1) Concatenate all the single characters a|b|c into [abc]'s
66: $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
67: {
68: my ( $string ) = ( $1 );
69: print "string = '$string'\n" if $DEBUG;
70: "|[" . join("", split m{\|}, $string) . "]"
71: }gex;
72:
73: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
74:
75: # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
76: $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
77: {
78: my ( $string, $prefix ) = ( $1, $2 );
79: print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
80: "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
81: }gex;
82:
83: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
84:
85: # 3) Find all the aX|bX|cX and change to (a|b|c)X]
86: $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
87: {
88: my ( $string, $postfix ) = ( $1, $3 );
89: print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
90: $string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx;
91: print "...string = '$string'\n" if $DEBUG;
92: "|(?:$string)$postfix"
93: }gex;
94:
95: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
96:
97: # 4) Change (?:a|b|c) into [abc]
98: $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
99: {
100: my ( $string ) = ( $1 );
101: print "string = '$string'\n" if $DEBUG;
102: "[" . join("", split m{\|}, $string) . "]"
103: }gex;
104:
105: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
106:
107: # 5) Optimise [abc] into [a-c] or \d
108: # This doesn't optimise all the cases only the complete continuous
109: # range in the [ ... ]
110: $re =~ s{ \[ ( \d{3,} ) \] }
111: {
112: my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
113: print "match ['$string']...range [$start-$end]\n" if $DEBUG;
114: if ($end - $start + 1 == length $string)
115: {
116: $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
117: }
118: else
119: {
120: "[$string]";
121: }
122: }gex;
123:
124: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
125:
126: # 6) recurse on any digit sequences left (?:ab|cd|ef)
127: $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
128: {
129: my ( $string ) = ( $1 );
130: print "**** Recursing on '$string'\n" if $DEBUG;
131: "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
132: }gex;
133:
134: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
135:
136: # 7) fix the | on each end
137: $re =~ s{^\|}{};
138: $re =~ s{\|$}{};
139:
140: print "**** Returning '$re'\n" if $DEBUG;
141:
142: return $re;
143: }
144:
145: ############################################################
146: # Test subroutine to check the regexp performs as advertised
147: #
148: # Call with a regexp and a reference to a list of numbers
149: # it will check that the regexp matches all the list and
150: # doesn't match some others (obviously it can't check them
151: # all can it!) die-ing on any failures.
152: ############################################################
153:
154: sub check_numeric_list_to_regexp
155: {
156: my ($re, $list) = @_;
157: my %list = map { $_ => 1 } @$list;
158: print "Re: $re\n";
159:
160: # Put some other test cases in
161: $list{$_} += 0 for (0..999);
162: $list{int(rand()*1000)} += 0 for (0..99);
163: $list{int(rand()*10000)} += 0 for (0..99);
164: $list{int(rand()*100000)} += 0 for (0..99);
165:
166: # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
167: $re =~ s{^\|}{}; # fix | on start and end
168: $re =~ s{\|$}{};
169: $re = "^(?:$re)\$"; # put in ^(?: ... )$
170: $re = qr{$re}; # compile the regexp for speed
171:
172: # Check all the keys in list against the regexp - some should pass
173: # and some should fail
174: for my $item (keys %list)
175: {
176: if ($list{$item} xor ($item =~ /$re/))
177: {
178: die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
179: }
180: else
181: {
182: # print "OK '$re' for '$item'\n";
183: }
184: }
185: }
RE (tilly) 1: Numeric list to optimised regexp by tilly (Archbishop) on Sep 07, 2000 at 01:38 UTC |
Out of curiosity, how does its output compare to my more
general-purpose "create an optimized RE for this list"
that I gave in RE (tilly) 4: SAS log scanner? | [reply] |
|
| [reply] |
|
I didn't know that he released that as a module. I would
hope that someday this win is native in the RE...
| [reply] |
|
[1-9]|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
Whereas your code gives
((?:1(?:|0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0|
+1|2|3|4|5|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9
+)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4|
+5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9))|2(?:|
+0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0|1|2|3|4|5|
+6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9)|5(?:|0|1
+|2|3|4|5)|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9
+)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4|
+5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9)))
My aim was to get rid of as many alternations as possible
(which are slow) and turn them into character classes (which
are fast). I wanted also to factor the regexp as much as possible.
If you change my code replacing all \d's with \w or whatever
it should work fine for any list of words, but I designed and tested
it with numeric lists in mind.
My first attempt at this problem used a trie like data structure
but I abandonded it once I had the idea of using backtracking
regexps - the irony of using regexps to optimise regexps was
irresistable! | [reply] [d/l] [select] |
|
| [reply] |
|
|