comment on

0: #!/usr/bin/perl -w
1: #
2: # The challenge - to write a function which given a list of non
3: # negative integers returns a regexp which will match those and only
4: # those numbers.
5: #
6: # Try the test program like this :-
7: #
8: #-- Simple numeric list
9: # ./numeric-list-to-regexp.pl 0..255
10: #  \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
11: #-- Sparse numeric list
12: # ./numeric-list-to-regexp.pl 0..11,15,21..33
13: # \d|1[015]|2[1-9]|3[0-3]
14: #-- Numbers divisble by 3
15: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'            
16: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
17: #-- All prime numbers < 100
18: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
19: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
20: 
21: use strict;
22: 
23: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
24:     unless @ARGV;
25: my @list = eval "@ARGV";
26: my $re = numeric_list_to_regexp( @list );
27: check_numeric_list_to_regexp($re, \@list);
28: exit;
29: 
30: ############################################################
31: # Converts a list of numbers into a regexp which will
32: # match those numbers and those numbers only.
33: #
34: # It does this by constructing a regexp and then progressively
35: # simplifying it - recursively if necessary.  It uses regexp's to
36: # transform the regexp of course!  This is almost a general purpose
37: # regexp optimiser.
38: #
39: # We assume that the caller will bound the regexp with ^( and )$ or
40: # \D(?: and )\D or whatever takes their fancy
41: #
42: # Set $DEBUG to 1 if you want to print lots of info and check the
43: # regexp works after each transformation.
44: #
45: # Warning: code contains heavy regexps - lift with care ;-)
46: # Caution: Code may use exponential time and space ;-(
47: ############################################################
48: 
49: sub numeric_list_to_regexp
50: {
51:     my (@list) = @_;
52:     my $DEBUG = 0;
53: 
54:     # The basic regexp with |'s on the start and end to make our life
55:     # easier
56:     # Should uniq here too...
57:     my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
58: 
59:     # Transform the regexp in stages, making sure at all time the
60:     # regexp is correct if $DEBUG is set
61: 
62:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
63: 
64:     # 1) Concatenate all the single characters a|b|c into [abc]'s
65:     $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
66:     {
67:         my ( $string ) = ( $1 );
68:         print "string = '$string'\n" if $DEBUG;
69:         "|[" . join("", split m{\|}, $string) . "]"
70:     }gex;
71: 
72:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
73: 
74:     # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
75:     $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
76:     {
77:         my ( $string, $prefix ) = ( $1, $2 );
78:         print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
79:         "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
80:     }gex;
81: 
82:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
83: 
84:     # 3) Find all the aX|bX|cX and change to (a|b|c)X]
85:     $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
86:     {
87:         my ( $string, $postfix ) = ( $1, $3 );
88:         print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
89:         $string =~ s{ \Q$postfix\E  (?= \| | $ ) }{}gx;
90:         print "...string = '$string'\n" if $DEBUG;
91:         "|(?:$string)$postfix"
92:     }gex;
93: 
94:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
95: 
96:     # 4) Change (?:a|b|c) into [abc]
97:     $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
98:     {
99:         my ( $string ) = ( $1 );
100:         print "string = '$string'\n" if $DEBUG;
101:         "[" . join("", split m{\|}, $string) . "]"
102:     }gex;
103: 
104:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
105: 
106:     # 5) Optimise [abc] into [a-c] or \d
107:     # This doesn't optimise all the cases only the complete continuous
108:     # range in the [ ... ]
109:     $re =~ s{ \[ ( \d{3,} ) \] }
110:     {
111:         my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
112:         print "match ['$string']...range [$start-$end]\n" if $DEBUG;
113:         if ($end - $start + 1 == length $string)
114:         {
115:             $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
116:         }
117:         else
118:         {
119:             "[$string]";
120:         }
121:     }gex;
122: 
123:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
124: 
125:     # 6) recurse on any digit sequences left (?:ab|cd|ef)
126:     $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
127:     {
128:         my ( $string ) = ( $1 );
129:         print "**** Recursing on '$string'\n" if $DEBUG;
130:         "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
131:     }gex;
132: 
133:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
134: 
135:     # 7) fix the | on each end
136:     $re =~ s{^\|}{};
137:     $re =~ s{\|$}{};
138: 
139:     print "**** Returning '$re'\n" if $DEBUG;
140: 
141:     return $re;
142: }
143: 
144: ############################################################
145: # Test subroutine to check the regexp performs as advertised
146: #
147: # Call with a regexp and a reference to a list of numbers
148: # it will check that the regexp matches all the list and
149: # doesn't match some others (obviously it can't check them
150: # all can it!) die-ing on any failures.
151: ############################################################
152: 
153: sub check_numeric_list_to_regexp
154: {
155:     my ($re, $list) = @_;
156:     my %list = map { $_ => 1 } @$list;
157:     print "Re: $re\n";
158: 
159:     # Put some other test cases in
160:     $list{$_} += 0 for (0..999);
161:     $list{int(rand()*1000)} += 0 for (0..99);
162:     $list{int(rand()*10000)} += 0 for (0..99);
163:     $list{int(rand()*100000)} += 0 for (0..99);
164: 
165:     # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
166:     $re =~ s{^\|}{};		# fix | on start and end
167:     $re =~ s{\|$}{};
168:     $re = "^(?:$re)\$";		# put in ^(?: ... )$
169:     $re = qr{$re};		# compile the regexp for speed
170: 
171:     # Check all the keys in list against the regexp - some should pass
172:     # and some should fail
173:     for my $item (keys %list)
174:     {
175:         if ($list{$item} xor ($item =~ /$re/))
176:         {
177:             die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
178:         }
179:         else
180:         {
181:             # print "OK '$re' for '$item'\n";
182:         }
183:     }
184: }

In reply to Numeric list to optimised regexp by ncw

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


laziness, impatience, and hubris
	PerlMonks