0: #!/usr/bin/perl -w
1: #
2: # The challenge - to write a function which given a list of non
3: # negative integers returns a regexp which will match those and only
4: # those numbers.
5: #
6: # Try the test program like this :-
7: #
8: #-- Simple numeric list
9: # ./numeric-list-to-regexp.pl 0..255
10: # \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
11: #-- Sparse numeric list
12: # ./numeric-list-to-regexp.pl 0..11,15,21..33
13: # \d|1[015]|2[1-9]|3[0-3]
14: #-- Numbers divisble by 3
15: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'
16: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
17: #-- All prime numbers < 100
18: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
19: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
20:
21: use strict;
22:
23: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
24: unless @ARGV;
25: my @list = eval "@ARGV";
26: my $re = numeric_list_to_regexp( @list );
27: check_numeric_list_to_regexp($re, \@list);
28: exit;
29:
30: ############################################################
31: # Converts a list of numbers into a regexp which will
32: # match those numbers and those numbers only.
33: #
34: # It does this by constructing a regexp and then progressively
35: # simplifying it - recursively if necessary. It uses regexp's to
36: # transform the regexp of course! This is almost a general purpose
37: # regexp optimiser.
38: #
39: # We assume that the caller will bound the regexp with ^( and )$ or
40: # \D(?: and )\D or whatever takes their fancy
41: #
42: # Set $DEBUG to 1 if you want to print lots of info and check the
43: # regexp works after each transformation.
44: #
45: # Warning: code contains heavy regexps - lift with care ;-)
46: # Caution: Code may use exponential time and space ;-(
47: ############################################################
48:
49: sub numeric_list_to_regexp
50: {
51: my (@list) = @_;
52: my $DEBUG = 0;
53:
54: # The basic regexp with |'s on the start and end to make our life
55: # easier
56: # Should uniq here too...
57: my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
58:
59: # Transform the regexp in stages, making sure at all time the
60: # regexp is correct if $DEBUG is set
61:
62: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
63:
64: # 1) Concatenate all the single characters a|b|c into [abc]'s
65: $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
66: {
67: my ( $string ) = ( $1 );
68: print "string = '$string'\n" if $DEBUG;
69: "|[" . join("", split m{\|}, $string) . "]"
70: }gex;
71:
72: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
73:
74: # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
75: $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
76: {
77: my ( $string, $prefix ) = ( $1, $2 );
78: print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
79: "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
80: }gex;
81:
82: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
83:
84: # 3) Find all the aX|bX|cX and change to (a|b|c)X]
85: $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
86: {
87: my ( $string, $postfix ) = ( $1, $3 );
88: print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
89: $string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx;
90: print "...string = '$string'\n" if $DEBUG;
91: "|(?:$string)$postfix"
92: }gex;
93:
94: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
95:
96: # 4) Change (?:a|b|c) into [abc]
97: $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
98: {
99: my ( $string ) = ( $1 );
100: print "string = '$string'\n" if $DEBUG;
101: "[" . join("", split m{\|}, $string) . "]"
102: }gex;
103:
104: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
105:
106: # 5) Optimise [abc] into [a-c] or \d
107: # This doesn't optimise all the cases only the complete continuous
108: # range in the [ ... ]
109: $re =~ s{ \[ ( \d{3,} ) \] }
110: {
111: my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
112: print "match ['$string']...range [$start-$end]\n" if $DEBUG;
113: if ($end - $start + 1 == length $string)
114: {
115: $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
116: }
117: else
118: {
119: "[$string]";
120: }
121: }gex;
122:
123: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
124:
125: # 6) recurse on any digit sequences left (?:ab|cd|ef)
126: $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
127: {
128: my ( $string ) = ( $1 );
129: print "**** Recursing on '$string'\n" if $DEBUG;
130: "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
131: }gex;
132:
133: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
134:
135: # 7) fix the | on each end
136: $re =~ s{^\|}{};
137: $re =~ s{\|$}{};
138:
139: print "**** Returning '$re'\n" if $DEBUG;
140:
141: return $re;
142: }
143:
144: ############################################################
145: # Test subroutine to check the regexp performs as advertised
146: #
147: # Call with a regexp and a reference to a list of numbers
148: # it will check that the regexp matches all the list and
149: # doesn't match some others (obviously it can't check them
150: # all can it!) die-ing on any failures.
151: ############################################################
152:
153: sub check_numeric_list_to_regexp
154: {
155: my ($re, $list) = @_;
156: my %list = map { $_ => 1 } @$list;
157: print "Re: $re\n";
158:
159: # Put some other test cases in
160: $list{$_} += 0 for (0..999);
161: $list{int(rand()*1000)} += 0 for (0..99);
162: $list{int(rand()*10000)} += 0 for (0..99);
163: $list{int(rand()*100000)} += 0 for (0..99);
164:
165: # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
166: $re =~ s{^\|}{}; # fix | on start and end
167: $re =~ s{\|$}{};
168: $re = "^(?:$re)\$"; # put in ^(?: ... )$
169: $re = qr{$re}; # compile the regexp for speed
170:
171: # Check all the keys in list against the regexp - some should pass
172: # and some should fail
173: for my $item (keys %list)
174: {
175: if ($list{$item} xor ($item =~ /$re/))
176: {
177: die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
178: }
179: else
180: {
181: # print "OK '$re' for '$item'\n";
182: }
183: }
184: }
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.