1: #!/usr/bin/perl -w 2: # 3: # The challenge - to write a function which given a list of non 4: # negative integers returns a regexp which will match those and only 5: # those numbers. 6: # 7: # Try the test program like this :- 8: # 9: #-- Simple numeric list 10: # ./numeric-list-to-regexp.pl 0..255 11: # \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5] 12: #-- Sparse numeric list 13: # ./numeric-list-to-regexp.pl 0..11,15,21..33 14: # \d|1[015]|2[1-9]|3[0-3] 15: #-- Numbers divisble by 3 16: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33' 17: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369] 18: #-- All prime numbers < 100 19: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)' 20: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97 21: 22: use strict; 23: 24: die "Pass some perl in please, eg 1..20 or 1, 4, 5" 25: unless @ARGV; 26: my @list = eval "@ARGV"; 27: my $re = numeric_list_to_regexp( @list ); 28: check_numeric_list_to_regexp($re, \@list); 29: exit; 30: 31: ############################################################ 32: # Converts a list of numbers into a regexp which will 33: # match those numbers and those numbers only. 34: # 35: # It does this by constructing a regexp and then progressively 36: # simplifying it - recursively if necessary. It uses regexp's to 37: # transform the regexp of course! This is almost a general purpose 38: # regexp optimiser. 39: # 40: # We assume that the caller will bound the regexp with ^( and )$ or 41: # \D(?: and )\D or whatever takes their fancy 42: # 43: # Set $DEBUG to 1 if you want to print lots of info and check the 44: # regexp works after each transformation. 45: # 46: # Warning: code contains heavy regexps - lift with care ;-) 47: # Caution: Code may use exponential time and space ;-( 48: ############################################################ 49: 50: sub numeric_list_to_regexp 51: { 52: my (@list) = @_; 53: my $DEBUG = 0; 54: 55: # The basic regexp with |'s on the start and end to make our life 56: # easier 57: # Should uniq here too... 58: my $re = "|" . join("|", sort { $a <=> $b } @list) . "|"; 59: 60: # Transform the regexp in stages, making sure at all time the 61: # regexp is correct if $DEBUG is set 62: 63: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 64: 65: # 1) Concatenate all the single characters a|b|c into [abc]'s 66: $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) } 67: { 68: my ( $string ) = ( $1 ); 69: print "string = '$string'\n" if $DEBUG; 70: "|[" . join("", split m{\|}, $string) . "]" 71: }gex; 72: 73: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 74: 75: # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)] 76: $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) } 77: { 78: my ( $string, $prefix ) = ( $1, $2 ); 79: print "prefix = '$prefix', string = '$string'\n" if $DEBUG; 80: "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")" 81: }gex; 82: 83: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 84: 85: # 3) Find all the aX|bX|cX and change to (a|b|c)X] 86: $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) } 87: { 88: my ( $string, $postfix ) = ( $1, $3 ); 89: print "postfix = '$postfix', string = '$string'\n" if $DEBUG; 90: $string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx; 91: print "...string = '$string'\n" if $DEBUG; 92: "|(?:$string)$postfix" 93: }gex; 94: 95: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 96: 97: # 4) Change (?:a|b|c) into [abc] 98: $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) } 99: { 100: my ( $string ) = ( $1 ); 101: print "string = '$string'\n" if $DEBUG; 102: "[" . join("", split m{\|}, $string) . "]" 103: }gex; 104: 105: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 106: 107: # 5) Optimise [abc] into [a-c] or \d 108: # This doesn't optimise all the cases only the complete continuous 109: # range in the [ ... ] 110: $re =~ s{ \[ ( \d{3,} ) \] } 111: { 112: my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) ); 113: print "match ['$string']...range [$start-$end]\n" if $DEBUG; 114: if ($end - $start + 1 == length $string) 115: { 116: $start == 0 && $end == 9 ? '\d' : "[$start-$end]"; 117: } 118: else 119: { 120: "[$string]"; 121: } 122: }gex; 123: 124: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 125: 126: # 6) recurse on any digit sequences left (?:ab|cd|ef) 127: $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) } 128: { 129: my ( $string ) = ( $1 ); 130: print "**** Recursing on '$string'\n" if $DEBUG; 131: "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")"; 132: }gex; 133: 134: check_numeric_list_to_regexp($re, \@list) if $DEBUG; 135: 136: # 7) fix the | on each end 137: $re =~ s{^\|}{}; 138: $re =~ s{\|$}{}; 139: 140: print "**** Returning '$re'\n" if $DEBUG; 141: 142: return $re; 143: } 144: 145: ############################################################ 146: # Test subroutine to check the regexp performs as advertised 147: # 148: # Call with a regexp and a reference to a list of numbers 149: # it will check that the regexp matches all the list and 150: # doesn't match some others (obviously it can't check them 151: # all can it!) die-ing on any failures. 152: ############################################################ 153: 154: sub check_numeric_list_to_regexp 155: { 156: my ($re, $list) = @_; 157: my %list = map { $_ => 1 } @$list; 158: print "Re: $re\n"; 159: 160: # Put some other test cases in 161: $list{$_} += 0 for (0..999); 162: $list{int(rand()*1000)} += 0 for (0..99); 163: $list{int(rand()*10000)} += 0 for (0..99); 164: $list{int(rand()*100000)} += 0 for (0..99); 165: 166: # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n"; 167: $re =~ s{^\|}{}; # fix | on start and end 168: $re =~ s{\|$}{}; 169: $re = "^(?:$re)\$"; # put in ^(?: ... )$ 170: $re = qr{$re}; # compile the regexp for speed 171: 172: # Check all the keys in list against the regexp - some should pass 173: # and some should fail 174: for my $item (keys %list) 175: { 176: if ($list{$item} xor ($item =~ /$re/)) 177: { 178: die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n"; 179: } 180: else 181: { 182: # print "OK '$re' for '$item'\n"; 183: } 184: } 185: }
|
---|
Replies are listed 'Best First'. | |
---|---|
RE (tilly) 1: Numeric list to optimised regexp
by tilly (Archbishop) on Sep 07, 2000 at 01:38 UTC | |
by ncw (Friar) on Sep 07, 2000 at 12:54 UTC | |
by tilly (Archbishop) on Sep 07, 2000 at 14:29 UTC | |
by tye (Sage) on Sep 07, 2000 at 02:56 UTC | |
by tilly (Archbishop) on Sep 07, 2000 at 14:25 UTC |
Back to
Craft