http://www.perlmonks.org?node_id=31329

   1: #!/usr/bin/perl -w
   2: #
   3: # The challenge - to write a function which given a list of non
   4: # negative integers returns a regexp which will match those and only
   5: # those numbers.
   6: #
   7: # Try the test program like this :-
   8: #
   9: #-- Simple numeric list
  10: # ./numeric-list-to-regexp.pl 0..255
  11: #  \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
  12: #-- Sparse numeric list
  13: # ./numeric-list-to-regexp.pl 0..11,15,21..33
  14: # \d|1[015]|2[1-9]|3[0-3]
  15: #-- Numbers divisble by 3
  16: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'            
  17: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
  18: #-- All prime numbers < 100
  19: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
  20: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
  21: 
  22: use strict;
  23: 
  24: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
  25:     unless @ARGV;
  26: my @list = eval "@ARGV";
  27: my $re = numeric_list_to_regexp( @list );
  28: check_numeric_list_to_regexp($re, \@list);
  29: exit;
  30: 
  31: ############################################################
  32: # Converts a list of numbers into a regexp which will
  33: # match those numbers and those numbers only.
  34: #
  35: # It does this by constructing a regexp and then progressively
  36: # simplifying it - recursively if necessary.  It uses regexp's to
  37: # transform the regexp of course!  This is almost a general purpose
  38: # regexp optimiser.
  39: #
  40: # We assume that the caller will bound the regexp with ^( and )$ or
  41: # \D(?: and )\D or whatever takes their fancy
  42: #
  43: # Set $DEBUG to 1 if you want to print lots of info and check the
  44: # regexp works after each transformation.
  45: #
  46: # Warning: code contains heavy regexps - lift with care ;-)
  47: # Caution: Code may use exponential time and space ;-(
  48: ############################################################
  49: 
  50: sub numeric_list_to_regexp
  51: {
  52:     my (@list) = @_;
  53:     my $DEBUG = 0;
  54: 
  55:     # The basic regexp with |'s on the start and end to make our life
  56:     # easier
  57:     # Should uniq here too...
  58:     my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
  59: 
  60:     # Transform the regexp in stages, making sure at all time the
  61:     # regexp is correct if $DEBUG is set
  62: 
  63:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
  64: 
  65:     # 1) Concatenate all the single characters a|b|c into [abc]'s
  66:     $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
  67:     {
  68:         my ( $string ) = ( $1 );
  69:         print "string = '$string'\n" if $DEBUG;
  70:         "|[" . join("", split m{\|}, $string) . "]"
  71:     }gex;
  72: 
  73:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
  74: 
  75:     # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
  76:     $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
  77:     {
  78:         my ( $string, $prefix ) = ( $1, $2 );
  79:         print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
  80:         "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
  81:     }gex;
  82: 
  83:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
  84: 
  85:     # 3) Find all the aX|bX|cX and change to (a|b|c)X]
  86:     $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
  87:     {
  88:         my ( $string, $postfix ) = ( $1, $3 );
  89:         print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
  90:         $string =~ s{ \Q$postfix\E  (?= \| | $ ) }{}gx;
  91:         print "...string = '$string'\n" if $DEBUG;
  92:         "|(?:$string)$postfix"
  93:     }gex;
  94: 
  95:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
  96: 
  97:     # 4) Change (?:a|b|c) into [abc]
  98:     $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
  99:     {
 100:         my ( $string ) = ( $1 );
 101:         print "string = '$string'\n" if $DEBUG;
 102:         "[" . join("", split m{\|}, $string) . "]"
 103:     }gex;
 104: 
 105:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
 106: 
 107:     # 5) Optimise [abc] into [a-c] or \d
 108:     # This doesn't optimise all the cases only the complete continuous
 109:     # range in the [ ... ]
 110:     $re =~ s{ \[ ( \d{3,} ) \] }
 111:     {
 112:         my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
 113:         print "match ['$string']...range [$start-$end]\n" if $DEBUG;
 114:         if ($end - $start + 1 == length $string)
 115:         {
 116:             $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
 117:         }
 118:         else
 119:         {
 120:             "[$string]";
 121:         }
 122:     }gex;
 123: 
 124:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
 125: 
 126:     # 6) recurse on any digit sequences left (?:ab|cd|ef)
 127:     $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
 128:     {
 129:         my ( $string ) = ( $1 );
 130:         print "**** Recursing on '$string'\n" if $DEBUG;
 131:         "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
 132:     }gex;
 133: 
 134:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
 135: 
 136:     # 7) fix the | on each end
 137:     $re =~ s{^\|}{};
 138:     $re =~ s{\|$}{};
 139: 
 140:     print "**** Returning '$re'\n" if $DEBUG;
 141: 
 142:     return $re;
 143: }
 144: 
 145: ############################################################
 146: # Test subroutine to check the regexp performs as advertised
 147: #
 148: # Call with a regexp and a reference to a list of numbers
 149: # it will check that the regexp matches all the list and
 150: # doesn't match some others (obviously it can't check them
 151: # all can it!) die-ing on any failures.
 152: ############################################################
 153: 
 154: sub check_numeric_list_to_regexp
 155: {
 156:     my ($re, $list) = @_;
 157:     my %list = map { $_ => 1 } @$list;
 158:     print "Re: $re\n";
 159: 
 160:     # Put some other test cases in
 161:     $list{$_} += 0 for (0..999);
 162:     $list{int(rand()*1000)} += 0 for (0..99);
 163:     $list{int(rand()*10000)} += 0 for (0..99);
 164:     $list{int(rand()*100000)} += 0 for (0..99);
 165: 
 166:     # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
 167:     $re =~ s{^\|}{};		# fix | on start and end
 168:     $re =~ s{\|$}{};
 169:     $re = "^(?:$re)\$";		# put in ^(?: ... )$
 170:     $re = qr{$re};		# compile the regexp for speed
 171: 
 172:     # Check all the keys in list against the regexp - some should pass
 173:     # and some should fail
 174:     for my $item (keys %list)
 175:     {
 176:         if ($list{$item} xor ($item =~ /$re/))
 177:         {
 178:             die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
 179:         }
 180:         else
 181:         {
 182:             # print "OK '$re' for '$item'\n";
 183:         }
 184:     }
 185: }

Replies are listed 'Best First'.
RE (tilly) 1: Numeric list to optimised regexp
by tilly (Archbishop) on Sep 07, 2000 at 01:38 UTC
    Out of curiosity, how does its output compare to my more general-purpose "create an optimized RE for this list" that I gave in RE (tilly) 4: SAS log scanner?
      I didn't realise anyone had had a go at this sort of thing already, though it is inevitable really!

      I'll anwser tilly's question with an example:-

      My code gives (for the list 1..255)

      [1-9]|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
      Whereas your code gives
      ((?:1(?:|0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0| +1|2|3|4|5|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9 +)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4| +5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9))|2(?:| +0(?:|0|1|2|3|4|5|6|7|8|9)|1(?:|0|1|2|3|4|5|6|7|8|9)|2(?:|0|1|2|3|4|5| +6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9)|5(?:|0|1 +|2|3|4|5)|6|7|8|9)|3(?:|0|1|2|3|4|5|6|7|8|9)|4(?:|0|1|2|3|4|5|6|7|8|9 +)|5(?:|0|1|2|3|4|5|6|7|8|9)|6(?:|0|1|2|3|4|5|6|7|8|9)|7(?:|0|1|2|3|4| +5|6|7|8|9)|8(?:|0|1|2|3|4|5|6|7|8|9)|9(?:|0|1|2|3|4|5|6|7|8|9)))
      My aim was to get rid of as many alternations as possible (which are slow) and turn them into character classes (which are fast). I wanted also to factor the regexp as much as possible.

      If you change my code replacing all \d's with \w or whatever it should work fine for any list of words, but I designed and tested it with numeric lists in mind.

      My first attempt at this problem used a trie like data structure but I abandonded it once I had the idea of using backtracking regexps - the irony of using regexps to optimise regexps was irresistable!

        OK, this is nice. OTOH I really want to see the win moved down to the RE engine, and at least one optimization that was discussed with Ilya would move all of the wins from both of our approaches down.

        So someday you should see all matches speed up because of this kind of logic, without having to do any work for it... :-)

      You might check out Text::Trie (by Ilya). So it might be useful to have "right" Tries to go with these "left" Tries.

              - tye (but my friends call me "Tye")
        I didn't know that he released that as a module. I would hope that someday this win is native in the RE...