Beefy Boxes and Bandwidth Generously Provided by pair Networks
laziness, impatience, and hubris
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
    0: #!/usr/bin/perl -w
    1: #
    2: # The challenge - to write a function which given a list of non
    3: # negative integers returns a regexp which will match those and only
    4: # those numbers.
    5: #
    6: # Try the test program like this :-
    7: #
    8: #-- Simple numeric list
    9: # ./numeric-list-to-regexp.pl 0..255
    10: #  \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
    11: #-- Sparse numeric list
    12: # ./numeric-list-to-regexp.pl 0..11,15,21..33
    13: # \d|1[015]|2[1-9]|3[0-3]
    14: #-- Numbers divisble by 3
    15: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'            
    16: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
    17: #-- All prime numbers < 100
    18: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
    19: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
    20: 
    21: use strict;
    22: 
    23: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
    24:     unless @ARGV;
    25: my @list = eval "@ARGV";
    26: my $re = numeric_list_to_regexp( @list );
    27: check_numeric_list_to_regexp($re, \@list);
    28: exit;
    29: 
    30: ############################################################
    31: # Converts a list of numbers into a regexp which will
    32: # match those numbers and those numbers only.
    33: #
    34: # It does this by constructing a regexp and then progressively
    35: # simplifying it - recursively if necessary.  It uses regexp's to
    36: # transform the regexp of course!  This is almost a general purpose
    37: # regexp optimiser.
    38: #
    39: # We assume that the caller will bound the regexp with ^( and )$ or
    40: # \D(?: and )\D or whatever takes their fancy
    41: #
    42: # Set $DEBUG to 1 if you want to print lots of info and check the
    43: # regexp works after each transformation.
    44: #
    45: # Warning: code contains heavy regexps - lift with care ;-)
    46: # Caution: Code may use exponential time and space ;-(
    47: ############################################################
    48: 
    49: sub numeric_list_to_regexp
    50: {
    51:     my (@list) = @_;
    52:     my $DEBUG = 0;
    53: 
    54:     # The basic regexp with |'s on the start and end to make our life
    55:     # easier
    56:     # Should uniq here too...
    57:     my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
    58: 
    59:     # Transform the regexp in stages, making sure at all time the
    60:     # regexp is correct if $DEBUG is set
    61: 
    62:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    63: 
    64:     # 1) Concatenate all the single characters a|b|c into [abc]'s
    65:     $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
    66:     {
    67:         my ( $string ) = ( $1 );
    68:         print "string = '$string'\n" if $DEBUG;
    69:         "|[" . join("", split m{\|}, $string) . "]"
    70:     }gex;
    71: 
    72:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    73: 
    74:     # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
    75:     $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
    76:     {
    77:         my ( $string, $prefix ) = ( $1, $2 );
    78:         print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
    79:         "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
    80:     }gex;
    81: 
    82:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    83: 
    84:     # 3) Find all the aX|bX|cX and change to (a|b|c)X]
    85:     $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
    86:     {
    87:         my ( $string, $postfix ) = ( $1, $3 );
    88:         print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
    89:         $string =~ s{ \Q$postfix\E  (?= \| | $ ) }{}gx;
    90:         print "...string = '$string'\n" if $DEBUG;
    91:         "|(?:$string)$postfix"
    92:     }gex;
    93: 
    94:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    95: 
    96:     # 4) Change (?:a|b|c) into [abc]
    97:     $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
    98:     {
    99:         my ( $string ) = ( $1 );
    100:         print "string = '$string'\n" if $DEBUG;
    101:         "[" . join("", split m{\|}, $string) . "]"
    102:     }gex;
    103: 
    104:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    105: 
    106:     # 5) Optimise [abc] into [a-c] or \d
    107:     # This doesn't optimise all the cases only the complete continuous
    108:     # range in the [ ... ]
    109:     $re =~ s{ \[ ( \d{3,} ) \] }
    110:     {
    111:         my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
    112:         print "match ['$string']...range [$start-$end]\n" if $DEBUG;
    113:         if ($end - $start + 1 == length $string)
    114:         {
    115:             $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
    116:         }
    117:         else
    118:         {
    119:             "[$string]";
    120:         }
    121:     }gex;
    122: 
    123:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    124: 
    125:     # 6) recurse on any digit sequences left (?:ab|cd|ef)
    126:     $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
    127:     {
    128:         my ( $string ) = ( $1 );
    129:         print "**** Recursing on '$string'\n" if $DEBUG;
    130:         "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
    131:     }gex;
    132: 
    133:     check_numeric_list_to_regexp($re, \@list) if $DEBUG;
    134: 
    135:     # 7) fix the | on each end
    136:     $re =~ s{^\|}{};
    137:     $re =~ s{\|$}{};
    138: 
    139:     print "**** Returning '$re'\n" if $DEBUG;
    140: 
    141:     return $re;
    142: }
    143: 
    144: ############################################################
    145: # Test subroutine to check the regexp performs as advertised
    146: #
    147: # Call with a regexp and a reference to a list of numbers
    148: # it will check that the regexp matches all the list and
    149: # doesn't match some others (obviously it can't check them
    150: # all can it!) die-ing on any failures.
    151: ############################################################
    152: 
    153: sub check_numeric_list_to_regexp
    154: {
    155:     my ($re, $list) = @_;
    156:     my %list = map { $_ => 1 } @$list;
    157:     print "Re: $re\n";
    158: 
    159:     # Put some other test cases in
    160:     $list{$_} += 0 for (0..999);
    161:     $list{int(rand()*1000)} += 0 for (0..99);
    162:     $list{int(rand()*10000)} += 0 for (0..99);
    163:     $list{int(rand()*100000)} += 0 for (0..99);
    164: 
    165:     # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
    166:     $re =~ s{^\|}{};		# fix | on start and end
    167:     $re =~ s{\|$}{};
    168:     $re = "^(?:$re)\$";		# put in ^(?: ... )$
    169:     $re = qr{$re};		# compile the regexp for speed
    170: 
    171:     # Check all the keys in list against the regexp - some should pass
    172:     # and some should fail
    173:     for my $item (keys %list)
    174:     {
    175:         if ($list{$item} xor ($item =~ /$re/))
    176:         {
    177:             die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
    178:         }
    179:         else
    180:         {
    181:             # print "OK '$re' for '$item'\n";
    182:         }
    183:     }
    184: }
    

In reply to Numeric list to optimised regexp by ncw

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others pondering the Monastery: (3)
As of 2024-04-25 07:02 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found