Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

expand unicode property (eg \p{Print}) to regex character class range

by Anonymous Monk
on Aug 07, 2010 at 10:55 UTC ( #853539=CUFP: print w/ replies, xml ) Need Help??

Javascript regular expressions does not have perl feature \p{Print} and with the help of this program now you can hardcode those as character class ranges

#!/usr/bin/perl -- use strict; use warnings; use utf8; unshift @ARGV , qw[ Print ]; for my $item ( @ARGV ){ my $swash ; unless( eval { $swash = utf8->SWASHNEW( $item ); 1; } ){ warn $@; next; } next unless ref $swash; #~ http://perl5.git.perl.org/perl.git/blob?f=lib/unicore/mktables #~ push @OUT, sprintf "%04X\t\t%s\n", $start, $name; #~ } else { #~ push @OUT, sprintf "%04X\t%04X\t%s\n", $start, $end, $name; #~ http://perl5.git.perl.org/perl.git/blob?f=lib/utf8_heavy.pl my $out = '['; while ($swash->{LIST} =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(? +:[ \t]([0-9a-fA-F]+))?/mg) { my $min = $1; my $max = defined $2 ? $2 : $min; if( $min eq $max ){ $out .= '\\u'.$min; } else { $out .= '\\u'.$min.'-'.'\\u'.$max; } } $out .= ']'; print "\n$item => $out\n\n"; } __END__ Print => [\u0009-\u000D\u0020-\u007E\u0085\u00A0-\u0377\u037A-\u037E\u +0384-\u038A\u038C\u038E-\u03A1\u03A3-\u0523\u0531-\u0556\u0559-\u055F +\u0561-\u0587\u0589-\u058A\u0591-\u05C7\u05D0-\u05EA\u05F0-\u05F4\u06 +00-\u0603\u0606-\u061B\u061E-\u061F\u0621-\u065E\u0660-\u070D\u070F-\ +u074A\u074D-\u07B1\u07C0-\u07FA\u0901-\u0939\u093C-\u094D\u0950-\u095 +4\u0958-\u0972\u097B-\u097F\u0981-\u0983\u0985-\u098C\u098F-\u0990\u0 +993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BC-\u09C4\u09C7-\u09C8\ +u09CB-\u09CE\u09D7\u09DC-\u09DD\u09DF-\u09E3\u09E6-\u09FA\u0A01-\u0A0 +3\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33\u0 +A35-\u0A36\u0A38-\u0A39\u0A3C\u0A3E-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D\ +u0A51\u0A59-\u0A5C\u0A5E\u0A66-\u0A75\u0A81-\u0A83\u0A85-\u0A8D\u0A8F +-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABC-\u0 +AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AD0\u0AE0-\u0AE3\u0AE6-\u0AEF\u0AF1\u +0B01-\u0B03\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32 +-\u0B33\u0B35-\u0B39\u0B3C-\u0B44\u0B47-\u0B48\u0B4B-\u0B4D\u0B56-\u0 +B57\u0B5C-\u0B5D\u0B5F-\u0B63\u0B66-\u0B71\u0B82-\u0B83\u0B85-\u0B8A\ +u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA +4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0 +BD0\u0BD7\u0BE6-\u0BFA\u0C01-\u0C03\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\ +u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3D-\u0C44\u0C46-\u0C48\u0C4A-\u0C4 +D\u0C55-\u0C56\u0C58-\u0C59\u0C60-\u0C63\u0C66-\u0C6F\u0C78-\u0C7F\u0 +C82-\u0C83\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5- +\u0CB9\u0CBC-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5-\u0CD6\u0CDE\u0CE +0-\u0CE3\u0CE6-\u0CEF\u0CF1-\u0CF2\u0D02-\u0D03\u0D05-\u0D0C\u0D0E-\u +0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D3D-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D +\u0D57\u0D60-\u0D63\u0D66-\u0D75\u0D79-\u0D7F\u0D82-\u0D83\u0D85-\u0D +96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0DCA\u0DCF-\u0DD4\u0 +DD6\u0DD8-\u0DDF\u0DF2-\u0DF4\u0E01-\u0E3A\u0E3F-\u0E5B\u0E81-\u0E82\ +u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3 +\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EB9\u0EBB-\u0EBD\u0EC0-\u0EC4\u0EC +6\u0EC8-\u0ECD\u0ED0-\u0ED9\u0EDC-\u0EDD\u0F00-\u0F47\u0F49-\u0F6C\u0 +F71-\u0F8B\u0F90-\u0F97\u0F99-\u0FBC\u0FBE-\u0FCC\u0FCE-\u0FD4\u1000- +\u1099\u109E-\u10C5\u10D0-\u10FC\u1100-\u1159\u115F-\u11A2\u11A8-\u11 +F9\u1200-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u +1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2- +\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u135F-\u13 +7C\u1380-\u1399\u13A0-\u13F4\u1401-\u1676\u1680-\u169C\u16A0-\u16F0\u +1700-\u170C\u170E-\u1714\u1720-\u1736\u1740-\u1753\u1760-\u176C\u176E +-\u1770\u1772-\u1773\u1780-\u17DD\u17E0-\u17E9\u17F0-\u17F9\u1800-\u1 +80E\u1810-\u1819\u1820-\u1877\u1880-\u18AA\u1900-\u191C\u1920-\u192B\ +u1930-\u193B\u1940\u1944-\u196D\u1970-\u1974\u1980-\u19A9\u19B0-\u19C +9\u19D0-\u19D9\u19DE-\u1A1B\u1A1E-\u1A1F\u1B00-\u1B4B\u1B50-\u1B7C\u1 +B80-\u1BAA\u1BAE-\u1BB9\u1C00-\u1C37\u1C3B-\u1C49\u1C4D-\u1C7F\u1D00- +\u1DE6\u1DFE-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F +57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FC4\u1FC6-\u1 +FD3\u1FD6-\u1FDB\u1FDD-\u1FEF\u1FF2-\u1FF4\u1FF6-\u1FFE\u2000-\u2064\ +u206A-\u2071\u2074-\u208E\u2090-\u2094\u20A0-\u20B5\u20D0-\u20F0\u210 +0-\u214F\u2153-\u2188\u2190-\u23E7\u2400-\u2426\u2440-\u244A\u2460-\u +269D\u26A0-\u26BC\u26C0-\u26C3\u2701-\u2704\u2706-\u2709\u270C-\u2727 +\u2729-\u274B\u274D\u274F-\u2752\u2756\u2758-\u275E\u2761-\u2794\u279 +8-\u27AF\u27B1-\u27BE\u27C0-\u27CA\u27CC\u27D0-\u2B4C\u2B50-\u2B54\u2 +C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2C6F\u2C71-\u2C7D\u2C80-\u2CEA\u2CF9- +\u2D25\u2D30-\u2D65\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB +0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u +2DDE\u2DE0-\u2E30\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB +\u3000-\u303F\u3041-\u3096\u3099-\u30FF\u3105-\u312D\u3131-\u318E\u31 +90-\u31B7\u31C0-\u31E3\u31F0-\u321E\u3220-\u3243\u3250-\u32FE\u3300-\ +u4DB5\u4DC0-\u9FC3\uA000-\uA48C\uA490-\uA4C6\uA500-\uA62B\uA640-\uA65 +F\uA662-\uA673\uA67C-\uA697\uA700-\uA78C\uA7FB-\uA82B\uA840-\uA877\uA +880-\uA8C4\uA8CE-\uA8D9\uA900-\uA953\uA95F\uAA00-\uAA36\uAA40-\uAA4D\ +uAA50-\uAA59\uAA5C-\uAA5F\uAC00-\uD7A3\uE000-\uFA2D\uFA30-\uFA6A\uFA7 +0-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D-\uFB36\uFB38-\uFB3C\uFB3E\uF +B40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3F\uFD50-\uFD8F\uFD92- +\uFDC7\uFDF0-\uFDFD\uFE00-\uFE19\uFE20-\uFE26\uFE30-\uFE52\uFE54-\uFE +66\uFE68-\uFE6B\uFE70-\uFE74\uFE76-\uFEFC\uFEFF\uFF01-\uFFBE\uFFC2-\u +FFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE0-\uFFE6\uFFE8-\uFFEE +\uFFF9-\uFFFD\u10000-\u1000B\u1000D-\u10026\u10028-\u1003A\u1003C-\u1 +003D\u1003F-\u1004D\u10050-\u1005D\u10080-\u100FA\u10100-\u10102\u101 +07-\u10133\u10137-\u1018A\u10190-\u1019B\u101D0-\u101FD\u10280-\u1029 +C\u102A0-\u102D0\u10300-\u1031E\u10320-\u10323\u10330-\u1034A\u10380- +\u1039D\u1039F-\u103C3\u103C8-\u103D5\u10400-\u1049D\u104A0-\u104A9\u +10800-\u10805\u10808\u1080A-\u10835\u10837-\u10838\u1083C\u1083F\u109 +00-\u10919\u1091F-\u10939\u1093F\u10A00-\u10A03\u10A05-\u10A06\u10A0C +-\u10A13\u10A15-\u10A17\u10A19-\u10A33\u10A38-\u10A3A\u10A3F-\u10A47\ +u10A50-\u10A58\u12000-\u1236E\u12400-\u12462\u12470-\u12473\u1D000-\u +1D0F5\u1D100-\u1D126\u1D129-\u1D1DD\u1D200-\u1D245\u1D300-\u1D356\u1D +360-\u1D371\u1D400-\u1D454\u1D456-\u1D49C\u1D49E-\u1D49F\u1D4A2\u1D4A +5-\u1D4A6\u1D4A9-\u1D4AC\u1D4AE-\u1D4B9\u1D4BB\u1D4BD-\u1D4C3\u1D4C5- +\u1D505\u1D507-\u1D50A\u1D50D-\u1D514\u1D516-\u1D51C\u1D51E-\u1D539\u +1D53B-\u1D53E\u1D540-\u1D544\u1D546\u1D54A-\u1D550\u1D552-\u1D6A5\u1D +6A8-\u1D7CB\u1D7CE-\u1D7FF\u1F000-\u1F02B\u1F030-\u1F093\u20000-\u2A6 +D6\u2F800-\u2FA1D\uE0001\uE0020-\uE007F\uE0100-\uE01EF\uF0000-\uFFFFD +\u100000-\u10FFFD]

See perlunicode,perluniprops, perlrecharclass

Comment on expand unicode property (eg \p{Print}) to regex character class range
Select or Download Code
Re: expand unicode property (eg \p{Print}) to regex character class range
by sflitman (Hermit) on Aug 07, 2010 at 22:54 UTC
    I got
    Use of uninitialized value $minbits in numeric ne (!=) at C:/Perl/lib/ +utf8_heavy.pl line 225. at C:/Perl/lib/utf8_heavy.pl line 225 utf8::SWASHNEW('utf8', 'Print') called at unicode_printable.pl + line 11 eval {...} called at unicode_printable.pl line 11 Use of uninitialized value $minbits in numeric lt (<) at C:/Perl/lib/u +tf8_heavy.pl line 225. at C:/Perl/lib/utf8_heavy.pl line 225 utf8::SWASHNEW('utf8', 'Print') called at unicode_printable.pl + line 11 eval {...} called at unicode_printable.pl line 11 Use of uninitialized value $bits in numeric lt (<) at C:/Perl/lib/utf8 +_heavy.pl line 237. at C:/Perl/lib/utf8_heavy.pl line 237 utf8::SWASHNEW('utf8', 'Print') called at unicode_printable.pl + line 11 eval {...} called at unicode_printable.pl line 11 main::(unicode_printable.pl:11): unless( eval { $swash = ut +f8->SWASHNEW( $item ); 1; } ){
    unicode_printable.pl is what I stepped through your script. It does work, I like the trick where you pull the ranges from after __END__ with @ARGV, but I cannot find that documented anywhere in the perldocs, maybe I'm not looking hard enough?
    SSF

      I like the trick where you pull the ranges from after __END__ with @ARGV

      He does no such thing. He simply placed the output he got after __END__ so you could see it without breaking the program.

      If you want what's after __END__, you can read it from the DATA file handle.

      See what ikegami said. Usage of program is
      perl fileyousaveditas.pl Propertyname propertyname propertyname
      For example (i'm omitting Print):
      $ perl unicode-regex-range.pl PerlSpace Title Bopo Dingbats PerlSpace => [\u0009-\u000A\u000C-\u000D\u0020] Title => [\u01C5\u01C8\u01CB\u01F2\u1F88-\u1F8F\u1F98-\u1F9F\u1FA8-\u1 +FAF\u1FBC\u1FCC\u1FFC] Bopo => [\u3105-\u312D\u31A0-\u31B7] Dingbats => [\u2700-\u27BF]
      For a list of properties see perluniprops. This program will work only for \w+ properties, it wont work for compound ones like Script: something or Block: something..

      I suppose Unicode::UCD ought to provide this functionality or really javascript ought to provide \p{} and \P{} ...

Re: expand unicode property (eg \p{Print}) to regex character class range (unicharproptoregexrange.pl)
by Anonymous Monk on May 30, 2013 at 22:57 UTC

    This updated version need a Unicode::UCD with prop_invlist, which started with perl 5.016

    See perlunicode, perluniprops, perlrecharclass, Unicode::UCD, Data::Dump, List::MoreUtils

    #!/usr/bin/perl -- use strict; use warnings; use Data::Dump qw/ dd pp /; use List::MoreUtils qw' uniq '; use Unicode::UCD qw/ prop_invlist /; Main( @ARGV ); exit( 0 ); sub uRanges { RangeIt( shift, '\\u%04.4X' ) } sub pRanges { RangeIt( shift, '\\N{U+%04.4X}' ) } sub iRanges { RangeIt( shift, '%04.4X', '%04.4X %04.4X' ) } sub RangeIt { my( $punct , $format1, $format2 ) = @_; $format1 ||= '\\N{U+%04.4X}'; $format2 ||= join '-', $format1, $format1; my @invlist = prop_invlist( $punct ); unless( @invlist ){ warn "## empty for $punct \n"; return; } my @ranges; for (my $i = 0; $i < @invlist; $i += 2) { my $lower = $invlist[ $i ]; my $upper = ($i + 1) < @invlist ? $invlist[$i+1] - 1 # In range : $Unicode::UCD::MAX_CP; # To infinity. You may +want # to stop much much earl +ier; # going this high may ex +pose # perl deficiencies with + very # large numbers. if( $lower != $upper ){ push @ranges, sprintf $format2, $lower, $upper; } else { push @ranges, sprintf $format1, $lower; } } @ranges; } sub Main { use Getopt::Long qw/ GetOptionsFromArray /; my %opt; GetOptionsFromArray( \@_, \%opt, q{i|is|in!}, q{p|perl!}, q{j|js|java|javascript!}, q{u|utf!}, q{h|help!}, ); $opt{h} and return Usage(); @_ or return Usage(); my %rangers = ( j => sub { printf "%s => %s\n\n", $_[0], join '', uRanges( $_ +[0] ); }, p => sub { printf "%s => %s\n\n", $_[0], join '', pRanges( $_ +[0] ); }, i => sub { print qq{sub Is$_[0] { return <<'$_[0]';\n@{[ join + "\n", iRanges( $_[0] ) ]}\n$_[0]\n} ## end of Is$_[0]\n\n}; }, u => sub { print qq{sub Is$_[0] { return <<'$_[0]';\n+utf8::$ +_[0]\n$_[0]\n} ## end of Is$_[0]\n\n}; }, ); $rangers{''} ||= $rangers{j} ; my $ranger = $rangers{ ( keys %opt )[0] || '' };## ick for my $k ( uniq @_ ){ $ranger->( $k ); } } sub Usage { print "\nUsage:\n $0 [ -i -j -u -p ] Punctuation\n"; print "\n $0 PerlSpace Title Bopo Dingbats AHex \n"; #~ print "\n $0 ASCII_Hex_Digit=Yes ASCII_Hex_Digit=No \n"; print q{ $ perl unicharproptoregexrange.pl Dingbats Dingbats => \u2700-\u27BF $ perl unicharproptoregexrange.pl -j Dingbats Dingbats => \u2700-\u27BF $ perl unicharproptoregexrange.pl -p Dingbats Dingbats => \N{U+2700}-\N{U+27BF} $ perl unicharproptoregexrange.pl -i Dingbats sub IsDingbats { return <<'Dingbats'; 2700 27BF Dingbats } ## end of IsDingbats $ perl unicharproptoregexrange.pl -u Dingbats sub IsDingbats { return <<'Dingbats'; +utf8::Dingbats Dingbats } ## end of IsDingbats See perldoc perluniprops };;;;;;; } __END__

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: CUFP [id://853539]
Approved by sflitman
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chilling in the Monastery: (7)
As of 2014-12-27 18:01 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (177 votes), past polls