in continuation to
Re^2: solution wanted for break-on-spaces (w/specifics):
Please note how readable and maintainable the regexes become now!
This solves AnomalousMonk's test case here but is easily adaptable to various interpretations.
(I disagree in the case of unbalanced quotes, I'd rather ignore them. For this to happen drop the $-branch commented with "EOL".)
use v5.12;
use warnings;
use Test::More;
my $escaped = qr/\\./;
my $quoted =
qr/
(['"]) # --- start-quote
(?: # --- inside
$escaped # any escape-pair
|
. # anything else
)*? # non-greedy
(?: # --- end
\g{-1} # same quote
|
$ # EOL ends missing pair
)
/x;
my $re =
qr/
(?:
$escaped # any escape pair
|
$quoted # any quoted string
|
\S # any none whitespace
)+ # at least once
/x;
my $str = q{This "is so" very simple.};
my @tests = (
# q{all '- and "-quotes properly balanced},
[ q{This is simple.}, [ q{This}, q{is}, q{simpl
+e.} ] ],
[ q{ This is simple. }, [ q{This}, q{is}, q{simpl
+e.} ] ],
[ q{This is "so very simple".}, [ q{This}, q{is}, q{"so v
+ery simple".} ] ],
[ q{This "is so" very simple.}, [ q{This}, q{"is so"}, q{
+very}, q{simple.} ] ],
[ q{This 'isn\'t nice.'}, [ q{This}, q{'isn\'t nice
+.'} ] ],
[ q{This "isn\"t nice."}, [ q{This}, q{"isn\"t nice
+."} ] ],
[ q{This 'isn\\\\'t nice.'}, [ q{This}, q{'isn\\\\'t},
+ q{nice.'} ] ],
[ q{This "isn\\\\"t nice."}, [ q{This}, q{"isn\\\\"t},
+ q{nice."} ] ],
[ q{This 'is not unnice.'}, [ q{This}, q{'is not unni
+ce.'} ] ],
[ q{This "is not unnice."}, [ q{This}, q{"is not unni
+ce."} ] ],
[ q{a "bb cc" d}, [ q{a}, q{"bb cc"}, q{d}
+ ] ],
# q{UNbalanced '- and "-quotes at absolute end of string
+},
[ q{This is "so very simple}, [ q{This}, q{is}, q{"so ver
+y simple} ] ],
[ q{This 'isn\'t nice.}, [ q{This}, q{'isn\'t nice.}
+ ] ],
[ q{This "isn\"t nice.}, [ q{This}, q{"isn\"t nice.}
+ ] ],
[ q{This 'isn\\\\'t nice.}, [ q{This}, q{'isn\\\\'t}, q
+{nice.} ] ],
[ q{This "isn\\\\"t nice.}, [ q{This}, q{"isn\\\\"t}, q
+{nice.} ] ],
[ q{This 'is not unnice.}, [ q{This}, q{'is not unnice
+.} ] ],
[ q{This "is not unnice.}, [ q{This}, q{"is not unnice
+.} ] ],
# 'what about these questionable cases?',
[ q{is this"really so"simple now?}, [ q{is}, q{this"reall
+y so"simple}, q{now?} ] ],
[ q{is this"really so" now?}, [ q{is}, q{this"reall
+y so"}, q{now?} ] ],
[ q{is "really so"simple now?}, [ q{is}, q{"really so
+"simple}, q{now?} ] ],
[ q{is this'really so'simple now?}, [ q{is}, q{this'reall
+y so'simple}, q{now?} ] ],
[ q{is this'really so' now?}, [ q{is}, q{this'reall
+y so'}, q{now?} ] ],
[ q{is 'really so'simple now?}, [ q{is}, q{'really so
+'simple}, q{now?} ] ],
);
plan tests => 0+@tests;
for my $test (@tests) {
my ($str, $exp) = @$test;
my $got;
push @$got, $&
while ($str =~ /$re/g);
is_deeply($got, $exp, qq{<$str>: } . join('|', @$exp));
}
-*- mode: compilation; default-directory: "d:/tmp/pm/" -*-
Compilation started at Sun Oct 24 14:00:21
C:/Strawberry/perl/bin\perl.exe -w d:/tmp/pm/break_not_quoted.pl
1..24
ok 1 - <This is simple.>: This|is|simple.
ok 2 - < This is simple. >: This|is|simple.
ok 3 - <This is "so very simple".>: This|is|"so very simple".
ok 4 - <This "is so" very simple.>: This|"is so"|very|simple.
ok 5 - <This 'isn\'t nice.'>: This|'isn\'t nice.'
ok 6 - <This "isn\"t nice.">: This|"isn\"t nice."
ok 7 - <This 'isn\\'t nice.'>: This|'isn\\'t|nice.'
ok 8 - <This "isn\\"t nice.">: This|"isn\\"t|nice."
ok 9 - <This 'is not unnice.'>: This|'is not unnice.'
ok 10 - <This "is not unnice.">: This|"is not unnice."
ok 11 - <a "bb cc" d>: a|"bb cc"|d
ok 12 - <This is "so very simple>: This|is|"so very simple
ok 13 - <This 'isn\'t nice.>: This|'isn\'t nice.
ok 14 - <This "isn\"t nice.>: This|"isn\"t nice.
ok 15 - <This 'isn\\'t nice.>: This|'isn\\'t|nice.
ok 16 - <This "isn\\"t nice.>: This|"isn\\"t|nice.
ok 17 - <This 'is not unnice.>: This|'is not unnice.
ok 18 - <This "is not unnice.>: This|"is not unnice.
ok 19 - <is this"really so"simple now?>: is|this"really so"simple|now?
ok 20 - <is this"really so" now?>: is|this"really so"|now?
ok 21 - <is "really so"simple now?>: is|"really so"simple|now?
ok 22 - <is this'really so'simple now?>: is|this'really so'simple|now?
ok 23 - <is this'really so' now?>: is|this'really so'|now?
ok 24 - <is 'really so'simple now?>: is|'really so'simple|now?
Compilation finished at Sun Oct 24 14:00:21
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.