BrowserUK thank you very much for the answer!
I have made two new versions of the example parser using the LVALUE returned by substr.
In version 2 I use [ $type, $first, $length ] to represent a part in @part.
In version 3 I use [ $type, $sub_txt_ref ], where $sub_txt_ref is a LVALUE returned by substr.
The scalar type LVALUE is new form me. Where can I read about it?
Example of parser version 2
use strict;
use warnings;
sub parse {
my ( $txt_ref, $part_aref, $spec_aref ) = @_;
if ( not defined $part_aref ) { # use the whole $$txt_ref
$part_aref = [ [ '??', 0, length($$txt_ref) ] ];
}
my @part; # result from the parsing
for my $part (@$part_aref) {
my ( $type, $first, $length ) = @$part;
if ( $type ne '??' ) {
push @part, $part;
next;
}
my $sub_txt_ref = \substr( $$txt_ref, $first, $length );
my $end_last_match = 0;
MATCH:
{
for my $spec (@$spec_aref) {
my ( $reg_exp, $type ) = @$spec;
if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) {
push @part, [ $type, $-[0] + $first, $+[0] - $-[0]
+ ];
$end_last_match = $+[0];
redo MATCH;
}
}
}
if ( $end_last_match < $length ) {
warn 'ERROR: Stopped before string end at pos: ',
"$first + $end_last_match\n<",
substr( $$sub_txt_ref, $end_last_match ), '>';
}
}
return \@part;
}
my @spec_1 = (
[ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ],
[ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ],
[ qr{\G//[^\n]*[\n]?}, 'cn' ],
[ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ],
[ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ],
);
my $dig1 = qr{-?\d+\.\d*};
my $dig2 = qr{-?\d*\.\d+};
my $dig3 = qr{-?\d+};
my $dig4 = qr{E-?\d+};
my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?};
my @spec_2 = (
[ qr{\G(?:var|alert)}, 'rw' ],
[ qr{\G$digit}, 'di' ],
[ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ],
[ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ],
);
sub to_string_part_aref {
my ( $txt_ref, $part_aref ) = @_;
return join '', map {
my ( $type, $first, $length ) = @$_;
if ( $type eq '??' ) {
substr( $$txt_ref, $first, $length );
}
else {
"<$type>" . substr( $$txt_ref, $first, $length ) . "</$typ
+e>";
}
} @{$part_aref};
}
my $text = do { local $/; <DATA> };
my $text_ref = \$text;
my $part_ref_1 = parse( $text_ref, undef, \@spec_1 );
my $part_ref_2 = parse( $text_ref, $part_ref_1, \@spec_2 );
warn to_string_part_aref( $text_ref, $part_ref_2 );
__DATA__
// This is a single-line comment
var x = 4; // Single
/* Multiple-line
comment that can
span any number
of lines */
/*
This is a multi-line comment
// Still a multi-line comment
*/
/* Stop code
var x = 4;
var y = 5;
/* Bug? *
x = "cool";
End Stop code */
// This is a single-line comment /* ...still a single-line comment
'string\' // still a string'; // comment /* not-a-nested-comment
var = 0.5; // comment */* still-a-comment
' /**/ string ' /* "comment..."
// still-a-comment */ alert('This isn\'t a comment!');
/\/* this isn't a comment! */; //* comment
/*
//a comment... // still-a-comment
12345
"Foo /bar/ ""
*/
/*//Boo*/
/*/**/
Example of parser version 3
Is updated! Removed the parameter $txt_ref.
use strict;
use warnings;
sub first_part_aref {
my ($txt_ref) = @_;
my $sub_txt_ref = \substr( $$txt_ref, 0 );
return [ [ '??', $sub_txt_ref ] ];
}
sub parse {
my ( $part_aref, $spec_aref ) = @_;
my @part; # result from the parsing
for my $part (@$part_aref) {
my ( $type, $sub_txt_ref ) = @$part;
if ( $type ne '??' ) {
push @part, $part;
next;
}
my $end_last_match = 0;
MATCH:
{
for my $spec (@$spec_aref) {
my ( $reg_exp, $type ) = @$spec;
if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) {
my $sub_sub_txt_ref =
\substr( $$sub_txt_ref, $-[0], $+[0] - $-[0] );
push @part, [ $type, $sub_sub_txt_ref ];
$end_last_match = $+[0];
redo MATCH;
}
}
}
if ( $end_last_match < length $$sub_txt_ref ) {
warn 'ERROR: Stopped before string end at pos: ',
"$end_last_match\n<",
substr( $$sub_txt_ref, $end_last_match ), '>';
}
}
return \@part;
}
my @spec_1 = (
[ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ],
[ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ],
[ qr{\G//[^\n]*[\n]?}, 'cn' ],
[ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ],
[ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ],
);
my $dig1 = qr{-?\d+\.\d*};
my $dig2 = qr{-?\d*\.\d+};
my $dig3 = qr{-?\d+};
my $dig4 = qr{E-?\d+};
my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?};
my @spec_2 = (
[ qr{\G(?:var|alert)}, 'rw' ],
[ qr{\G$digit}, 'di' ],
[ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ],
[ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ],
);
sub to_string_part_aref {
my ($part_aref) = @_;
return join '', map {
my ( $type, $sub_txt_ref ) = @$_;
$type ne '??' ? "<$type>" : (),
$$sub_txt_ref,
$type ne '??' ? "</$type>" : ();
} @{$part_aref};
}
my $text = do { local $/; <DATA> };
my $part_ref_0 = first_part_aref( \$text );
my $part_ref_1 = parse( $part_ref_0, \@spec_1 );
my $part_ref_2 = parse( $part_ref_1, \@spec_2 );
warn to_string_part_aref($part_ref_2);
__DATA__
// This is a single-line comment
var x = 4; // Single
/* Multiple-line
comment that can
span any number
of lines */
/*
This is a multi-line comment
// Still a multi-line comment
*/
/* Stop code
var x = 4;
var y = 5;
/* Bug? *
x = "cool";
End Stop code */
// This is a single-line comment /* ...still a single-line comment
'string\' // still a string'; // comment /* not-a-nested-comment
var = 0.5; // comment */* still-a-comment
' /**/ string ' /* "comment..."
// still-a-comment */ alert('This isn\'t a comment!');
/\/* this isn't a comment! */; //* comment
/*
//a comment... // still-a-comment
12345
"Foo /bar/ ""
*/
/*//Boo*/
/*/**/
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.
|
|