comment on

BrowserUK thank you very much for the answer!

I have made two new versions of the example parser using the LVALUE returned by substr.

In version 2 I use [ $type, $first, $length ] to represent a part in @part.

In version 3 I use [ $type, $sub_txt_ref ], where $sub_txt_ref is a LVALUE returned by substr.

The scalar type LVALUE is new form me. Where can I read about it?

Example of parser version 2

use strict;
use warnings;

sub parse {
    my ( $txt_ref, $part_aref, $spec_aref ) = @_;

    if ( not defined $part_aref ) {    # use the whole $$txt_ref
        $part_aref = [ [ '??', 0, length($$txt_ref) ] ];
    }

    my @part;                          # result from the parsing

    for my $part (@$part_aref) {
        my ( $type, $first, $length ) = @$part;

        if ( $type ne '??' ) {
            push @part, $part;
            next;
        }

        my $sub_txt_ref = \substr( $$txt_ref, $first, $length );
        my $end_last_match = 0;
      MATCH:
        {
            for my $spec (@$spec_aref) {
                my ( $reg_exp, $type ) = @$spec;
                if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) {
                    push @part, [ $type, $-[0] + $first, $+[0] - $-[0]
+ ];
                    $end_last_match = $+[0];
                    redo MATCH;
                }
            }
        }

        if ( $end_last_match < $length ) {
            warn 'ERROR: Stopped before string end at pos: ',
              "$first + $end_last_match\n<",
              substr( $$sub_txt_ref, $end_last_match ), '>';
        }
    }

    return \@part;
}

my @spec_1 = (
    [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'},      'sq' ],
    [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"},      'dq' ],
    [ qr{\G//[^\n]*[\n]?},                 'cn' ],
    [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ],

    [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ],
);

my $dig1  = qr{-?\d+\.\d*};
my $dig2  = qr{-?\d*\.\d+};
my $dig3  = qr{-?\d+};
my $dig4  = qr{E-?\d+};
my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?};

my @spec_2 = (
    [ qr{\G(?:var|alert)},                    'rw' ],
    [ qr{\G$digit},                           'di' ],
    [ qr{\G[_a-zA-Z0-9.\$]+},                 'na' ],
    [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ],
);

sub to_string_part_aref {
    my ( $txt_ref, $part_aref ) = @_;
    return join '', map {
        my ( $type, $first, $length ) = @$_;
        if ( $type eq '??' ) {
            substr( $$txt_ref, $first, $length );
        }
        else {
            "<$type>" . substr( $$txt_ref, $first, $length ) . "</$typ
+e>";
        }

    } @{$part_aref};
}

my $text = do { local $/; <DATA> };
my $text_ref = \$text;

my $part_ref_1 = parse( $text_ref, undef, \@spec_1 );

my $part_ref_2 = parse( $text_ref, $part_ref_1, \@spec_2 );
warn to_string_part_aref( $text_ref, $part_ref_2 );

__DATA__
// This is a single-line comment

var x = 4; // Single

/* Multiple-line
comment that can
span any number
of lines */
/*
    This is a multi-line comment
    // Still a multi-line comment
*/
/* Stop code
var x = 4;
var y = 5;
/* Bug? *
x = "cool";
End Stop code */
// This is a single-line comment /* ...still a single-line comment
'string\' // still a string'; // comment /* not-a-nested-comment
var = 0.5; // comment */* still-a-comment
' /**/ string ' /* "comment..."
// still-a-comment */ alert('This isn\'t a comment!');
/\/* this isn't a comment! */; //* comment
/*
    //a comment... // still-a-comment
    12345
    "Foo /bar/ ""
*/
/*//Boo*/
/*/**/
[download]

Example of parser version 3

Is updated! Removed the parameter $txt_ref.

use strict;
use warnings;

sub first_part_aref {
    my ($txt_ref) = @_;
    my $sub_txt_ref = \substr( $$txt_ref, 0 );
    return [ [ '??', $sub_txt_ref ] ];
}

sub parse {
    my ( $part_aref, $spec_aref ) = @_;

    my @part;    # result from the parsing

    for my $part (@$part_aref) {
        my ( $type, $sub_txt_ref ) = @$part;

        if ( $type ne '??' ) {
            push @part, $part;
            next;
        }

        my $end_last_match = 0;
      MATCH:
        {
            for my $spec (@$spec_aref) {
                my ( $reg_exp, $type ) = @$spec;
                if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) {
                    my $sub_sub_txt_ref =
                      \substr( $$sub_txt_ref, $-[0], $+[0] - $-[0] );
                    push @part, [ $type, $sub_sub_txt_ref ];
                    $end_last_match = $+[0];
                    redo MATCH;
                }
            }
        }

        if ( $end_last_match < length $$sub_txt_ref ) {
            warn 'ERROR: Stopped before string end at pos: ',
              "$end_last_match\n<",
              substr( $$sub_txt_ref, $end_last_match ), '>';
        }
    }

    return \@part;
}

my @spec_1 = (
    [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'},      'sq' ],
    [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"},      'dq' ],
    [ qr{\G//[^\n]*[\n]?},                 'cn' ],
    [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ],

    [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ],
);

my $dig1  = qr{-?\d+\.\d*};
my $dig2  = qr{-?\d*\.\d+};
my $dig3  = qr{-?\d+};
my $dig4  = qr{E-?\d+};
my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?};

my @spec_2 = (
    [ qr{\G(?:var|alert)},                    'rw' ],
    [ qr{\G$digit},                           'di' ],
    [ qr{\G[_a-zA-Z0-9.\$]+},                 'na' ],
    [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ],
);

sub to_string_part_aref {
    my ($part_aref) = @_;
    return join '', map {
        my ( $type, $sub_txt_ref ) = @$_;
        $type ne '??' ? "<$type>" : (),
          $$sub_txt_ref,
          $type ne '??' ? "</$type>" : ();
    } @{$part_aref};
}

my $text = do { local $/; <DATA> };

my $part_ref_0 = first_part_aref( \$text );

my $part_ref_1 = parse( $part_ref_0, \@spec_1 );

my $part_ref_2 = parse( $part_ref_1, \@spec_2 );
warn to_string_part_aref($part_ref_2);

__DATA__
// This is a single-line comment

var x = 4; // Single

/* Multiple-line
comment that can
span any number
of lines */
/*
    This is a multi-line comment
    // Still a multi-line comment
*/
/* Stop code
var x = 4;
var y = 5;
/* Bug? *
x = "cool";
End Stop code */
// This is a single-line comment /* ...still a single-line comment
'string\' // still a string'; // comment /* not-a-nested-comment
var = 0.5; // comment */* still-a-comment
' /**/ string ' /* "comment..."
// still-a-comment */ alert('This isn\'t a comment!');
/\/* this isn't a comment! */; //* comment
/*
    //a comment... // still-a-comment
    12345
    "Foo /bar/ ""
*/
/*//Boo*/
/*/**/
[download]

In reply to Re^2: Setting end position for the regexp engine using LVALUE by bojinlund
in thread Setting end position for the regexp engine by bojinlund

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


We don't bite newbies here... much
	PerlMonks