Beefy Boxes and Bandwidth Generously Provided by pair Networks
Problems? Is your data what you think it is?
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
BrowserUK thank you very much for the answer!

I have made two new versions of the example parser using the LVALUE returned by substr.

In version 2 I use [ $type, $first, $length ] to represent a part in @part.

In version 3 I use [ $type, $sub_txt_ref ], where $sub_txt_ref is a LVALUE returned by substr.

The scalar type LVALUE is new form me. Where can I read about it?

Example of parser version 2

use strict; use warnings; sub parse { my ( $txt_ref, $part_aref, $spec_aref ) = @_; if ( not defined $part_aref ) { # use the whole $$txt_ref $part_aref = [ [ '??', 0, length($$txt_ref) ] ]; } my @part; # result from the parsing for my $part (@$part_aref) { my ( $type, $first, $length ) = @$part; if ( $type ne '??' ) { push @part, $part; next; } my $sub_txt_ref = \substr( $$txt_ref, $first, $length ); my $end_last_match = 0; MATCH: { for my $spec (@$spec_aref) { my ( $reg_exp, $type ) = @$spec; if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) { push @part, [ $type, $-[0] + $first, $+[0] - $-[0] + ]; $end_last_match = $+[0]; redo MATCH; } } } if ( $end_last_match < $length ) { warn 'ERROR: Stopped before string end at pos: ', "$first + $end_last_match\n<", substr( $$sub_txt_ref, $end_last_match ), '>'; } } return \@part; } my @spec_1 = ( [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ], [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ], [ qr{\G//[^\n]*[\n]?}, 'cn' ], [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ], [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ], ); my $dig1 = qr{-?\d+\.\d*}; my $dig2 = qr{-?\d*\.\d+}; my $dig3 = qr{-?\d+}; my $dig4 = qr{E-?\d+}; my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?}; my @spec_2 = ( [ qr{\G(?:var|alert)}, 'rw' ], [ qr{\G$digit}, 'di' ], [ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ], [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ], ); sub to_string_part_aref { my ( $txt_ref, $part_aref ) = @_; return join '', map { my ( $type, $first, $length ) = @$_; if ( $type eq '??' ) { substr( $$txt_ref, $first, $length ); } else { "<$type>" . substr( $$txt_ref, $first, $length ) . "</$typ +e>"; } } @{$part_aref}; } my $text = do { local $/; <DATA> }; my $text_ref = \$text; my $part_ref_1 = parse( $text_ref, undef, \@spec_1 ); my $part_ref_2 = parse( $text_ref, $part_ref_1, \@spec_2 ); warn to_string_part_aref( $text_ref, $part_ref_2 ); __DATA__ // This is a single-line comment var x = 4; // Single /* Multiple-line comment that can span any number of lines */ /* This is a multi-line comment // Still a multi-line comment */ /* Stop code var x = 4; var y = 5; /* Bug? * x = "cool"; End Stop code */ // This is a single-line comment /* ...still a single-line comment 'string\' // still a string'; // comment /* not-a-nested-comment var = 0.5; // comment */* still-a-comment ' /**/ string ' /* "comment..." // still-a-comment */ alert('This isn\'t a comment!'); /\/* this isn't a comment! */; //* comment /* //a comment... // still-a-comment 12345 "Foo /bar/ "" */ /*//Boo*/ /*/**/

Example of parser version 3

Is updated! Removed the parameter $txt_ref.

use strict; use warnings; sub first_part_aref { my ($txt_ref) = @_; my $sub_txt_ref = \substr( $$txt_ref, 0 ); return [ [ '??', $sub_txt_ref ] ]; } sub parse { my ( $part_aref, $spec_aref ) = @_; my @part; # result from the parsing for my $part (@$part_aref) { my ( $type, $sub_txt_ref ) = @$part; if ( $type ne '??' ) { push @part, $part; next; } my $end_last_match = 0; MATCH: { for my $spec (@$spec_aref) { my ( $reg_exp, $type ) = @$spec; if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) { my $sub_sub_txt_ref = \substr( $$sub_txt_ref, $-[0], $+[0] - $-[0] ); push @part, [ $type, $sub_sub_txt_ref ]; $end_last_match = $+[0]; redo MATCH; } } } if ( $end_last_match < length $$sub_txt_ref ) { warn 'ERROR: Stopped before string end at pos: ', "$end_last_match\n<", substr( $$sub_txt_ref, $end_last_match ), '>'; } } return \@part; } my @spec_1 = ( [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ], [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ], [ qr{\G//[^\n]*[\n]?}, 'cn' ], [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ], [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ], ); my $dig1 = qr{-?\d+\.\d*}; my $dig2 = qr{-?\d*\.\d+}; my $dig3 = qr{-?\d+}; my $dig4 = qr{E-?\d+}; my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?}; my @spec_2 = ( [ qr{\G(?:var|alert)}, 'rw' ], [ qr{\G$digit}, 'di' ], [ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ], [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ], ); sub to_string_part_aref { my ($part_aref) = @_; return join '', map { my ( $type, $sub_txt_ref ) = @$_; $type ne '??' ? "<$type>" : (), $$sub_txt_ref, $type ne '??' ? "</$type>" : (); } @{$part_aref}; } my $text = do { local $/; <DATA> }; my $part_ref_0 = first_part_aref( \$text ); my $part_ref_1 = parse( $part_ref_0, \@spec_1 ); my $part_ref_2 = parse( $part_ref_1, \@spec_2 ); warn to_string_part_aref($part_ref_2); __DATA__ // This is a single-line comment var x = 4; // Single /* Multiple-line comment that can span any number of lines */ /* This is a multi-line comment // Still a multi-line comment */ /* Stop code var x = 4; var y = 5; /* Bug? * x = "cool"; End Stop code */ // This is a single-line comment /* ...still a single-line comment 'string\' // still a string'; // comment /* not-a-nested-comment var = 0.5; // comment */* still-a-comment ' /**/ string ' /* "comment..." // still-a-comment */ alert('This isn\'t a comment!'); /\/* this isn't a comment! */; //* comment /* //a comment... // still-a-comment 12345 "Foo /bar/ "" */ /*//Boo*/ /*/**/

In reply to Re^2: Setting end position for the regexp engine using LVALUE by bojinlund
in thread Setting end position for the regexp engine by bojinlund

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others wandering the Monastery: (7)
    As of 2014-08-23 01:52 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The best computer themed movie is:











      Results (171 votes), past polls