http://www.perlmonks.org?node_id=998445


in reply to Help with regular expression

Simple nested language structures can be easily parsed with a loop and a stack. Whenever you encounter the character sequence that marks the start of the nested language/data, you add to the stack. Whenever you encounter the character sequence that marks the end of the nested language/data you pop the stack. It looks something like this:

Note: the /\G..../gc idiom means "start matching where we left off and reset \G to the character after the match". \G means "where we left off". For example qr(\Ga) would require there to be an "a" right where we left off whereas qr(a) would look for the first "a" any place after we left off, even a 1000 characters later.

use strict; use warnings; use Data::Dumper; my %hData; my @stack; my $h=\%hData; my $buf = ''; my $iPos=0; while (my $line = <DATA>) { chomp $line; $buf .= $line; #print STDERR "<$buf>\n"; while ( $buf =~ /\s*\(\s*(\w+)\s*=/g) { #get start, e.g. (S= my $k = $1; #print STDERR "k=$k stack=" . @stack ." pos=". pos($buf) . "\n"; # decide if what comes after start is nested data (S=(... # or a key value pair (S=V) if ( $buf =~ /\G\s*\(/gc) { #print STDERR "nested data: pushing stack\n"; # we have nested data! push @stack, $h; $h = $h->{$k} = {}; # position to just before the ( so we can read in the # next item. pos($buf) = pos($buf) - 1; } elsif ($buf =~ /\G\s*([^)]*)\s*(\))/gc) { # we have a key value pair, so add it to the hash # Note: in case there are two values for a key, store # values in an array my $v = $1; if (exists $h->{$k}) { if ( ref($h->{$k}) eq 'ARRAY') { push @{$h->{$k}}, $v; } else { $h->{$k} = [ $h->{$k}, $v ]; } } else { $h->{$k} = $v; } } # look for extra closing ) that signal the end of nested data while ( $buf =~ /\G\s*\)/gc ) { #print STDERR "end of nested data: popping stack\n"; $h = pop @stack; } # store the position so we can add what is left to the next # parse buffer if the regex above fails an pos is reset to 0. $iPos=pos($buf); } # get the unparsed tail. $buf = $iPos < length($buf) ? substr($buf, $iPos) : ''; } print Data::Dumper->Dump([\%hData]); print "stack = " . @stack . "\n"; __DATA__ (S=(SN=ac2.bd) (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI))