Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW

Help with regular expression

by ng (Initiate)
on Oct 10, 2012 at 16:43 UTC ( #998258=perlquestion: print w/replies, xml ) Need Help??
ng has asked for the wisdom of the Perl Monks concerning the following question:

I have multiple lines of the following text that i need split and extract data

(S=( (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI))

For each, i need to retrieve values of PAD/SED/PPP. How do i split them? Any ideas?

Replies are listed 'Best First'.
Re: Help with regular expression
by Corion (Pope) on Oct 10, 2012 at 16:48 UTC

    I would fake it by replacing the = sign with . (dot) and then using Data::SExpression or something to turn the strings into Perl expressions.

    If faking is too much effort, then Parse::RecDescent or some other parser can also easily turn the given string into a data structure that then can be examined.

      Thanks for your reponse. I don't have any of these modules installed in my perl and i can't use them.
Re: Help with regular expression
by choroba (Chancellor) on Oct 10, 2012 at 23:25 UTC
    Parsing from leaves to root using regular expressions:
    #!/usr/bin/perl use Data::Dumper; use warnings; use strict; undef $/; my $input = <DATA>; $input =~ s/[ \n]//g; my %tree; while ($input =~ /[()]/) { if (my ($parent, $son) = $input =~ /\(([^()]+)=\(([^()]+)\)/) { my ($name, $value) = split /=/, $son; if (length $value) { push @{ $tree{$parent} }, {$name => $value}; } else { push @{ $tree{$parent} }, { $name => $tree{$name} }; delete $tree{$name}; } $input =~ s/\($son\)//; } elsif (my ($root, $value) = $input =~ /\(([^()]+)=([^()]+)\)/) { $tree{$root} = $value; $input =~ s/\($root=$value\)//; } else { die "Invalid input\n" unless $input =~ /^\([^()]+=\)$/; last; } } if (keys %tree > 1) { die "More than one root\n"; } print Dumper \%tree; __DATA__ (S=( (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI))
    لսႽ ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ

      Whoa, modifying and matching the same string? That can get tricky :) You appear to collapse whitespace ( "REM SSS" becomes "REMSSS" )

      Here is a familiar pattern I've used before, jazzed up with perlfaq6#What good is \G in a regular expression?

      #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; Main( @ARGV ); exit( 0 ); sub Main { my $pp = q{ (S= ( (I1= (IN=s%1) (NM=1) (HL=(HLD=kkk kjkjk) (ST=abdc) (HI=REM SSS) (H_M=9) (HL=72) (EB=0) (ER=0) (HI=E043-93A-DF0-0AB63E) (PE=aaa) (HN=DEE) (SS=NS) (SED= ( APR=(PAD=kkk) (PN=9905) (HH=llkjk) ) (DD=(LLL=kkk)) ) ) (ppp=1) (RAW=kkk) (DN=kkk) (RIN=ppp) ) (PPP=1) (AA=LLI) )}; dd blahs2aoa(""); dd blahs2aoa( $pp ); } #~ What good is \G in a regular expression? #~\G-in-a-regular- +expression%3f sub blahs2aoa { my $curr_stack = my $root = [] ; my $prev_stack; local $_ = $_[0]; pos = 0; while( length > pos ){ m/\G\s++/gcx and next; m/\G\s++/gcx and next; # ignore space "greedily" # ignore space without backtracki +ng # Match 1 or more times and give +nothing back # ?? ignore space while ratchetin +g # ?? ratchet and ignore space m/\G=/gcx and next; # ignore equals m/\G\(/gcx and do { # open push @$curr_stack, [] ; push @$prev_stack, $curr_stack; $curr_stack = $$curr_stack[-1]; next; }; m/\G\)/gcx and do { # close if( @$prev_stack ){ $curr_stack = pop @$prev_stack; } else { warn join ' ', "error extra ) close at pos ", pos, "\n +"; } next; }; m/\G([^\(\)=]++)/gcx and do { # key or value push @$curr_stack, $1; next; }; } if( $prev_stack and @$prev_stack ){ warn "Trouble!\nprev_stack ", int @$prev_stack , "\n", 'curr_stack ', int @$curr_stack , "\n", Data::Dump::pp({ prev => $prev_stack, curr => $curr_stack }), +"\n"; } return $root; } __END__ [] [ [ "S", ["SN", ""], [ "I1", ["IN", "s%1"], ["NM", 1], [ "HL", ["HLD", "kkk kjkjk"], ["ST", "abdc"], ["HI", "REM SSS"], ["H_M", 9], ["HL", 72], ["EB", 0], ["ER", 0], ["HI", "E043-93A-DF0-0AB63E"], ["PE", "aaa"], ["HN", "DEE"], ["SS", "NS"], [ "SED", ["APR", ["PAD", "kkk"], ["PN", 9905], ["HH", "llkjk"]], ["DD", ["LLL", "kkk"]], ], ], ["ppp", 1], ["RAW", "kkk"], ["DN", "kkk"], ["RIN", "ppp"], ], ["PPP", 1], ["AA", "LLI"], ], ]
Re: Help with regular expression
by Anonymous Monk on Oct 10, 2012 at 22:29 UTC


    #!/usr/bin/perl -- use strict; use warnings; use Data::Dump; use Safe; Main( @ARGV ); exit( 0 ); sub Main { my $pp = q{ (S=( (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI)) }; dd blahs2aoa( $pp ); } BEGIN { my %reps = ( '(' => '[ ', ')' => " ], ", '=' => ' "=" ,', '' => '', ); sub blahs2aoa { my( $j ) = @_; no warnings 'uninitialized'; $j =~ s{ ( [\(\)=] ) | ( \s+ ) | ( [^\(\)=]+ ) }{ $reps{$1} || $2 || "'$3'," }gsex; return Safe->new->reval( $j ); } } __END__ [ "S", "=", ["SN", "=", ""], [ "I1", "=", ["IN", "=", "s%1"], ["NM", "=", 1], [ "HL", "=", ["HLD", "=", "kkk kjkjk"], ["ST", "=", "abdc"], ["HI", "=", "REM SSS"], ["H_M", "=", 9], ["HL", "=", 72], ["EB", "=", 0], ["ER", "=", 0], ["HI", "=", "E043-93A-DF0-0AB63E"], ["PE", "=", "aaa"], ["HN", "=", "DEE"], ["SS", "=", "NS"], [ "SED", "=", [ "APR", "=", ["PAD", "=", "kkk"], ["PN", "=", 9905], ["HH", "=", "llkjk"], ], ["DD", "=", ["LLL", "=", "kkk"]], ], ], ["ppp", "=", 1], ["RAW", "=", "kkk"], ["DN", "=", "kkk"], ["RIN", "=", "ppp"], ], ["PPP", "=", 1], ["AA", "=", "LLI"], ]
Re: Help with regular expression
by ELISHEVA (Prior) on Oct 11, 2012 at 13:38 UTC

    Simple nested language structures can be easily parsed with a loop and a stack. Whenever you encounter the character sequence that marks the start of the nested language/data, you add to the stack. Whenever you encounter the character sequence that marks the end of the nested language/data you pop the stack. It looks something like this:

    Note: the /\G..../gc idiom means "start matching where we left off and reset \G to the character after the match". \G means "where we left off". For example qr(\Ga) would require there to be an "a" right where we left off whereas qr(a) would look for the first "a" any place after we left off, even a 1000 characters later.

    use strict; use warnings; use Data::Dumper; my %hData; my @stack; my $h=\%hData; my $buf = ''; my $iPos=0; while (my $line = <DATA>) { chomp $line; $buf .= $line; #print STDERR "<$buf>\n"; while ( $buf =~ /\s*\(\s*(\w+)\s*=/g) { #get start, e.g. (S= my $k = $1; #print STDERR "k=$k stack=" . @stack ." pos=". pos($buf) . "\n"; # decide if what comes after start is nested data (S=(... # or a key value pair (S=V) if ( $buf =~ /\G\s*\(/gc) { #print STDERR "nested data: pushing stack\n"; # we have nested data! push @stack, $h; $h = $h->{$k} = {}; # position to just before the ( so we can read in the # next item. pos($buf) = pos($buf) - 1; } elsif ($buf =~ /\G\s*([^)]*)\s*(\))/gc) { # we have a key value pair, so add it to the hash # Note: in case there are two values for a key, store # values in an array my $v = $1; if (exists $h->{$k}) { if ( ref($h->{$k}) eq 'ARRAY') { push @{$h->{$k}}, $v; } else { $h->{$k} = [ $h->{$k}, $v ]; } } else { $h->{$k} = $v; } } # look for extra closing ) that signal the end of nested data while ( $buf =~ /\G\s*\)/gc ) { #print STDERR "end of nested data: popping stack\n"; $h = pop @stack; } # store the position so we can add what is left to the next # parse buffer if the regex above fails an pos is reset to 0. $iPos=pos($buf); } # get the unparsed tail. $buf = $iPos < length($buf) ? substr($buf, $iPos) : ''; } print Data::Dumper->Dump([\%hData]); print "stack = " . @stack . "\n"; __DATA__ (S=( (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI))

      Missed you! Good to see you again!

        Thanks! It's good to be back "home".

      Very elegant solution. I am still a baby in Perl, and I don`t know how to build more ADTs with trees and lists so I couldn`t figure it out... But the stack idea just slipped away. I am still a noob, I guess :). Good suggestion, btw.

      Exactly what i needed. Thank you so much.

        I have another question...for the same input

        (S=(SN=a1) (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=st1)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI) (SN=a2) (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=st2)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=PPP))

        How do i retrieve the value of "AA" or "ST" For each (SN) above? Thank you

Re: Help with regular expression
by choroba (Chancellor) on Oct 11, 2012 at 09:59 UTC
    A solution using the Marpa::XS parser: Update: Hashes cannot be used easily, because the "key" can be repeated (as in HI). Switched to arrays.
    Update 2: Simplified the code.
    لսႽ ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ
Re: Help with regular expression
by Kenosis (Priest) on Oct 10, 2012 at 20:29 UTC

    It may help if you show the exact output you want from your data. Also, what do you mean by "...multiple lines of the following text..."? Is your example one of many records in a file from which you want to extract values?

      Yes, i have multiple records of each of these in a file. (S=( (I1=(IN=s%1)(NM=1) (HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0) +(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=k +kk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk)))) (ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp)) (PPP=1) (AA=LLI)) I want output similar to a tree structure.
Re: Help with regular expression
by heatblazer (Scribe) on Oct 11, 2012 at 07:56 UTC

    Hello, I have a similar soluttion mentioned here by simply substitutin all brackets with a whitespace then split everything by whitespaces.

    #!/usr/bin/perl -w use strict; $ARGV[0] = ""; open FH, $ARGV[0] or die "Can`t open it: $!\n"; my @parsed = grep { s/[()]/ /igsx } <FH>; close FH; foreach (@parsed) { print $_, "\n"; } #some testing outputs my @vars=(); foreach (@parsed) { push @vars, split(" ", $_); } foreach (@vars) { print $_, "\n"; }

    It just parsed them with the corresponding vars, but you may need more and more parsing for youur desires.

    Tree structure will be a good solution for that task.

      You lose the structure. Also, it does not work for values containig whitespace, as in HLD.
      لսႽ ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ

        Point taken. I am still a learner and I try to catch up with you guys, I`ve just beat Lama book and I am going to Alpaca, so I didn`t know how to make C`s arrays of arrays or hashes with arrays, so building a real tree wasn`t possible for me yet. However one of the users offered a stack, something I did not come up with, and quite easy one in Perl... More practice I guess.

Re: Help with regular expression
by ambrus (Abbot) on Oct 11, 2012 at 10:50 UTC

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://998258]
Approved by Corion
Front-paged by Tanktalus
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others chanting in the Monastery: (7)
As of 2017-05-26 17:59 GMT
Find Nodes?
    Voting Booth?