#!/usr/bin/perl use warnings; use strict; use Data::Dump::Streamer 'Dumper', 'Dump'; use List::Util 'max'; $|++; my $DEBUG = 1; # $x -- virtual line number. # $y -- position along the virtual line of the leftmost point in the physical line. my ($x, $y)=(0, 0); # The position along the virtual line of the last EOL-continuation. my $y_saved; # The line that has the start/end markers on it. my $mainline; # NOTE: (y, x) order, because it makes it easier to iterate over. my @info; while (<>) { chomp; s/\cM//; # Deal with DOSish newlines. print "Init : $_\n"; next if /^#/; if ($_ =~ /^\s*$/) { print "Empty line\n"; $x=0; if (not defined $y_saved) { # warn "Blank line with no continuation"; } $y=$y_saved; $y_saved=undef; next; } if (not defined $y) { warn "Nonempty line at undefined location?"; warn "$_\n"; last; } s(^>>){ my $y=$y+pos($_); $mainline=$x; $info[$x][$y] = ['START', $x, $y]; '--'; }gex; s(^>){ if ($x != $mainline) { print "???: $_\n" if $DEBUG; die "Line mismatch at >--. x=$x, mainline=$mainline"; } # if (not defined $y_saved) { # die ">- with no matching ->"; # } # $y=$y_saved; '-'; }gex; print "($x, $y): $_\n" if $DEBUG; s(>$){ $y_saved=$y+pos($_); '-'; }gex; s(><$){ my $y=$y+pos($_); $info[$x][$y] = ['END', $x, $y]; '--'; }gex; # Quoted literals. Parse these early, but not before we've # determined our position. s( \xA0(.*?)\xA0 ){ my $y=$y+pos($_); # print "Normal literal: $1\n"; $info[$x][$y] = ['LITERAL', $x, $y, $1]; '-' x (length($1)+2); }gex; # s(V([ ]*)\|){ # my $y=$y+pos($_); # $info[$x][$y] = ['REPEAT', $x, $y, length($1)]; # }gex; # # Ugly as fuck, but neccessary -- check if there's a repeat on the # # previous line at this position, and take care of the + at it's # # right-hand-side, to avoid the choice processor becomming confused. # if ($info[$x-1][$y][0] eq 'REPEAT') { # substr($_, # } # 1 while because it's vital that we get all of these before # continuing, and they can nest. # Collect all possible combinations (eg 1st '+' to 2nd '+' AND 1st '+' # to 3rd '+' AND 2nd '+' to 3rd etc, where contents are not \s # and throw away the ones that dont make sense later? 1 while (s( ([+\'.]) ([^\s]*) \1 ) { my $inside = $2; my $y=$y+pos($_); $info[$x][$y] = ['CHOICE', $x, $y, length($inside)]; # print "Choice, inside=$inside\n"; if ($inside =~ /^-+$/) { # print "Emptystring choice\n"; $info[$x][$y+1] = ['LITERAL', $x, $y+1, '']; } '-' . $inside . '-'; }gex); # Quoted variables. These are actually "blocks" that are supposed to be # defined later in the same file, but we treat them the same. s(\| ([^ |]*) \|){ my $y=$y+pos($_); $info[$x][$y] = ['VARIABLE', $x, $y, $1]; '--'.('-' x length($1)).'--'; }ge; # Footnotes. I'm really not sure how to deal with them, but parse them # here to prevent them showing up later as literals and parse failures. # Note that we can't use () delimiters here, or \) is just an end-paren, # not a literal end-paren. s{\((\d+)\)}{ my $y=$y+pos($_); $info[$x][$y] = ['FOOTNOTE', $x, $y, $1]; '-' x (length($1)+2); }gex; # Thanks, [tye], for the regex. # We include . and * as valid chars for the inside of variables, because of # "exposed-name.*", in select-clause. Uck. # : is used as part of a variable in ch2host, but only at the beginning, so # religate it to that position until we see more examples. # More examples: psmbod s( ([a-z:0-9\[\]]+(?:-[:a-z.*0-9\[\]]+)*) ){ my $y=$y+pos($_); $info[$x][$y] = ['VARIABLE', $x, $y, $1]; '-' x length ($1) }gex; # Variable then literal, since both can include *s. # = -- psmasn s( ([A-Z*,()_=]+) ){ my $y=$y+pos($_); # print "Normal literal: $1\n"; $info[$x][$y] = ['LITERAL', $x, $y, $1]; '-' x length($1); }gex; die "Unparsed bits? Final: $_" if (m/[^- |]/); # print "Final: $_\n"; $x++; } exit if !@info; Dump \@info if $DEBUG>=2; sub item2atom { my $item = shift; my $linenumber = shift; my $world = shift; return '' if !$item; my $type = $item->[0]; die "Line number passed wrong $linenumber / $item->[1]" if $linenumber != $item->[1]; # print "item2atom(".Dumper($item).")"; if ($type eq 'START' or $type eq 'END') { return ''; } elsif ($type eq 'LITERAL') { return '"' . quotemeta($item->[3]) . '"'; } elsif ($type eq 'CHOICE') { my @subrules; # Fixme: Should look up and down from the source line, until it finds a position # which is not part of the choice. # But watch out for: # ---+--A--+--- # | | # '--B--' my $choicesfound=0; foreach my $linenumber (0..$#{$world}) { my $line = $world->[$linenumber]; print "Choice, checking $linenumber\n" if $DEBUG>=2; # Keep going unless this line also begins a choice here. next if not defined $line->[$item->[2]]; next unless $line->[$item->[2]][0] eq 'CHOICE'; # Make sure it has the same length. next unless $line->[$item->[2]][3] == $item->[3]; print "$linenumber is choice, following...\n" if $DEBUG>=2; $choicesfound++; push @subrules, line2rule($linenumber, $item->[2]+1, $item->[3]-1, $world); # Replace bit we just iterated over with an equal number of # undefs, so we don't see it twice. splice(@$line, $item->[2]+1, $item->[3]-1, (undef)x($item->[3]-1)); } if ($choicesfound < 2) { warn "Choice with one ($choicesfound) option at ($item->[1], $item->[2])"; $world->[$linenumber]->[$item->[2]] = undef; } return "(".join(' | ', @subrules).")"; } elsif ($type eq 'VARIABLE') { return $item->[3]; } else { die "Unknown item type: $type" . Dumper($item); } } sub line2rule { my ($linenumber, $colnumber, $length, $world) = @_; my @result; for my $i ( $colnumber .. $colnumber+$length-1 ) { # print "line2rule: ", Dumper $world->[$linenumber][$i]; my $result = item2atom($world->[$linenumber][$i], $linenumber, $world); next unless $result; push @result, $result; } return join ' ', @result; } # print "Mainline: $mainline\n"; print line2rule($mainline, 0, 0+@{$info[$mainline]}, \@info), "\n";