There are two parts. In the first one, there are very simple "tags", with a key and a value. Piece of cake. The second part is the game, which you can take as a whole and consider your job done, as the CPAN module does, or you can read the Standard and extract the information that I need to feed a database with.[Event "Botvinnik Memorial"] [Site "Moscow"] [Date "2001.12.05"] [Round "4"] [White "Kasparov, Garry"] [Black "Kramnik, Vladimir"] [Result "1/2-1/2"] [ECO "C80"] [WhiteElo "2839"] [BlackElo "2808"] [PlyCount "37"] [EventDate "2001.12.01"] 1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 $1 {first comment} 4. Ba4 Nf6 5. O-O Nxe4 {second comment} 6. d4 ; comment starting with ";" up to EOL b5 7. Bb3 d5 8. dxe5 Be6 9. Be3 {third comment} 9... Bc5 10. Qd3 O-O 11. Nc3 Nb4 (11... Bxe3 12. Qxe3 Nxc3 13. Qxc3 Qd7 14. Rad1 Nd8 $1 15. Nd4 c6 $14 (15... Nb7 16. Qc6 $1 $16)) 12. Qe2 Nxc3 13. bxc3 Bxe3 % escaped line - it will be discarded up to the EOL 14. Qxe3 Nc6 {wrong } comment} 15. a4 Na5 oh? 16. axb5 {yet another comment} (16. Nd4 {nested comment}) 16... axb5 17. Nd4 (17. Qc5 c6 18. + Nd4 Ra6 19. f4 g6 20. Ra3 Qd7 21. Rfa1 Rfa8) 17... Qe8 18. f4 c5 19. Nxe6 the end 1/2-1/2
Notice that the moves are stored into an array, and the comments recorded with reference to the move where they belonged ('3b' = move 3, black; '6w' = move 6, white, and so on) so that the game, once stored, can be easily recreated.$game = { 'Event' => 'Botvinnik Memorial', 'Site' => 'Moscow', 'Round' => '4', 'White' => 'Kasparov, Garry', 'Black' => 'Kramnik, Vladimir', 'Date' => '2001.12.05', 'Result' => '1/2-1/2', 'WhiteElo' => '2839', 'BlackElo' => '2808', 'ECO' => 'C80', 'PlyCount' => '37', 'EventDate' => '2001.12.01', GameMoves => [ qw( e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 b5 Bb3 d +5 dxe5 Be6 Be3 Bc5 Qd3 O-O Nc3 Nb4 Qe2 Nxc3 bxc3 Bxe3 Qxe3 Nc6 a4 Na5 axb5 axb5 Nd4 Qe8 f4 c5 Nxe6) ], 'GameComments' => { '3b' => ' $1 {first comment}', '5b' => ' {second comment}', '6w' => ' ; comment starting with ";" up to EOL ', '9w' => ' {third comment}', '11b' => ' (11... Bxe3 12. Qxe3 Nxc3 13. Qxc3 Qd7 14. Rad1 Nd8 $1 + 15. Nd4 c6 $14 (15... Nb7 16. Qc6 $1 $16))', '14b' => ' {wrong }', '16w' => ' {yet another comment} (16. Nd4 {nested comment})', '17w' => ' (17. Qc5 c6 18. Nd4 Ra6 19. f4 g6 20. Ra3 Qd7 21. Rfa1 Rfa8)' }, 'GameErrors' => { '14b' => 'comment}', '15b' => 'oh?', '19w' => 'the end' } }
Here $REnumber and $REmove are regular expressions describin how a number and a chess move should be (see below), while _find_end_symbol(), _find_EOL() and _find_chunk() are functions that scan the char array for the desired portion of text, and return the text or a failure code (I didn't include here for simplicity.)my %switchcolor = ( 'w' => 'b', 'b' => 'w'); my @gamechars = split //, $gametext; for ($position =0; $position < scalar @gamechars; ) { if ($gamechars[$position] =~ /([\(\[\{])/) { $end_comment = _find_end_symbol(\@gamechars, $position, $1) +; $comment = substr($gametext, $position, $end_comment - $position +1); $game{comments}->{$movecount.$color} .= " " . $comment; $position = $end_comment + 1; } elsif ($gamechars[$position] =~ /([;%])/) { $end_comment = _find_EOL(\@gamechars, $position); # store comment as above } elsif (($gamechars[$position] =~ /[1-9]/) { $num = _find_chunk (\@gamechars, $position, $REnumber); $movecount++; $position += length($num); } elsif ($gamechars[$position] =~ /([OKQRBNa-h])/) { $move = _find_chunk(\@gamechars, $position, $REmove); $color = $switchcolor{$color}; $position += length($move); } else { $position++; # store into errors unless /\n|\s/; } } sub _find_chunck (\@array $pos $regexp ){ # returns a portion of @array starting from $pos # matching $regexp } sub _find_EOL (\@array $pos){ # returns the nearest EOL from @array starting from $pos # } sub _find_end_symbol (\@array $pos $symbol ){ # returns the position of the closing symbol # "{" => "}", "(" =>")", "[" => "]" # in \@array, starting at $pos }
It was the key to the real solution. Previously, I had only tried something likewhile (<>) { chomp; PARSER: { m/ \G( \d+\b )/gcx && do { print "number: $1\n"; redo; }; m/ \G( \w+ )/gcx && do { print "word: $1\n"; redo; }; m/ \G( \s+ )/gcx && do { print "space: $1\n"; redo; }; m/ \G( [^\w\d]+ )/gcx && do { print "other: $1\n"; redo; }; } }
And I could catch every piece of the text, but without being able to tell which one I got!.while (m/$REmove|$REnumber|$REcomment|$REeolcomment/g) { print $1; }
This second parser is much more "perlish" than the previous one. It's 240 lines shorter and slightly faster. It will produce the same data structure as the previous one, dealing with comments and errors in a very efficient way, at the price of using an external module (thanks to TheDamian for Regexp::Common).package PGNParser; # temporary name use FileHandle; use Regexp::Common; sub new { my $class = shift; my $filename = shift || return undef; my $fh = new FileHandle "< $filename"; unless (defined $fh) { return undef } my $self = bless { GameMoves =>[], # game moves GameComments =>{}, # comments with reference to the move gamedescr => {}, # will contain the PGN tags GameErrors => {}, # will contain the parsing errors fh => \$fh # filehandle to the PGN file }, $class; return $self; } sub read_game{ # will read the game from a PGN file # after this, the game text will be in $self->{gamedescr}{Game} } my $REresult = qr{(?:1\-0|0\-1|1\/2\-1\/2|\*)}; my $REmove = qr{[KQRBN]?[a-h]?[1-8]?x?[a-h][1-8](?:\=[QRBN])?}; my $REcastling = qr/O\-O(?:\-O)?/; my $REcheck = qr/(?:(?:\#|\+(\+)?))?/; my $REanymove = qr/(?:$REmove|$REcastling)$REcheck/; my $RENAG = qr/\$\d+/; my $REnumber = qr/\d+\.(?:\.\.)?/; my $REescape = qr/^\%[^\n]*\n/; my $REeolcomment= qr/;.*$/; my $REcomment = $RE{balanced}{-parens=>'{}'}; my $RERAV = $RE{balanced}{-parens=>'()'}; my %switchcolor = ('w' => 'b', 'b' => 'w'); sub parse_game { my $self = shift; return undef unless $self->{gamedescr}{Game}; my $movecount = 0; my $color = 'b'; $self->{gamedescr}{Game} =~ s/$REresult\s*\Z//o; PARSER: { $self->{gamedescr}{Game} =~ m/\G($REnumber)\s*/mgc && do { my $num=$1; if (( $num =~ tr/\.//d) > 1) { $color = 'w'; } if ($movecount == 0) { $movecount = $num; } elsif ($movecount == ($num -1)) { $movecount++; } elsif ($movecount != $num) { $self->{GameErrors}->{$movecount.$color} .= " invalid move sequence ($num <=> $movecount)"; $movecount++; } redo }; $self->{gamedescr}{Game} =~ m/\G($REanymove)\s*/mgc && do { push @{$self->{GameMoves}}, $1; $color = $switchcolor{$color}; redo }; $self->{gamedescr}{Game} =~ m/\G($REcomment|$REeolcomment|$RERAV|$RENAG|$REescape)\s*/mg +c && do { $self->{GameComments}->{$movecount.$color} .= " " . $1; $self->{GameComments}->{$movecount.$color} =~ tr/\r//d; $self->{GameComments}->{$movecount.$color} =~ s/\n/ /g; redo }; $self->{gamedescr}{Game} =~ m/\G(\S+\s*)/mgc && do { $self->{GameErrors}->{$movecount.$color} .= " " . $1; $self->{GameErrors}->{$movecount.$color} =~ tr/\r//d; $self->{GameErrors}->{$movecount.$color} =~ s/\n/ /g; redo }; } }
_ _ _ _ (_|| | |(_|>< _|
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: A demanding parser
by blakem (Monsignor) on Jan 25, 2002 at 15:14 UTC | |
Re: A demanding parser
by Juerd (Abbot) on Jan 25, 2002 at 16:43 UTC | |
by TheDamian (Priest) on Jan 25, 2002 at 17:12 UTC | |
by Juerd (Abbot) on Jan 25, 2002 at 20:34 UTC | |
by TheDamian (Priest) on Jan 27, 2002 at 15:14 UTC | |
by gmax (Abbot) on Jan 31, 2002 at 06:59 UTC | |
by gmax (Abbot) on Jan 26, 2002 at 18:56 UTC | |
by Aristotle (Chancellor) on Jan 25, 2002 at 17:07 UTC | |
by Juerd (Abbot) on Jan 25, 2002 at 20:31 UTC | |
(tye)Re: A demanding parser
by tye (Sage) on Jan 26, 2002 at 00:11 UTC |