I just remembered that I actually have Perl code doing this. It's from a research project, so there's not much comment and it's dashed off quickly so I'm sure it can be optimized a lot.
The code is intended to generate all strings described by the CFG upto a given length. In your case, you'll probably want to include some probabilities when choosing alterantives.
Hope this helps, -gjb-
#!/usr/bin/perl
use strict;
use warnings;
use Set::Scalar;
# maximum number of times productions (or rules) are applied
my $maxNrRuleApplications = 9;
# maximum length of the strings produced
my $maxStringLength = 11;
# the productions (or rules themselves, . is concatenation of
# symbols
# A -> c.A | a.B.b
# B -> f | d.A.e
my %rules = ('A' => ['c.A', 'a.B.b'],
'B' => ['f', 'd.A.e']);
# the root of the derivation trees
my $startSymbol = 'A';
# the set of terminals contains the head of all rules
my $nonTerminals = Set::Scalar->new(keys %rules);
my $nonTerminalSymbolsStr = join("|", $nonTerminals->members());
my $nonTerminalRegex = qr/\b$nonTerminalSymbolsStr\b/;
# terminals is everything that is not a nonterminal
my $terminals = Set::Scalar->new();
foreach my $rhs (values %rules) {
foreach my $expr (@$rhs) {
$terminals->insert(split(/\s*\.\s*/, $expr));
}
}
$terminals = $terminals - $nonTerminals;
my $terminalSymbolsStr = join("|", $terminals->members());
my $terminalRegex = qr/\b$terminalSymbolsStr\b/;
my %result;
my $set = {$startSymbol => 1};
foreach (1..$maxNrRuleApplications) {
my $resultSet = {};
foreach my $member (keys %$set) {
my $newSet = applyRules($member, \%rules, $maxStringLength);
foreach my $newMember (keys %$newSet) {
$resultSet->{$newMember} = 1;
}
}
$set = $resultSet;
foreach my $member (keys %$set) {
if ($member !~ /$nonTerminalRegex/) {
my $length = nrOfTokens($member);
if (!exists $result{$length}) {
$result{$length} = {};
}
$result{$length}->{$member} = 1;
}
}
}
foreach my $length (sort { $a <=> $b } keys %result) {
print "$length (", scalar(keys %{$result{$length}}), "):", "\n",
join("\n", sort keys %{$result{$length}}), "\n\n";
}
sub applyRules {
my $expr = shift(@_);
my $rules = shift(@_);
my $maxStringLength = shift(@_);
my $resultSet = {};
foreach my $nonTerminal (keys %$rules) {
if ($expr =~ /\b($nonTerminal)\b/) {
my $left = $`;
my $right = "$'";
foreach my $rhs (@{$rules->{$nonTerminal}}) {
my $rightResultSet = applyRules($right, $rules,
$maxStringLength);
if (scalar(keys %$rightResultSet) > 0) {
foreach my $rightResult (keys %$rightResultSet) {
my $derivation = $left.$rhs.$rightResult;
if (defined $maxStringLength) {
if (nrOfTokens($derivation)
<= $maxStringLength) {
$resultSet->{$derivation} = 1;;
}
} else {
$resultSet->{$derivation} = 1;
}
}
} else {
my $derivation = $left.$rhs.$right;
if (defined $maxStringLength) {
if (nrOfTokens($derivation)
<= $maxStringLength) {
$resultSet->{$derivation} = 1;
}
} else {
$resultSet->{$derivation} = 1;
}
}
}
}
}
return $resultSet;
}
sub nrOfTokens {
my $string = shift(@_);
my @matches = ($string =~ m/\./g);
return 1 + scalar(@matches);
}
Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
Read Where should I post X? if you're not absolutely sure you're posting in the right place.
Please read these before you post! —
Posts may use any of the Perl Monks Approved HTML tags:
- a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
|
For: |
|
Use: |
| & | | & |
| < | | < |
| > | | > |
| [ | | [ |
| ] | | ] |
Link using PerlMonks shortcuts! What shortcuts can I use for linking?
See Writeup Formatting Tips and other pages linked from there for more info.
|
|