I just remembered that I actually have Perl code doing this. It's from a research project, so there's not much comment and it's dashed off quickly so I'm sure it can be optimized a lot.
The code is intended to generate all strings described by the CFG upto a given length. In your case, you'll probably want to include some probabilities when choosing alterantives.
#!/usr/bin/perl
use strict;
use warnings;
use Set::Scalar;
# maximum number of times productions (or rules) are applied
my $maxNrRuleApplications = 9;
# maximum length of the strings produced
my $maxStringLength = 11;
# the productions (or rules themselves, . is concatenation of
# symbols
# A -> c.A | a.B.b
# B -> f | d.A.e
my %rules = ('A' => ['c.A', 'a.B.b'],
'B' => ['f', 'd.A.e']);
# the root of the derivation trees
my $startSymbol = 'A';
# the set of terminals contains the head of all rules
my $nonTerminals = Set::Scalar->new(keys %rules);
my $nonTerminalSymbolsStr = join("|", $nonTerminals->members());
my $nonTerminalRegex = qr/\b$nonTerminalSymbolsStr\b/;
# terminals is everything that is not a nonterminal
my $terminals = Set::Scalar->new();
foreach my $rhs (values %rules) {
foreach my $expr (@$rhs) {
$terminals->insert(split(/\s*\.\s*/, $expr));
}
}
$terminals = $terminals - $nonTerminals;
my $terminalSymbolsStr = join("|", $terminals->members());
my $terminalRegex = qr/\b$terminalSymbolsStr\b/;
my %result;
my $set = {$startSymbol => 1};
foreach (1..$maxNrRuleApplications) {
my $resultSet = {};
foreach my $member (keys %$set) {
my $newSet = applyRules($member, \%rules, $maxStringLength);
foreach my $newMember (keys %$newSet) {
$resultSet->{$newMember} = 1;
}
}
$set = $resultSet;
foreach my $member (keys %$set) {
if ($member !~ /$nonTerminalRegex/) {
my $length = nrOfTokens($member);
if (!exists $result{$length}) {
$result{$length} = {};
}
$result{$length}->{$member} = 1;
}
}
}
foreach my $length (sort { $a <=> $b } keys %result) {
print "$length (", scalar(keys %{$result{$length}}), "):", "\n",
join("\n", sort keys %{$result{$length}}), "\n\n";
}
sub applyRules {
my $expr = shift(@_);
my $rules = shift(@_);
my $maxStringLength = shift(@_);
my $resultSet = {};
foreach my $nonTerminal (keys %$rules) {
if ($expr =~ /\b($nonTerminal)\b/) {
my $left = $`;
my $right = "$'";
foreach my $rhs (@{$rules->{$nonTerminal}}) {
my $rightResultSet = applyRules($right, $rules,
$maxStringLength);
if (scalar(keys %$rightResultSet) > 0) {
foreach my $rightResult (keys %$rightResultSet) {
my $derivation = $left.$rhs.$rightResult;
if (defined $maxStringLength) {
if (nrOfTokens($derivation)
<= $maxStringLength) {
$resultSet->{$derivation} = 1;;
}
} else {
$resultSet->{$derivation} = 1;
}
}
} else {
my $derivation = $left.$rhs.$right;
if (defined $maxStringLength) {
if (nrOfTokens($derivation)
<= $maxStringLength) {
$resultSet->{$derivation} = 1;
}
} else {
$resultSet->{$derivation} = 1;
}
}
}
}
}
return $resultSet;
}
sub nrOfTokens {
my $string = shift(@_);
my @matches = ($string =~ m/\./g);
return 1 + scalar(@matches);
}