rule xml :i {
^
\s*
[
# Single tags like
\<
\s*
<[a-z:]>+
(?:
\s*<[a-z:]>*
\s* = \s*
(?:' <[^']>* ' | " <[^"]>* ")
)*
\s*
/\s*
\>
|
# Tags in pairs like content
\<
\s*
$1 := (<[a-z:]>+)
[
\s*<[a-z:]>*
\s* = \s*
[ ' <[^']>* ' | " <[^"]>* " ]
]*
\s*
\>
[ <[^<>]>* | ]*
\< \s* / \s* $1 \s* \>
]
\s*
}
####
document ::= prolog element Misc*
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
S ::= (#x20 | #x9 | #xD | #xA)+
etcetera, etcetera
##
##
rule document { * }
rule Char { <[\x9\xA\xD\x20-\xD7FF\xE000\xFFFD\x10000-\x10FFFF]> }
rule S { <[\x20\x9\xD\xA]>+ }
rule NameChar { | | <[._:-]> | | }
rule Name { [ ] * }
rule Names { [ ]* }
rule Nmtoken { + }
rule Nmtokens { [ ]* }
etcetera, etcetera
##
##
#!/usr/bin/perl -w
use strict;
use XML::TreeBuilder;
my $tree = XML::TreeBuilder->new;
$tree->parse_file('rec');
# http://www.w3.org/TR/2000/REC-xml-20001006.xml
for ($tree->look_down(_tag => 'prod')) {
print 'rule ';
print $_->look_down(_tag => 'lhs')->content_list;
print " {\n ";
my $not = 0;
my $p = 1;
my @foo = $_->look_down(_tag => 'rhs');
for (map { $_->content_list } @foo) {
if (ref) {
if (my $t = $_->look_down(_tag => 'nt')) {
print '<', $t->content_list, '>';
}
$not = 0, print '>' if $not and $p == $not;
next;
}
{
s/\xC2\xA0/ /g; # Non breaking space
s/\s+/ /g;
s/^S//g and do {
print '';
$not = 0, print '>' if $not and $p == $not;
next;
};
s/^'([^']*)'// || s/^"([^"]*)"// and do {
my $m = $1;
if ($m =~ /[\@\$%<>:\\.]/) {
$m =~ s/\\/\\\\/g;
$m =~ s/'/\\'/g;
print "<'$m'>";
} else {
print $m;
}
$not = 0, print '>' if $not and $p == $not;
redo;
};
s/^\[([^]]*)\]// and do {
my $m = $1;
$m =~ s/#x/\\x/g;
my $neg = (($m =~ s/^\^//) ? '-' : '');
print "<$neg\[$m]>";
$not = 0, print '>' if $not and $p == $not;
redo;
};
s/^#x([0-9A-Fa-f]+)// and do {
print "\\x$1";
$not = 0, print '>' if $not and $p == $not;
redo;
};
s/^-// and do {
$not = $p;
print '' if $not and $p == $not;
redo;
};
s/^[|*+? ]+// and do {
(my $x = $&) =~ tr/()/[]/;
print $x;
redo;
};
next if not length;
print "ERROR: $_\n";
}
}
print "\n}\n\n";
}
##
##
rule document {
*
}
rule Char {
\x9 | \xA | \xD | <[\x20-\xD7FF]> | <[\xE000-\xFFFD]> | <[\x10000-\x10FFFF]>
}
rule S {
[\x20 | \x9 | \xD | \xA]+
}
rule NameChar {
| | <'.'> | - | _ | <':'> | |
}
rule Name {
[ | _ | <':'>] []*
}
rule Names {
[ ]*
}
rule Nmtoken {
[]+
}
rule Nmtokens {
[ ]*
}
rule EntityValue {
" [<-[%&"]> | | ]* " | ' [<-[%&']> | | ]* '
}
rule AttValue {
" [<-[<&"]> | ]* " | ' [<-[<&']> | ]* '
}
rule SystemLiteral {
[" <-["]>* "] | [' <-[']>* ']
}
rule PubidLiteral {
" * " | ' [ ]* '
}
rule PubidChar {
\x20 | \xD | \xA | <[a-zA-Z0-9]> | <[-'()+,./:=?;!*#@$_%]>
}
rule CharData {
<-[<&]>* * <']]>'> <-[<&]>*]>
}
rule Comment {
<''>
}
rule PI {
<''> [ [* * <'?>'> *]>]]? <'?>'>
}
rule PITarget {
}
rule CDSect {
}
rule CDStart {
<'
}
rule CData {
[* * <']]>'> *]>]
}
rule CDEnd {
<']]>'>
}
rule prolog {
? * [ *]?
}
rule XMLDecl {
<' ? ? ? <'?>'>
}
rule VersionInfo {
version [' ' | " "]
}
rule Eq {
? = ?
}
rule VersionNum {
[<[a-zA-Z0-9_.:]> | -]+
}
rule Misc {
| |
}
rule doctypedecl {
<' [ ]? ? [[ [ | ]* ] ?]? <'>'>
}
rule DeclSep {
|
}
rule markupdecl {
| | | | |
}
rule extSubset {
?
}
rule extSubsetDecl {
[ | | ]*
}
rule SDDecl {
standalone [[' [yes | no] '] | [" [yes | no] "]]
}
rule LanguageID {
[- ]*
}
rule Langcode {
| |
}
rule ISO639Code {
[<[a-z]> | <[A-Z]>] [<[a-z]> | <[A-Z]>]
}
rule IanaCode {
[i | I] - [<[a-z]> | <[A-Z]>]+
}
rule UserCode {
[x | X] - [<[a-z]> | <[A-Z]>]+
}
rule Subcode {
[<[a-z]> | <[A-Z]>]+
}
rule element {
|
}
rule STag {
<'<'> [ ]* ? <'>'>
}
rule Attribute {
}
rule ETag {
<''> ? <'>'>
}
rule content {
? [[ | | | | ] ?]*
}
rule EmptyElemTag {
<'<'> [ ]* ? <'/>'>
}
rule elementdecl {
<' ? <'>'>
}
rule contentspec {
EMPTY | ANY | |
}
rule children {
[ | ] [? | * | +]?
}
rule cp {
[ | | ] [? | * | +]?
}
rule choice {
( ? [ ? | ? ]+ ? )
}
rule seq {
( ? [ ? , ? ]* ? )
}
rule Mixed {
( ? #PCDATA [? | ? ]* ? )* | ( ? #PCDATA ? )
}
rule AttlistDecl {
<' * ? <'>'>
}
rule AttDef {
}
rule AttType {
| |
}
rule StringType {
CDATA
}
rule TokenizedType {
ID| IDREF| IDREFS| ENTITY| ENTITIES| NMTOKEN| NMTOKENS
}
rule EnumeratedType {
|
}
rule NotationType {
NOTATION ( ? [? | ? ]* ? )
}
rule Enumeration {
( ? [? | ? ]* ? )
}
rule DefaultDecl {
#REQUIRED | #IMPLIED | [[#FIXED ]
}
rule conditionalSect {
|
}
rule includeSect {
<' <']]>'>
}
rule ignoreSect {
<' * <']]>'>
}
rule ignoreSectContents {
[<' <']]>'> ]*
}
rule Ignore {
* * [<' | <']]>'>] *]>
}
rule CharRef {
<[0-9]>+ ; | <[0-9a-fA-F]>+ ;
}
rule Reference {
|
}
rule EntityRef {
& ;
}
rule PEReference {
<'%'> ;
}
rule EntityDecl {
|
}
rule GEDecl {
<' ? <'>'>
}
rule PEDecl {
<' <'%'> ? <'>'>
}
rule EntityDef {
| [ ?]
}
rule PEDef {
|
}
rule ExternalID {
SYSTEM | PUBLIC
}
rule NDataDecl {
NDATA
}
rule TextDecl {
<' ? ? <'?>'>
}
rule extParsedEnt {
?
}
rule extPE {
?
}
rule EncodingDecl {
encoding [" " | ' ' ]
}
rule EncName {
<[A-Za-z]> [<[A-Za-z0-9._]> | -]*
}
rule NotationDecl {
<' [ | ] ? <'>'>
}
rule PublicID {
PUBLIC
}
rule Letter {
|
}
rule BaseChar {
# Large block manually removed.
}
rule Ideographic {
<[\x4E00-\x9FA5]> | \x3007 | <[\x3021-\x3029]>
}
rule CombiningChar {
# Large block manually removed.
}
rule Digit {
# Large block manually removed.
}
rule Extender {
# Large block manually removed.
}