rule xml :i { ^ \s* [ # Single tags like \< \s* <[a-z:]>+ (?: \s*<[a-z:]>* \s* = \s* (?:' <[^']>* ' | " <[^"]>* ") )* \s* /\s* \> | # Tags in pairs like content \< \s* $1 := (<[a-z:]>+) [ \s*<[a-z:]>* \s* = \s* [ ' <[^']>* ' | " <[^"]>* " ] ]* \s* \> [ <[^<>]>* | ]* \< \s* / \s* $1 \s* \> ] \s* } #### document ::= prolog element Misc* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] S ::= (#x20 | #x9 | #xD | #xA)+ etcetera, etcetera #### rule document { * } rule Char { <[\x9\xA\xD\x20-\xD7FF\xE000\xFFFD\x10000-\x10FFFF]> } rule S { <[\x20\x9\xD\xA]>+ } rule NameChar { | | <[._:-]> | | } rule Name { [ ] * } rule Names { [ ]* } rule Nmtoken { + } rule Nmtokens { [ ]* } etcetera, etcetera #### #!/usr/bin/perl -w use strict; use XML::TreeBuilder; my $tree = XML::TreeBuilder->new; $tree->parse_file('rec'); # http://www.w3.org/TR/2000/REC-xml-20001006.xml for ($tree->look_down(_tag => 'prod')) { print 'rule '; print $_->look_down(_tag => 'lhs')->content_list; print " {\n "; my $not = 0; my $p = 1; my @foo = $_->look_down(_tag => 'rhs'); for (map { $_->content_list } @foo) { if (ref) { if (my $t = $_->look_down(_tag => 'nt')) { print '<', $t->content_list, '>'; } $not = 0, print '>' if $not and $p == $not; next; } { s/\xC2\xA0/ /g; # Non breaking space s/\s+/ /g; s/^S//g and do { print ''; $not = 0, print '>' if $not and $p == $not; next; }; s/^'([^']*)'// || s/^"([^"]*)"// and do { my $m = $1; if ($m =~ /[\@\$%<>:\\.]/) { $m =~ s/\\/\\\\/g; $m =~ s/'/\\'/g; print "<'$m'>"; } else { print $m; } $not = 0, print '>' if $not and $p == $not; redo; }; s/^\[([^]]*)\]// and do { my $m = $1; $m =~ s/#x/\\x/g; my $neg = (($m =~ s/^\^//) ? '-' : ''); print "<$neg\[$m]>"; $not = 0, print '>' if $not and $p == $not; redo; }; s/^#x([0-9A-Fa-f]+)// and do { print "\\x$1"; $not = 0, print '>' if $not and $p == $not; redo; }; s/^-// and do { $not = $p; print '' if $not and $p == $not; redo; }; s/^[|*+? ]+// and do { (my $x = $&) =~ tr/()/[]/; print $x; redo; }; next if not length; print "ERROR: $_\n"; } } print "\n}\n\n"; } #### rule document { * } rule Char { \x9 | \xA | \xD | <[\x20-\xD7FF]> | <[\xE000-\xFFFD]> | <[\x10000-\x10FFFF]> } rule S { [\x20 | \x9 | \xD | \xA]+ } rule NameChar { | | <'.'> | - | _ | <':'> | | } rule Name { [ | _ | <':'>] []* } rule Names { [ ]* } rule Nmtoken { []+ } rule Nmtokens { [ ]* } rule EntityValue { " [<-[%&"]> | | ]* " | ' [<-[%&']> | | ]* ' } rule AttValue { " [<-[<&"]> | ]* " | ' [<-[<&']> | ]* ' } rule SystemLiteral { [" <-["]>* "] | [' <-[']>* '] } rule PubidLiteral { " * " | ' [ ]* ' } rule PubidChar { \x20 | \xD | \xA | <[a-zA-Z0-9]> | <[-'()+,./:=?;!*#@$_%]> } rule CharData { <-[<&]>* * <']]>'> <-[<&]>*]> } rule Comment { <''> } rule PI { <' [ [* * <'?>'> *]>]]? <'?>'> } rule PITarget { } rule CDSect { } rule CDStart { <' } rule CData { [* * <']]>'> *]>] } rule CDEnd { <']]>'> } rule prolog { ? * [ *]? } rule XMLDecl { <' ? ? ? <'?>'> } rule VersionInfo { version [' ' | " "] } rule Eq { ? = ? } rule VersionNum { [<[a-zA-Z0-9_.:]> | -]+ } rule Misc { | | } rule doctypedecl { <' [ ]? ? [[ [ | ]* ] ?]? <'>'> } rule DeclSep { | } rule markupdecl { | | | | | } rule extSubset { ? } rule extSubsetDecl { [ | | ]* } rule SDDecl { standalone [[' [yes | no] '] | [" [yes | no] "]] } rule LanguageID { [- ]* } rule Langcode { | | } rule ISO639Code { [<[a-z]> | <[A-Z]>] [<[a-z]> | <[A-Z]>] } rule IanaCode { [i | I] - [<[a-z]> | <[A-Z]>]+ } rule UserCode { [x | X] - [<[a-z]> | <[A-Z]>]+ } rule Subcode { [<[a-z]> | <[A-Z]>]+ } rule element { | } rule STag { <'<'> [ ]* ? <'>'> } rule Attribute { } rule ETag { <' ? <'>'> } rule content { ? [[ | | | | ] ?]* } rule EmptyElemTag { <'<'> [ ]* ? <'/>'> } rule elementdecl { <' ? <'>'> } rule contentspec { EMPTY | ANY | | } rule children { [ | ] [? | * | +]? } rule cp { [ | | ] [? | * | +]? } rule choice { ( ? [ ? | ? ]+ ? ) } rule seq { ( ? [ ? , ? ]* ? ) } rule Mixed { ( ? #PCDATA [? | ? ]* ? )* | ( ? #PCDATA ? ) } rule AttlistDecl { <' * ? <'>'> } rule AttDef { } rule AttType { | | } rule StringType { CDATA } rule TokenizedType { ID| IDREF| IDREFS| ENTITY| ENTITIES| NMTOKEN| NMTOKENS } rule EnumeratedType { | } rule NotationType { NOTATION ( ? [? | ? ]* ? ) } rule Enumeration { ( ? [? | ? ]* ? ) } rule DefaultDecl { #REQUIRED | #IMPLIED | [[#FIXED ] } rule conditionalSect { | } rule includeSect { <' <']]>'> } rule ignoreSect { <' * <']]>'> } rule ignoreSectContents { [<' <']]>'> ]* } rule Ignore { * * [<' | <']]>'>] *]> } rule CharRef { &# <[0-9]>+ ; | &#x <[0-9a-fA-F]>+ ; } rule Reference { | } rule EntityRef { & ; } rule PEReference { <'%'> ; } rule EntityDecl { | } rule GEDecl { <' ? <'>'> } rule PEDecl { <' <'%'> ? <'>'> } rule EntityDef { | [ ?] } rule PEDef { | } rule ExternalID { SYSTEM | PUBLIC } rule NDataDecl { NDATA } rule TextDecl { <' ? ? <'?>'> } rule extParsedEnt { ? } rule extPE { ? } rule EncodingDecl { encoding [" " | ' ' ] } rule EncName { <[A-Za-z]> [<[A-Za-z0-9._]> | -]* } rule NotationDecl { <' [ | ] ? <'>'> } rule PublicID { PUBLIC } rule Letter { | } rule BaseChar { # Large block manually removed. } rule Ideographic { <[\x4E00-\x9FA5]> | \x3007 | <[\x3021-\x3029]> } rule CombiningChar { # Large block manually removed. } rule Digit { # Large block manually removed. } rule Extender { # Large block manually removed. }