Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW
 
PerlMonks  

XML-related Regular Expressions

by John M. Dlugosz (Monsignor)
on Jun 11, 2001 at 22:32 UTC ( [id://87612]=CUFP: print w/replies, xml ) Need Help??

I had a need, and found a canned solution (XML::RegExp) was out of date and would not work at all due to changes in Perl's handling of UTF-8. So I massaged the formal spec into working code. It's not in a module, just in my script, but I thought I'd share it in this section to save the next person the effort.
# see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Name my $XML_BaseChar= qr/@{[ "[" . # a character class... "\x{0041}-\x{005A}\x{0061}-\x{007A}\x{00C0}-\x{00D6}\x{00D8}-\x{00F +6}\x{00F8}-\x{00FF}" . "\x{0100}-\x{0131}\x{0134}-\x{013E}\x{0141}-\x{0148}\x{014A}-\x{017 +E}\x{0180}-\x{01C3}\x{01CD}-\x{01F0}\x{01F4}-\x{01F5}" . "\x{01FA}-\x{0217}\x{0250}-\x{02A8}\x{02BB}-\x{02C1}" . "\x{0386}\x{0388}-\x{038A}\x{038C}\x{038E}-\x{03A1}\x{03A3}-\x{03CE +}\x{03D0}-\x{03D6}\x{03DA}\x{03DC}\x{03DE}\x{03E0}\x{03E2}-\x{03F3}" +. "\x{0401}-\x{040C}\x{040E}-\x{044F}\x{0451}-\x{045C}\x{045E}-\x{048 +1}\x{0490}-\x{04C4}\x{04C7}-\x{04C8}\x{04CB}-\x{04CC}\x{04D0}-\x{04EB +}\x{04EE}-\x{04F5}\x{04F8}-\x{04F9}" . "\x{0531}-\x{0556}\x{0559}\x{0561}-\x{0586}\x{05D0}-\x{05EA}\x{05F0 +}-\x{05F2}" . "\x{0621}-\x{063A}\x{0641}-\x{064A}\x{0671}-\x{06B7}\x{06BA}-\x{06B +E}\x{06C0}-\x{06CE}\x{06D0}-\x{06D3}\x{06D5}\x{06E5}-\x{06E6}" . "\x{0905}-\x{0939}\x{093D}\x{0958}-\x{0961}\x{0985}-\x{098C}\x{098F +}-\x{0990}\x{0993}-\x{09A8}\x{09AA}-\x{09B0}\x{09B2}\x{09B6}-\x{09B9} +\x{09DC}-\x{09DD}\x{09DF}-\x{09E1}\x{09F0}-\x{09F1}" . "\x{0A05}-\x{0A0A}\x{0A0F}-\x{0A10}\x{0A13}-\x{0A28}\x{0A2A}-\x{0A3 +0}\x{0A32}-\x{0A33}\x{0A35}-\x{0A36}\x{0A38}-\x{0A39}\x{0A59}-\x{0A5C +}\x{0A5E}\x{0A72}-\x{0A74}\x{0A85}-\x{0A8B}\x{0A8D}\x{0A8F}-\x{0A91}\ +x{0A93}-\x{0AA8}\x{0AAA}-\x{0AB0}\x{0AB2}-\x{0AB3}\x{0AB5}-\x{0AB9}\x +{0ABD}\x{0AE0}" . "\x{0B05}-\x{0B0C}\x{0B0F}-\x{0B10}\x{0B13}-\x{0B28}\x{0B2A}-\x{0B3 +0}\x{0B32}-\x{0B33}\x{0B36}-\x{0B39}\x{0B3D}\x{0B5C}-\x{0B5D}\x{0B5F} +-\x{0B61}\x{0B85}-\x{0B8A}\x{0B8E}-\x{0B90}\x{0B92}-\x{0B95}\x{0B99}- +\x{0B9A}\x{0B9C}\x{0B9E}-\x{0B9F}\x{0BA3}-\x{0BA4}\x{0BA8}-\x{0BAA}\x +{0BAE}-\x{0BB5}\x{0BB7}-\x{0BB9}" . "\x{0C05}-\x{0C0C}\x{0C0E}-\x{0C10}\x{0C12}-\x{0C28}\x{0C2A}-\x{0C3 +3}\x{0C35}-\x{0C39}\x{0C60}-\x{0C61}\x{0C85}-\x{0C8C}\x{0C8E}-\x{0C90 +}\x{0C92}-\x{0CA8}\x{0CAA}-\x{0CB3}\x{0CB5}-\x{0CB9}\x{0CDE}\x{0CE0}- +\x{0CE1}" . "\x{0D05}-\x{0D0C}\x{0D0E}-\x{0D10}\x{0D12}-\x{0D28}\x{0D2A}-\x{0D3 +9}\x{0D60}-\x{0D61}" . "\x{0E01}-\x{0E2E}\x{0E30}\x{0E32}-\x{0E33}\x{0E40}-\x{0E45}\x{0E81 +}-\x{0E82}\x{0E84}\x{0E87}-\x{0E88}\x{0E8A}\x{0E8D}\x{0E94}-\x{0E97}\ +x{0E99}-\x{0E9F}\x{0EA1}-\x{0EA3}\x{0EA5}\x{0EA7}\x{0EAA}-\x{0EAB}\x{ +0EAD}-\x{0EAE}\x{0EB0}\x{0EB2}-\x{0EB3}\x{0EBD}\x{0EC0}-\x{0EC4}" . "\x{0F40}-\x{0F47}\x{0F49}-\x{0F69}" . "\x{10A0}-\x{10C5}\x{10D0}-\x{10F6}" . "\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}\x{110B}-\x{110C +}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}\x{1150}\x{ +1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}\x{1167}\x{116 +9}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}\x{11AB}\ +x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}\x{11EB}\x{ +11F0}\x{11F9}" . "\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}" . "\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4 +D}\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}- +\x{1FB4}\x{1FB6}-\x{1FBC}\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x +{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{ +1FF6}-\x{1FFC}" . "\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}" . "\x{3041}-\x{3094}\x{30A1}-\x{30FA}\x{3105}-\x{312C}" . "\x{AC00}-\x{D7A3}" . "]" ]}/; my $XML_Ideographic= qr/[\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}]/; my $XML_Digit= qr/[\x{30}-\x{39}\x{660}-\x{669}\x{6F0}-\x{6F9}\x{966}- +\x{96F}\x{9E6}-\x{9EF}\x{A66}-\x{A6F}\x{AE6}-\x{AEF}\x{B66}-\x{B6F}\x +{BE7}-\x{BEF}\x{C66}-\x{C6F}\x{CE6}-\x{CEF}\x{D66}-\x{D6F}\x{E50}-\x{ +E59}\x{ED0}-\x{ED9}\x{F20}-\x{F29}]/; my $XML_CombiningChar= qr/@{[ "[" . # a character class... "\x{300}-\x{345}\x{360}-\x{361}" . "\x{483}-\x{486}" . "\x{591}-\x{5A1}\x{5A3}-\x{5B9}\x{5BB}-\x{5BD}\x{5BF}\x{5C1}-\x{5C2 +}\x{5C4}" . "\x{64B}-\x{652}\x{670}\x{6D6}-\x{6DC}\x{6DD}-\x{6DF}\x{6E0}-\x{6E4 +}\x{6E7}-\x{6E8}\x{6EA}-\x{6ED}" . "\x{901}-\x{903}\x{93C}\x{93E}-\x{94C}\x{94D}\x{951}-\x{954}\x{962} +-\x{963}\x{981}-\x{983}\x{9BC}\x{9BE}\x{9BF}\x{9C0}-\x{9C4}\x{9C7}-\x +{9C8}\x{9CB}-\x{9CD}\x{9D7}\x{9E2}-\x{9E3}" . "\x{A02}\x{A3C}\x{A3E}\x{A3F}\x{A40}-\x{A42}\x{A47}-\x{A48}\x{A4B}- +\x{A4D}\x{A70}-\x{A71}\x{A81}-\x{A83}\x{ABC}\x{ABE}-\x{AC5}\x{AC7}-\x +{AC9}\x{ACB}-\x{ACD}" . "\x{B01}-\x{B03}\x{B3C}\x{B3E}-\x{B43}\x{B47}-\x{B48}\x{B4B}-\x{B4D +}\x{B56}-\x{B57}\x{B82}-\x{B83}\x{BBE}-\x{BC2}\x{BC6}-\x{BC8}\x{BCA}- +\x{BCD}\x{BD7}" . "\x{C01}-\x{C03}\x{C3E}-\x{C44}\x{C46}-\x{C48}\x{C4A}-\x{C4D}\x{C55 +}-\x{C56}\x{C82}-\x{C83}\x{CBE}-\x{CC4}\x{CC6}-\x{CC8}\x{CCA}-\x{CCD} +\x{CD5}-\x{CD6}" . "\x{D02}-\x{D03}\x{D3E}-\x{D43}\x{D46}-\x{D48}\x{D4A}-\x{D4D}\x{D57 +}" . "\x{E31}\x{E34}-\x{E3A}\x{E47}-\x{E4E}\x{EB1}\x{EB4}-\x{EB9}\x{EBB} +-\x{EBC}" . "\x{EC8}-\x{ECD}" . "\x{F18}-\x{F19}\x{F35}\x{F37}\x{F39}\x{F3E}\x{F3F}\x{F71}-\x{F84}\ +x{F86}-\x{F8B}\x{F90}-\x{F95}\x{F97}\x{F99}-\x{FAD}\x{FB1}-\x{FB7}\x{ +FB9}" . "\x{20D0}-\x{20DC}\x{20E1}\x{302A}-\x{302F}\x{3099}\x{309A}" . "]" ]}/; my $XML_Extender= qr/[\x{B7}\x{2D0}\x{2D1}\x{387}\x{640}\x{E46}\x{EC6} +\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}]/; my $XML_Letter= qr/$XML_BaseChar|$XML_Ideographic/; my $XML_NameChar= qr/$XML_Letter|$XML_Digit|$XML_CombiningChar|$XML_Ex +tender|[.-_:]/; my $XML_Name= qr/(?:$XML_Letter|[_:])$XML_NameChar*/; sub OK_name ($) { return $_[0] =~ /^$XML_Name$/o; }

Replies are listed 'Best First'.
Re: XML-related Regular Expressions
by extremely (Priest) on Jun 12, 2001 at 07:26 UTC
    No offense but I think I went blind looking at your code. =) =)

    My way looks longer but is ever so much more readable:

    # just the CombiningChar one for an example # # And yes that is an evil thing to do to a heredoc... # (my $XML_CombiningChar = <<XMLCC) =~s/\s//gs; \x{300}-\x{345} \x{360}-\x{361} \x{483}-\x{486} \x{591}-\x{5A1} \x{5A3}-\x{5B9} \x{5BB}-\x{5BD} \x{5BF} \x{5C1}-\x{5C2} \x{5C4} \x{64B}-\x{652} \x{670} \x{6D6}-\x{6DC} \x{6DD}-\x{6DF} \x{6E0}-\x{6E4} \x{6E7}-\x{6E8} \x{6EA}-\x{6ED} \x{901}-\x{903} \x{93C} \x{93E}-\x{94C} \x{94D} \x{951}-\x{954} \x{962}-\x{963} \x{981}-\x{983} \x{9BC} \x{9BE} \x{9BF} \x{9C0}-\x{9C4} \x{9C7}-\x{9C8} \x{9CB}-\x{9CD} \x{9D7} \x{9E2}-\x{9E3} \x{A02} \x{A3C} \x{A3E} \x{A3F} \x{A40}-\x{A42} \x{A47}-\x{A48} \x{A4B}-\x{A4D} \x{A70}-\x{A71} \x{A81}-\x{A83} \x{ABC} \x{ABE}-\x{AC5} \x{AC7}-\x{AC9} \x{ACB}-\x{ACD} \x{B01}-\x{B03} \x{B3C} \x{B3E}-\x{B43} \x{B47}-\x{B48} \x{B4B}-\x{B4D} \x{B56}-\x{B57} \x{B82}-\x{B83} \x{BBE}-\x{BC2} \x{BC6}-\x{BC8} \x{BCA}-\x{BCD} \x{BD7} \x{C01}-\x{C03} \x{C3E}-\x{C44} \x{C46}-\x{C48} \x{C4A}-\x{C4D} \x{C55}-\x{C56} \x{C82}-\x{C83} \x{CBE}-\x{CC4} \x{CC6}-\x{CC8} \x{CCA}-\x{CCD} \x{CD5}-\x{CD6} \x{D02}-\x{D03} \x{D3E}-\x{D43} \x{D46}-\x{D48} \x{D4A}-\x{D4D} \x{D57} \x{E31} \x{E34}-\x{E3A} \x{E47}-\x{E4E} \x{EB1} \x{EB4}-\x{EB9} \x{EBB}-\x{EBC} \x{EC8}-\x{ECD} \x{F18}-\x{F19} \x{F35} \x{F37} \x{F39} \x{F3E} \x{F3F} \x{F71}-\x{F84} \x{F86}-\x{F8B} \x{F90}-\x{F95} \x{F97} \x{F99}-\x{FAD} \x{FB1}-\x{FB7} \x{FB9} \x{20D0}-\x{20DC} \x{20E1} \x{302A}-\x{302F} \x{3099} \x{309A} XMLCC print "$XML_CombiningChar\n"; $XML_CombiningChar = qr/[$XML_CombiningChar]/; print "$XML_CombiningChar\n";

    --
    $you = new YOU;
    honk() if $you->love(perl)

      $XML_CombiningChar = qr/[$XML_CombiningChar]/; Nice! Reusing the same variable eliminates the need for another scratch variable in the same scope.

      —John

Re: XML-related Regular Expressions
by mirod (Canon) on Jun 11, 2001 at 23:44 UTC

    I get an error when trying to run this (perl 5.6.1 under linux) in the definition of Ideographic characters:

    Illegal hexadecimal digit '{' ignored at test_name line 45. Invalid [] range "}-\x" before HERE mark in regex m/[\x{4E00}-\x << HE +RE {9FA5}\x{3007}\x{3021}-\x{3029}]/

    If this regexp works it would be a good idea to send an email (or even better a patch!) to the maintainer of XML::RegExp: T.J. Mather <tjmather@anidea.com> (or even take over maintenance for the module)

    update: I managed to get he code to run by replacing all qr/[class]/ by qr/@{["[class]"]}/ but now nothing gets matched...

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: CUFP [id://87612]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others lurking in the Monastery: (3)
As of 2024-04-24 20:53 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found