Here is my attempt at commenting the perl regex Bray uses. Note that Unicode sub-properties are heavily used.
(<[^/]([^>]*[^/>])?>) # xml start tag, e.g. <tag>
|(</[^>]*>) # xml end tag, e.g. </tag>
|(<[^>]*/>) # xml empty tag, e.g. <tag />
|
# Get text at least two characters long that begins with a letter,
# number, or CJK ideograph, that may also contain some
# punctuation, and that ends with a letter, number, or CJK
# ideograph.
(
(
\p{Lu} # Uppercase Letter
|\p{Ll} # Lowercase Letter
|\p{Lt} # Titlecase Letter
|\p{Nd} # Decimal Digit Number, i.e. 0-9
|\p{Nl} # Letter Number, e.g. Roman numerals
|\p{No} # Other Number
|[\x{4e00}-\x{9fa5}] # CJK Unified Ideographs
|\x{3007} # Ideographic number zero
|[\x{3021}-\x{3029}] # Hangzhou numerals 1-9
)
(
(
\p{Lu}
|\p{Ll}
|\p{Lt}
|\p{Nd}
|\p{Nl}
|\p{No}
|[-._:'] # some punctuation
|[\x{4e00}-\x{9fa5}]
|\x{3007}
|[\x{3021}-\x{3029}]
)*
(
\p{Lu}
|\p{Ll}
|\p{Lt}
|\p{Nd}
|\p{Nl}
|\p{No}
|[\x{4e00}-\x{9fa5}]
|\x{3007}
|[\x{3021}-\x{3029}]
)
)?
)
|