This is what I have come up with so far for a ThML parser...
#!/usr/bin/perl -wT #DEFINE EXTERNAL MODULE USES use CGI; use CGI::Carp qw(fatalsToBrowser); use strict; use warnings; use Regexp::Assemble; use Encode; use Encode qw(encode decode); use open qw( :std :encoding(UTF-8) ); print "Content-type: text/html; charset=utf-8\n\n"; #print CGI::heade +r(); binmode STDOUT, ":utf8"; our $booksfile = 'BibleBooks_Abbreviations_PlusApoc_FULL.txt'; # Filen +ame says it all! our $thisprogram = ""; #MAY NEED TO BE FULLY QUALIFIED +URL our $book = ""; # For Bible book names assemble +d in regex via Regexp::Assemble our $text = ""; # Holds text from HTML form to +be processed our %input; # For holding name/value key pa +irs from HTML form input ###################### ### BEGIN PROGRAM ### &parseinputs; &assemblebooks; &parsetext; &printHTML; exit; ### FINISH PROGRAM ### ###################### sub parseinputs { my $buffer=""; my $pair=""; my @pairs=(); if ($ENV{CONTENT_LENGTH}) { read(STDIN, $buffer, $ENV{CONTENT_LENGTH}); @pairs = split(/&/,$buffer); } else { $buffer = $ENV{QUERY_STRING}; @pairs = split(/\&/,$buffer); } foreach $pair (@pairs) { $pair=~s/`/ /g; #REMOVE BACKTICKS AS A SECURITY AND FUNCTIONA +LITY MEASURE TO PROTECT REMAINDER OF SCRIPT #$input{translate}=''; my ($name, $value) = split(/=/,$pair); $value =~ tr/+/ /; $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; $name = decode("utf-8", $name); $value = decode("utf-8", $valu +e); $input{$name} = $value; } $text=$input{terms}; } # END SUB sub assemblebooks { my @data = (); # To hold file source data my @books = (); # To hold book tokens for entire fi +le my @temp = (); # To hold single book's tokens (one +line in file) my $line = ""; # For each line of the file my $piece = ""; # For each record in line open (DATA, '<:encoding(utf8)', $booksfile) or die "Cannot open the Bi +ble book names file! $!\n"; @data = <DATA>; close DATA; foreach $line (@data) { chomp $line; @temp = split/\t|,\s/, $line; foreach $piece (@temp) { push @books, "$piece\n"; } } $book = Regexp::Assemble->new; foreach $line (@books) { chomp $line; $book->add( "$line" ); } } # END SUB sub parsetext { $text =~ s% (?<!(?:\[TAGGED_START\])) # TO BE REMOVED AFTER MATCHING ALL REFS ( # BEGIN CAPTURE OF $1 *** \(* # MATCH OPTIONAL PARENTHESIS ($book) # MATCH BIBLE BOOK NAMES FROM ASSEM +BLED REGEX $2 \s # MATCH A SPACE FOLLOWING BOOK NAME (\d+(?:[:\-, ]\d+)*) # MATCH A NUMBER (USUALLY A CHAPT. +)=$3 # OPTIONALLY FOLLOWED BY VERSE NUMBERS # WHICH MAY BE DELIMITED BY THE [:-, ] + (\; # .MATCH A SEMI-COLON...ETC. (BELOW +)=$4 (?:\s*\d+ # .MATCH AN OPTIONAL SPACE, THEN A NU +MBER (?:[:\-,]\d+)* ) # .MATCH ZERO+ VERSE DELIMITERS/NU +MBERS )* # .MAKE CLAUSE OPTIONAL & REPEATABLE ( # >LOOK FOR OPTIONAL VERSION NOTATION +=$5 (?:\(\w+\)*) # >MATCH WORD IN PARENTHESES | # > -OR- (?:[, ]+\w+)* # >MATCH COMMA OR SPACE FOLLOWED BY W +ORD )* # >CLOSE CAPTURE FOR $5 \)* # MATCH OPTIONAL PARENTHESIS ) # CLOSE CAPTURE OF $1 *** (?>!(?:\[TAGGED_END\])) # WE CAN REMOVE THESE TAGS LATER %\[TAGGED_START\]$1\[TAGGED_END\]%gx; #$text =~ s/(<\!--TAGGED_START-->)|(<\!--TAGGED_END-->)//g; } #END SUB parsetext sub printHTML { print <<HTML; <html lang="utf8"> <head> <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf8"> <title>Text/HTML --> ThML Parser</title> </HEAD> <BODY> <h3 align="center">Enter your text in the box below, then submit for T +hML.</h3> <form name="ff" method="POST" accept-encoding="UTF-8" accept-charset=" +utf-8" action="$thisprogram"> <textarea rows="5" cols="65" name="terms" style="background-color:#999 +999;" value="$input{terms}">$input{terms}</textarea> <input type="submit" value="Submit" name="submit"></input> </form> <hr> $text <hr> $book <hr> </body> </html> HTML } # END SUB
And here's contents of the external file referenced above:
Book Name Abbreviations Genesis Gen, Ge, Gn Exodus Exo, Ex, Exod Leviticus Lev, Le, Lv Numbers Num, Nu, Nm, Nb Deuteronomy Deut, Dt Joshua Josh, Jos, Jsh Judges Judg, Jdg, Jg, Jdgs Ruth Rth, Ru 1 Samuel 1 Sam, 1 Sa, 1Samuel, 1S, I Sa, 1 Sm, 1Sa, I Sam, 1Sam, I + Samuel, 1st Samuel, First Samuel 2 Samuel 2 Sam, 2 Sa, 2Samuel, 2S, II Sa, 2 Sm, 2Sa, II Sam, 2Sam, + II Samuel, 2Samuel, 2nd Samuel, Second Samuel 1 Kings 1 Kgs, 1 Ki, 1K, I Kgs, 1Kgs, I Ki, 1Ki, I Kings, 1Kings, +1st Kgs, 1st Kings, First Kings, First Kgs, 1Kin 2 Kings 2 Kgs, 2 Ki, 2K, II Kgs, 2Kgs, II Ki, 2Ki, II Kings, 2King +s, 2nd Kgs, 2nd Kings, Second Kings, Second Kgs, 2Kin 1 Chronicles 1 Chron, 1 Ch, I Ch, 1Ch, 1 Chr, I Chr, 1Chr, I Chron +, 1Chron, I Chronicles, 1Chronicles, 1st Chronicles, First Chronicles +, 3 Kgs, 3 Ki, 3K, III Kgs, 3Kgs, III Ki, 3Ki, III Kings, 3Kings, 3rd + Kgs, 3rd Kings, Third Kings, Third Kgs, 3Kin 2 Chronicles 2 Chron, 2 Ch, II Ch, 2Ch, 2 Chr, II Chr, 2Chr, II Ch +ron, 2Chron, II Chronicles, 2Chronicles, 2nd Chronicles, Second Chron +icles, 4 Kgs, 4 Ki, 4K, IV Kgs, 4Kgs, IV Ki, 4Ki, IV Kings, 4Kings, 4 +th Kgs, 4th Kings, Fourth Kings, Fourth Kgs, 4Kin Ezra Ezra, Ezr Nehemiah Neh, Ne Esther Esth, Es Job Job, Job, Jb Psalm Pslm, Ps, Psalms, Psa, Psm, Pss Proverbs Prov, Pr, Prv Ecclesiastes Eccles, Ec, Qoh, Qoheleth Song of Solomon Song, So, Canticle of Canticles, Canticles, Song o +f Songs, SOS Isaiah Isa, Is Jeremiah Jer, Je, Jr Lamentations Lam, La Ezekiel Ezek, Eze, Ezk Daniel Dan, Da, Dn Hosea Hos, Ho Joel Joel, Joe, Jl Amos Amos, Am Obadiah Obad, Ob Jonah Jnh, Jon Micah Micah, Mic Nahum Nah, Na Habakkuk Hab, Hab Zephaniah Zeph, Zep, Zp Haggai Haggai, Hag, Hg Zechariah Zech, Zec, Zc Malachi Mal, Mal, Ml Tobit Tobit, Tob, Tb Judith Jdth, Jdt, Jth Additions to Esther Add Esth, Add Es, Rest of Esther, The Rest of +Esther, AEs, AddEsth Wisdom of Solomon Wisd of Sol, Wis, Ws, Wisdom Sirach Sirach, Sir, Ecclesiasticus, Ecclus Baruch Baruch, Bar Letter of Jeremiah Let Jer, Let Jer, LJe, Ltr Jer Song of Three Youths Song of Three, Song Thr, The Song of Three Yo +uths, Pr Az, Prayer of Azariah, Azariah, The Song of the Three Holy C +hildren, The Song of Three Jews, Song of the Three Holy Children, Son +g of Thr, Song of Three Children, Song of Three Jews Susanna Susanna, Sus Bel and the Dragon Bel, Bel 1 Maccabees 1 Macc, 1 Mac, 1M, I Ma, 1Ma, I Mac, 1Mac, I Macc, 1Ma +cc, I Maccabees, 1Maccabees, 1st Maccabees, First Maccabees 2 Maccabees 2 Macc, 2 Mac, 2M, II Ma, 2Ma, II Mac, 2Mac, II Macc, +2Macc, II Maccabees, 2Maccabees, 2nd Maccabees, Second Maccabees 1 Esdras 1 Esdr, 1 Esd, I Es, 1Es, I Esd, 1Esd, I Esdr, 1Esdr, I E +sdras, 1Esdras, 1st Esdras, First Esdras Prayer of Manasseh Pr of Man, Pr Man, PMa, Prayer of Manasses Additional Psalm Add Psalm, Add Ps 3 Maccabees 3 Macc, 3 Mac, III Ma, 3Ma, III Mac, 3Mac, III Macc, 3 +Macc, III Maccabees, 3rd Maccabees, Third Maccabees 2 Esdras 2 Esdr, 2 Esd, II Es, 2Es, II Esd, 2Esd, II Esdr, 2Esdr, +II Esdras, 2Esdras, 2nd Esdras, Second Esdras 4 Maccabees 4 Macc, 4 Mac, IV Ma, 4Ma, IV Mac, 4Mac, IV Macc, 4Mac +c, IV Maccabees, IIII Maccabees, 4Maccabees, 4th Maccabees, Fourth Ma +ccabees Ode Ode, Ode Psalms of Solomon Ps Solomon, Ps Sol, Psalms Solomon, PsSol Epistle to the Laodiceans Laodiceans, Laod, Ep Laod, Epist Laodice +ans, Epistle Laodiceans, Epistle to Laodiceans 3 Esdras 3 Esd, 3Esd 4 Esdras 4 Esd, 4Esd 5 Esdras 5 Esd, 5Esd Matthew Matt, Mt Mark Mrk, Mk, Mr Luke Luk, Lk John John, Jn, Jhn Acts Acts, Ac Romans Rom, Ro, Rm 1 Corinthians 1 Cor, 1 Co, I Co, 1Co, I Cor, 1Cor, I Corinthians, +1Corinthians, 1st Corinthians, First Corinthians 2 Corinthians 2 Cor, 2 Co, II Co, 2Co, II Cor, 2Cor, II Corinthian +s, 2Corinthians, 2nd Corinthians, Second Corinthians Galatians Gal, Ga Ephesians Ephes, Eph Philippians Phil, Php Colossians Col, Col 1 Thessalonians 1 Thess, 1 Th, I Th, 1Th, I Thes, 1Thes, I Thess, +1Thess, I Thessalonians, 1Thessalonians, 1st Thessalonians, First The +ssalonians 2 Thessalonians 2 Thess, 2 Th, II Th, 2Th, II Thes, 2Thes, II Thes +s, 2Thess, II Thessalonians, 2Thessalonians, 2nd Thessalonians, Secon +d Thessalonians 1 Timothy 1 Tim, 1 Ti, I Ti, 1Ti, I Tim, 1Tim, I Timothy, 1Timothy +, 1st Timothy, First Timothy 2 Timothy 2 Tim, 2 Ti, II Ti, 2Ti, II Tim, 2Tim, II Timothy, 2Timo +thy, 2nd Timothy, Second Timothy Titus Titus, Tit Philemon Philem, Phm Hebrews Hebrews, Heb James James, Jas, Jm 1 Peter 1 Pet, 1 Pe, I Pe, 1Pe, I Pet, 1Pet, I Pt, 1 Pt, 1Pt, I Pe +ter, 1Peter, 1st Peter, First Peter 2 Peter 2 Pet, 2 Pe, II Pe, 2Pe, II Pet, 2Pet, II Pt, 2 Pt, 2Pt, I +I Peter, 2Peter, 2nd Peter, Second Peter 1 John 1 John, 1 Jn, I Jn, 1Jn, I Jo, 1Jo, I Joh, 1Joh, I Jhn, 1 J +hn, 1Jhn, I John, 1John, 1st John, First John 2 John 2 John, 2 Jn, II Jn, 2Jn, II Jo, 2Jo, II Joh, 2Joh, II Jhn, + 2 Jhn, 2Jhn, II John, 2John, 2nd John, Second John 3 John 3 John, 3 Jn, III Jn, 3Jn, III Jo, 3Jo, III Joh, 3Joh, III +Jhn, 3 Jhn, 3Jhn, III John, 3John, 3rd John, Third John Jude Jude, Jud Revelation Rev, Re, The Revelation, Apocalypse, Apoc