Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
#!/usr/local/bin/perl # # Uses docx2txt project of Sandeep Kumar: # http://docx2txt.sourceforge.net/ # Uses newbie Perl/Tk example code from: # http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/ # Uses CakeCMD unzipper because other commandline unzippers # not extract corrupt word/xml and word/_rels/document.xml.rels files # use Tk; # # Create the Main Window # my $mw = new MainWindow; # # Hides TK logo with my own logo # my $icon = $mw->Photo(-file => 'ddte.gif'); $mw->iconimage($icon); # # Declare that there is a menu, create text # editor and create a vertical scroll bar # my $mbar = $mw -> Menu(); $mw -> configure(-menu => $mbar); my $textarea = $mw -> Frame(); #Creating Another Frame my $txt = $textarea -> Text(-width=>80, -height=>22); my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $t +xt]); $txt -> configure(-yscrollcommand=>['set', $srl_y]); $txt -> grid(-row=>1,-column=>1); $srl_y -> grid(-row=>1,-column=>2,-sticky=>"ns"); $textarea -> grid(-row=>5,-column=>1,-columnspan=>2); # # Main Menu Choices Setup section # my $file = $mbar -> cascade(-label=>"File", -underline=>0, -tearoff => + 0); my $help = $mbar -> cascade(-label =>"Help", -underline=>0, -tearoff = +> 0); # # File Menu Choices section # $file -> checkbutton(-label =>"Open", -underline => 0, -command => [\&menuopenClicked, "Open"]); $file -> command(-label =>"Save", -underline => 0, -command => [\&menusavedClicked, "Save"]); $file -> separator(); $file -> command(-label =>"Exit", -underline => 1, -command => sub { exit } ); # # Help Menu Choices section # $help -> command(-label =>"About", -command => sub { $txt->delete('1.0','end'); $txt->insert('end', "About ---------- How to use this program: 1. Click on the File Menu and choose Opem. 2. Choose your docx Word 2007 file from which you wish to extract text. 3. You extracted text will be displayed. 4. Choose the Save menu choice on the File Menu. 5. Save the text file to the name and file location you wish. This program is made by Paul D Pruitt (socrtwo) and uses a modification of the docx2txt Perl script by Sandeep Kumar for it's main logic. See http://docx2txt.sourceforge.net/. It also uses Binny V A's Perl/Tk code for the GUI elements from http://www.geocities.com/binnyva/code. CakeCMD is by Leung Yat Chun Joseph. http://www.quickzip.org/softwares-cakecmd It requires Microsoft .NET Framework Version 2.0 http://tinyurl.com/ms2-0-netframework My software website is http://www.godskingsandheroes.info/software/. Also visit my data recovery software list http://www.s2services.com. My E-Mail : socrtwo\@s2services"); }); # # Open Dialog Box File Extension Declaration section # my $typesopen = [ ['Word 2007 files', '.docx'], ['All files', '*'],]; # # Main loop currently activated by selecting the file # MainLoop; sub menuopenClicked { my $mainfilepath = $mw->getOpenFile(-filetypes => $typesop +en, -defaultextension => '.docx'); return if undefined $mainfilepath; # # Delete old XML data found in the word folder section # use Win32::OLE; $dir = '<word>'; if(-e $dir){ $Win32::OLE::Warn = 3; # ------ SCRIPT CONFIGURATION ------ $strFolderPath = '<word>'; # e.g. "d:\temp" # ------ END CONFIGURATION --------- $objFSO = Win32::OLE->new('Scripting.FileSystemObject'); $objFSO->DeleteFolder($strFolderPath); } else { print "\n"; } # # Docx file rename to zip section necessary for CakeCMD to unzip. # my $zipwordfilepath = $mainfilepath . '.zip'; rename($mainfilepath,$zipwordfilepath); # # Unzip docx/zip file section # my $unzip = "cakecmd.exe"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/document.xml + \"\" " or die "Could not start $unzip: $!"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/_rels/docume +nt.xml.rels \"\" " or die "Could not start $unzip: $!"; # # Script sleep section to allow unzipping action to catch up with scri +pt # $num = 1; while($num--){ sleep(1); } close $zipwordfilepath; # # Revert the target file to it's original extension # rename ($zipwordfilepath,$mainfilepath); # # Housekeeping section to make the program # run well in Windows? Check with docx2xml author. # my $nl = "\r\n"; # Alternative is "\n". my $lindent = " "; # Indent nested lists by "\t", " " etc. my $lwidth = 80; # Line width, used for short line justificatio +n. # # ToDo: Better list handling. Currently assumed 8 level nesting. # my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--'); # # Added routine for reading file into $_ variable. # This is necessary because the unzipper, cakecmd, does not write to S +TDIN or STDOUT # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/_rels/document.xml.rels" or die "Couldn't open file +: $!"; binmode FILE; $_= <FILE>; close FILE; } # # Gather information about header, footer, hyperlinks, images, footnot +es etc. # my %docurels; while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( + .*?)?\/>/g) { $docurels{"$2:$1"} = $3; } # # Subroutines for center and right justification of text in a line. # sub cjustify { my $len = length $_[0]; if ($len < ($lwidth - 1)) { my $lsp = ($lwidth - $len) / 2; return ' ' x $lsp . $_[0]; } else { return $_[0]; } } # sub rjustify { my $len = length $_[0]; if ($len < $lwidth) { return ' ' x ($lwidth - $len) . $_[0]; } else { return $_[0]; } } # # Subroutines for dealing with embedded links and images # sub hyperlink { return "{$_[1]}[HYPERLINK: $docurels{\"hyperlink:$_[0]\"}]"; } # # Routine for reading file into $content variable. # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/document.xml" or die "Couldn't open file: $!"; binmode FILE; $content = <FILE>; close FILE; } # # Text extraction begins section # $content =~ s/<?xml .*?\?>(\r)?\n//; $content =~ s{<w:p [^/>]+?/>|</w:p>}|$nl|og; $content =~ s|<w:br/>|$nl|og; $content =~ s|<w:tab/>|\t|og; # my $hr = '-' x 78 . $nl; $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og; $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . "$le +vchar[$1] "|oge; # # Uncomment either of below two lines and comment above line, if deali +ng # with more than 8 level nested lists. # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . '* + '|oge; # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' ' +|oge; # $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge; $content =~ s{<w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>(.*?)</w: +t></w:r>}/cjustify($1)/oge; $content =~ s{<w:pPr><w:jc w:val="right"/></w:pPr><w:r><w:t>(.*?)</w:t +></w:r>}/rjustify($1)/oge; $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperl +ink($1,$2)/oge; $content =~ s/<.*?>//g; # # Convert non-ASCII characters/character sequences to ASCII characters +. # $content =~ s/\xE2\x82\xAC/\xC8/og; # euro symbol as saved by MSOf +fice # $content =~ s/\xE2\x82\xAC/E/og; # euro symbol expressed as E $content =~ s/\xE2\x80\xA6/.../og; $content =~ s/\xE2\x80\xA2/::/og; # four dot diamond symbol $content =~ s/\xE2\x80\x9C/"/og; $content =~ s/\xE2\x80\x99/'/og; $content =~ s/\xE2\x80\x98/'/og; $content =~ s/\xE2\x80\x93/-/og; $content =~ s/\xC2\xA0//og; $content =~ s/&amp;/&/ogi; $content =~ s/&lt;/</ogi; $content =~ s/&gt;/>/ogi; # # Write the extracted and converted text contents to output. # #Text Area # $txt->delete('1.0','end'); $txt -> insert('end',$content); my $typessaved = [ ['Text files', '.txt'], ['All files', '*'],]; # # Subroutine actived by clicking on the save menu # sub menusavedClicked { my $saved = $mw->getSaveFile(-filetypes => $typessaved, -defaultextension => '.txt', -initialfile => "$mainfilepath" . '.txt' +); return if not defined $saved; # # Opens results text file for writing. # open($saved, "> $saved") || die "Can't create <$docx_name> for output! +\n"; print ($saved $content) if $saved; close $saved; }}

In reply to Corrupt MS Word 2007 Text Extractor by socrtwo

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others chilling in the Monastery: (6)
    As of 2014-08-02 02:47 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      Who would be the most fun to work for?















      Results (53 votes), past polls