Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

Corrupt MS Word 2007 Text Extractor

by socrtwo (Sexton)
on Apr 03, 2009 at 16:58 UTC ( #755314=sourcecode: print w/ replies, xml ) Need Help??

Category: Win32 Stuff
Author/Contact Info socrtwo (Paul Pruitt) socrtwo@s2services.com
Description: DOCX files are really zipped collections of XML and other binary files. The text in the docx files is contained in the word/document.xml file within the collections. Some basic formatting information is also contained in the word/_rels/document.xml.rels file. The code presented here uses a modification of the docx2txt script to extract text from corrupt Word 2007 files where Word 2007 and the original doc2txt fails to. It uses CakeCMD to unzip instead of Info-Zip found in the original docx2txt script, because CakeCMD will unzip corrupt xml files where Info-Zip fails or does not work as well. This script also uses Perl/Tk to make a GUI, with the aim of a wider audience. The Perl/Tk code was inspired by the easy great beginner tutorials by Binny V A here: http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/. CakeCMD is by Leung Yat Chun Joseph and can be found here: http://www.quickzip.org/softwares-cakecmd. It uses the open source ICSharpCode.SharpZipLib.dll library. Cake CMD also requires the Microsoft dotNet 2.0 framework here: http://tinyurl.com/8vgak. Docx2txt is by Sandeep Kumar and can be found starting here: http://docx2txt.sourceforge.net/. My real name is Paul Pruitt, my E-mail is socrtwo@s2services.com. My website containing a couple hundred links of data recovery resources is here: http://www.s2services.com. I built an executable for this package which can be found there too on the index page.
#!/usr/local/bin/perl
# 
# Uses docx2txt project of Sandeep Kumar:
# http://docx2txt.sourceforge.net/
# Uses newbie Perl/Tk example code from:
# http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/
# Uses CakeCMD unzipper because other commandline unzippers 
# not extract corrupt word/xml and word/_rels/document.xml.rels files
#
use Tk;
# 
# Create the Main Window
# 
my $mw = new MainWindow;
# 
# Hides TK logo with my own logo
#
my $icon = $mw->Photo(-file => 'ddte.gif');
$mw->iconimage($icon);
# 
# Declare that there is a menu, create text 
# editor and create a vertical scroll bar
#
my $mbar = $mw -> Menu();
$mw -> configure(-menu => $mbar);
my $textarea = $mw -> Frame(); #Creating Another Frame
my $txt = $textarea -> Text(-width=>80, -height=>22);
my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $t
+xt]);
$txt -> configure(-yscrollcommand=>['set', $srl_y]);
$txt -> grid(-row=>1,-column=>1);
$srl_y -> grid(-row=>1,-column=>2,-sticky=>"ns");
$textarea -> grid(-row=>5,-column=>1,-columnspan=>2);
# 
# Main Menu Choices Setup section
# 
my $file = $mbar -> cascade(-label=>"File", -underline=>0, -tearoff =>
+ 0);
my $help = $mbar -> cascade(-label =>"Help", -underline=>0, -tearoff =
+> 0);
# 
# File Menu Choices section
# 
$file -> checkbutton(-label =>"Open", -underline => 0,
        -command => [\&menuopenClicked, "Open"]);
$file -> command(-label =>"Save", -underline => 0,
        -command => [\&menusavedClicked, "Save"]);
$file -> separator();
$file -> command(-label =>"Exit", -underline => 1,
        -command => sub { exit } );
# 
# Help Menu Choices section 
# 
$help -> command(-label =>"About", -command => sub { 
    $txt->delete('1.0','end');
    $txt->insert('end',
    "About
----------
How to use this program: 
1.  Click on the File Menu and choose Opem.
2.  Choose your docx Word 2007 file from which you
wish to extract text.
3. You extracted text will be displayed.
4.  Choose the Save menu choice on the File Menu.
5.  Save the text file to the name and 
file location you wish.

This program is made by Paul D Pruitt (socrtwo)
and uses a modification of the docx2txt Perl 
script by Sandeep Kumar for it's main logic. 
See http://docx2txt.sourceforge.net/.
It also uses Binny V A's Perl/Tk code for the GUI elements 
from http://www.geocities.com/binnyva/code.
CakeCMD is by Leung Yat Chun Joseph.  
http://www.quickzip.org/softwares-cakecmd
It requires Microsoft .NET Framework Version 2.0
http://tinyurl.com/ms2-0-netframework

My software website is 
http://www.godskingsandheroes.info/software/.
Also visit my data recovery software list
http://www.s2services.com.
My E-Mail : socrtwo\@s2services"); });
# 
# Open Dialog Box File Extension Declaration section
# 
my $typesopen = [ ['Word 2007 files', '.docx'],
                    ['All files',   '*'],];
# 
# Main loop currently activated by selecting the file
# 
MainLoop;
sub menuopenClicked {
            my $mainfilepath = $mw->getOpenFile(-filetypes => $typesop
+en,
                              -defaultextension => '.docx');      
            return if undefined $mainfilepath;
# 
# Delete old XML data found in the word folder section
# 
use Win32::OLE;
$dir = '<word>';
if(-e $dir){
$Win32::OLE::Warn = 3;
# ------ SCRIPT CONFIGURATION ------
$strFolderPath = '<word>'; # e.g. "d:\temp"
# ------ END CONFIGURATION ---------
$objFSO = Win32::OLE->new('Scripting.FileSystemObject');
$objFSO->DeleteFolder($strFolderPath);
} else {
        print "\n";
    }
# 
# Docx file rename to zip section necessary for CakeCMD to unzip.
#     
    my $zipwordfilepath = $mainfilepath . '.zip'; 
        rename($mainfilepath,$zipwordfilepath);
# 
# Unzip docx/zip file section
#     
my $unzip = "cakecmd.exe";
open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/document.xml
+ \"\" " or die "Could not start $unzip: $!";
open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/_rels/docume
+nt.xml.rels \"\" " or die "Could not start $unzip: $!";
#
# Script sleep section to allow unzipping action to catch up with scri
+pt
# 
$num = 1;
while($num--){
    sleep(1);
}
close $zipwordfilepath;
# 
# Revert the target file to it's original extension
#
rename ($zipwordfilepath,$mainfilepath);
# 
# Housekeeping section to make the program 
# run well in Windows? Check with docx2xml author.
# 
my $nl = "\r\n";                # Alternative is "\n".
my $lindent = "  ";     # Indent nested lists by "\t", " " etc.
my $lwidth = 80;        # Line width, used for short line justificatio
+n.
#
# ToDo: Better list handling. Currently assumed 8 level nesting.
# 
my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
# 
# Added routine for reading file into $_ variable.  
# This is necessary because the unzipper, cakecmd, does not write to S
+TDIN or STDOUT
# Source is Perl Monks: http://www.perlmonks.org/?node_id=1952
# 
{
  local $/=undef;
  open FILE, "word/_rels/document.xml.rels" or die "Couldn't open file
+: $!";
  binmode FILE;
  $_= <FILE>;
  close FILE;
}
# 
# Gather information about header, footer, hyperlinks, images, footnot
+es etc.
# 
my %docurels;
while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"(
+ .*?)?\/>/g)
{
    $docurels{"$2:$1"} = $3;
}
#
# Subroutines for center and right justification of text in a line.
#
sub cjustify {
    my $len = length $_[0];
    if ($len < ($lwidth - 1)) {
        my $lsp = ($lwidth - $len) / 2;
        return ' ' x $lsp . $_[0];
    } else {
        return $_[0];
    }
}
#
sub rjustify {
    my $len = length $_[0];

    if ($len < $lwidth) {
        return ' ' x ($lwidth - $len) . $_[0];
    } else {
        return $_[0];
    }
}
#
# Subroutines for dealing with embedded links and images
#
sub hyperlink {
    return "{$_[1]}[HYPERLINK: $docurels{\"hyperlink:$_[0]\"}]";
}
# 
# Routine for reading file into $content variable.  
# Source is Perl Monks: http://www.perlmonks.org/?node_id=1952
#
{
  local $/=undef;
  open FILE, "word/document.xml" or die "Couldn't open file: $!";
  binmode FILE;
  $content = <FILE>;
  close FILE;
}
#
# Text extraction begins section
# 
 
$content =~ s/<?xml .*?\?>(\r)?\n//;
$content =~ s{<w:p [^/>]+?/>|</w:p>}|$nl|og;
$content =~ s|<w:br/>|$nl|og;
$content =~ s|<w:tab/>|\t|og;
#
my $hr = '-' x 78 . $nl;
$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
$content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . "$le
+vchar[$1] "|oge;
 
#
# Uncomment either of below two lines and comment above line, if deali
+ng
# with more than 8 level nested lists.
# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . '*
+ '|oge;
# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '
+|oge;
#
$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
$content =~ s{<w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>(.*?)</w:
+t></w:r>}/cjustify($1)/oge;
$content =~ s{<w:pPr><w:jc w:val="right"/></w:pPr><w:r><w:t>(.*?)</w:t
+></w:r>}/rjustify($1)/oge;
$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperl
+ink($1,$2)/oge;
$content =~ s/<.*?>//g;
#
# Convert non-ASCII characters/character sequences to ASCII characters
+.
# $content =~ s/\xE2\x82\xAC/\xC8/og;   # euro symbol as saved by MSOf
+fice
#
$content =~ s/\xE2\x82\xAC/E/og;        # euro symbol expressed as E
$content =~ s/\xE2\x80\xA6/.../og;
$content =~ s/\xE2\x80\xA2/::/og;       # four dot diamond symbol
$content =~ s/\xE2\x80\x9C/"/og;
$content =~ s/\xE2\x80\x99/'/og;
$content =~ s/\xE2\x80\x98/'/og;
$content =~ s/\xE2\x80\x93/-/og;
$content =~ s/\xC2\xA0//og;
$content =~ s/&amp;/&/ogi;
$content =~ s/&lt;/</ogi;
$content =~ s/&gt;/>/ogi;
# 
# Write the extracted and converted text contents to output.
# 
#Text Area
# 
$txt->delete('1.0','end');
$txt -> insert('end',$content);
my $typessaved = [ ['Text files', '.txt'],
                    ['All files',   '*'],];
#
# Subroutine actived by clicking on the save menu
#
sub menusavedClicked {
  my $saved = $mw->getSaveFile(-filetypes => $typessaved,
                              -defaultextension => '.txt',
                              -initialfile => "$mainfilepath" . '.txt'
+);
                              return if not defined $saved;
# 
# Opens results text file for writing. 
# 
open($saved, "> $saved") || die "Can't create <$docx_name> for output!
+\n";
  print ($saved $content) if $saved;    
close $saved;
}}


Comment on Corrupt MS Word 2007 Text Extractor
Download Code

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://755314]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others lurking in the Monastery: (6)
As of 2014-12-26 03:03 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (164 votes), past polls