#!/usr/local/bin/perl # # Uses docx2txt project of Sandeep Kumar: # http://docx2txt.sourceforge.net/ # Uses newbie Perl/Tk example code from: # http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/ # Uses CakeCMD unzipper because other commandline unzippers # not extract corrupt word/xml and word/_rels/document.xml.rels files # use Tk; # # Create the Main Window # my $mw = new MainWindow; # # Hides TK logo with my own logo # my $icon = $mw->Photo(-file => 'ddte.gif'); $mw->iconimage($icon); # # Declare that there is a menu, create text # editor and create a vertical scroll bar # my $mbar = $mw -> Menu(); $mw -> configure(-menu => $mbar); my $textarea = $mw -> Frame(); #Creating Another Frame my $txt = $textarea -> Text(-width=>80, -height=>22); my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $txt]); $txt -> configure(-yscrollcommand=>['set', $srl_y]); $txt -> grid(-row=>1,-column=>1); $srl_y -> grid(-row=>1,-column=>2,-sticky=>"ns"); $textarea -> grid(-row=>5,-column=>1,-columnspan=>2); # # Main Menu Choices Setup section # my $file = $mbar -> cascade(-label=>"File", -underline=>0, -tearoff => 0); my $help = $mbar -> cascade(-label =>"Help", -underline=>0, -tearoff => 0); # # File Menu Choices section # $file -> checkbutton(-label =>"Open", -underline => 0, -command => [\&menuopenClicked, "Open"]); $file -> command(-label =>"Save", -underline => 0, -command => [\&menusavedClicked, "Save"]); $file -> separator(); $file -> command(-label =>"Exit", -underline => 1, -command => sub { exit } ); # # Help Menu Choices section # $help -> command(-label =>"About", -command => sub { $txt->delete('1.0','end'); $txt->insert('end', "About ---------- How to use this program: 1. Click on the File Menu and choose Opem. 2. Choose your docx Word 2007 file from which you wish to extract text. 3. You extracted text will be displayed. 4. Choose the Save menu choice on the File Menu. 5. Save the text file to the name and file location you wish. This program is made by Paul D Pruitt (socrtwo) and uses a modification of the docx2txt Perl script by Sandeep Kumar for it's main logic. See http://docx2txt.sourceforge.net/. It also uses Binny V A's Perl/Tk code for the GUI elements from http://www.geocities.com/binnyva/code. CakeCMD is by Leung Yat Chun Joseph. http://www.quickzip.org/softwares-cakecmd It requires Microsoft .NET Framework Version 2.0 http://tinyurl.com/ms2-0-netframework My software website is http://www.godskingsandheroes.info/software/. Also visit my data recovery software list http://www.s2services.com. My E-Mail : socrtwo\@s2services"); }); # # Open Dialog Box File Extension Declaration section # my $typesopen = [ ['Word 2007 files', '.docx'], ['All files', '*'],]; # # Main loop currently activated by selecting the file # MainLoop; sub menuopenClicked { my $mainfilepath = $mw->getOpenFile(-filetypes => $typesopen, -defaultextension => '.docx'); return if undefined $mainfilepath; # # Delete old XML data found in the word folder section # use Win32::OLE; $dir = ''; if(-e $dir){ $Win32::OLE::Warn = 3; # ------ SCRIPT CONFIGURATION ------ $strFolderPath = ''; # e.g. "d:\temp" # ------ END CONFIGURATION --------- $objFSO = Win32::OLE->new('Scripting.FileSystemObject'); $objFSO->DeleteFolder($strFolderPath); } else { print "\n"; } # # Docx file rename to zip section necessary for CakeCMD to unzip. # my $zipwordfilepath = $mainfilepath . '.zip'; rename($mainfilepath,$zipwordfilepath); # # Unzip docx/zip file section # my $unzip = "cakecmd.exe"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/document.xml \"\" " or die "Could not start $unzip: $!"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/_rels/document.xml.rels \"\" " or die "Could not start $unzip: $!"; # # Script sleep section to allow unzipping action to catch up with script # $num = 1; while($num--){ sleep(1); } close $zipwordfilepath; # # Revert the target file to it's original extension # rename ($zipwordfilepath,$mainfilepath); # # Housekeeping section to make the program # run well in Windows? Check with docx2xml author. # my $nl = "\r\n"; # Alternative is "\n". my $lindent = " "; # Indent nested lists by "\t", " " etc. my $lwidth = 80; # Line width, used for short line justification. # # ToDo: Better list handling. Currently assumed 8 level nesting. # my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--'); # # Added routine for reading file into $_ variable. # This is necessary because the unzipper, cakecmd, does not write to STDIN or STDOUT # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/_rels/document.xml.rels" or die "Couldn't open file: $!"; binmode FILE; $_= ; close FILE; } # # Gather information about header, footer, hyperlinks, images, footnotes etc. # my %docurels; while (//g) { $docurels{"$2:$1"} = $3; } # # Subroutines for center and right justification of text in a line. # sub cjustify { my $len = length $_[0]; if ($len < ($lwidth - 1)) { my $lsp = ($lwidth - $len) / 2; return ' ' x $lsp . $_[0]; } else { return $_[0]; } } # sub rjustify { my $len = length $_[0]; if ($len < $lwidth) { return ' ' x ($lwidth - $len) . $_[0]; } else { return $_[0]; } } # # Subroutines for dealing with embedded links and images # sub hyperlink { return "{$_[1]}[HYPERLINK: $docurels{\"hyperlink:$_[0]\"}]"; } # # Routine for reading file into $content variable. # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/document.xml" or die "Couldn't open file: $!"; binmode FILE; $content = ; close FILE; } # # Text extraction begins section # $content =~ s/(\r)?\n//; $content =~ s{]+?/>|}|$nl|og; $content =~ s||$nl|og; $content =~ s||\t|og; # my $hr = '-' x 78 . $nl; $content =~ s|.*?|$hr|og; $content =~ s||$lindent x $1 . "$levchar[$1] "|oge; # # Uncomment either of below two lines and comment above line, if dealing # with more than 8 level nested lists. # $content =~ s||$lindent x $1 . '* '|oge; # $content =~ s||'*' x ($1+1) . ' '|oge; # $content =~ s{.*?(|]+>)(.*?)}/uc $2/oge; $content =~ s{(.*?)}/cjustify($1)/oge; $content =~ s{(.*?)}/rjustify($1)/oge; $content =~ s{(.*?)}/hyperlink($1,$2)/oge; $content =~ s/<.*?>//g; # # Convert non-ASCII characters/character sequences to ASCII characters. # $content =~ s/\xE2\x82\xAC/\xC8/og; # euro symbol as saved by MSOffice # $content =~ s/\xE2\x82\xAC/E/og; # euro symbol expressed as E $content =~ s/\xE2\x80\xA6/.../og; $content =~ s/\xE2\x80\xA2/::/og; # four dot diamond symbol $content =~ s/\xE2\x80\x9C/"/og; $content =~ s/\xE2\x80\x99/'/og; $content =~ s/\xE2\x80\x98/'/og; $content =~ s/\xE2\x80\x93/-/og; $content =~ s/\xC2\xA0//og; $content =~ s/&/&/ogi; $content =~ s/<//ogi; # # Write the extracted and converted text contents to output. # #Text Area # $txt->delete('1.0','end'); $txt -> insert('end',$content); my $typessaved = [ ['Text files', '.txt'], ['All files', '*'],]; # # Subroutine actived by clicking on the save menu # sub menusavedClicked { my $saved = $mw->getSaveFile(-filetypes => $typessaved, -defaultextension => '.txt', -initialfile => "$mainfilepath" . '.txt'); return if not defined $saved; # # Opens results text file for writing. # open($saved, "> $saved") || die "Can't create <$docx_name> for output!\n"; print ($saved $content) if $saved; close $saved; }}