Here is a small program that uses OLE::Storage_Lite to distinguish distinguish Microsoft doc and xls files.
#!/usr/bin/perl
use strict;
use warnings;
use OLE::Storage_Lite;
my @files = (
'test.xls',
'test.doc',
'test.ppt',
'test.txt',
);
for my $filename ( @files ) {
printf( "%-20s = %s\n", $filename, check_ole_filetype( $filena
+me ) );
}
sub check_ole_filetype {
my $filename = shift;
# Check that the file exists.
return 'not_found' if !-e $filename;
# Create an OLE::Storage_Lite object to read the file.
my $ole = OLE::Storage_Lite->new( $filename );
my $pps = $ole->getPpsTree();
# If getPpsTree() failed then this isn't an OLE file.
return 'not_ole_file' if !$pps;
# Loop through the PPS children below the root.
for my $child_pps ( @{ $pps->{Child} } ) {
my $pps_name = OLE::Storage_Lite::Ucs2Asc( $child_pps->{Na
+me} );
# Match an Excel xls file.
if ( $pps_name eq 'Workbook' || $pps_name eq 'Book' ) {
return 'xls';
}
# Match a Word document.
if ( $pps_name eq 'WordDocument') {
return 'doc';
}
}
return 'unknown_ole_file';
}
__END__
Output:
$ perl ole_check.pl
test.xls = xls
test.doc = doc
test.ppt = unknown_ole_file
test.txt = not_ole_file
You will probably have to harden it a little for your needs. For example it is possible that some older Word files might have a differed $pps_name. A little testing should highlight if that is the case. Also, this won't find Office 2007+ style docx or xlsx files.
--
John.