#!/usr/bin/perl
# ppt2doc.pl v1.00
use strict;
use warnings;
if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i )
{
system( "perldoc", $0 ) and die "For usage, perldoc $0\n";
exit 0;
}
use Cwd qw( abs_path );
# Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin-
+1.
use Win32::OLE qw( in CP_UTF8 );
Win32::OLE->Option( CP => CP_UTF8 );
$Win32::OLE::Warn = 3;
# Obtain the mso/wd/pp constants in the cleanest way. Asking for plain
+
# 'Microsoft Office' doesn't work for some reason: we need details.
use Win32::OLE::Const;
my $ole_const =
{
%{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") }
+,
%{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Libra
+ry") },
%{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library")
+ },
};
# Set up files - must be absolute paths as OLE server doesn't use your
+ CWD.
my ( $pptfile, $docfile ) = @ARGV;
$pptfile = abs_path( $pptfile );
( my $file_stem = $pptfile ) =~ s/\.ppt$//i;
if ( defined $docfile )
{
$docfile = abs_path( $docfile );
}
else
{
$docfile = $file_stem . '.doc';
}
####################### Extract from PowerPoint ######################
+########
# Create PowerPoint OLE server. Subroutines may legitimately manipulat
+e this.
#OLE server should 'Quit' if the script dies unexpectedly.
my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' )
or die "Can't create PowerPoint OLE: $!\n";
# Can't set this to 0, for reasons I can't/don't understand.
$ppt->{Visible} = 1;
my $pres = $ppt->Presentations->Open( $pptfile )
or die "Can't open PowerPoint file '$pptfile': $!\n";
my @slides;
SLIDE:
for my $slide ( in $ppt->ActivePresentation->Slides )
{
# To select and extract images, we need to actively 'View' the sli
+de in
# question, which is an item from a 1-based 'array' of slides.
# This also makes it clear where the processing has reached.
$ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} );
next SLIDE unless slide_has_title( $slide );
# Slide type will be 'frontispiece', 'chapter', 'list' or 'normal'
+.
my $slide_type = get_slide_type( $slide );
my $picture_number = 1;
my @slide_text;
PLACEHOLDER:
for my $placeholder ( in $slide->Shapes->Placeholders )
{
if ( contains_table( $placeholder ) )
{
push @slide_text, extract_table( $placeholder );
+
}
elsif ( contains_text( $placeholder ) )
{
my $text_type = text_or_title( $placeholder );
PARAGRAPH:
for my $paragraph
( in $placeholder->TextFrame->TextRange->Paragraphs )
{
next PARAGRAPH unless is_worthwhile_text( $paragraph )
+;
push @slide_text, extract_text( $paragraph, $text_type
+ );
}
# Deal with icky special cases.
if ( $slide_type eq 'frontispiece' )
{
# The second placeholder of the 'frontispiece' slide c
+ontains
# a code number that should be prepended onto the titl
+e.
my ( $code_number ) =
$slide->Shapes->Placeholders(2)
->TextFrame->TextRange->{Text}
=~ /^(\w+)\s/i;
# Ugly, but easiest way to do this: but this isn't OOP
+, so
# it's OK :)
$slide_text[-1][1][0] = "$code_number $slide_text[-1][
+1][0]";
# Everything else on the slide should be discarded.
+
last PLACEHOLDER;
}
elsif ( $slide_type eq 'chapter' )
{
# We also discard the non-title text on 'chapter' slid
+es.
last PLACEHOLDER;
}
}
elsif ( contains_placeheld_picture( $placeholder ) )
{
my $picture_filename =
make_picture_filename
( $file_stem, $slide->{SlideNumber}, $picture_numb
+er);
push @slide_text, extract_picture
( $placeholder, $picture_filename );
$picture_number++;
}
else
{
next PLACEHOLDER;
}
}
FLOATING_PICTURE:
for my $shape ( in $slide->Shapes )
{
# Harvest floating pictures in addition to those in placeholde
+rs.
# This will not include autoshapes or embedded OLE objects.
next FLOATING_PICTURE unless contains_floating_picture( $shape
+ );
my $picture_filename =
make_picture_filename
( $file_stem, $slide->{SlideNumber}, $picture_number);
push @slide_text, extract_picture( $shape, $picture_filename )
+;
$picture_number++;
}
push @slides, [ $slide_type => @slide_text ];
}
########################## Insert into Word ##########################
+########
# Create Word OLE server.
my $word = Win32::OLE->new( 'Word.Application', 'Quit' )
or die "Can't create Word OLE: $!\n";
$word->{Visible} = 1;
my $doc = $word->Documents->Add
or die "Can't create new Word document: $!\n";
# All the preprocessing leg-work has been done by the extractor. Print
+ing
# to Word is trivial with the type_paragraph subroutine.
for my $slide ( @slides )
{
my ( $slide_type, @paragraphs ) = @{ $slide };
for my $paragraph ( @paragraphs )
{
type_paragraph( $paragraph, $slide_type );
}
}
$word->ActiveDocument->SaveAs( $docfile );
exit 0;
################################ Slides ##############################
+########
sub get_slide_type
{
my ( $slide ) = @_;
if ( $slide->{SlideNumber} == 1 )
{
# 'frontispiece' is for the presentation's first slide;
return 'frontispiece';
}
elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} )
{
# 'chapter' is for the presentation's section heading slides;
return 'chapter';
}
elsif
(
$slide->Shapes->Title->TextFrame->TextRange->{Text}
=~ /^(Objectives|Summary|Test yourself|Answers)/i
)
{
# 'list' is for slides whose bulleted layouts should be mainta
+ined;
return 'list';
}
else
{
# 'normal' is for generic text slides.
return 'normal';
}
}
sub slide_has_title
{
my ( $slide ) = @_;
if
(
$slide->{Layout} == $ole_const->{ppLayoutBlank}
|| ! $slide->Shapes->Count
)
{
warn "Skipping slide $slide->{SlideNumber}: no content\n";
return;
}
elsif ( ! $slide->Shapes->{HasTitle} )
{
warn "Skipping slide $slide->{SlideNumber}: no title\n";
return;
}
else
{
return 1;
}
}
############################# Formatting #############################
+########
sub get_style_for
{
my ( $slide_type, $text_type ) = @_;
my %style_for =
(
frontispiece =>
{ title => 'Heading 1'
+ },
chapter =>
{ title => 'Heading 2'
+ },
list =>
{ title => 'Heading 2', text => 'Bullets'
+ },
normal =>
{ title => 'Heading 3', text => 'Normal', picture => 'Figu
+re' },
);
return exists $style_for{$slide_type}{$text_type}
? $style_for{$slide_type}{$text_type}
: 'Normal'
}
############################ Titles and Text #########################
+########
sub contains_text
{
my ( $placeholder ) = @_;
return 1 if $placeholder->{HasTextFrame};
return;
}
sub is_worthwhile_text
{
my ( $paragraph ) = @_;
return 1 if $paragraph->{Text} !~ /^\s*$/;
return;
}
sub text_or_title
{
my ( $placeholder ) = @_;
for ( $placeholder->PlaceholderFormat->{Type} )
{
return 'title' if $_ == $ole_const->{ppPlaceholderTitle};
return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle}
+;
return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitl
+e};
return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle};
}
return 'text';
}
sub extract_text
{
my ( $paragraph, $text_type ) = @_;
$text_type ||= 'text';
my @paragraph_text;
RUN:
for my $run ( in $paragraph->Runs )
{
# A run is a section of text with the same font properties.
my $run_item = _extract_run( $run );
push @paragraph_text, $run_item;
}
return [ $text_type, @paragraph_text ];
}
sub _type_run
{
my ( $run ) = @_;
my $selection = $word->Selection;
my $run_text = shift @{ $run };
my %font_properties = @{ $run };
while ( my ( $font_property, $value ) = each %font_properties )
{
$selection->Font->{ $font_property } = $value;
}
$selection->TypeText( $run_text );
}
sub _extract_run
{
my ( $run ) = @_;
my %font_properties;
my $run_text = $run->{Text};
for my $name ( qw{ Italic Bold Subscript Superscript } )
{
# Save these important font properties for each run,
# so we don't have to manually reformat Arabidopsis and
# CH4. Note that this assumes ppTrue == wdTrue == msoTrue, whi
+ch
# is probably true.
$font_properties{ $name } = $run->Font->$name;
}
# Remove 'smart' "quotes" -- and em dashes.
$run_text
=~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/;
# Remove trailing newlines from bulleted lists.
$run_text
=~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x;
return [ $run_text => %font_properties ];
}
################################ Paragraph ###########################
+########
sub type_paragraph
{
my ( $paragraph, $slide_type ) = @_;
my $selection = $word->Selection;
my $paragraph_type = shift @{ $paragraph };
if ( $paragraph_type eq 'table' )
{
my $table = shift @{ $paragraph };
_type_table( $table, $slide_type );
}
elsif ( $paragraph_type eq 'picture' )
{
my ( $filename ) = shift @{ $paragraph };
_type_picture( $filename, $slide_type );
}
elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' )
{
_type_formatted_text
( $paragraph, get_style_for( $slide_type, $paragraph_type
+) );
}
else
{
warn "Unsupported paragraph type $paragraph_type\n";
}
}
sub _type_formatted_text
{
my ( $paragraph, $style ) = @_;
my $selection = $word->Selection;
$selection->{Style} = $style;
for my $run ( @{ $paragraph } )
{
_type_run( $run );
}
$selection->TypeParagraph();
$selection->{Style} = 'Normal';
}
############################### Pictures #############################
+########
sub make_picture_filename
{
my ( $file_stem, $slide_number, $picture_number ) = @_;
return $file_stem . '_' . $slide_number . '_' . $picture_number .
+'.png';
}
sub contains_floating_picture
{
my ( $shape ) = @_;
return 1 if $shape->{Type} == $ole_const->{msoPicture};
return;
}
sub contains_placeheld_picture
{
my ( $placeholder ) = @_;
# This is a hack, but if you check a placeholder's Type property
# for == msoPicture, it's false, because its placeholderiness
# overrides its picturiness. Why isn't there a
# HasPicture property?
return 1 if defined $placeholder->{PictureFormat};
return;
}
sub extract_picture
{
my ( $shape, $picture_filename ) = @_;
# Select the shape containing the picture.
$shape->Select;
# Export the selected shape range as a PNG graphic. We'll store
# the filename for later.
$ppt->ActiveWindow->Selection->ShapeRange->Export
( $picture_filename, $ole_const->{ppShapeFormatPNG} );
return [ 'picture' => $picture_filename ];
}
sub _type_picture
{
my ( $picture_filename, $slide_type ) = @_;
my $selection = $word->Selection;
# Insert inlined picture (a simple ->Shapes will float the picture
+).
$selection->InlineShapes->AddPicture
( $picture_filename,
{
LinkToFile => $ole_const->{msoFalse},
SaveWithDocument => $ole_const->{msoTrue},
}
);
$selection->{Style} = get_style_for( $slide_type, 'picture' );
# Add a newline so we don't end up typing on the same line as the
+picture.
$selection->TypeParagraph();
}
################################ Tables ##############################
+########
sub contains_table
{
my ( $placeholder ) = @_;
return 1 if $placeholder->{HasTable};
return;
}
sub extract_table
{
my ( $shape ) = @_;
my $table;
# Extract data from table cell-by-cell, store in simple AoA.
ROW:
for my $i ( 1 .. $shape->Table->Rows->Count )
{
CELL:
for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count )
{
my $cell = $shape->Table->Rows($i)->Cells($j)->Shape;
my @cell_text;
PARAGRAPH:
for my $paragraph ( in $cell->TextFrame->TextRange->Paragr
+aphs )
{
push @cell_text, extract_text( $paragraph );
}
# Convert to Perl 0-based arrays.
$table->[ $i - 1 ][ $j - 1 ] = \@cell_text;
}
}
return [ 'table' => $table ];
}
sub _type_table
{
my ( $table, $slide_type ) = @_;
my $selection = $word->Selection;
# Collate table size and placement (range) parameters.
my $number_of_rows = scalar @{ $table };
my $number_of_columns = scalar @{ $table->[0] };
my $end_of_doc = $selection->End;
my $range = $word->ActiveDocument->Range
( { Start => $end_of_doc, End => $end_of_doc } );
# Insert table of calculated size at given range.
my $table_obj = $selection->Tables->Add
( $range, $number_of_rows, $number_of_columns );
# Add data cell-by-cell.
for my $i ( 0 .. $#{ $table } )
{
for my $j ( 0 .. $#{ $table->[$i] } )
{
# Use Perl 0-based arrays.
$table_obj->Cell( $i + 1, $j + 1 )->Select;
for my $paragraph ( @{ $table->[$i][$j] } )
{
type_paragraph( $paragraph, $slide_type );
# It'd be nice to supress the newline at the end
# of each cell, but I can't think of a nice way to do
+this.
}
}
}
# Set selection outside the end of the table (end of document, act
+ually).
$selection->EndKey( $ole_const->{wdStory} );
}
######################################################################
+########
__END__
=head1 NAME
ppt2doc.pl - Perl script to convert PowerPoint presentations to Word d
+ocuments
=head1 SYNOPSIS
perl ppt2doc.pl PPTFILE [WORDFILE]
=head1 ABSTRACT
Convert PowerPoint presentation to Word document
=head1 DESCRIPTION
Converts PowerPoint presentation C<PPTFILE> to Word document C<WORDFIL
+E>.
If C<WORDFILE> is not specified, the data will be output to a file
of the same name with the C<.ppt> extension exchanged for <.doc>.
Pictures in PowerPoint placeholders will be extracted to PNG files nam
+ed
C<pptname_slidenumber_picturenumber.png>, where C<pptname> is the name
+ of
the presentation, less its extension, C<slidenumber> is the number of
+the
slide on which the picture was found, and C<picturenumber> is the numb
+er of
the picture on that particular slide.
The script is obviously tuned for converting my lecture slides to lect
+ure
notes, but the code shows how you can handle tables, paragraphs, title
+s and
images using OLE.
=head1 SEE ALSO
L<perl>
L<Win32::OLE>
L<Win32::OLE::Const>
L<http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vba
+pp10/html/pptocObjectModelApplication.asp> -
Microsoft Office XP PowerPoint object model.
=head1 AUTHOR
Steve Cook, E<lt>steve@steve.gb.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2005 by Steve Cook
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
|