<?xml version="1.0" encoding="windows-1252"?>
<node id="491983" title="Convert PowerPoint Presentation to Word Document with Win32::OLE" created="2005-09-14 15:31:15" updated="2005-09-14 11:31:15">
<type id="1748">
sourcecode</type>
<author id="205290">
polypompholyx</author>
<data>
<field name="doctext">
&lt;code&gt;
#!/usr/bin/perl 

# ppt2doc.pl v1.00

use strict;
use warnings;

if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i )
{
	system( "perldoc", $0 ) and die "For usage, perldoc $0\n";
	exit 0;
}

use Cwd qw( abs_path );

# Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin-1.
use Win32::OLE qw( in CP_UTF8 );
Win32::OLE-&gt;Option( CP =&gt; CP_UTF8 );
$Win32::OLE::Warn = 3;

# Obtain the mso/wd/pp constants in the cleanest way. Asking for plain 
# 'Microsoft Office' doesn't work for some reason: we need details.
use Win32::OLE::Const;
my $ole_const =
{ 
	%{ Win32::OLE::Const-&gt;Load("Microsoft Word 11.0 Object Library") }, 
	%{ Win32::OLE::Const-&gt;Load("Microsoft PowerPoint 11.0 Object Library") }, 
	%{ Win32::OLE::Const-&gt;Load("Microsoft Office 11.0 Object Library") }, 
};

# Set up files - must be absolute paths as OLE server doesn't use your CWD.
my ( $pptfile, $docfile ) = @ARGV;
$pptfile = abs_path( $pptfile );
( my $file_stem = $pptfile ) =~ s/\.ppt$//i;
if ( defined $docfile )
{
	$docfile = abs_path( $docfile );
}
else
{
	$docfile = $file_stem . '.doc';
}

####################### Extract from PowerPoint ##############################

# Create PowerPoint OLE server. Subroutines may legitimately manipulate this.
#OLE server should 'Quit' if the script dies unexpectedly.
my $ppt = Win32::OLE-&gt;new( 'PowerPoint.Application', 'Quit' )
	or die "Can't create PowerPoint OLE: $!\n";

# Can't set this to 0, for reasons I can't/don't understand.
$ppt-&gt;{Visible} = 1;

my $pres = $ppt-&gt;Presentations-&gt;Open( $pptfile ) 
	or die "Can't open PowerPoint file '$pptfile': $!\n";		

my @slides;

SLIDE:
for my $slide ( in $ppt-&gt;ActivePresentation-&gt;Slides )
{	
	# To select and extract images, we need to actively 'View' the slide in 
	# question, which is an item from a 1-based 'array' of slides. 
	# This also makes it clear where the processing has reached.
	$ppt-&gt;ActiveWindow-&gt;View-&gt;GotoSlide( $slide-&gt;{SlideNumber} );
	
	next SLIDE unless slide_has_title( $slide );
	
	# Slide type will be 'frontispiece', 'chapter', 'list' or 'normal'.
	my $slide_type = get_slide_type( $slide );
	
	my $picture_number = 1;
	my @slide_text;
	
	PLACEHOLDER:
	for my $placeholder ( in $slide-&gt;Shapes-&gt;Placeholders ) 
	{			
		if ( contains_table( $placeholder ) )
		{
			push @slide_text, extract_table( $placeholder );			
		}
		elsif ( contains_text( $placeholder ) )
		{
			my $text_type = text_or_title( $placeholder );
			PARAGRAPH:
			for my $paragraph 
			( in $placeholder-&gt;TextFrame-&gt;TextRange-&gt;Paragraphs )
			{	
				next PARAGRAPH unless is_worthwhile_text( $paragraph );
				push @slide_text, extract_text( $paragraph, $text_type );
			}		

			# Deal with icky special cases.
			if ( $slide_type eq 'frontispiece' )
			{
				# The second placeholder of the 'frontispiece' slide contains 
				# a code number that should be prepended onto the title.
				my ( $code_number ) = 
					$slide-&gt;Shapes-&gt;Placeholders(2)
						-&gt;TextFrame-&gt;TextRange-&gt;{Text}
					=~ /^(\w+)\s/i;
				# Ugly, but easiest way to do this: but this isn't OOP, so
				# it's OK :)
				$slide_text[-1][1][0] = "$code_number $slide_text[-1][1][0]";

				# Everything else on the slide should be discarded.				
				last PLACEHOLDER;
			}
			elsif ( $slide_type eq 'chapter' )
			{
				# We also discard the non-title text on 'chapter' slides.
				last PLACEHOLDER;
			}
		}
		elsif ( contains_placeheld_picture( $placeholder ) )
		{
			my $picture_filename = 
				make_picture_filename
					( $file_stem, $slide-&gt;{SlideNumber}, $picture_number);		
			push @slide_text, extract_picture
				( $placeholder, $picture_filename );
			$picture_number++;
		}
		else
		{
			next PLACEHOLDER;
		}
	}
	
	FLOATING_PICTURE:
	for my $shape ( in $slide-&gt;Shapes ) 
	{
		# Harvest floating pictures in addition to those in placeholders.
		# This will not include autoshapes or embedded OLE objects.
		next FLOATING_PICTURE unless contains_floating_picture( $shape );
		my $picture_filename = 
			make_picture_filename
				( $file_stem, $slide-&gt;{SlideNumber}, $picture_number);
		push @slide_text, extract_picture( $shape, $picture_filename );	
		$picture_number++;
	}
	
	push @slides, [ $slide_type =&gt; @slide_text ];
}

########################## Insert into Word ##################################

# Create Word OLE server.
my $word = Win32::OLE-&gt;new( 'Word.Application', 'Quit' )
	or die "Can't create Word OLE: $!\n";
$word-&gt;{Visible} = 1;
my $doc = $word-&gt;Documents-&gt;Add
	or die "Can't create new Word document: $!\n";

# All the preprocessing leg-work has been done by the extractor. Printing
# to Word is trivial with the type_paragraph subroutine.
for my $slide ( @slides )
{
	my ( $slide_type, @paragraphs ) = @{ $slide };
	for my $paragraph ( @paragraphs )
	{
		type_paragraph( $paragraph, $slide_type );
	}
}

$word-&gt;ActiveDocument-&gt;SaveAs( $docfile );

exit 0;

################################ Slides ######################################

sub get_slide_type
{
	my ( $slide ) = @_;
	if ( $slide-&gt;{SlideNumber} == 1 )
	{
		# 'frontispiece' is for the presentation's first slide;
		return 'frontispiece';
	}
	elsif ( $slide-&gt;{Layout} == $ole_const-&gt;{ppLayoutTitle} )
	{
		# 'chapter' is for the presentation's section heading slides;
		return 'chapter';
	}
	elsif
	(
		$slide-&gt;Shapes-&gt;Title-&gt;TextFrame-&gt;TextRange-&gt;{Text}
			=~ /^(Objectives|Summary|Test yourself|Answers)/i
	)
	{
		# 'list' is for slides whose bulleted layouts should be maintained;
		return 'list';
	}
	else
	{
		# 'normal' is for generic text slides.
		return 'normal';
	}
}

sub slide_has_title
{
	my ( $slide ) = @_;
	if
	(
		$slide-&gt;{Layout} == $ole_const-&gt;{ppLayoutBlank} 
			|| ! $slide-&gt;Shapes-&gt;Count
	)
	{
		warn "Skipping slide $slide-&gt;{SlideNumber}: no content\n";
		return;
	}
	elsif ( ! $slide-&gt;Shapes-&gt;{HasTitle} )
	{
		warn "Skipping slide $slide-&gt;{SlideNumber}: no title\n";
		return;
	}
	else
	{
		return 1;
	}
}

############################# Formatting #####################################

sub get_style_for
{
	my ( $slide_type, $text_type ) = @_;
	my %style_for =
	(
		frontispiece =&gt; 
			{ title =&gt; 'Heading 1'                                        },
		chapter      =&gt; 
			{ title =&gt; 'Heading 2'                                        },
		list         =&gt; 
			{ title =&gt; 'Heading 2', text =&gt; 'Bullets'                     },
		normal       =&gt; 
			{ title =&gt; 'Heading 3', text =&gt; 'Normal', picture =&gt; 'Figure' },	
	);
	return exists $style_for{$slide_type}{$text_type} 
		? $style_for{$slide_type}{$text_type} 
		: 'Normal'
}

############################ Titles and Text #################################

sub contains_text
{
	my ( $placeholder ) = @_;
	return 1 if $placeholder-&gt;{HasTextFrame};
	return;
}

sub is_worthwhile_text
{
	my ( $paragraph ) = @_;
	return 1 if $paragraph-&gt;{Text} !~ /^\s*$/;
	return;		
}

sub text_or_title
{
	my ( $placeholder ) = @_;
	for ( $placeholder-&gt;PlaceholderFormat-&gt;{Type} )
	{
		return 'title' if $_ == $ole_const-&gt;{ppPlaceholderTitle};
		return 'title' if $_ == $ole_const-&gt;{ppPlaceholderCenterTitle};
		return 'title' if $_ == $ole_const-&gt;{ppPlaceholderVerticalTitle};
		return 'title' if $_ == $ole_const-&gt;{ppPlaceholderSubtitle};
	}
	return 'text';
}

sub extract_text
{
	my ( $paragraph, $text_type ) = @_;
	$text_type ||= 'text';
	my @paragraph_text;					
	RUN:
	for my $run ( in $paragraph-&gt;Runs )
	{	
		# A run is a section of text with the same font properties.
		my $run_item = _extract_run( $run );
		push @paragraph_text, $run_item;
	}
	return [ $text_type, @paragraph_text ];
}

sub _type_run
{	
	my ( $run ) = @_;
	my $selection = $word-&gt;Selection;
	my $run_text  = shift @{ $run };
	my %font_properties = @{ $run }; 
	while ( my ( $font_property, $value ) = each %font_properties )
	{
		$selection-&gt;Font-&gt;{ $font_property } = $value; 
	}
	$selection-&gt;TypeText( $run_text );
}

sub _extract_run
{
	my ( $run ) = @_;
	my %font_properties;
	my $run_text = $run-&gt;{Text};
	for my $name ( qw{ Italic Bold Subscript Superscript } )
	{
		# Save these important font properties for each run,
		# so we don't have to manually reformat Arabidopsis and 
		# CH4. Note that this assumes ppTrue == wdTrue == msoTrue, which
		# is probably true.
		$font_properties{ $name } = $run-&gt;Font-&gt;$name;
	}
	
	# Remove 'smart' "quotes" -- and em dashes.
	$run_text 
		=~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; 
	
	# Remove trailing newlines from bulleted lists.
	$run_text 
		=~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x;	
	
	return [ $run_text =&gt; %font_properties ];
}

################################ Paragraph ###################################

sub type_paragraph
{
	my ( $paragraph, $slide_type ) = @_;
	my $selection = $word-&gt;Selection;
	my $paragraph_type = shift @{ $paragraph };

	if ( $paragraph_type eq 'table' )
	{
		my $table = shift @{ $paragraph };
		_type_table( $table, $slide_type );
	}
	elsif ( $paragraph_type eq 'picture' )
	{
		my ( $filename ) = shift @{ $paragraph };
		_type_picture( $filename, $slide_type );
	}
	elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' )
	{
		_type_formatted_text
			( $paragraph, get_style_for( $slide_type, $paragraph_type ) );
	}
	else
	{
		warn "Unsupported paragraph type $paragraph_type\n";
	}
}

sub _type_formatted_text
{
	my ( $paragraph, $style ) = @_;
	my $selection = $word-&gt;Selection;
	$selection-&gt;{Style} = $style;
	for my $run ( @{ $paragraph } )
	{
		_type_run( $run );
	}
	$selection-&gt;TypeParagraph();
	$selection-&gt;{Style} = 'Normal';	
}

############################### Pictures #####################################

sub make_picture_filename
{
	my ( $file_stem, $slide_number, $picture_number ) = @_;
	return $file_stem . '_' . $slide_number . '_' . $picture_number . '.png';
}

sub contains_floating_picture
{
	my ( $shape ) = @_;
	return 1 if $shape-&gt;{Type} == $ole_const-&gt;{msoPicture};
	return;
}

sub contains_placeheld_picture
{
	my ( $placeholder ) = @_;
	# This is a hack, but if you check a placeholder's Type property
	# for == msoPicture, it's false, because its placeholderiness
	# overrides its picturiness. Why isn't there a 
	# HasPicture property?
	return 1 if defined $placeholder-&gt;{PictureFormat};
	return;	
}

sub extract_picture
{
	my ( $shape, $picture_filename ) = @_;
	
	# Select the shape containing the picture.
	$shape-&gt;Select;			
	
	# Export the selected shape range as a PNG graphic. We'll store
	# the filename for later.
	$ppt-&gt;ActiveWindow-&gt;Selection-&gt;ShapeRange-&gt;Export
		( $picture_filename, $ole_const-&gt;{ppShapeFormatPNG} );
	
	return [ 'picture' =&gt; $picture_filename ];
}

sub _type_picture
{
	my ( $picture_filename, $slide_type ) = @_;
	my $selection = $word-&gt;Selection;
	
	# Insert inlined picture (a simple -&gt;Shapes will float the picture).
	$selection-&gt;InlineShapes-&gt;AddPicture
	( $picture_filename, 
		{ 
			LinkToFile       =&gt; $ole_const-&gt;{msoFalse}, 
			SaveWithDocument =&gt; $ole_const-&gt;{msoTrue},
		} 
	);
	$selection-&gt;{Style} = get_style_for( $slide_type, 'picture' );
	# Add a newline so we don't end up typing on the same line as the picture.
	$selection-&gt;TypeParagraph();
}

################################ Tables ######################################

sub contains_table
{
	my ( $placeholder ) = @_;
	return 1 if $placeholder-&gt;{HasTable};
	return;
}

sub extract_table
{
	my ( $shape ) = @_;
	my $table;
	
	# Extract data from table cell-by-cell, store in simple AoA.
	ROW:
	for my $i ( 1 .. $shape-&gt;Table-&gt;Rows-&gt;Count )
	{
		CELL:
		for my $j ( 1 .. $shape-&gt;Table-&gt;Rows($i)-&gt;Cells-&gt;Count )
		{
			my $cell = $shape-&gt;Table-&gt;Rows($i)-&gt;Cells($j)-&gt;Shape;
			
			my @cell_text;
			
			PARAGRAPH:
			for my $paragraph ( in $cell-&gt;TextFrame-&gt;TextRange-&gt;Paragraphs )
			{	
				push @cell_text, extract_text( $paragraph );
			}
			
			# Convert to Perl 0-based arrays.
			$table-&gt;[ $i - 1 ][ $j - 1 ] = \@cell_text;	
		}
	}
	return [ 'table' =&gt; $table ];
}

sub _type_table
{
	my ( $table, $slide_type ) = @_;
	
	my $selection = $word-&gt;Selection;
	
	# Collate table size and placement (range) parameters.
	my $number_of_rows    = scalar @{ $table };
	my $number_of_columns = scalar @{ $table-&gt;[0] };
	my $end_of_doc        = $selection-&gt;End;
	my $range = $word-&gt;ActiveDocument-&gt;Range
		( { Start =&gt; $end_of_doc, End =&gt; $end_of_doc } );
	
	# Insert table of calculated size at given range.
	my $table_obj = $selection-&gt;Tables-&gt;Add
		( $range, $number_of_rows, $number_of_columns );
		
	# Add data cell-by-cell.
	for my $i ( 0 .. $#{ $table } )
	{
		for my $j ( 0 .. $#{ $table-&gt;[$i] } )
		{
			# Use Perl 0-based arrays.
			$table_obj-&gt;Cell( $i + 1, $j + 1 )-&gt;Select;
			for my $paragraph ( @{ $table-&gt;[$i][$j] } )
			{
				type_paragraph( $paragraph, $slide_type );
				# It'd be nice to supress the newline at the end
				# of each cell, but I can't think of a nice way to do this.
			}
		}
	}
	# Set selection outside the end of the table (end of document, actually).
	$selection-&gt;EndKey( $ole_const-&gt;{wdStory} );
}

##############################################################################

__END__

=head1 NAME

ppt2doc.pl - Perl script to convert PowerPoint presentations to Word documents

=head1 SYNOPSIS

	perl ppt2doc.pl PPTFILE [WORDFILE]

=head1 ABSTRACT

Convert PowerPoint presentation to Word document

=head1 DESCRIPTION

Converts PowerPoint presentation C&lt;PPTFILE&gt; to Word document C&lt;WORDFILE&gt;.
If C&lt;WORDFILE&gt; is not specified, the data will be output to a file
of the same name with the C&lt;.ppt&gt; extension exchanged for &lt;.doc&gt;.

Pictures in PowerPoint placeholders will be extracted to PNG files named
C&lt;pptname_slidenumber_picturenumber.png&gt;, where C&lt;pptname&gt; is the name of
the presentation, less its extension, C&lt;slidenumber&gt; is the number of the 
slide on which the picture was found, and C&lt;picturenumber&gt; is the number of 
the picture on that particular slide.
 
The script is obviously tuned for converting my lecture slides to lecture 
notes, but the code shows how you can handle tables, paragraphs, titles and 
images using OLE.

=head1 SEE ALSO

L&lt;perl&gt;

L&lt;Win32::OLE&gt;

L&lt;Win32::OLE::Const&gt;

L&lt;http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vbapp10/html/pptocObjectModelApplication.asp&gt; -
Microsoft Office XP PowerPoint object model.

=head1 AUTHOR

Steve Cook, E&lt;lt&gt;steve@steve.gb.comE&lt;gt&gt;

=head1 COPYRIGHT AND LICENSE

Copyright 2005 by Steve Cook

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut
&lt;/code&gt;</field>
<field name="codedescription">
I wrote this because I needed to convert a large number of presentations into lecture notes. The script does a fairly robust job of transferring pictures, tables, formatted text and titles from PowerPoint to Word. Although the code may be a little specific to the job I was doing, it's entirely done using OLE, and will be useful to anyone who would like to use &lt;code&gt;Win32::OLE&lt;/code&gt; with PowerPoint, for which I have never found any sizeable online examples before. There are lots of handy comments too, so you can avoid all the pitfalls I fell into...</field>
<field name="codecategory">
Win32 Stuff</field>
<field name="codeauthor">
polypompholyx (Steve Cook - &lt;steve@steve.gb.com&gt;)</field>
</data>
</node>
