#!/usr/bin/perl # ppt2doc.pl v1.00 use strict; use warnings; if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i ) { system( "perldoc", $0 ) and die "For usage, perldoc $0\n"; exit 0; } use Cwd qw( abs_path ); # Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin-1. use Win32::OLE qw( in CP_UTF8 ); Win32::OLE->Option( CP => CP_UTF8 ); $Win32::OLE::Warn = 3; # Obtain the mso/wd/pp constants in the cleanest way. Asking for plain # 'Microsoft Office' doesn't work for some reason: we need details. use Win32::OLE::Const; my $ole_const = { %{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") }, %{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Library") }, %{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library") }, }; # Set up files - must be absolute paths as OLE server doesn't use your CWD. my ( $pptfile, $docfile ) = @ARGV; $pptfile = abs_path( $pptfile ); ( my $file_stem = $pptfile ) =~ s/\.ppt$//i; if ( defined $docfile ) { $docfile = abs_path( $docfile ); } else { $docfile = $file_stem . '.doc'; } ####################### Extract from PowerPoint ############################## # Create PowerPoint OLE server. Subroutines may legitimately manipulate this. #OLE server should 'Quit' if the script dies unexpectedly. my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' ) or die "Can't create PowerPoint OLE: $!\n"; # Can't set this to 0, for reasons I can't/don't understand. $ppt->{Visible} = 1; my $pres = $ppt->Presentations->Open( $pptfile ) or die "Can't open PowerPoint file '$pptfile': $!\n"; my @slides; SLIDE: for my $slide ( in $ppt->ActivePresentation->Slides ) { # To select and extract images, we need to actively 'View' the slide in # question, which is an item from a 1-based 'array' of slides. # This also makes it clear where the processing has reached. $ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} ); next SLIDE unless slide_has_title( $slide ); # Slide type will be 'frontispiece', 'chapter', 'list' or 'normal'. my $slide_type = get_slide_type( $slide ); my $picture_number = 1; my @slide_text; PLACEHOLDER: for my $placeholder ( in $slide->Shapes->Placeholders ) { if ( contains_table( $placeholder ) ) { push @slide_text, extract_table( $placeholder ); } elsif ( contains_text( $placeholder ) ) { my $text_type = text_or_title( $placeholder ); PARAGRAPH: for my $paragraph ( in $placeholder->TextFrame->TextRange->Paragraphs ) { next PARAGRAPH unless is_worthwhile_text( $paragraph ); push @slide_text, extract_text( $paragraph, $text_type ); } # Deal with icky special cases. if ( $slide_type eq 'frontispiece' ) { # The second placeholder of the 'frontispiece' slide contains # a code number that should be prepended onto the title. my ( $code_number ) = $slide->Shapes->Placeholders(2) ->TextFrame->TextRange->{Text} =~ /^(\w+)\s/i; # Ugly, but easiest way to do this: but this isn't OOP, so # it's OK :) $slide_text[-1][1][0] = "$code_number $slide_text[-1][1][0]"; # Everything else on the slide should be discarded. last PLACEHOLDER; } elsif ( $slide_type eq 'chapter' ) { # We also discard the non-title text on 'chapter' slides. last PLACEHOLDER; } } elsif ( contains_placeheld_picture( $placeholder ) ) { my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_number); push @slide_text, extract_picture ( $placeholder, $picture_filename ); $picture_number++; } else { next PLACEHOLDER; } } FLOATING_PICTURE: for my $shape ( in $slide->Shapes ) { # Harvest floating pictures in addition to those in placeholders. # This will not include autoshapes or embedded OLE objects. next FLOATING_PICTURE unless contains_floating_picture( $shape ); my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_number); push @slide_text, extract_picture( $shape, $picture_filename ); $picture_number++; } push @slides, [ $slide_type => @slide_text ]; } ########################## Insert into Word ################################## # Create Word OLE server. my $word = Win32::OLE->new( 'Word.Application', 'Quit' ) or die "Can't create Word OLE: $!\n"; $word->{Visible} = 1; my $doc = $word->Documents->Add or die "Can't create new Word document: $!\n"; # All the preprocessing leg-work has been done by the extractor. Printing # to Word is trivial with the type_paragraph subroutine. for my $slide ( @slides ) { my ( $slide_type, @paragraphs ) = @{ $slide }; for my $paragraph ( @paragraphs ) { type_paragraph( $paragraph, $slide_type ); } } $word->ActiveDocument->SaveAs( $docfile ); exit 0; ################################ Slides ###################################### sub get_slide_type { my ( $slide ) = @_; if ( $slide->{SlideNumber} == 1 ) { # 'frontispiece' is for the presentation's first slide; return 'frontispiece'; } elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} ) { # 'chapter' is for the presentation's section heading slides; return 'chapter'; } elsif ( $slide->Shapes->Title->TextFrame->TextRange->{Text} =~ /^(Objectives|Summary|Test yourself|Answers)/i ) { # 'list' is for slides whose bulleted layouts should be maintained; return 'list'; } else { # 'normal' is for generic text slides. return 'normal'; } } sub slide_has_title { my ( $slide ) = @_; if ( $slide->{Layout} == $ole_const->{ppLayoutBlank} || ! $slide->Shapes->Count ) { warn "Skipping slide $slide->{SlideNumber}: no content\n"; return; } elsif ( ! $slide->Shapes->{HasTitle} ) { warn "Skipping slide $slide->{SlideNumber}: no title\n"; return; } else { return 1; } } ############################# Formatting ##################################### sub get_style_for { my ( $slide_type, $text_type ) = @_; my %style_for = ( frontispiece => { title => 'Heading 1' }, chapter => { title => 'Heading 2' }, list => { title => 'Heading 2', text => 'Bullets' }, normal => { title => 'Heading 3', text => 'Normal', picture => 'Figure' }, ); return exists $style_for{$slide_type}{$text_type} ? $style_for{$slide_type}{$text_type} : 'Normal' } ############################ Titles and Text ################################# sub contains_text { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTextFrame}; return; } sub is_worthwhile_text { my ( $paragraph ) = @_; return 1 if $paragraph->{Text} !~ /^\s*$/; return; } sub text_or_title { my ( $placeholder ) = @_; for ( $placeholder->PlaceholderFormat->{Type} ) { return 'title' if $_ == $ole_const->{ppPlaceholderTitle}; return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle}; return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitle}; return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle}; } return 'text'; } sub extract_text { my ( $paragraph, $text_type ) = @_; $text_type ||= 'text'; my @paragraph_text; RUN: for my $run ( in $paragraph->Runs ) { # A run is a section of text with the same font properties. my $run_item = _extract_run( $run ); push @paragraph_text, $run_item; } return [ $text_type, @paragraph_text ]; } sub _type_run { my ( $run ) = @_; my $selection = $word->Selection; my $run_text = shift @{ $run }; my %font_properties = @{ $run }; while ( my ( $font_property, $value ) = each %font_properties ) { $selection->Font->{ $font_property } = $value; } $selection->TypeText( $run_text ); } sub _extract_run { my ( $run ) = @_; my %font_properties; my $run_text = $run->{Text}; for my $name ( qw{ Italic Bold Subscript Superscript } ) { # Save these important font properties for each run, # so we don't have to manually reformat Arabidopsis and # CH4. Note that this assumes ppTrue == wdTrue == msoTrue, which # is probably true. $font_properties{ $name } = $run->Font->$name; } # Remove 'smart' "quotes" -- and em dashes. $run_text =~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; # Remove trailing newlines from bulleted lists. $run_text =~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x; return [ $run_text => %font_properties ]; } ################################ Paragraph ################################### sub type_paragraph { my ( $paragraph, $slide_type ) = @_; my $selection = $word->Selection; my $paragraph_type = shift @{ $paragraph }; if ( $paragraph_type eq 'table' ) { my $table = shift @{ $paragraph }; _type_table( $table, $slide_type ); } elsif ( $paragraph_type eq 'picture' ) { my ( $filename ) = shift @{ $paragraph }; _type_picture( $filename, $slide_type ); } elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' ) { _type_formatted_text ( $paragraph, get_style_for( $slide_type, $paragraph_type ) ); } else { warn "Unsupported paragraph type $paragraph_type\n"; } } sub _type_formatted_text { my ( $paragraph, $style ) = @_; my $selection = $word->Selection; $selection->{Style} = $style; for my $run ( @{ $paragraph } ) { _type_run( $run ); } $selection->TypeParagraph(); $selection->{Style} = 'Normal'; } ############################### Pictures ##################################### sub make_picture_filename { my ( $file_stem, $slide_number, $picture_number ) = @_; return $file_stem . '_' . $slide_number . '_' . $picture_number . '.png'; } sub contains_floating_picture { my ( $shape ) = @_; return 1 if $shape->{Type} == $ole_const->{msoPicture}; return; } sub contains_placeheld_picture { my ( $placeholder ) = @_; # This is a hack, but if you check a placeholder's Type property # for == msoPicture, it's false, because its placeholderiness # overrides its picturiness. Why isn't there a # HasPicture property? return 1 if defined $placeholder->{PictureFormat}; return; } sub extract_picture { my ( $shape, $picture_filename ) = @_; # Select the shape containing the picture. $shape->Select; # Export the selected shape range as a PNG graphic. We'll store # the filename for later. $ppt->ActiveWindow->Selection->ShapeRange->Export ( $picture_filename, $ole_const->{ppShapeFormatPNG} ); return [ 'picture' => $picture_filename ]; } sub _type_picture { my ( $picture_filename, $slide_type ) = @_; my $selection = $word->Selection; # Insert inlined picture (a simple ->Shapes will float the picture). $selection->InlineShapes->AddPicture ( $picture_filename, { LinkToFile => $ole_const->{msoFalse}, SaveWithDocument => $ole_const->{msoTrue}, } ); $selection->{Style} = get_style_for( $slide_type, 'picture' ); # Add a newline so we don't end up typing on the same line as the picture. $selection->TypeParagraph(); } ################################ Tables ###################################### sub contains_table { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTable}; return; } sub extract_table { my ( $shape ) = @_; my $table; # Extract data from table cell-by-cell, store in simple AoA. ROW: for my $i ( 1 .. $shape->Table->Rows->Count ) { CELL: for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count ) { my $cell = $shape->Table->Rows($i)->Cells($j)->Shape; my @cell_text; PARAGRAPH: for my $paragraph ( in $cell->TextFrame->TextRange->Paragraphs ) { push @cell_text, extract_text( $paragraph ); } # Convert to Perl 0-based arrays. $table->[ $i - 1 ][ $j - 1 ] = \@cell_text; } } return [ 'table' => $table ]; } sub _type_table { my ( $table, $slide_type ) = @_; my $selection = $word->Selection; # Collate table size and placement (range) parameters. my $number_of_rows = scalar @{ $table }; my $number_of_columns = scalar @{ $table->[0] }; my $end_of_doc = $selection->End; my $range = $word->ActiveDocument->Range ( { Start => $end_of_doc, End => $end_of_doc } ); # Insert table of calculated size at given range. my $table_obj = $selection->Tables->Add ( $range, $number_of_rows, $number_of_columns ); # Add data cell-by-cell. for my $i ( 0 .. $#{ $table } ) { for my $j ( 0 .. $#{ $table->[$i] } ) { # Use Perl 0-based arrays. $table_obj->Cell( $i + 1, $j + 1 )->Select; for my $paragraph ( @{ $table->[$i][$j] } ) { type_paragraph( $paragraph, $slide_type ); # It'd be nice to supress the newline at the end # of each cell, but I can't think of a nice way to do this. } } } # Set selection outside the end of the table (end of document, actually). $selection->EndKey( $ole_const->{wdStory} ); } ############################################################################## __END__ =head1 NAME ppt2doc.pl - Perl script to convert PowerPoint presentations to Word documents =head1 SYNOPSIS perl ppt2doc.pl PPTFILE [WORDFILE] =head1 ABSTRACT Convert PowerPoint presentation to Word document =head1 DESCRIPTION Converts PowerPoint presentation C to Word document C. If C is not specified, the data will be output to a file of the same name with the C<.ppt> extension exchanged for <.doc>. Pictures in PowerPoint placeholders will be extracted to PNG files named C, where C is the name of the presentation, less its extension, C is the number of the slide on which the picture was found, and C is the number of the picture on that particular slide. The script is obviously tuned for converting my lecture slides to lecture notes, but the code shows how you can handle tables, paragraphs, titles and images using OLE. =head1 SEE ALSO L L L L - Microsoft Office XP PowerPoint object model. =head1 AUTHOR Steve Cook, Esteve@steve.gb.comE =head1 COPYRIGHT AND LICENSE Copyright 2005 by Steve Cook This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut