http://www.perlmonks.org?node_id=491983
Category: Win32 Stuff
Author/Contact Info polypompholyx (Steve Cook - <steve@steve.gb.com>)
Description: I wrote this because I needed to convert a large number of presentations into lecture notes. The script does a fairly robust job of transferring pictures, tables, formatted text and titles from PowerPoint to Word. Although the code may be a little specific to the job I was doing, it's entirely done using OLE, and will be useful to anyone who would like to use Win32::OLE with PowerPoint, for which I have never found any sizeable online examples before. There are lots of handy comments too, so you can avoid all the pitfalls I fell into...
#!/usr/bin/perl 

# ppt2doc.pl v1.00

use strict;
use warnings;

if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i )
{
    system( "perldoc", $0 ) and die "For usage, perldoc $0\n";
    exit 0;
}

use Cwd qw( abs_path );

# Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin-
+1.
use Win32::OLE qw( in CP_UTF8 );
Win32::OLE->Option( CP => CP_UTF8 );
$Win32::OLE::Warn = 3;

# Obtain the mso/wd/pp constants in the cleanest way. Asking for plain
+ 
# 'Microsoft Office' doesn't work for some reason: we need details.
use Win32::OLE::Const;
my $ole_const =
{ 
    %{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") }
+, 
    %{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Libra
+ry") }, 
    %{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library")
+ }, 
};

# Set up files - must be absolute paths as OLE server doesn't use your
+ CWD.
my ( $pptfile, $docfile ) = @ARGV;
$pptfile = abs_path( $pptfile );
( my $file_stem = $pptfile ) =~ s/\.ppt$//i;
if ( defined $docfile )
{
    $docfile = abs_path( $docfile );
}
else
{
    $docfile = $file_stem . '.doc';
}

####################### Extract from PowerPoint ######################
+########

# Create PowerPoint OLE server. Subroutines may legitimately manipulat
+e this.
#OLE server should 'Quit' if the script dies unexpectedly.
my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' )
    or die "Can't create PowerPoint OLE: $!\n";

# Can't set this to 0, for reasons I can't/don't understand.
$ppt->{Visible} = 1;

my $pres = $ppt->Presentations->Open( $pptfile ) 
    or die "Can't open PowerPoint file '$pptfile': $!\n";        

my @slides;

SLIDE:
for my $slide ( in $ppt->ActivePresentation->Slides )
{    
    # To select and extract images, we need to actively 'View' the sli
+de in 
    # question, which is an item from a 1-based 'array' of slides. 
    # This also makes it clear where the processing has reached.
    $ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} );
    
    next SLIDE unless slide_has_title( $slide );
    
    # Slide type will be 'frontispiece', 'chapter', 'list' or 'normal'
+.
    my $slide_type = get_slide_type( $slide );
    
    my $picture_number = 1;
    my @slide_text;
    
    PLACEHOLDER:
    for my $placeholder ( in $slide->Shapes->Placeholders ) 
    {            
        if ( contains_table( $placeholder ) )
        {
            push @slide_text, extract_table( $placeholder );          
+  
        }
        elsif ( contains_text( $placeholder ) )
        {
            my $text_type = text_or_title( $placeholder );
            PARAGRAPH:
            for my $paragraph 
            ( in $placeholder->TextFrame->TextRange->Paragraphs )
            {    
                next PARAGRAPH unless is_worthwhile_text( $paragraph )
+;
                push @slide_text, extract_text( $paragraph, $text_type
+ );
            }        

            # Deal with icky special cases.
            if ( $slide_type eq 'frontispiece' )
            {
                # The second placeholder of the 'frontispiece' slide c
+ontains 
                # a code number that should be prepended onto the titl
+e.
                my ( $code_number ) = 
                    $slide->Shapes->Placeholders(2)
                        ->TextFrame->TextRange->{Text}
                    =~ /^(\w+)\s/i;
                # Ugly, but easiest way to do this: but this isn't OOP
+, so
                # it's OK :)
                $slide_text[-1][1][0] = "$code_number $slide_text[-1][
+1][0]";

                # Everything else on the slide should be discarded.   
+             
                last PLACEHOLDER;
            }
            elsif ( $slide_type eq 'chapter' )
            {
                # We also discard the non-title text on 'chapter' slid
+es.
                last PLACEHOLDER;
            }
        }
        elsif ( contains_placeheld_picture( $placeholder ) )
        {
            my $picture_filename = 
                make_picture_filename
                    ( $file_stem, $slide->{SlideNumber}, $picture_numb
+er);        
            push @slide_text, extract_picture
                ( $placeholder, $picture_filename );
            $picture_number++;
        }
        else
        {
            next PLACEHOLDER;
        }
    }
    
    FLOATING_PICTURE:
    for my $shape ( in $slide->Shapes ) 
    {
        # Harvest floating pictures in addition to those in placeholde
+rs.
        # This will not include autoshapes or embedded OLE objects.
        next FLOATING_PICTURE unless contains_floating_picture( $shape
+ );
        my $picture_filename = 
            make_picture_filename
                ( $file_stem, $slide->{SlideNumber}, $picture_number);
        push @slide_text, extract_picture( $shape, $picture_filename )
+;    
        $picture_number++;
    }
    
    push @slides, [ $slide_type => @slide_text ];
}

########################## Insert into Word ##########################
+########

# Create Word OLE server.
my $word = Win32::OLE->new( 'Word.Application', 'Quit' )
    or die "Can't create Word OLE: $!\n";
$word->{Visible} = 1;
my $doc = $word->Documents->Add
    or die "Can't create new Word document: $!\n";

# All the preprocessing leg-work has been done by the extractor. Print
+ing
# to Word is trivial with the type_paragraph subroutine.
for my $slide ( @slides )
{
    my ( $slide_type, @paragraphs ) = @{ $slide };
    for my $paragraph ( @paragraphs )
    {
        type_paragraph( $paragraph, $slide_type );
    }
}

$word->ActiveDocument->SaveAs( $docfile );

exit 0;

################################ Slides ##############################
+########

sub get_slide_type
{
    my ( $slide ) = @_;
    if ( $slide->{SlideNumber} == 1 )
    {
        # 'frontispiece' is for the presentation's first slide;
        return 'frontispiece';
    }
    elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} )
    {
        # 'chapter' is for the presentation's section heading slides;
        return 'chapter';
    }
    elsif
    (
        $slide->Shapes->Title->TextFrame->TextRange->{Text}
            =~ /^(Objectives|Summary|Test yourself|Answers)/i
    )
    {
        # 'list' is for slides whose bulleted layouts should be mainta
+ined;
        return 'list';
    }
    else
    {
        # 'normal' is for generic text slides.
        return 'normal';
    }
}

sub slide_has_title
{
    my ( $slide ) = @_;
    if
    (
        $slide->{Layout} == $ole_const->{ppLayoutBlank} 
            || ! $slide->Shapes->Count
    )
    {
        warn "Skipping slide $slide->{SlideNumber}: no content\n";
        return;
    }
    elsif ( ! $slide->Shapes->{HasTitle} )
    {
        warn "Skipping slide $slide->{SlideNumber}: no title\n";
        return;
    }
    else
    {
        return 1;
    }
}

############################# Formatting #############################
+########

sub get_style_for
{
    my ( $slide_type, $text_type ) = @_;
    my %style_for =
    (
        frontispiece => 
            { title => 'Heading 1'                                    
+    },
        chapter      => 
            { title => 'Heading 2'                                    
+    },
        list         => 
            { title => 'Heading 2', text => 'Bullets'                 
+    },
        normal       => 
            { title => 'Heading 3', text => 'Normal', picture => 'Figu
+re' },    
    );
    return exists $style_for{$slide_type}{$text_type} 
        ? $style_for{$slide_type}{$text_type} 
        : 'Normal'
}

############################ Titles and Text #########################
+########

sub contains_text
{
    my ( $placeholder ) = @_;
    return 1 if $placeholder->{HasTextFrame};
    return;
}

sub is_worthwhile_text
{
    my ( $paragraph ) = @_;
    return 1 if $paragraph->{Text} !~ /^\s*$/;
    return;        
}

sub text_or_title
{
    my ( $placeholder ) = @_;
    for ( $placeholder->PlaceholderFormat->{Type} )
    {
        return 'title' if $_ == $ole_const->{ppPlaceholderTitle};
        return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle}
+;
        return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitl
+e};
        return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle};
    }
    return 'text';
}

sub extract_text
{
    my ( $paragraph, $text_type ) = @_;
    $text_type ||= 'text';
    my @paragraph_text;                    
    RUN:
    for my $run ( in $paragraph->Runs )
    {    
        # A run is a section of text with the same font properties.
        my $run_item = _extract_run( $run );
        push @paragraph_text, $run_item;
    }
    return [ $text_type, @paragraph_text ];
}

sub _type_run
{    
    my ( $run ) = @_;
    my $selection = $word->Selection;
    my $run_text  = shift @{ $run };
    my %font_properties = @{ $run }; 
    while ( my ( $font_property, $value ) = each %font_properties )
    {
        $selection->Font->{ $font_property } = $value; 
    }
    $selection->TypeText( $run_text );
}

sub _extract_run
{
    my ( $run ) = @_;
    my %font_properties;
    my $run_text = $run->{Text};
    for my $name ( qw{ Italic Bold Subscript Superscript } )
    {
        # Save these important font properties for each run,
        # so we don't have to manually reformat Arabidopsis and 
        # CH4. Note that this assumes ppTrue == wdTrue == msoTrue, whi
+ch
        # is probably true.
        $font_properties{ $name } = $run->Font->$name;
    }
    
    # Remove 'smart' "quotes" -- and em dashes.
    $run_text 
        =~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; 
    
    # Remove trailing newlines from bulleted lists.
    $run_text 
        =~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x;    
    
    return [ $run_text => %font_properties ];
}

################################ Paragraph ###########################
+########

sub type_paragraph
{
    my ( $paragraph, $slide_type ) = @_;
    my $selection = $word->Selection;
    my $paragraph_type = shift @{ $paragraph };

    if ( $paragraph_type eq 'table' )
    {
        my $table = shift @{ $paragraph };
        _type_table( $table, $slide_type );
    }
    elsif ( $paragraph_type eq 'picture' )
    {
        my ( $filename ) = shift @{ $paragraph };
        _type_picture( $filename, $slide_type );
    }
    elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' )
    {
        _type_formatted_text
            ( $paragraph, get_style_for( $slide_type, $paragraph_type 
+) );
    }
    else
    {
        warn "Unsupported paragraph type $paragraph_type\n";
    }
}

sub _type_formatted_text
{
    my ( $paragraph, $style ) = @_;
    my $selection = $word->Selection;
    $selection->{Style} = $style;
    for my $run ( @{ $paragraph } )
    {
        _type_run( $run );
    }
    $selection->TypeParagraph();
    $selection->{Style} = 'Normal';    
}

############################### Pictures #############################
+########

sub make_picture_filename
{
    my ( $file_stem, $slide_number, $picture_number ) = @_;
    return $file_stem . '_' . $slide_number . '_' . $picture_number . 
+'.png';
}

sub contains_floating_picture
{
    my ( $shape ) = @_;
    return 1 if $shape->{Type} == $ole_const->{msoPicture};
    return;
}

sub contains_placeheld_picture
{
    my ( $placeholder ) = @_;
    # This is a hack, but if you check a placeholder's Type property
    # for == msoPicture, it's false, because its placeholderiness
    # overrides its picturiness. Why isn't there a 
    # HasPicture property?
    return 1 if defined $placeholder->{PictureFormat};
    return;    
}

sub extract_picture
{
    my ( $shape, $picture_filename ) = @_;
    
    # Select the shape containing the picture.
    $shape->Select;            
    
    # Export the selected shape range as a PNG graphic. We'll store
    # the filename for later.
    $ppt->ActiveWindow->Selection->ShapeRange->Export
        ( $picture_filename, $ole_const->{ppShapeFormatPNG} );
    
    return [ 'picture' => $picture_filename ];
}

sub _type_picture
{
    my ( $picture_filename, $slide_type ) = @_;
    my $selection = $word->Selection;
    
    # Insert inlined picture (a simple ->Shapes will float the picture
+).
    $selection->InlineShapes->AddPicture
    ( $picture_filename, 
        { 
            LinkToFile       => $ole_const->{msoFalse}, 
            SaveWithDocument => $ole_const->{msoTrue},
        } 
    );
    $selection->{Style} = get_style_for( $slide_type, 'picture' );
    # Add a newline so we don't end up typing on the same line as the 
+picture.
    $selection->TypeParagraph();
}

################################ Tables ##############################
+########

sub contains_table
{
    my ( $placeholder ) = @_;
    return 1 if $placeholder->{HasTable};
    return;
}

sub extract_table
{
    my ( $shape ) = @_;
    my $table;
    
    # Extract data from table cell-by-cell, store in simple AoA.
    ROW:
    for my $i ( 1 .. $shape->Table->Rows->Count )
    {
        CELL:
        for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count )
        {
            my $cell = $shape->Table->Rows($i)->Cells($j)->Shape;
            
            my @cell_text;
            
            PARAGRAPH:
            for my $paragraph ( in $cell->TextFrame->TextRange->Paragr
+aphs )
            {    
                push @cell_text, extract_text( $paragraph );
            }
            
            # Convert to Perl 0-based arrays.
            $table->[ $i - 1 ][ $j - 1 ] = \@cell_text;    
        }
    }
    return [ 'table' => $table ];
}

sub _type_table
{
    my ( $table, $slide_type ) = @_;
    
    my $selection = $word->Selection;
    
    # Collate table size and placement (range) parameters.
    my $number_of_rows    = scalar @{ $table };
    my $number_of_columns = scalar @{ $table->[0] };
    my $end_of_doc        = $selection->End;
    my $range = $word->ActiveDocument->Range
        ( { Start => $end_of_doc, End => $end_of_doc } );
    
    # Insert table of calculated size at given range.
    my $table_obj = $selection->Tables->Add
        ( $range, $number_of_rows, $number_of_columns );
        
    # Add data cell-by-cell.
    for my $i ( 0 .. $#{ $table } )
    {
        for my $j ( 0 .. $#{ $table->[$i] } )
        {
            # Use Perl 0-based arrays.
            $table_obj->Cell( $i + 1, $j + 1 )->Select;
            for my $paragraph ( @{ $table->[$i][$j] } )
            {
                type_paragraph( $paragraph, $slide_type );
                # It'd be nice to supress the newline at the end
                # of each cell, but I can't think of a nice way to do 
+this.
            }
        }
    }
    # Set selection outside the end of the table (end of document, act
+ually).
    $selection->EndKey( $ole_const->{wdStory} );
}

######################################################################
+########

__END__

=head1 NAME

ppt2doc.pl - Perl script to convert PowerPoint presentations to Word d
+ocuments

=head1 SYNOPSIS

    perl ppt2doc.pl PPTFILE [WORDFILE]

=head1 ABSTRACT

Convert PowerPoint presentation to Word document

=head1 DESCRIPTION

Converts PowerPoint presentation C<PPTFILE> to Word document C<WORDFIL
+E>.
If C<WORDFILE> is not specified, the data will be output to a file
of the same name with the C<.ppt> extension exchanged for <.doc>.

Pictures in PowerPoint placeholders will be extracted to PNG files nam
+ed
C<pptname_slidenumber_picturenumber.png>, where C<pptname> is the name
+ of
the presentation, less its extension, C<slidenumber> is the number of 
+the 
slide on which the picture was found, and C<picturenumber> is the numb
+er of 
the picture on that particular slide.
 
The script is obviously tuned for converting my lecture slides to lect
+ure 
notes, but the code shows how you can handle tables, paragraphs, title
+s and 
images using OLE.

=head1 SEE ALSO

L<perl>

L<Win32::OLE>

L<Win32::OLE::Const>

L<http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vba
+pp10/html/pptocObjectModelApplication.asp> -
Microsoft Office XP PowerPoint object model.

=head1 AUTHOR

Steve Cook, E<lt>steve@steve.gb.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2005 by Steve Cook

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut