Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
#!/usr/bin/perl # ppt2doc.pl v1.00 use strict; use warnings; if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i ) { system( "perldoc", $0 ) and die "For usage, perldoc $0\n"; exit 0; } use Cwd qw( abs_path ); # Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin- +1. use Win32::OLE qw( in CP_UTF8 ); Win32::OLE->Option( CP => CP_UTF8 ); $Win32::OLE::Warn = 3; # Obtain the mso/wd/pp constants in the cleanest way. Asking for plain + # 'Microsoft Office' doesn't work for some reason: we need details. use Win32::OLE::Const; my $ole_const = { %{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") } +, %{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Libra +ry") }, %{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library") + }, }; # Set up files - must be absolute paths as OLE server doesn't use your + CWD. my ( $pptfile, $docfile ) = @ARGV; $pptfile = abs_path( $pptfile ); ( my $file_stem = $pptfile ) =~ s/\.ppt$//i; if ( defined $docfile ) { $docfile = abs_path( $docfile ); } else { $docfile = $file_stem . '.doc'; } ####################### Extract from PowerPoint ###################### +######## # Create PowerPoint OLE server. Subroutines may legitimately manipulat +e this. #OLE server should 'Quit' if the script dies unexpectedly. my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' ) or die "Can't create PowerPoint OLE: $!\n"; # Can't set this to 0, for reasons I can't/don't understand. $ppt->{Visible} = 1; my $pres = $ppt->Presentations->Open( $pptfile ) or die "Can't open PowerPoint file '$pptfile': $!\n"; my @slides; SLIDE: for my $slide ( in $ppt->ActivePresentation->Slides ) { # To select and extract images, we need to actively 'View' the sli +de in # question, which is an item from a 1-based 'array' of slides. # This also makes it clear where the processing has reached. $ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} ); next SLIDE unless slide_has_title( $slide ); # Slide type will be 'frontispiece', 'chapter', 'list' or 'normal' +. my $slide_type = get_slide_type( $slide ); my $picture_number = 1; my @slide_text; PLACEHOLDER: for my $placeholder ( in $slide->Shapes->Placeholders ) { if ( contains_table( $placeholder ) ) { push @slide_text, extract_table( $placeholder ); + } elsif ( contains_text( $placeholder ) ) { my $text_type = text_or_title( $placeholder ); PARAGRAPH: for my $paragraph ( in $placeholder->TextFrame->TextRange->Paragraphs ) { next PARAGRAPH unless is_worthwhile_text( $paragraph ) +; push @slide_text, extract_text( $paragraph, $text_type + ); } # Deal with icky special cases. if ( $slide_type eq 'frontispiece' ) { # The second placeholder of the 'frontispiece' slide c +ontains # a code number that should be prepended onto the titl +e. my ( $code_number ) = $slide->Shapes->Placeholders(2) ->TextFrame->TextRange->{Text} =~ /^(\w+)\s/i; # Ugly, but easiest way to do this: but this isn't OOP +, so # it's OK :) $slide_text[-1][1][0] = "$code_number $slide_text[-1][ +1][0]"; # Everything else on the slide should be discarded. + last PLACEHOLDER; } elsif ( $slide_type eq 'chapter' ) { # We also discard the non-title text on 'chapter' slid +es. last PLACEHOLDER; } } elsif ( contains_placeheld_picture( $placeholder ) ) { my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_numb +er); push @slide_text, extract_picture ( $placeholder, $picture_filename ); $picture_number++; } else { next PLACEHOLDER; } } FLOATING_PICTURE: for my $shape ( in $slide->Shapes ) { # Harvest floating pictures in addition to those in placeholde +rs. # This will not include autoshapes or embedded OLE objects. next FLOATING_PICTURE unless contains_floating_picture( $shape + ); my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_number); push @slide_text, extract_picture( $shape, $picture_filename ) +; $picture_number++; } push @slides, [ $slide_type => @slide_text ]; } ########################## Insert into Word ########################## +######## # Create Word OLE server. my $word = Win32::OLE->new( 'Word.Application', 'Quit' ) or die "Can't create Word OLE: $!\n"; $word->{Visible} = 1; my $doc = $word->Documents->Add or die "Can't create new Word document: $!\n"; # All the preprocessing leg-work has been done by the extractor. Print +ing # to Word is trivial with the type_paragraph subroutine. for my $slide ( @slides ) { my ( $slide_type, @paragraphs ) = @{ $slide }; for my $paragraph ( @paragraphs ) { type_paragraph( $paragraph, $slide_type ); } } $word->ActiveDocument->SaveAs( $docfile ); exit 0; ################################ Slides ############################## +######## sub get_slide_type { my ( $slide ) = @_; if ( $slide->{SlideNumber} == 1 ) { # 'frontispiece' is for the presentation's first slide; return 'frontispiece'; } elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} ) { # 'chapter' is for the presentation's section heading slides; return 'chapter'; } elsif ( $slide->Shapes->Title->TextFrame->TextRange->{Text} =~ /^(Objectives|Summary|Test yourself|Answers)/i ) { # 'list' is for slides whose bulleted layouts should be mainta +ined; return 'list'; } else { # 'normal' is for generic text slides. return 'normal'; } } sub slide_has_title { my ( $slide ) = @_; if ( $slide->{Layout} == $ole_const->{ppLayoutBlank} || ! $slide->Shapes->Count ) { warn "Skipping slide $slide->{SlideNumber}: no content\n"; return; } elsif ( ! $slide->Shapes->{HasTitle} ) { warn "Skipping slide $slide->{SlideNumber}: no title\n"; return; } else { return 1; } } ############################# Formatting ############################# +######## sub get_style_for { my ( $slide_type, $text_type ) = @_; my %style_for = ( frontispiece => { title => 'Heading 1' + }, chapter => { title => 'Heading 2' + }, list => { title => 'Heading 2', text => 'Bullets' + }, normal => { title => 'Heading 3', text => 'Normal', picture => 'Figu +re' }, ); return exists $style_for{$slide_type}{$text_type} ? $style_for{$slide_type}{$text_type} : 'Normal' } ############################ Titles and Text ######################### +######## sub contains_text { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTextFrame}; return; } sub is_worthwhile_text { my ( $paragraph ) = @_; return 1 if $paragraph->{Text} !~ /^\s*$/; return; } sub text_or_title { my ( $placeholder ) = @_; for ( $placeholder->PlaceholderFormat->{Type} ) { return 'title' if $_ == $ole_const->{ppPlaceholderTitle}; return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle} +; return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitl +e}; return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle}; } return 'text'; } sub extract_text { my ( $paragraph, $text_type ) = @_; $text_type ||= 'text'; my @paragraph_text; RUN: for my $run ( in $paragraph->Runs ) { # A run is a section of text with the same font properties. my $run_item = _extract_run( $run ); push @paragraph_text, $run_item; } return [ $text_type, @paragraph_text ]; } sub _type_run { my ( $run ) = @_; my $selection = $word->Selection; my $run_text = shift @{ $run }; my %font_properties = @{ $run }; while ( my ( $font_property, $value ) = each %font_properties ) { $selection->Font->{ $font_property } = $value; } $selection->TypeText( $run_text ); } sub _extract_run { my ( $run ) = @_; my %font_properties; my $run_text = $run->{Text}; for my $name ( qw{ Italic Bold Subscript Superscript } ) { # Save these important font properties for each run, # so we don't have to manually reformat Arabidopsis and # CH4. Note that this assumes ppTrue == wdTrue == msoTrue, whi +ch # is probably true. $font_properties{ $name } = $run->Font->$name; } # Remove 'smart' "quotes" -- and em dashes. $run_text =~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; # Remove trailing newlines from bulleted lists. $run_text =~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x; return [ $run_text => %font_properties ]; } ################################ Paragraph ########################### +######## sub type_paragraph { my ( $paragraph, $slide_type ) = @_; my $selection = $word->Selection; my $paragraph_type = shift @{ $paragraph }; if ( $paragraph_type eq 'table' ) { my $table = shift @{ $paragraph }; _type_table( $table, $slide_type ); } elsif ( $paragraph_type eq 'picture' ) { my ( $filename ) = shift @{ $paragraph }; _type_picture( $filename, $slide_type ); } elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' ) { _type_formatted_text ( $paragraph, get_style_for( $slide_type, $paragraph_type +) ); } else { warn "Unsupported paragraph type $paragraph_type\n"; } } sub _type_formatted_text { my ( $paragraph, $style ) = @_; my $selection = $word->Selection; $selection->{Style} = $style; for my $run ( @{ $paragraph } ) { _type_run( $run ); } $selection->TypeParagraph(); $selection->{Style} = 'Normal'; } ############################### Pictures ############################# +######## sub make_picture_filename { my ( $file_stem, $slide_number, $picture_number ) = @_; return $file_stem . '_' . $slide_number . '_' . $picture_number . +'.png'; } sub contains_floating_picture { my ( $shape ) = @_; return 1 if $shape->{Type} == $ole_const->{msoPicture}; return; } sub contains_placeheld_picture { my ( $placeholder ) = @_; # This is a hack, but if you check a placeholder's Type property # for == msoPicture, it's false, because its placeholderiness # overrides its picturiness. Why isn't there a # HasPicture property? return 1 if defined $placeholder->{PictureFormat}; return; } sub extract_picture { my ( $shape, $picture_filename ) = @_; # Select the shape containing the picture. $shape->Select; # Export the selected shape range as a PNG graphic. We'll store # the filename for later. $ppt->ActiveWindow->Selection->ShapeRange->Export ( $picture_filename, $ole_const->{ppShapeFormatPNG} ); return [ 'picture' => $picture_filename ]; } sub _type_picture { my ( $picture_filename, $slide_type ) = @_; my $selection = $word->Selection; # Insert inlined picture (a simple ->Shapes will float the picture +). $selection->InlineShapes->AddPicture ( $picture_filename, { LinkToFile => $ole_const->{msoFalse}, SaveWithDocument => $ole_const->{msoTrue}, } ); $selection->{Style} = get_style_for( $slide_type, 'picture' ); # Add a newline so we don't end up typing on the same line as the +picture. $selection->TypeParagraph(); } ################################ Tables ############################## +######## sub contains_table { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTable}; return; } sub extract_table { my ( $shape ) = @_; my $table; # Extract data from table cell-by-cell, store in simple AoA. ROW: for my $i ( 1 .. $shape->Table->Rows->Count ) { CELL: for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count ) { my $cell = $shape->Table->Rows($i)->Cells($j)->Shape; my @cell_text; PARAGRAPH: for my $paragraph ( in $cell->TextFrame->TextRange->Paragr +aphs ) { push @cell_text, extract_text( $paragraph ); } # Convert to Perl 0-based arrays. $table->[ $i - 1 ][ $j - 1 ] = \@cell_text; } } return [ 'table' => $table ]; } sub _type_table { my ( $table, $slide_type ) = @_; my $selection = $word->Selection; # Collate table size and placement (range) parameters. my $number_of_rows = scalar @{ $table }; my $number_of_columns = scalar @{ $table->[0] }; my $end_of_doc = $selection->End; my $range = $word->ActiveDocument->Range ( { Start => $end_of_doc, End => $end_of_doc } ); # Insert table of calculated size at given range. my $table_obj = $selection->Tables->Add ( $range, $number_of_rows, $number_of_columns ); # Add data cell-by-cell. for my $i ( 0 .. $#{ $table } ) { for my $j ( 0 .. $#{ $table->[$i] } ) { # Use Perl 0-based arrays. $table_obj->Cell( $i + 1, $j + 1 )->Select; for my $paragraph ( @{ $table->[$i][$j] } ) { type_paragraph( $paragraph, $slide_type ); # It'd be nice to supress the newline at the end # of each cell, but I can't think of a nice way to do +this. } } } # Set selection outside the end of the table (end of document, act +ually). $selection->EndKey( $ole_const->{wdStory} ); } ###################################################################### +######## __END__ =head1 NAME ppt2doc.pl - Perl script to convert PowerPoint presentations to Word d +ocuments =head1 SYNOPSIS perl ppt2doc.pl PPTFILE [WORDFILE] =head1 ABSTRACT Convert PowerPoint presentation to Word document =head1 DESCRIPTION Converts PowerPoint presentation C<PPTFILE> to Word document C<WORDFIL +E>. If C<WORDFILE> is not specified, the data will be output to a file of the same name with the C<.ppt> extension exchanged for <.doc>. Pictures in PowerPoint placeholders will be extracted to PNG files nam +ed C<pptname_slidenumber_picturenumber.png>, where C<pptname> is the name + of the presentation, less its extension, C<slidenumber> is the number of +the slide on which the picture was found, and C<picturenumber> is the numb +er of the picture on that particular slide. The script is obviously tuned for converting my lecture slides to lect +ure notes, but the code shows how you can handle tables, paragraphs, title +s and images using OLE. =head1 SEE ALSO L<perl> L<Win32::OLE> L<Win32::OLE::Const> L<http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vba +pp10/html/pptocObjectModelApplication.asp> - Microsoft Office XP PowerPoint object model. =head1 AUTHOR Steve Cook, E<lt>steve@steve.gb.comE<gt> =head1 COPYRIGHT AND LICENSE Copyright 2005 by Steve Cook This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut

In reply to Convert PowerPoint Presentation to Word Document with Win32::OLE by polypompholyx

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others romping around the Monastery: (20)
    As of 2015-07-01 17:24 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (13 votes), past polls