In lieu of embedded-files-related examples, quite a few others distribution provides might be considered "suitable" to study, especially considering "Internals" and "Reference" (warning: 53 Mb PDF).
use strict;
use warnings;
use feature 'say';
use Data::Dump;
use Encode 'decode';
use CAM::PDF;
my $doc = CAM::PDF-> new( 'test.pdf' ) or die;
dd get_embedded_files( $doc );
# Get a hashref. Keys (filenames) may happen to be encoded as
# UTF16-BE, with prepended BOM. Sort it out yourself, if names
# are relevant at all.
sub get_embedded_files {
my $doc = shift;
my $names_dict = $doc-> getValue( $doc-> getRootDict-> { Names })
+or return {};
my $files_tree = $names_dict-> { EmbeddedFiles }
+or return {};
my @agenda = $files_tree;
my $ret = {};
# Hardly ever more than single leaf, but...
while ( @agenda ) {
my $item = $doc-> getValue( shift @agenda );
if ( $item-> { Kids }) {
my $kids = $doc-> getValue( $item-> { Kids });
push @agenda, @$kids
}
else {
my $nodes = $doc-> getValue( $item-> { Names });
my @names = map { $doc-> getValue( $_ )} @$nodes;
while ( @names ) {
my ( $k, $v ) = splice @names, 0, 2;
my $ef_node = $v-> { EF };
my $ef_dict = $doc-> getValue( $ef_node );
my $any_num = ( values %$ef_dict )[ 0 ]-> { value };
my $obj_node = $doc-> dereference( $any_num );
$ret-> { $k } = $doc-> decodeOne( $obj_node-> { value
+}, 0 );
}
}
}
return $ret
}
If it looks unpleasant and complex, then it is. "getValue" everywhere may lead to slight temporary madness in larger projects.
I only tested it against couple of my files. If anything is broken, here's a backup brute force solution (since files are probably simple and overhead tiny) -- just enumerate all objects, one of them will be yours.
use strict;
use warnings;
use feature 'say';
use CAM::PDF;
use XML::LibXML;
my $doc = CAM::PDF-> new( 'test.pdf' ) or die;
$doc-> cacheObjects;
while ( my ( $k, $v ) = each %{ $doc-> { objcache }}) {
next unless $v-> { value }{ type } eq 'dictionary' and
$v-> { value }{ value }{ StreamData };
my $str = $doc-> decodeOne( $v-> { value }, 0 );
my $dom = eval { XML::LibXML-> load_xml( string => $str )};
next unless $dom;
# Process expected XML, skip possible others (Adobe XMP, etc.)
# ...
}