I've previously written a rendering class that does does just that:
package PDF::ToText;
use 5.006;
use warnings;
use strict;
use CAM::PDF;
use CAM::PDF::GS;
use base qw(CAM::PDF::GS);
=head1 NAME
PDF::ToText - CAM::PDF renderer to extract PDF Text and position infor
+mation
=head1 VERSION
Version 0.01
=cut
our $VERSION = '0.01';
=head1 SYNOPSIS
use CAM::PDF;
use PDF::ToText;
my $pdf = CAM::PDF->new($filename);
my $contentTree = $pdf->getPageContentTree(1);
$contentTree->render("PDF::ToText");
=head1 SUBROUTINES/METHODS
=head2 renderText
=cut
sub _textToDevice {
my $self = shift;
my @t2u = $self->textToUser( @_ );
my @t2d = $self->userToDevice( @t2u);
return @t2d;
}
sub renderText {
my $self = shift;
my $string = shift;
my $width = shift;
# collect vertices of this text segment.
my @bottom_left = $self->_textToDevice(0, 0);
my @bottom_right = $self->_textToDevice($width, 0);
my @top_left = $self->_textToDevice(0, $self->{Tfs});
my @top_right = $self->_textToDevice($width, $self->{Tfs});
printf "%7.2f %7.2f %7.2f %7.2f %s\n", @bottom_left, @top_right, $s
+tring;
return;
}
It's a drop in replacement for CAM::PDF::PageText.
In it's current state, it dumps text coordinates to STDOUT; but it can be easily amended to collect them in a global variable or whatever (CAM::PDF doesn't currently support the passing of handles).