open (my $XML, "-|", "pdftohtml.exe -xml -zoom 1.4 -stdout $PDF_FILE") or die "$!\n$^E"; # We are only interested in the text for the "ROUTE TO:" and "SORT GROUP:" sections # Set the twig_handlers to extract the nodes of interest; all other nodes will be ignored # XPath queries provide an extra 1/20 inch padding on all sides to take font and rendering variations into account my $t = XML::Twig->new( twig_handlers => { '//text[(@top >= 180 and @top <= 190) and (@left >= 100 and @left <= 111)]' => \&RouteTo, '//text[(@top >= 215 and @top <= 225) and (@left >= 260 and @left <= 270)]' => \&InvoiceSort, }, comments => 'drop', # remove any comments empty_tags => 'normal',# empty tags = ); $t->parse($XML); $t->purge; close $XML;