http://www.perlmonks.org?node_id=535462
Category: HTML utility
Author/Contact Info JefferySchmitz@mac.com
Description: This is a script that uses the HTML::TableExtract module that Mojotoad wrote. It is very handy for stripping out just the error messages in NetIQ reports for emailing them out. Matt helped me with this a lot so I have to give him props for this one.
#!/usr/bin/perl -w

use LWP::Simple;
use HTML::TableExtract;

my $html_report;
# replace this with LWP::Simple get() or somesuch
# for fetching main report
open(F, "<Report.htm") or die "oof: $!\n";
$html_report = join('', <F>);
close(F);

foreach my $row (rows_from_main_report($html_report)) {
  next unless $row->[1] =~ /failed/i;
  my($link) = $row->[2] =~ /href\s*\=\s*\"?([^\"]+)/;
  unless ($link) {
    print STDERR "no link from row ($row->[2])\n";
    next;
  }
  print "$link\n";
  my $html = get($link);
  unless ($html) {
    print STDERR "no html from link $link\n";
    next;
  }
  print "$link\n";
  foreach my $row (rows_from_fail_report($html)) {
    # do whatever here
    print join(' : ', @$row), "\n";
  }
}

sub rows_from_main_report {
  my $html = shift || die "HTML string required\n";
  my $te = HTML::TableExtract->new(
    headers   => [qw(computer data time)],
    keep_html => 1,
  );
  $te->parse($html_report);
  my $ts = $te->first_table_state_found;
  $ts->rows;
}

sub rows_from_fail_report {
  my $html = shift || die "HTML string required\n";
  my $te = HTML::TableExtract->new(
    headers => [qw(job date client class schedule master desc)],
  );
  $te->parse($html);
  my $ts = $te->first_table_state_found;
  $ts->rows;
}