Anyone know how to skip the entire content between two HTML tags? I'm using HTML::Parser, and I have figured out how to use the start and end subclasses to match on certain tags and manipulate them. If there is text within those tags I want to keep or modify, I print them to an output file. So I thought that if I matched on a style tag and simply returned (rather than printing the text to my output file) that it wouldn't be there. But now I understand that the text within the tags is processed by the separate subclass for text. What I'd like to do is match on the <style></style> tags and NOT print the text between them.
#!c:/perl/bin/perl
#This version allows the use of a filename specified on the command li
+ne
use strict;
use warnings;
package HTMLStrip;
use base "HTML::Parser";
#system("cls");
my $output = "c:/perl/bin/parseOutput.txt";
if (-e $output) {
unlink $output;
}
open PARSETEXT,'>',$output or die $!;
my $p = new HTMLStrip;
# parse line-by-line, rather than the whole file at once
while (<>) {
$p->parse($_);
}
# flush and parse remaining unparsed HTML
$p->eof;
close PARSETEXT;
sub text {
my ($self, $text) = @_;
chomp($text);
$text =~ s/#.*//; # comments
$text =~ s/^\s+//; # leading whitespace
$text =~ s/\s+$//; # trailing whitespace
#Once the beginning comment if found, remove style
if ($text =~ /^<\!--$/) {
next unless ($text =~ /^-->$/);
}
#Print non-blank lines
if (length($text) > 0) {
print PARSETEXT $text . "\n";
}
}
#Process OPENING/STARTING HTML tags
sub start {
my ($self, $tag, $attr, $attrseq, $origtext) = @_;
#We're only interested in dealing with table tags
if ($tag =~ /^table$/) {
print PARSETEXT "\n************* BEGIN TABLE ****************\
+n";
}
if ($tag =~ /^tr$/) {
print PARSETEXT "\n";
}
if ($tag =~ /^td$/) {
print PARSETEXT "\t";
if (defined $attr->{'class'}) {
if ($attr->{'class'} =~ /alarmClear/) {
print PARSETEXT "OK";
}
if ($attr->{'class'} =~ /alarmSet/) {
print PARSETEXT "ALARM";
}
}
}
}
#Process CLOSING/ENDING HTML tags
sub end {
my($self, $tag, $origtext) = @_;
if ($tag =~ /^table$/) {
print PARSETEXT "\n************* END TABLE ****************\n"
+;
}
}
############################################################
############################################################
HTML FILE CONTENTS
<!DOCTYPE html PUBLIC
"-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>System Status</title>
<style type="text/css">
<!--
/* colors to use:
"CSI Blue" 004683
"CSI Red" 7e2d41
"CSI Grey" e5e5e5
"Liz Green" 00CC33
*/
body {
background-color: #FFFFFF;
color: #004683;
}
dt {
font-weight: bold;
}
#main {
float: left;
margin: 1em;
}
#menu {
float: left;
margin: 1em;
font-family: sans-serif;
background-color: #7e2d41;
}
#menu ul {
list-style: none;
margin: 0;
padding: 0;
}
#menu li {
margin: 1em;
}
#menu li a {
color: #fff;
text-decoration: none;
}
#menu li a {
width: auto;
}
div.messages {
/*width: 70%;*/
padding: 1em;
margin-top: 1em;
margin-bottom: 1em;
font-size: x-large;
border: thin solid black;
}
#messages {
/*width: 70%;*/
padding: 1em;
margin-top: 1em;
margin-bottom: 1em;
font-size: x-large;
border: thin solid black;
}
/* TODO trash all these? */
.tallrow {
padding-bottom: 6em;
/*width: 70%;*/
}
.input {
/*width: 45%;*/
float: right;
}
.info {
float: right;
}
/* TODO end of area to trash? */
table.bigdata {
text-align: center;
}
div.boxer {
padding: 1em;
background-color: #e5e5e5;
border: thin solid black;
}
td.label {
text-align: right;
padding-right: 1em;
}
td.labelInvalid {
text-align: right;
padding-right: 1em;
color: red;
}
td.lineitem {
text-align: left;
}
th {
padding-right: 1em;
}
tr.spacer {
height: 1em;
}
.section {
text-align: right;
}
td.alarmSet {
background-color: red;
color:white;
}
td.alarmSet:after { content: "ALARM"; }
td.alarmClear { background-color: #00CC33; }
td.alarmClear:after { content: "OK"; }
-->
</style>
</head>
<body onLoad="setTimeout('reload()',60000)" >
<div id="menu"><ul><li><a href="index.cgi">System Status</a></li><li><
+a href="netconf.cgi">Local Network</a></li><li><a href="rf.cgi">RF Co
+nfiguration</a></li><li><a href="programfilter.cgi">Program a Filter<
+/a></li><li><a href="remotenetconf.cgi">Remote Network</a></li><li><a
+ href="snmpconf.cgi">SNMP Configuration</a></li><li><a href="status.c
+gi">System Health</a></li><li><a href="install.cgi">Install Software<
+/a></li><li><a href="reboot.cgi">Reboot</a></li></ul></div><div id="m
+ain" onLoad=setTimeout('reload()',60000)><h1>System Status</h1>
<script type="text/javascript">
function reload()
{
window.location.reload();
}
</script>
<table class="bigdata">
<tr>
<td class="label"><b>Timestamp</b></td>
<td colspan="4" class="lineitem"> 2010:03:09 - 18:26:45 </
+td>
</tr>
<tr>
<td class="label"><b>System Uptime</b></td>
<!-- I'd like to use colspan="0" but IE sucks. -->
<td colspan="4" class="lineitem">
16 days 3 hours 5 minutes
</td>
</tr>
<tr>
<td class="label"><b>Software Version</b></td>
<td colspan="4" class="lineitem"> 2.2.4 REL </td>
</tr>
<tr>
<td class="label"><b>Serial Number</b></td>
<td colspan="4" class="lineitem"> CFB90357-000000 </td>
</tr>
<tr>
<td class="label"><b>Model Number</b></td>
<td colspan="4" class="lineitem"> DSP85-C/P </td>
</tr>
<tr>
<td class="label"><b>Item Number</b></td>
<td colspan="4" class="lineitem"> CS10-377-403 </td>
</tr>
<tr>
<td class="label"><b>Location</b></td>
<td colspan="4" class="lineitem"> Unknown </td>
</tr>
<tr class="spacer" />
<tr>
<th></th>
<th colspan="2"> Band 1 (CELL) </th>
<th colspan="2"> Band 2 (PCS) </th>
</tr>
<tr> <th align="right">Active Filter</th> <td colspan="2"><tt
+>cgA0-0</tt></td><td colspan="2"><tt>pgA0B4B5F0C5-0</tt></td>
</tr>
<tr class="spacer" />
<tr>
<th style="text-align:right">Power</th>
<th> Down Link </th> <th> Up Link</th> <th> Down Link </th> <th>
+ Up Link</th>
</tr>
<tr>
<td class="label">
In-band Input<sup><a target="_blank" href="help.html#1
+">?</a></sup> (dBm)
</td>
<td>-35.6</td><td>≤ -66.0</td><td>-43.4</td><td>≤ -66.0</td>
</tr>
<tr>
<td class="label">
Measured Output<sup><a target="_blank" href="help.html
+#2">?</a></sup> (dBm)
</td>
<td>18.6</td><td>≤ 4.0</td><td>22.8</td><td>≤ 4.0</td>
</tr>
<tr>
<td class="label">
Composite Input<sup><a target="_blank" href="help.
+html#4">?</a></sup> (dBm)
</td>
<td>-33.9</td><td>-43.4</td><td>-23.7</td><td>≤ -53.0</td>
</tr>
<tr class="spacer" />
<tr class="section">
<th>Gain Control</th>
</tr>
<tr>
<td class="label">AGC Mode</td>
<td colspan="2">On</td><td colspan="2">On</td>
</tr>
<tr>
<td class="label">AGC Attenuation (dB)</td>
<td>0.0</td><td>0.0</td><td>0.0</td><td>0.0</td>
</tr>
<tr>
<td class="label">System Gain</td>
<td>53.5</td><td>70.0</td><td>66.0</td><td>70.0</td>
</tr>
<tr class="spacer" />
<tr class="section">
<th>RF Alarms</th>
</tr>
<tr>
<td class="label">Over Range</td>
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
</tr>
<tr>
<td class="label">Oscillation</td>
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
</tr>
<tr>
<td class="label">VSWR</td>
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
</tr>
<tr>
<td class="label">Out of Band Overdrive</td>
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
</tr>
<tr>
<td class="label">Low Signal</td>
<td class="alarmClear" />
<td />
<td class="alarmClear" />
<td />
</tr>
<tr>
<td class="label">No Signal</td>
<td class="alarmClear" />
<td />
<td class="alarmClear" />
<td />
</tr>
<tr class="spacer" />
<tr class="section">
<th>System Alarms</th>
</tr>
<tr>
<td class="label">Synthesizer Lock</td>
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
<td class="alarmClear" />
</tr>
<tr>
<td class="label">Voltage</td>
<td colspan="2" class="alarmClear" />
<td colspan="2" class="alarmClear" />
</tr>
<tr>
<td class="label">Temperature</td>
<td colspan="2" class="alarmClear" />
<td colspan="2" class="alarmClear" />
</tr>
<tr>
<td class="label">Software</td>
<td colspan="2" class="alarmClear" />
<td colspan="2" class="alarmClear" />
</tr>
<tr>
<td class="label">Hardware</td>
<td colspan="2" class="alarmClear" />
<td colspan="2" class="alarmClear" />
</tr>
</table>
</div></body>
</html>
Regards,
Scott