<?xml version="1.0" encoding="windows-1252"?>
<node id="861233" title="Re: XML cleanup - regex or ?" created="2010-09-22 05:08:44" updated="2010-09-22 05:08:44">
<type id="11">
note</type>
<author id="323803">
murugu</author>
<data>
<field name="doctext">
&lt;p&gt;I dont know whether i understood the question correctly. As you mentioned that each line in the file is single XML statement, I used [cpan://XML::Twig] to check the element and attributes by processing the file line by line.
&lt;p&gt;This below code will print the line number at which you find the discrepancy. You can tweak this code to accommodate the changes you need. 
&lt;code&gt;
#!/usr/bin/perl
use strict;
use XML::Twig;

my %elem_att = qw(cat meow dog bark);
my $reg = join '|', keys %elem_att;

while (&lt;DATA&gt;) {
	next unless (m/&lt;(?:$reg)/);
	my $line = $_;
	my $line_num = $.;
	my $elt = parse XML::Twig::Elt($line);
	my $element = $elt-&gt;name;
	my $att = $elem_att{$element};
	unless ($elt-&gt;att_exists($att)) {
		print "Attribute $att is not found at line number $line_num\n";
		next;
	}
}


__DATA__
&lt;root&gt;
&lt;a/&gt;
&lt;b/&gt;
&lt;cat tail='text' meow='text'/&gt;
&lt;cat tail='text'/&gt;
&lt;cat tail='text'/&gt;
&lt;dog tail='text' bark='text'/&gt;
&lt;dog tail='text'/&gt;
&lt;/root&gt;
&lt;/code&gt;&lt;!-- Node text goes above. Div tags should contain sig only --&gt;
&lt;div class="pmsig"&gt;&lt;div class="pmsig-323803"&gt;
&lt;p&gt;Regards,&lt;br&gt;Murugesan Kandasamy&lt;br&gt;use perl for(;;);
&lt;/div&gt;&lt;/div&gt;</field>
<field name="root_node">
861077</field>
<field name="parent_node">
861077</field>
</data>
</node>
