G'day sannag,
Welcome to the Monastery.
Your input is basically CSV, with tabs instead of commas, so you can use
Text::CSV to parse it.
If you have Text::CSV_XS installed,
Text::CSV will use that and it should run faster.
For creating the XML, I've used various XML::LibXML methods from
XML::LibXML::Document,
XML::LibXML::Node and
XML::LibXML::Element.
Here's the script:
#!/usr/bin/env perl
use strict;
use warnings;
use autodie qw{:all};
use Text::CSV;
use XML::LibXML;
my $tsv_file_in = 'pm_1169537_input.tsv';
my $xml_file_out = 'pm_1169537_output.xml';
my $dom = generate_xml($tsv_file_in);
output_xml($dom, $xml_file_out);
sub generate_xml {
my ($tsv_file_in) = @_;
open my $in_fh, '<', $tsv_file_in;
my $csv = Text::CSV::->new({sep_char => "\t"})
or die 'Text::CSV::->new() FAILED: ', Text::CSV::->error_diag(
+);
my $dom = XML::LibXML::Document::->new(qw{1.0 UTF-8});
my $root = $dom->createElement('XML_FILE');
$dom->setDocumentElement($root);
my @headers = map { uc } @{$csv->getline($in_fh)};
my %header_index_for = map { $headers[$_] => $_ } 0 .. $#headers;
my ($last_pin, $report) = ('', undef);
while (my $row = $csv->getline($in_fh)) {
next if @$row == 1 && ! length $row->[0];
if ($row->[$header_index_for{PIN}] ne $last_pin) {
$last_pin = $row->[$header_index_for{PIN}];
$report = XML::LibXML::Element->new('REPORT');
$report->setAttribute(TYPE => 'AB');
$root->addChild($report);
for (qw{REASON1 REASON2 REASON3}) {
my $reason = XML::LibXML::Element->new($_);
$reason->appendText($row->[$header_index_for{$_}]);
$report->addChild($reason);
}
my $person = XML::LibXML::Element->new('PERSON');
$report->addChild($person);
for (qw{PIN NAME ZIP}) {
my $person_datum = XML::LibXML::Element->new($_);
$person_datum->appendText($row->[$header_index_for{$_}
+]);
$person->addChild($person_datum);
}
}
my $charge = XML::LibXML::Element->new('CHARGE');
$report->addChild($charge);
for (qw{DATE TIME}) {
my $charge_datum = XML::LibXML::Element->new($_);
$charge_datum->appendText($row->[$header_index_for{$_}]);
$charge->addChild($charge_datum);
}
}
return $dom;
}
sub output_xml {
my ($dom, $xml_file_out) = @_;
open my $out_fh, '>', $xml_file_out;
print $out_fh $dom->toString(1);
return;
}
The input, pm_1169537_input.tsv, is exactly what you posted.
The output, pm_1169537_output.xml, looks exactly as you wanted (except for the erroneous end tag, </REASON>, which now appears correctly as </REASON1>):
<?xml version="1.0" encoding="UTF-8"?>
<XML_FILE>
<REPORT TYPE="AB">
<REASON1>data1</REASON1>
<REASON2>data2</REASON2>
<REASON3>data3</REASON3>
<PERSON>
<PIN>Pin 1</PIN>
<NAME>data5</NAME>
<ZIP>data6</ZIP>
</PERSON>
<CHARGE>
<DATE>data7</DATE>
<TIME>data8</TIME>
</CHARGE>
<CHARGE>
<DATE>data9</DATE>
<TIME>data10</TIME>
</CHARGE>
<CHARGE>
<DATE>data11</DATE>
<TIME>data12</TIME>
</CHARGE>
</REPORT>
</XML_FILE>
Actually, you didn't show the XML declaration:
modify to suit if want something different.
Update:
Oops! Spotted a bug in my code. Two instances of
$headers[$header_index_for{PIN}]
now corrected to
$row->[$header_index_for{PIN}].