Here's a way to do it by subclassing HTML::Parser. Even though it is not well documented, I like the fine-grained control this technique allows.
use strict;
use warnings;
my $html = q|
<h1>blah</h1>
<p>blah<p>
<h2>blah</h2>
<p>blah</p>
<h3>blah</h3>
<p>blah</p>
<h2>blah blah blah</h2>
<p>blah</p>
<h1>blah</h1>
<h2>blah</h2>
<h2>blah</h2>|;
my $parser = Markdent_Parser->new();
$parser->parse($html);
$parser->eof;
print $parser->out;
package Markdent_Parser;
use parent qw(HTML::Parser);
sub start {
my ($self,$tag,$attr,$attrseq,$text) = @_;
if ($tag eq 'h1' and $self->{'in_h2'}) {
$self->{'out'} .= "</div>\n\n";
$self->{'in_h2'} = 0;
}
elsif ($tag eq 'h2') {
if ($self->{'in_h2'}) {
$self->{'out'} .= "</div>\n";
}
$self->{'out'} .= "\n<div>\n";
$self->{'in_h2'} = 1;
}
$self->{'out'} .= $text;
}
sub text {
my ($self,$text) = @_;
$self->{'out'} .= $text;
}
sub end {
my ($self,$tag,$text) = @_;
$self->{'out'} .= $text;
}
sub out {
my ($self) = @_;
if ($self->{'in_h2'}) {
$self->{'out'} .= "\n</div>";
}
return $self->{'out'};
}
1;
The output is as described by your algorithm rather than as shown by your example (I fixed the typo in your input also).
<h1>blah</h1>
<p>blah<p>
<div>
<h2>blah</h2>
<p>blah</p>
<h3>blah</h3>
<p>blah</p>
</div>
<div>
<h2>blah blah blah</h2>
<p>blah</p>
</div>
<h1>blah</h1>
<div>
<h2>blah</h2>
</div>
<div>
<h2>blah</h2>
</div>