HTML::Parser fun

FreakyGreenLeaky has asked for the wisdom of the Perl Monks concerning the following question:

Apologies for the long post. I'm stumped and our usual (paid) Perl freelance goto guy can't get to it.

I'm using HTML::Parser to extract links and associated text and it works, except for a few issues. It's these few nigglies I need help with.

The script currently outputs:

 0. [refresh] ................................. [web.asite.com/tmpgifs
+/zz]
 1. [www.gone.com] ............................ [linktext]
 2. [www.gone2.com] ........................... [linktext2]
 3. [www.gone3.com] ........................... [linktext3 ]
 4. [a] ....................................... [bold word]
 5. [www.linkfromimage.com1] .................. [text next to image]
 6. [href] .................................... [www.linkfromimage.com
+2]
 7. [destinations/western-cape/map.aspx] ...... [abc]
 8. [destinations/free-state/map.aspx] ........ [Free State Map]
 9. [destinations/free-state/map2.aspx] ....... [Free State Map2]
10. [a] ....................................... [email link text]
[download]

But I need:

 0. [refresh] ................................. [web.asite.com/tmpgifs
+/zz]
 1. [www.gone.com] ............................ [linktext]
 2. [www.gone2.com] ........................... [linktext2]
 3. [www.gone3.com] ........................... [linktext3 ]
 4. [www.gone3.com] ........................... [bold word]
 5. [www.linkfromimage.com1] .................. [text next to image]
 6. [www.linkfromimage.com2] .................. []
 7. [destinations/western-cape/map.aspx] ...... [abc]
 8. [destinations/free-state/map.aspx] ........ [Free State Map]
 9. [destinations/free-state/map2.aspx] ....... [Free State Map2]
[download]

ie, #4 and #6 must change, and #10 must go bye-bye.

Any pointers on what I'm doing wrong would be appreciated.

#!/usr/bin/perl
#
# Usage:
# get_links_test {file.html}
#
#


use strict;
use warnings;
use HTML::Parser;

#----------------------------------------
# support variables for get_links().
my @get_links_result;
my $get_links_txt_hdlr_text_maxsz;
my $get_links_txt_hdlr_text_tallysz;
my %get_links_txt_hdlr_text_inside;
my @get_links_txt_hdlr_tags;

sub strdef {
    my ($str) = @_;

    return '' if not defined $str;
    return $str;
}

#----------------------------------------
# HTML::Parser support routine for get_links().
# NB: Caller must check if inside tag.
sub get_links_txt_stashit {
    my ( $inside, $element, $txt ) = @_;

    $txt = get_text_cleanit($txt);

    if ( ( defined $txt and $txt ne '' ) and $txt =~ m/\S+/ ) {

        # if this is <A>, and previous element was <HREF>,
        # then pop it off the stack and change it...
        # I'm sure there's a better way to achieve this.
        if ( $element eq 'a' ) {
            my $trec = pop @get_links_result;
            if ( defined $trec->{href} ) {

                # override some detail.
                $element = clean_url( $trec->{href} );
            }
            else {

                # else push it back.
                push @get_links_result, $trec;
            }
        }

        my $rec = {};
        $rec->{$element} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    $inside->{$element} = 0;
}

sub get_text_cleanit {
    my ($txt) = @_;

    $txt =~ s/(?:^[[:punct:]]+|\s+|\n|\r)/ /gxm;
    $txt =~ s/(?:^\s+|^[[:punct:]]+|\s+$|[[:punct:]]+$)//gxm;

    return $txt;
}

sub clean_url {
    my $url = shift;

    $url =~ s#^http[s]*://##gxmi;
    $url =~ s#^ftp://##gxmi;
    $url =~ s#^gopher://##gxmi;
    $url =~ s#^news://##gxmi;
    $url =~ s#^file://##gxmi;
    $url =~ s#/$##gxm;

    return $url;
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_txt_hdlr {

    my ( $self, $txt ) = @_;

    # todo: something about doing this in a loop bothers me...
    # somehow, we need to know what the current event html tag is bein
+g
    # processed, then we can do away with this loop...  'tagname' appe
+ars to
    # be for start/end handlers only.

    foreach my $tag (@get_links_txt_hdlr_tags) {

        # skip these since they're handled by get_links_start_tag_hdlr
+().
        next
          if $tag eq 'img'
          or $tag eq 'href'
          or $tag eq 'meta';

        get_links_txt_stashit( \%get_links_txt_hdlr_text_inside, $tag,
+ $txt )
          if $get_links_txt_hdlr_text_inside{$tag};    # process if in
+side tag.

        # exit early if got enough text.
        if (    $get_links_txt_hdlr_text_maxsz
            and $get_links_txt_hdlr_text_tallysz >
            $get_links_txt_hdlr_text_maxsz )
        {
            $self->eof;
            last;
        }
    }
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_start_tag_hdlr {
    my ( $self, $tag, $attr ) = @_;

    $get_links_txt_hdlr_text_inside{$tag} = 1;

    if ( $tag eq 'a' and defined $attr->{href} ) {
        my $txt = get_text_cleanit( $attr->{href} );
        return if not defined $txt or $txt eq '';

        # skip links to email, javascript, etc. - this is not working.
+..
        if (
            $attr->{href} =~

            # [:;] to cope with erroneous HTML
            m/mailto[:;]|^javascript[:;]|^file[:;]|^news[:;]/i
          )
        {
            return;
        }

        my $rec = {};
        $rec->{href} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    if ( $tag eq 'area' and defined $attr->{href} ) {
        my $rec   = {};
        my $txt   = get_text_cleanit( $attr->{href} );
        my $title = get_text_cleanit( strdef( $attr->{title} ) );
        my $alt   = get_text_cleanit( strdef( $attr->{alt} ) );
        return if not defined $txt or $txt eq '';
        my $content = strdef($title);
        $content = $alt if $content eq '';
        $rec->{$txt} = $content;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    elsif ( $tag eq 'meta'
        and defined $attr->{'http-equiv'}
        and lc( $attr->{'http-equiv'} ) eq 'refresh'
        and defined $attr->{content} )
    {
        my $tempurl = $attr->{content};
        $tempurl =~ s#^\s*\d+\s*\;\s*url\s*\=\s*[']*(?:http[s]*://)*##
+gi;
        $tempurl =~ s/[']+$//;
        my $txt = clean_url($tempurl);
        return if not defined $txt or $txt eq '';
        my $rec = {};
        $rec->{refresh} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_end_tag_hdlr {
    my ( $tag, $text ) = @_;

    $get_links_txt_hdlr_text_inside{$tag} = 0;
}

# new get_links()
sub get_links_new {
    my ($fn) = @_;

    my $html_parser = HTML::Parser->new( api_version => 3 );

    @get_links_txt_hdlr_tags = split '\s+',
      'title h1 h2 h3 h4 h5 h6 a b em i '
      . 'u caption th li dt dd blockquote '
      . 'pre area img meta';
    undef %get_links_txt_hdlr_text_inside;
    @get_links_result = ();

    $html_parser->handler(
        start => \&get_links_start_tag_hdlr,
        "self, tagname, attr"
    );
    $html_parser->handler(
        end => \&get_links_end_tag_hdlr,
        "tagname, dtext"
    );

    $html_parser->handler( text => \&get_links_txt_hdlr, "self,dtext" 
+);
    $html_parser->report_tags( split '\s+', 'a area meta' );
    $html_parser->ignore_elements( split '\s+', 'script style comments
+' );

    eval { $html_parser->parse_file($fn); };

    return @get_links_result;
}




#-----------------------------------------------------------------
die "Usage:  $0 {file.html}"
    if not defined $ARGV[0];

my @res = get_links_new( $ARGV[0] );

my $n = 0;
for my $hash (@res) {
    for my $tag ( keys %$hash ) {
        printf( "%2d%s",
            $n++,
            ". [$tag] " . '.' x ( 40 - length($tag) ) . " [$hash->{$ta
+g}]\n" );
    }
}




#############################################

HTML file:

<html>
<meta name='description' content='this is some meta content'>
<meta name='keywords' content='cars bikes sales call wheels engine fas
+t'>
<meta HTTP-EQUIV="Refresh" CONTENT="300;URL='http://web.asite.com/tmpg
+ifs/zz/'">

<title>This is a title</title>

<p align=left>this is some P aligned text</p>
sfgsdfg
sssss sjkfhsfh
<div>this is some DIV wrapped text</div>
<table><tr><td>now we have some text in a td/table</td></tr></table>
this is bold text:  <b>www.abc1.com</b>
you can find it here: http://www.abc2.com/
you can find it here: www.abc3.com
you can find it here: henry@bbbbabc3.com
you can find it here: www.abc4.com you can find it here: www.abc5.com 
+vv
www.abc.pl.net abc.com abc.pl.com/?a=1
123.com
www.aaaa.info

<a href=http://www.gone.com>linktext</a>
<a href=http://www.gone2.com>linktext2</a>
<a href=http://www.gone3.com>linktext3 (<b>bold word</b>)</a>

mailto:bob@somwhere.com

<img src=/xxx.png alt='alt text for img'>

<a href=www.linkfromimage.com1>text next to image<img src=/xxx.png alt
+='alt text for img'></a>
<a href=www.linkfromimage.com2><img src=/xxx.png alt='alt text for img
+'></a>


<area title='abc' shape="rect" coords="121,380,172,400" href="/destina
+tions/western-cape/map.aspx">
<area shape="rect" coords="262,214,301,241" href="/destinations/free-s
+tate/map.aspx" title="Free State Map">
<area shape="rect" coords="262,214,301,241" href="/destinations/free-s
+tate/map2.aspx" alt="Free State Map2">

<a href='mailto:deepheat@bbb.com'>email link text</a>

</html>
[download]

Back to Seekers of Perl Wisdom