This is a title

0. [refresh] ................................. [web.asite.com/tmpgifs/zz] 1. [www.gone.com] ............................ [linktext] 2. [www.gone2.com] ........................... [linktext2] 3. [www.gone3.com] ........................... [linktext3 ] 4. [a] ....................................... [bold word] 5. [www.linkfromimage.com1] .................. [text next to image] 6. [href] .................................... [www.linkfromimage.com2] 7. [destinations/western-cape/map.aspx] ...... [abc] 8. [destinations/free-state/map.aspx] ........ [Free State Map] 9. [destinations/free-state/map2.aspx] ....... [Free State Map2] 10. [a] ....................................... [email link text] ##

##

 0. [refresh] ................................. [web.asite.com/tmpgifs/zz]
 1. [www.gone.com] ............................ [linktext]
 2. [www.gone2.com] ........................... [linktext2]
 3. [www.gone3.com] ........................... [linktext3 ]
 4. [www.gone3.com] ........................... [bold word]
 5. [www.linkfromimage.com1] .................. [text next to image]
 6. [www.linkfromimage.com2] .................. []
 7. [destinations/western-cape/map.aspx] ...... [abc]
 8. [destinations/free-state/map.aspx] ........ [Free State Map]
 9. [destinations/free-state/map2.aspx] ....... [Free State Map2]

##

##

#!/usr/bin/perl
#
# Usage:
# get_links_test {file.html}
#
#


use strict;
use warnings;
use HTML::Parser;

#----------------------------------------
# support variables for get_links().
my @get_links_result;
my $get_links_txt_hdlr_text_maxsz;
my $get_links_txt_hdlr_text_tallysz;
my %get_links_txt_hdlr_text_inside;
my @get_links_txt_hdlr_tags;

sub strdef {
    my ($str) = @_;

    return '' if not defined $str;
    return $str;
}

#----------------------------------------
# HTML::Parser support routine for get_links().
# NB: Caller must check if inside tag.
sub get_links_txt_stashit {
    my ( $inside, $element, $txt ) = @_;

    $txt = get_text_cleanit($txt);

    if ( ( defined $txt and $txt ne '' ) and $txt =~ m/\S+/ ) {

        # if this is , and previous element was ,
        # then pop it off the stack and change it...
        # I'm sure there's a better way to achieve this.
        if ( $element eq 'a' ) {
            my $trec = pop @get_links_result;
            if ( defined $trec->{href} ) {

                # override some detail.
                $element = clean_url( $trec->{href} );
            }
            else {

                # else push it back.
                push @get_links_result, $trec;
            }
        }

        my $rec = {};
        $rec->{$element} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    $inside->{$element} = 0;
}

sub get_text_cleanit {
    my ($txt) = @_;

    $txt =~ s/(?:^[[:punct:]]+|\s+|\n|\r)/ /gxm;
    $txt =~ s/(?:^\s+|^[[:punct:]]+|\s+$|[[:punct:]]+$)//gxm;

    return $txt;
}

sub clean_url {
    my $url = shift;

    $url =~ s#^http[s]*://##gxmi;
    $url =~ s#^ftp://##gxmi;
    $url =~ s#^gopher://##gxmi;
    $url =~ s#^news://##gxmi;
    $url =~ s#^file://##gxmi;
    $url =~ s#/$##gxm;

    return $url;
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_txt_hdlr {

    my ( $self, $txt ) = @_;

    # todo: something about doing this in a loop bothers me...
    # somehow, we need to know what the current event html tag is being
    # processed, then we can do away with this loop...  'tagname' appears to
    # be for start/end handlers only.

    foreach my $tag (@get_links_txt_hdlr_tags) {

        # skip these since they're handled by get_links_start_tag_hdlr().
        next
          if $tag eq 'img'
          or $tag eq 'href'
          or $tag eq 'meta';

        get_links_txt_stashit( \%get_links_txt_hdlr_text_inside, $tag, $txt )
          if $get_links_txt_hdlr_text_inside{$tag};    # process if inside tag.

        # exit early if got enough text.
        if (    $get_links_txt_hdlr_text_maxsz
            and $get_links_txt_hdlr_text_tallysz >
            $get_links_txt_hdlr_text_maxsz )
        {
            $self->eof;
            last;
        }
    }
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_start_tag_hdlr {
    my ( $self, $tag, $attr ) = @_;

    $get_links_txt_hdlr_text_inside{$tag} = 1;

    if ( $tag eq 'a' and defined $attr->{href} ) {
        my $txt = get_text_cleanit( $attr->{href} );
        return if not defined $txt or $txt eq '';

        # skip links to email, javascript, etc. - this is not working...
        if (
            $attr->{href} =~

            # [:;] to cope with erroneous HTML
            m/mailto[:;]|^javascript[:;]|^file[:;]|^news[:;]/i
          )
        {
            return;
        }

        my $rec = {};
        $rec->{href} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    if ( $tag eq 'area' and defined $attr->{href} ) {
        my $rec   = {};
        my $txt   = get_text_cleanit( $attr->{href} );
        my $title = get_text_cleanit( strdef( $attr->{title} ) );
        my $alt   = get_text_cleanit( strdef( $attr->{alt} ) );
        return if not defined $txt or $txt eq '';
        my $content = strdef($title);
        $content = $alt if $content eq '';
        $rec->{$txt} = $content;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
    elsif ( $tag eq 'meta'
        and defined $attr->{'http-equiv'}
        and lc( $attr->{'http-equiv'} ) eq 'refresh'
        and defined $attr->{content} )
    {
        my $tempurl = $attr->{content};
        $tempurl =~ s#^\s*\d+\s*\;\s*url\s*\=\s*[']*(?:http[s]*://)*##gi;
        $tempurl =~ s/[']+$//;
        my $txt = clean_url($tempurl);
        return if not defined $txt or $txt eq '';
        my $rec = {};
        $rec->{refresh} = $txt;
        push @get_links_result, $rec;
        $get_links_txt_hdlr_text_tallysz += length $txt;
    }
}

#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_end_tag_hdlr {
    my ( $tag, $text ) = @_;

    $get_links_txt_hdlr_text_inside{$tag} = 0;
}

# new get_links()
sub get_links_new {
    my ($fn) = @_;

    my $html_parser = HTML::Parser->new( api_version => 3 );

    @get_links_txt_hdlr_tags = split '\s+',
      'title h1 h2 h3 h4 h5 h6 a b em i '
      . 'u caption th li dt dd blockquote '
      . 'pre area img meta';
    undef %get_links_txt_hdlr_text_inside;
    @get_links_result = ();

    $html_parser->handler(
        start => \&get_links_start_tag_hdlr,
        "self, tagname, attr"
    );
    $html_parser->handler(
        end => \&get_links_end_tag_hdlr,
        "tagname, dtext"
    );

    $html_parser->handler( text => \&get_links_txt_hdlr, "self,dtext" );
    $html_parser->report_tags( split '\s+', 'a area meta' );
    $html_parser->ignore_elements( split '\s+', 'script style comments' );

    eval { $html_parser->parse_file($fn); };

    return @get_links_result;
}




#-----------------------------------------------------------------
die "Usage:  $0 {file.html}"
    if not defined $ARGV[0];

my @res = get_links_new( $ARGV[0] );

my $n = 0;
for my $hash (@res) {
    for my $tag ( keys %$hash ) {
        printf( "%2d%s",
            $n++,
            ". [$tag] " . '.' x ( 40 - length($tag) ) . " [$hash->{$tag}]\n" );
    }
}




#############################################

HTML file:






This is a title

this is some P aligned text
sfgsdfg
sssss sjkfhsfh
this is some DIV wrapped text
now we have some text in a td/table
this is bold text:  www.abc1.com
you can find it here: http://www.abc2.com/
you can find it here: www.abc3.com
you can find it here: henry@bbbbabc3.com
you can find it here: www.abc4.com you can find it here: www.abc5.com vv
www.abc.pl.net abc.com abc.pl.com/?a=1
123.com
www.aaaa.info

linktext
linktext2
linktext3 (bold word)

mailto:bob@somwhere.com



text next to image







email link text