##
0. [refresh] ................................. [web.asite.com/tmpgifs/zz]
1. [www.gone.com] ............................ [linktext]
2. [www.gone2.com] ........................... [linktext2]
3. [www.gone3.com] ........................... [linktext3 ]
4. [www.gone3.com] ........................... [bold word]
5. [www.linkfromimage.com1] .................. [text next to image]
6. [www.linkfromimage.com2] .................. []
7. [destinations/western-cape/map.aspx] ...... [abc]
8. [destinations/free-state/map.aspx] ........ [Free State Map]
9. [destinations/free-state/map2.aspx] ....... [Free State Map2]
##
##
#!/usr/bin/perl
#
# Usage:
# get_links_test {file.html}
#
#
use strict;
use warnings;
use HTML::Parser;
#----------------------------------------
# support variables for get_links().
my @get_links_result;
my $get_links_txt_hdlr_text_maxsz;
my $get_links_txt_hdlr_text_tallysz;
my %get_links_txt_hdlr_text_inside;
my @get_links_txt_hdlr_tags;
sub strdef {
my ($str) = @_;
return '' if not defined $str;
return $str;
}
#----------------------------------------
# HTML::Parser support routine for get_links().
# NB: Caller must check if inside tag.
sub get_links_txt_stashit {
my ( $inside, $element, $txt ) = @_;
$txt = get_text_cleanit($txt);
if ( ( defined $txt and $txt ne '' ) and $txt =~ m/\S+/ ) {
# if this is , and previous element was ,
# then pop it off the stack and change it...
# I'm sure there's a better way to achieve this.
if ( $element eq 'a' ) {
my $trec = pop @get_links_result;
if ( defined $trec->{href} ) {
# override some detail.
$element = clean_url( $trec->{href} );
}
else {
# else push it back.
push @get_links_result, $trec;
}
}
my $rec = {};
$rec->{$element} = $txt;
push @get_links_result, $rec;
$get_links_txt_hdlr_text_tallysz += length $txt;
}
$inside->{$element} = 0;
}
sub get_text_cleanit {
my ($txt) = @_;
$txt =~ s/(?:^[[:punct:]]+|\s+|\n|\r)/ /gxm;
$txt =~ s/(?:^\s+|^[[:punct:]]+|\s+$|[[:punct:]]+$)//gxm;
return $txt;
}
sub clean_url {
my $url = shift;
$url =~ s#^http[s]*://##gxmi;
$url =~ s#^ftp://##gxmi;
$url =~ s#^gopher://##gxmi;
$url =~ s#^news://##gxmi;
$url =~ s#^file://##gxmi;
$url =~ s#/$##gxm;
return $url;
}
#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_txt_hdlr {
my ( $self, $txt ) = @_;
# todo: something about doing this in a loop bothers me...
# somehow, we need to know what the current event html tag is being
# processed, then we can do away with this loop... 'tagname' appears to
# be for start/end handlers only.
foreach my $tag (@get_links_txt_hdlr_tags) {
# skip these since they're handled by get_links_start_tag_hdlr().
next
if $tag eq 'img'
or $tag eq 'href'
or $tag eq 'meta';
get_links_txt_stashit( \%get_links_txt_hdlr_text_inside, $tag, $txt )
if $get_links_txt_hdlr_text_inside{$tag}; # process if inside tag.
# exit early if got enough text.
if ( $get_links_txt_hdlr_text_maxsz
and $get_links_txt_hdlr_text_tallysz >
$get_links_txt_hdlr_text_maxsz )
{
$self->eof;
last;
}
}
}
#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_start_tag_hdlr {
my ( $self, $tag, $attr ) = @_;
$get_links_txt_hdlr_text_inside{$tag} = 1;
if ( $tag eq 'a' and defined $attr->{href} ) {
my $txt = get_text_cleanit( $attr->{href} );
return if not defined $txt or $txt eq '';
# skip links to email, javascript, etc. - this is not working...
if (
$attr->{href} =~
# [:;] to cope with erroneous HTML
m/mailto[:;]|^javascript[:;]|^file[:;]|^news[:;]/i
)
{
return;
}
my $rec = {};
$rec->{href} = $txt;
push @get_links_result, $rec;
$get_links_txt_hdlr_text_tallysz += length $txt;
}
if ( $tag eq 'area' and defined $attr->{href} ) {
my $rec = {};
my $txt = get_text_cleanit( $attr->{href} );
my $title = get_text_cleanit( strdef( $attr->{title} ) );
my $alt = get_text_cleanit( strdef( $attr->{alt} ) );
return if not defined $txt or $txt eq '';
my $content = strdef($title);
$content = $alt if $content eq '';
$rec->{$txt} = $content;
push @get_links_result, $rec;
$get_links_txt_hdlr_text_tallysz += length $txt;
}
elsif ( $tag eq 'meta'
and defined $attr->{'http-equiv'}
and lc( $attr->{'http-equiv'} ) eq 'refresh'
and defined $attr->{content} )
{
my $tempurl = $attr->{content};
$tempurl =~ s#^\s*\d+\s*\;\s*url\s*\=\s*[']*(?:http[s]*://)*##gi;
$tempurl =~ s/[']+$//;
my $txt = clean_url($tempurl);
return if not defined $txt or $txt eq '';
my $rec = {};
$rec->{refresh} = $txt;
push @get_links_result, $rec;
$get_links_txt_hdlr_text_tallysz += length $txt;
}
}
#----------------------------------------
# HTML::Parser support routine for get_links().
sub get_links_end_tag_hdlr {
my ( $tag, $text ) = @_;
$get_links_txt_hdlr_text_inside{$tag} = 0;
}
# new get_links()
sub get_links_new {
my ($fn) = @_;
my $html_parser = HTML::Parser->new( api_version => 3 );
@get_links_txt_hdlr_tags = split '\s+',
'title h1 h2 h3 h4 h5 h6 a b em i '
. 'u caption th li dt dd blockquote '
. 'pre area img meta';
undef %get_links_txt_hdlr_text_inside;
@get_links_result = ();
$html_parser->handler(
start => \&get_links_start_tag_hdlr,
"self, tagname, attr"
);
$html_parser->handler(
end => \&get_links_end_tag_hdlr,
"tagname, dtext"
);
$html_parser->handler( text => \&get_links_txt_hdlr, "self,dtext" );
$html_parser->report_tags( split '\s+', 'a area meta' );
$html_parser->ignore_elements( split '\s+', 'script style comments' );
eval { $html_parser->parse_file($fn); };
return @get_links_result;
}
#-----------------------------------------------------------------
die "Usage: $0 {file.html}"
if not defined $ARGV[0];
my @res = get_links_new( $ARGV[0] );
my $n = 0;
for my $hash (@res) {
for my $tag ( keys %$hash ) {
printf( "%2d%s",
$n++,
". [$tag] " . '.' x ( 40 - length($tag) ) . " [$hash->{$tag}]\n" );
}
}
#############################################
HTML file:
This is a title
this is some P aligned text
sfgsdfg
sssss sjkfhsfh
this is some DIV wrapped text
now we have some text in a td/table
this is bold text: www.abc1.com
you can find it here: http://www.abc2.com/
you can find it here: www.abc3.com
you can find it here: henry@bbbbabc3.com
you can find it here: www.abc4.com you can find it here: www.abc5.com vv
www.abc.pl.net abc.com abc.pl.com/?a=1
123.com
www.aaaa.info
linktext
linktext2
linktext3 (bold word)
mailto:bob@somwhere.com
text next to image
email link text