0. [refresh] ................................. [web.asite.com/tmpgifs/zz] 1. [www.gone.com] ............................ [linktext] 2. [www.gone2.com] ........................... [linktext2] 3. [www.gone3.com] ........................... [linktext3 ] 4. [a] ....................................... [bold word] 5. [www.linkfromimage.com1] .................. [text next to image] 6. [href] .................................... [www.linkfromimage.com2] 7. [destinations/western-cape/map.aspx] ...... [abc] 8. [destinations/free-state/map.aspx] ........ [Free State Map] 9. [destinations/free-state/map2.aspx] ....... [Free State Map2] 10. [a] ....................................... [email link text] #### 0. [refresh] ................................. [web.asite.com/tmpgifs/zz] 1. [www.gone.com] ............................ [linktext] 2. [www.gone2.com] ........................... [linktext2] 3. [www.gone3.com] ........................... [linktext3 ] 4. [www.gone3.com] ........................... [bold word] 5. [www.linkfromimage.com1] .................. [text next to image] 6. [www.linkfromimage.com2] .................. [] 7. [destinations/western-cape/map.aspx] ...... [abc] 8. [destinations/free-state/map.aspx] ........ [Free State Map] 9. [destinations/free-state/map2.aspx] ....... [Free State Map2] #### #!/usr/bin/perl # # Usage: # get_links_test {file.html} # # use strict; use warnings; use HTML::Parser; #---------------------------------------- # support variables for get_links(). my @get_links_result; my $get_links_txt_hdlr_text_maxsz; my $get_links_txt_hdlr_text_tallysz; my %get_links_txt_hdlr_text_inside; my @get_links_txt_hdlr_tags; sub strdef { my ($str) = @_; return '' if not defined $str; return $str; } #---------------------------------------- # HTML::Parser support routine for get_links(). # NB: Caller must check if inside tag. sub get_links_txt_stashit { my ( $inside, $element, $txt ) = @_; $txt = get_text_cleanit($txt); if ( ( defined $txt and $txt ne '' ) and $txt =~ m/\S+/ ) { # if this is , and previous element was , # then pop it off the stack and change it... # I'm sure there's a better way to achieve this. if ( $element eq 'a' ) { my $trec = pop @get_links_result; if ( defined $trec->{href} ) { # override some detail. $element = clean_url( $trec->{href} ); } else { # else push it back. push @get_links_result, $trec; } } my $rec = {}; $rec->{$element} = $txt; push @get_links_result, $rec; $get_links_txt_hdlr_text_tallysz += length $txt; } $inside->{$element} = 0; } sub get_text_cleanit { my ($txt) = @_; $txt =~ s/(?:^[[:punct:]]+|\s+|\n|\r)/ /gxm; $txt =~ s/(?:^\s+|^[[:punct:]]+|\s+$|[[:punct:]]+$)//gxm; return $txt; } sub clean_url { my $url = shift; $url =~ s#^http[s]*://##gxmi; $url =~ s#^ftp://##gxmi; $url =~ s#^gopher://##gxmi; $url =~ s#^news://##gxmi; $url =~ s#^file://##gxmi; $url =~ s#/$##gxm; return $url; } #---------------------------------------- # HTML::Parser support routine for get_links(). sub get_links_txt_hdlr { my ( $self, $txt ) = @_; # todo: something about doing this in a loop bothers me... # somehow, we need to know what the current event html tag is being # processed, then we can do away with this loop... 'tagname' appears to # be for start/end handlers only. foreach my $tag (@get_links_txt_hdlr_tags) { # skip these since they're handled by get_links_start_tag_hdlr(). next if $tag eq 'img' or $tag eq 'href' or $tag eq 'meta'; get_links_txt_stashit( \%get_links_txt_hdlr_text_inside, $tag, $txt ) if $get_links_txt_hdlr_text_inside{$tag}; # process if inside tag. # exit early if got enough text. if ( $get_links_txt_hdlr_text_maxsz and $get_links_txt_hdlr_text_tallysz > $get_links_txt_hdlr_text_maxsz ) { $self->eof; last; } } } #---------------------------------------- # HTML::Parser support routine for get_links(). sub get_links_start_tag_hdlr { my ( $self, $tag, $attr ) = @_; $get_links_txt_hdlr_text_inside{$tag} = 1; if ( $tag eq 'a' and defined $attr->{href} ) { my $txt = get_text_cleanit( $attr->{href} ); return if not defined $txt or $txt eq ''; # skip links to email, javascript, etc. - this is not working... if ( $attr->{href} =~ # [:;] to cope with erroneous HTML m/mailto[:;]|^javascript[:;]|^file[:;]|^news[:;]/i ) { return; } my $rec = {}; $rec->{href} = $txt; push @get_links_result, $rec; $get_links_txt_hdlr_text_tallysz += length $txt; } if ( $tag eq 'area' and defined $attr->{href} ) { my $rec = {}; my $txt = get_text_cleanit( $attr->{href} ); my $title = get_text_cleanit( strdef( $attr->{title} ) ); my $alt = get_text_cleanit( strdef( $attr->{alt} ) ); return if not defined $txt or $txt eq ''; my $content = strdef($title); $content = $alt if $content eq ''; $rec->{$txt} = $content; push @get_links_result, $rec; $get_links_txt_hdlr_text_tallysz += length $txt; } elsif ( $tag eq 'meta' and defined $attr->{'http-equiv'} and lc( $attr->{'http-equiv'} ) eq 'refresh' and defined $attr->{content} ) { my $tempurl = $attr->{content}; $tempurl =~ s#^\s*\d+\s*\;\s*url\s*\=\s*[']*(?:http[s]*://)*##gi; $tempurl =~ s/[']+$//; my $txt = clean_url($tempurl); return if not defined $txt or $txt eq ''; my $rec = {}; $rec->{refresh} = $txt; push @get_links_result, $rec; $get_links_txt_hdlr_text_tallysz += length $txt; } } #---------------------------------------- # HTML::Parser support routine for get_links(). sub get_links_end_tag_hdlr { my ( $tag, $text ) = @_; $get_links_txt_hdlr_text_inside{$tag} = 0; } # new get_links() sub get_links_new { my ($fn) = @_; my $html_parser = HTML::Parser->new( api_version => 3 ); @get_links_txt_hdlr_tags = split '\s+', 'title h1 h2 h3 h4 h5 h6 a b em i ' . 'u caption th li dt dd blockquote ' . 'pre area img meta'; undef %get_links_txt_hdlr_text_inside; @get_links_result = (); $html_parser->handler( start => \&get_links_start_tag_hdlr, "self, tagname, attr" ); $html_parser->handler( end => \&get_links_end_tag_hdlr, "tagname, dtext" ); $html_parser->handler( text => \&get_links_txt_hdlr, "self,dtext" ); $html_parser->report_tags( split '\s+', 'a area meta' ); $html_parser->ignore_elements( split '\s+', 'script style comments' ); eval { $html_parser->parse_file($fn); }; return @get_links_result; } #----------------------------------------------------------------- die "Usage: $0 {file.html}" if not defined $ARGV[0]; my @res = get_links_new( $ARGV[0] ); my $n = 0; for my $hash (@res) { for my $tag ( keys %$hash ) { printf( "%2d%s", $n++, ". [$tag] " . '.' x ( 40 - length($tag) ) . " [$hash->{$tag}]\n" ); } } ############################################# HTML file: This is a title

this is some P aligned text

sfgsdfg sssss sjkfhsfh
this is some DIV wrapped text
now we have some text in a td/table
this is bold text: www.abc1.com you can find it here: http://www.abc2.com/ you can find it here: www.abc3.com you can find it here: henry@bbbbabc3.com you can find it here: www.abc4.com you can find it here: www.abc5.com vv www.abc.pl.net abc.com abc.pl.com/?a=1 123.com www.aaaa.info
linktext linktext2 linktext3 (bold word) mailto:bob@somwhere.com alt text for img text next to imagealt text for img alt text for img Free State Map2 email link text