http://www.perlmonks.org?node_id=191458
Category: HTML Utility
Author/Contact Info /msg podmaster
Description: Finally, a better link extractor, in a module, HTML::LinkExtractor (does the things people wished HTML::LinkExtor did )

See pod for description and documentation.

Use pod2html with a patched version Pod::Html which correctly interprets <a href="">f</a> in verbatim blocks (my mail to perl5 porters).

update:
I do have a HTML::TokeParser::Simple version of this ;D

and later i fixed a typo

UPDATE: Mon Aug 26 11:09:37 2002 GMT
I just put it up on CPAN (version 0.04). Enjoy HTML::LinkExtractor

package HTML::LinkExtractor;

use strict;

use HTML::TokeParser 2; # looks like a good number
use URI 1;              # same here
use Carp qw( croak );

use vars qw( $VERSION );
$VERSION = '0.03';

## The html tags which might have URLs
# the master list of tagolas and required attributes (to constitute a 
+link)
use vars qw( %TAGS );
%TAGS = (
              a => [qw( href )],
         applet => [qw( archive code codebase src )],
           area => [qw( href )],
           base => [qw( href )],
        bgsound => [qw( src )],
     blockquote => [qw( cite )],
           body => [qw( background )],
            del => [qw( cite )],
            div => [qw( src )], # IE likes it, but don't know where it
+'s documented
          embed => [qw( pluginspage pluginurl src )],
           form => [qw( action )],
          frame => [qw( src longdesc  )],
         iframe => [qw( src )],
         ilayer => [qw( background src )],
            img => [qw( dynsrc longdesc lowsrc src usemap )],
          input => [qw( dynsrc lowsrc src )],
            ins => [qw( cite )],
        isindex => [qw( action )], # real oddball
          layer => [qw( src )],
           link => [qw( src href )],
         object => [qw( archive classid code codebase data usemap )],
              q => [qw( cite )],
         script => [qw( src  )], # HTML::Tagset has 'for' ~ it's WRONG
+!
          sound => [qw( src )],
          table => [qw( background )],
             td => [qw( background )],
             th => [qw( background )],
             tr => [qw( background )],
  ## the exotic case
           meta => undef,
);

## tags which contain <.*?> STUFF TO GET </\w+>
use vars qw( @TAGS_IN_NEED );
@TAGS_IN_NEED = qw(
    a
    blockquote
    del
    ins
    q
);

use vars qw( @VALID_URL_ATTRIBUTES );
@VALID_URL_ATTRIBUTES = qw(
        action
        archive
        background
        cite
        classid
        code
        codebase
        data
        dynsrc
        href
        longdesc
        lowsrc
        pluginspage
        pluginurl
        src
        usemap
);


sub new {
    my($class, $cb, $base) = @_;
    my $self = {};

    $self->{_cb} = $cb if defined $cb;
    $self->{_base} = URI->new($base) if defined $base;

    return bless $self, $class;
}

## $p=HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents);

sub parse {
    my( $this, $hmmm ) = @_;
    my $tp = new HTML::TokeParser( $hmmm );

    unless($tp) {
        croak qq[ Couldn't create a HTML::TokeParser object: $!];
    }

    $this->{_tp} = $tp;

    $this->_parsola();
    return();
}

sub _parsola {
    my $self = shift;
    my $IS_WE_OPEN = 0;

## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
    my @TEXT = ();
    $self->{_LINKS} = [];


#  ["S",  $tag, $attr, $attrseq, $text]
#  ["E",  $tag, $text]
#  ["T",  $text, $is_data]
#  ["C",  $text]
#  ["D",  $text]
#  ["PI", $token0, $text]

    while (my $T = $self->{_tp}->get_token() ) {
        my $NewLink;
## Start tag?
        if($$T[0] eq "S") {

            next unless exists $TAGS{$$T[1]};

## Do we have a tag for which we want to capture text?
            my $UNIQUE = 0;
            $UNIQUE = grep { /^\Q$$T[1]\E$/i } @TAGS_IN_NEED;

## then check to see if we got things besides META :)
            if(defined $TAGS{ $$T[1] }) {
                for my $tag(@{$TAGS{$$T[1]}}) {
## and we check if they do have one with a value
                    if(exists $$T[2]{ $tag }) {
                        $NewLink = $$T[2];
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
                        if($UNIQUE) {
                            push @TEXT, $NewLink;
                            $NewLink->{_TEXT} = "";
                        }
                    }
                }
            }elsif($$T[1] eq 'meta') {
                $NewLink = $$T[2];
            }

    ## In case we got nested tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $$T[-1];
            }

## Text?
        }elsif($$T[0] eq "T") {
            $TEXT[-1]->{_TEXT} .= $$T[1] if @TEXT;
## Declaration?
        }elsif($$T[0] eq "D") {
## We look at declarations, to get anly custom .dtd's (tis linky)
            if( $$T[1] =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) {
+ #"
                $NewLink = { raw => $$T[1], url => $1};
            }
## End tag?
        }elsif($$T[0] eq "E"){
## these be ignored (maybe not in between <a...></a> tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $$T[-1];
                my $pop = pop @TEXT;
                $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
                $self->{_cb}->($pop) if exists $self->{_cb};
            }
        }

        if(defined $NewLink) {
            $$NewLink{tag}=$$T[1];

            my $base = $self->{_base};

            for my $attr( @VALID_URL_ATTRIBUTES ) {
                $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba
+se )
                    if exists $$NewLink{$attr};
            }

            if(exists $self->{_cb}) {
                $self->{_cb}->( $NewLink ) unless @TEXT;
            } else {
                push @{$self->{_LINKS}}, $NewLink;
            }
        }
    }## endof while (my $token = $p->get_token)

    undef $self->{_tp};
    return();
}

sub links {
    my $self = shift;
    ## just like HTML::LinkExtor's
    return $self->{_LINKS};
}

# Preloaded methods go here.

1;

package main;

unless(caller()) {

    my $p = new HTML::LinkExtractor(
        sub {
            print Dumper(shift);
        },
    );

    my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.

1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht
+ml">
3 <base href="http://perl.org">
4 <a href="http://www.perlmonks.org">Perlmonks.org</a>
<p>

5 <a href="#BUTTER"  href="#SCOTCH">
    hello there
6 <img src="#AND" src="#PEANUTS">
7    <a href="#butter"> now </a>
</a>

8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q>
9 <blockquote CITE="http://www.stonehenge.com/merlyn/">
    Just Another Perl Hacker,
</blockquote>
    };

    $p->parse(\$INPUT);

    $p = new HTML::LinkExtractor();
    $p->parse(\$INPUT);

    use Data::Dumper;
    print scalar(@{$p->links()})." we GOT\n";
    print Dumper( $p->links() );    
    
}

__END__


=head1 NAME

HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> fr
+om an HTML document

=head1 DESCRIPTION

HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>,
except that besides getting the URL, you also get the link-text.

Example (please run the examples):

    use HTML::LinkExtractor;
    use Data::Dumper;
    
    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
    my $p = new HTML::LinkExtractor();
    
    $p->parse(\$input);
    
    print Dumper($p->links);
    __END__
    # the above example will yield
    $VAR1 = [
              {
                '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!!
+ </a>',
                'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR
+I::http'),
                'tag' => 'a'
              }
            ];

C<HTML::LinkExtractor> will also correctly extract nexted link-type ta
+gs.

=head1 SYNOPSIS

    perl LinkExtractor.pm
    
    ## or

    use HTML::LinkExtractor;
    use LWP::Simple qw( get );

    my $base = 'http://search.cpan.org';
    my $html = get($base.'/recent');
    my $p = new HTML::LinkExtractor();

    $p->parse(\$html);

    print qq{<base href="$base">\n};

    for my $Link( @{ $p->links } ) {
    ## new modules are linked  by /author/NAME/Dist
        if( $$Link{href}=~ m{^\/author\/\w+} ) {
            print $$Link{_TEXT}."\n";
        }
    }

    undef $p;
    __END__


=head1 METHODS

=head2 C<new>

Just like HTML::LinkExtor's new, it accepts 2 argument,
a callback ( a sub reference, as in C<sub{}>, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
 after the closing tag is encountered )

and a base URL (it's up to you to make sure it's valid)
which is used to convert all relative URI's to absolute ones.

    $ALinkP{href} = URI->new_abs( $ALink{href}, $base );

=head2 C<parse>

Each time you call C<parse>, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>

Each time you call C<parse> a new C<HTML::TokeParser> object
is created and stored in C<$this-E<gt>{_tp}>.

You shouldn't need to mess with the TokeParser object.

=head2 C<links>

Only after you call C<parse> will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)

    $VAR1 = [ { type => 'img', src => 'image.png' }, ];


=head1 WHAT'S A LINK-type tag

Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.

Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)

Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>>


=head2 HOW CAN THAT BE?

I took at look at C<%HTML::Tagset::linkElements> and the following URL
+'s

    http://www.blooberry.com/indexdot/html/tagindex/all.htm

    http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/area.htm

    http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
    http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm

    http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
    http://www.blooberry.com/indexdot/html/tagpages/d/div.htm

    http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
    http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm

    http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm

    http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
    http://www.blooberry.com/indexdot/html/tagpages/l/link.htm

    http://www.blooberry.com/indexdot/html/tagpages/o/object.htm

    http://www.blooberry.com/indexdot/html/tagpages/q/q.htm

    http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
    http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm

    And the special cases 

    <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
    http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
    
    and

    <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.
+html">
    http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm


=head1 AUTHOR

podmaster (see CPAN) aka crazyinsomniac@yahoo.com

=head1 SEE ALSO

L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>.

=cut