package HTML::LinkExtractor; use strict; use HTML::TokeParser 2; # looks like a good number use URI 1; # same here use Carp qw( croak ); use vars qw( $VERSION ); $VERSION = '0.03'; ## The html tags which might have URLs # the master list of tagolas and required attributes (to constitute a link) use vars qw( %TAGS ); %TAGS = ( a => [qw( href )], applet => [qw( archive code codebase src )], area => [qw( href )], base => [qw( href )], bgsound => [qw( src )], blockquote => [qw( cite )], body => [qw( background )], del => [qw( cite )], div => [qw( src )], # IE likes it, but don't know where it's documented embed => [qw( pluginspage pluginurl src )], form => [qw( action )], frame => [qw( src longdesc )], iframe => [qw( src )], ilayer => [qw( background src )], img => [qw( dynsrc longdesc lowsrc src usemap )], input => [qw( dynsrc lowsrc src )], ins => [qw( cite )], isindex => [qw( action )], # real oddball layer => [qw( src )], link => [qw( src href )], object => [qw( archive classid code codebase data usemap )], q => [qw( cite )], script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG! sound => [qw( src )], table => [qw( background )], td => [qw( background )], th => [qw( background )], tr => [qw( background )], ## the exotic case meta => undef, ); ## tags which contain <.*?> STUFF TO GET use vars qw( @TAGS_IN_NEED ); @TAGS_IN_NEED = qw( a blockquote del ins q ); use vars qw( @VALID_URL_ATTRIBUTES ); @VALID_URL_ATTRIBUTES = qw( action archive background cite classid code codebase data dynsrc href longdesc lowsrc pluginspage pluginurl src usemap ); sub new { my($class, $cb, $base) = @_; my $self = {}; $self->{_cb} = $cb if defined $cb; $self->{_base} = URI->new($base) if defined $base; return bless $self, $class; } ## $p=HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); sub parse { my( $this, $hmmm ) = @_; my $tp = new HTML::TokeParser( $hmmm ); unless($tp) { croak qq[ Couldn't create a HTML::TokeParser object: $!]; } $this->{_tp} = $tp; $this->_parsola(); return(); } sub _parsola { my $self = shift; my $IS_WE_OPEN = 0; ## a stack of links for keeping track of TEXT ## which is all of "text" my @TEXT = (); $self->{_LINKS} = []; # ["S", $tag, $attr, $attrseq, $text] # ["E", $tag, $text] # ["T", $text, $is_data] # ["C", $text] # ["D", $text] # ["PI", $token0, $text] while (my $T = $self->{_tp}->get_token() ) { my $NewLink; ## Start tag? if($$T[0] eq "S") { next unless exists $TAGS{$$T[1]}; ## Do we have a tag for which we want to capture text? my $UNIQUE = 0; $UNIQUE = grep { /^\Q$$T[1]\E$/i } @TAGS_IN_NEED; ## then check to see if we got things besides META :) if(defined $TAGS{ $$T[1] }) { for my $tag(@{$TAGS{$$T[1]}}) { ## and we check if they do have one with a value if(exists $$T[2]{ $tag }) { $NewLink = $$T[2]; ## TAGS_IN_NEED are tags in deed (start capturing the STUFF) if($UNIQUE) { push @TEXT, $NewLink; $NewLink->{_TEXT} = ""; } } } }elsif($$T[1] eq 'meta') { $NewLink = $$T[2]; } ## In case we got nested tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; } ## Text? }elsif($$T[0] eq "T") { $TEXT[-1]->{_TEXT} .= $$T[1] if @TEXT; ## Declaration? }elsif($$T[0] eq "D") { ## We look at declarations, to get anly custom .dtd's (tis linky) if( $$T[1] =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) { #" $NewLink = { raw => $$T[1], url => $1}; } ## End tag? }elsif($$T[0] eq "E"){ ## these be ignored (maybe not in between tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; my $pop = pop @TEXT; $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT; $self->{_cb}->($pop) if exists $self->{_cb}; } } if(defined $NewLink) { $$NewLink{tag}=$$T[1]; my $base = $self->{_base}; for my $attr( @VALID_URL_ATTRIBUTES ) { $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $base ) if exists $$NewLink{$attr}; } if(exists $self->{_cb}) { $self->{_cb}->( $NewLink ) unless @TEXT; } else { push @{$self->{_LINKS}}, $NewLink; } } }## endof while (my $token = $p->get_token) undef $self->{_tp}; return(); } sub links { my $self = shift; ## just like HTML::LinkExtor's return $self->{_LINKS}; } # Preloaded methods go here. 1; package main; unless(caller()) { my $p = new HTML::LinkExtractor( sub { print Dumper(shift); }, ); my $INPUT = q{ COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS. 1 2 3 4 Perlmonks.org

5 hello there 6 7 now 8 To be or not to be. 9

Just Another Perl Hacker,

}; $p->parse(\$INPUT); $p = new HTML::LinkExtractor(); $p->parse(\$INPUT); use Data::Dumper; print scalar(@{$p->links()})." we GOT\n"; print Dumper( $p->links() ); } __END__ =head1 NAME HTML::LinkExtractor - Extract I> from an HTML document =head1 DESCRIPTION HTML::LinkExtractor is used for extracting links from HTML. It is very similar to L, except that besides getting the URL, you also get the link-text. Example (please run the examples): use HTML::LinkExtractor; use Data::Dumper; my $input = q{If I am a LINK!!! }; my $p = new HTML::LinkExtractor(); $p->parse(\$input); print Dumper($p->links); __END__ # the above example will yield $VAR1 = [ { '_TEXT' => ' I am a LINK!!! ', 'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'), 'tag' => 'a' } ]; C will also correctly extract nexted link-type tags. =head1 SYNOPSIS perl LinkExtractor.pm ## or use HTML::LinkExtractor; use LWP::Simple qw( get ); my $base = 'http://search.cpan.org'; my $html = get($base.'/recent'); my $p = new HTML::LinkExtractor(); $p->parse(\$html); print qq{\n}; for my $Link( @{ $p->links } ) { ## new modules are linked by /author/NAME/Dist if( $$Link{href}=~ m{^\/author\/\w+} ) { print $$Link{_TEXT}."\n"; } } undef $p; __END__ =head1 METHODS =head2 C Just like HTML::LinkExtor's new, it accepts 2 argument, a callback ( a sub reference, as in C, or C<\&sub>) which is to be called each time a new LINK is encountered ( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means after the closing tag is encountered ) and a base URL (it's up to you to make sure it's valid) which is used to convert all relative URI's to absolute ones. $ALinkP{href} = URI->new_abs( $ALink{href}, $base ); =head2 C Each time you call C, you should pass it a C<$filename> a C<*FILEHANDLE> or a C<\$FileContent> Each time you call C a new C object is created and stored in C<$this-E{_tp}>. You shouldn't need to mess with the TokeParser object. =head2 C Only after you call C will this method return anything. This method returns a reference to an ArrayOfHashes, which basically looks like (Data::Dumper output) $VAR1 = [ { type => 'img', src => 'image.png' }, ]; =head1 WHAT'S A LINK-type tag Take a look at C<%HTML::LinkExtractor::TAGS> to see what I consider to be link-type-tag. Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see all the possible tag attributes which can contain URI's (the links!!) Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see the tags for which the C<'_TEXT'> attribute is provided, like Ca href="#"E TEST E/aE> =head2 HOW CAN THAT BE? I took at look at C<%HTML::Tagset::linkElements> and the following URL's http://www.blooberry.com/indexdot/html/tagindex/all.htm http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm http://www.blooberry.com/indexdot/html/tagpages/a/area.htm http://www.blooberry.com/indexdot/html/tagpages/b/base.htm http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm http://www.blooberry.com/indexdot/html/tagpages/d/del.htm http://www.blooberry.com/indexdot/html/tagpages/d/div.htm http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm http://www.blooberry.com/indexdot/html/tagpages/i/image.htm http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm http://www.blooberry.com/indexdot/html/tagpages/l/link.htm http://www.blooberry.com/indexdot/html/tagpages/o/object.htm http://www.blooberry.com/indexdot/html/tagpages/q/q.htm http://www.blooberry.com/indexdot/html/tagpages/s/script.htm http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm And the special cases http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm and http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm =head1 AUTHOR podmaster (see CPAN) aka crazyinsomniac@yahoo.com =head1 SEE ALSO L, L, L. =cut