HTML::LinkExtractor

Category:	HTML Utility
Author/Contact Info	/msg podmaster
Description:	Finally, a better link extractor, in a module, HTML::LinkExtractor (does the things people wished HTML::LinkExtor did ) See pod for description and documentation. Use pod2html with a patched version Pod::Html which correctly interprets `<a href="">f</a>` in verbatim blocks (my mail to perl5 porters). update: I do have a HTML::TokeParser::Simple version of this ;D and later i fixed a typo UPDATE: Mon Aug 26 11:09:37 2002 GMT I just put it up on CPAN (version 0.04). Enjoy HTML::LinkExtractor
package HTML::LinkExtractor; use strict; use HTML::TokeParser 2; # looks like a good number use URI 1; # same here use Carp qw( croak ); use vars qw( $VERSION ); $VERSION = '0.03'; ## The html tags which might have URLs # the master list of tagolas and required attributes (to constitute a +link) use vars qw( %TAGS ); %TAGS = ( a => [qw( href )], applet => [qw( archive code codebase src )], area => [qw( href )], base => [qw( href )], bgsound => [qw( src )], blockquote => [qw( cite )], body => [qw( background )], del => [qw( cite )], div => [qw( src )], # IE likes it, but don't know where it +'s documented embed => [qw( pluginspage pluginurl src )], form => [qw( action )], frame => [qw( src longdesc )], iframe => [qw( src )], ilayer => [qw( background src )], img => [qw( dynsrc longdesc lowsrc src usemap )], input => [qw( dynsrc lowsrc src )], ins => [qw( cite )], isindex => [qw( action )], # real oddball layer => [qw( src )], link => [qw( src href )], object => [qw( archive classid code codebase data usemap )], q => [qw( cite )], script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG +! sound => [qw( src )], table => [qw( background )], td => [qw( background )], th => [qw( background )], tr => [qw( background )], ## the exotic case meta => undef, ); ## tags which contain <.?> STUFF TO GET </\w+> use vars qw( @TAGS_IN_NEED ); @TAGS_IN_NEED = qw( a blockquote del ins q ); use vars qw( @VALID_URL_ATTRIBUTES ); @VALID_URL_ATTRIBUTES = qw( action archive background cite classid code codebase data dynsrc href longdesc lowsrc pluginspage pluginurl src usemap ); sub new { my($class, $cb, $base) = @_; my $self = {}; $self->{_cb} = $cb if defined $cb; $self->{_base} = URI->new($base) if defined $base; return bless $self, $class; } ## $p=HTML::TokeParser->new($filename \|\| FILEHANDLE \|\|\$filecontents); sub parse { my( $this, $hmmm ) = @_; my $tp = new HTML::TokeParser( $hmmm ); unless($tp) { croak qq[ Couldn't create a HTML::TokeParser object: $!]; } $this->{_tp} = $tp; $this->_parsola(); return(); } sub _parsola { my $self = shift; my $IS_WE_OPEN = 0; ## a stack of links for keeping track of TEXT ## which is all of "<a href>text</a>" my @TEXT = (); $self->{_LINKS} = []; # ["S", $tag, $attr, $attrseq, $text] # ["E", $tag, $text] # ["T", $text, $is_data] # ["C", $text] # ["D", $text] # ["PI", $token0, $text] while (my $T = $self->{_tp}->get_token() ) { my $NewLink; ## Start tag? if($$T[0] eq "S") { next unless exists $TAGS{$$T[1]}; ## Do we have a tag for which we want to capture text? my $UNIQUE = 0; $UNIQUE = grep { /^\Q$$T[1]\E$/i } @TAGS_IN_NEED; ## then check to see if we got things besides META :) if(defined $TAGS{ $$T[1] }) { for my $tag(@{$TAGS{$$T[1]}}) { ## and we check if they do have one with a value if(exists $$T[2]{ $tag }) { $NewLink = $$T[2]; ## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>) if($UNIQUE) { push @TEXT, $NewLink; $NewLink->{_TEXT} = ""; } } } }elsif($$T[1] eq 'meta') { $NewLink = $$T[2]; } ## In case we got nested tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; } ## Text? }elsif($$T[0] eq "T") { $TEXT[-1]->{_TEXT} .= $$T[1] if @TEXT; ## Declaration? }elsif($$T[0] eq "D") { ## We look at declarations, to get anly custom .dtd's (tis linky) if( $$T[1] =~ m{ SYSTEM \s \" ( http://. ) \" > $ }ix ) { + #" $NewLink = { raw => $$T[1], url => $1}; } ## End tag? }elsif($$T[0] eq "E"){ ## these be ignored (maybe not in between <a...></a> tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; my $pop = pop @TEXT; $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT; $self->{_cb}->($pop) if exists $self->{_cb}; } } if(defined $NewLink) { $$NewLink{tag}=$$T[1]; my $base = $self->{_base}; for my $attr( @VALID_URL_ATTRIBUTES ) { $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba +se ) if exists $$NewLink{$attr}; } if(exists $self->{_cb}) { $self->{_cb}->( $NewLink ) unless @TEXT; } else { push @{$self->{_LINKS}}, $NewLink; } } }## endof while (my $token = $p->get_token) undef $self->{_tp}; return(); } sub links { my $self = shift; ## just like HTML::LinkExtor's return $self->{_LINKS}; } # Preloaded methods go here. 1; package main; unless(caller()) { my $p = new HTML::LinkExtractor( sub { print Dumper(shift); }, ); my $INPUT = q{ COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS. 1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> 2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht +ml"> 3 <base href="http://perl.org"> 4 <a href="http://www.perlmonks.org">Perlmonks.org</a> <p> 5 <a href="#BUTTER" href="#SCOTCH"> hello there 6 <img src="#AND" src="#PEANUTS"> 7 <a href="#butter"> now </a> </a> 8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q> 9 <blockquote CITE="http://www.stonehenge.com/merlyn/"> Just Another Perl Hacker, </blockquote> }; $p->parse(\$INPUT); $p = new HTML::LinkExtractor(); $p->parse(\$INPUT); use Data::Dumper; print scalar(@{$p->links()})." we GOT\n"; print Dumper( $p->links() ); } __END__ =head1 NAME HTML::LinkExtractor - Extract I<L<links\|/"WHAT'S A LINK-type tag">> fr +om an HTML document =head1 DESCRIPTION HTML::LinkExtractor is used for extracting links from HTML. It is very similar to L<HTML::LinkExtor\|HTML::LinkExtor>, except that besides getting the URL, you also get the link-text. Example (please run the examples): use HTML::LinkExtractor; use Data::Dumper; my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>}; my $p = new HTML::LinkExtractor(); $p->parse(\$input); print Dumper($p->links); __END__ # the above example will yield $VAR1 = [ { '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! + </a>', 'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR +I::http'), 'tag' => 'a' } ]; C<HTML::LinkExtractor> will also correctly extract nexted link-type ta +gs. =head1 SYNOPSIS perl LinkExtractor.pm ## or use HTML::LinkExtractor; use LWP::Simple qw( get ); my $base = 'http://search.cpan.org'; my $html = get($base.'/recent'); my $p = new HTML::LinkExtractor(); $p->parse(\$html); print qq{<base href="$base">\n}; for my $Link( @{ $p->links } ) { ## new modules are linked by /author/NAME/Dist if( $$Link{href}=~ m{^\/author\/\w+} ) { print $$Link{_TEXT}."\n"; } } undef $p; __END__ =head1 METHODS =head2 C<new> Just like HTML::LinkExtor's new, it accepts 2 argument, a callback ( a sub reference, as in C<sub{}>, or C<\&sub>) which is to be called each time a new LINK is encountered ( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means after the closing tag is encountered ) and a base URL (it's up to you to make sure it's valid) which is used to convert all relative URI's to absolute ones. $ALinkP{href} = URI->new_abs( $ALink{href}, $base ); =head2 C<parse> Each time you call C<parse>, you should pass it a C<$filename> a C<*FILEHANDLE> or a C<\$FileContent> Each time you call C<parse> a new C<HTML::TokeParser> object is created and stored in C<$this-E<gt>{_tp}>. You shouldn't need to mess with the TokeParser object. =head2 C<links> Only after you call C<parse> will this method return anything. This method returns a reference to an ArrayOfHashes, which basically looks like (Data::Dumper output) $VAR1 = [ { type => 'img', src => 'image.png' }, ]; =head1 WHAT'S A LINK-type tag Take a look at C<%HTML::LinkExtractor::TAGS> to see what I consider to be link-type-tag. Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see all the possible tag attributes which can contain URI's (the links!!) Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see the tags for which the C<'_TEXT'> attribute is provided, like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>> =head2 HOW CAN THAT BE? I took at look at C<%HTML::Tagset::linkElements> and the following URL +'s http://www.blooberry.com/indexdot/html/tagindex/all.htm http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm http://www.blooberry.com/indexdot/html/tagpages/a/area.htm http://www.blooberry.com/indexdot/html/tagpages/b/base.htm http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm http://www.blooberry.com/indexdot/html/tagpages/d/del.htm http://www.blooberry.com/indexdot/html/tagpages/d/div.htm http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm http://www.blooberry.com/indexdot/html/tagpages/i/image.htm http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm http://www.blooberry.com/indexdot/html/tagpages/l/link.htm http://www.blooberry.com/indexdot/html/tagpages/o/object.htm http://www.blooberry.com/indexdot/html/tagpages/q/q.htm http://www.blooberry.com/indexdot/html/tagpages/s/script.htm http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm And the special cases <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm and <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo. +html"> http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm =head1 AUTHOR podmaster (see CPAN) aka crazyinsomniac@yahoo.com =head1 SEE ALSO L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>. =cut

Back to Code Catacombs