Beefy Boxes and Bandwidth Generously Provided by pair Networks
Think about Loose Coupling
 
PerlMonks  

Re: HTML::LinkExtractor

by PodMaster (Abbot)
on Aug 21, 2002 at 01:33 UTC ( #191637=note: print w/ replies, xml ) Need Help??


in reply to HTML::LinkExtractor

Someone asked for it, so here it is, the HTML::TokeParser::Simple version

package HTML::LinkExtractor; use strict; use HTML::TokeParser::Simple 1; use URI 1; use Carp qw( croak ); use vars qw( $VERSION ); $VERSION = '0.03'; ## The html tags which might have URLs # the master list of tagolas and required attributes (to constitute a +link) use vars qw( %TAGS ); %TAGS = ( a => [qw( href )], applet => [qw( archive code codebase src )], area => [qw( href )], base => [qw( href )], bgsound => [qw( src )], blockquote => [qw( cite )], body => [qw( background )], del => [qw( cite )], div => [qw( src )], # IE likes it, but don't know where it +'s documented embed => [qw( pluginspage pluginurl src )], form => [qw( action )], frame => [qw( src longdesc )], iframe => [qw( src )], ilayer => [qw( background src )], img => [qw( dynsrc longdesc lowsrc src usemap )], input => [qw( dynsrc lowsrc src )], ins => [qw( cite )], isindex => [qw( action )], # real oddball layer => [qw( src )], link => [qw( src href )], object => [qw( archive classid code codebase data usemap )], q => [qw( cite )], script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG +! sound => [qw( src )], table => [qw( background )], td => [qw( background )], th => [qw( background )], tr => [qw( background )], ## the exotic case meta => undef, ); ## tags which contain <.*?> STUFF TO GET </\w+> use vars qw( @TAGS_IN_NEED ); @TAGS_IN_NEED = qw( a blockquote del ins q ); use vars qw( @VALID_URL_ATTRIBUTES ); @VALID_URL_ATTRIBUTES = qw( action archive background cite classid code codebase data dynsrc href longdesc lowsrc pluginspage pluginurl src usemap ); sub new { my($class, $cb, $base) = @_; my $self = {}; $self->{_cb} = $cb if defined $cb; $self->{_base} = URI->new($base) if defined $base; return bless $self, $class; } ## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filec +ontents); sub parse { my( $this, $hmmm ) = @_; my $tp = new HTML::TokeParser::Simple( $hmmm ); unless($tp) { croak qq[ Couldn't create a HTML::TokeParser::Simple object: $ +!]; } $this->{_tp} = $tp; $this->_parsola(); return(); } sub _parsola { my $self = shift; my $IS_WE_OPEN = 0; ## a stack of links for keeping track of TEXT ## which is all of "<a href>text</a>" my @TEXT = (); $self->{_LINKS} = []; # ["S", $tag, $attr, $attrseq, $text] # ["E", $tag, $text] # ["T", $text, $is_data] # ["C", $text] # ["D", $text] # ["PI", $token0, $text] while (my $T = $self->{_tp}->get_token() ) { my $NewLink; my $Tag = $T->return_tag; ## Start tag? if($T->is_start_tag) { next unless exists $TAGS{$Tag}; ## Do we have a tag for which we want to capture text? my $UNIQUE = 0; $UNIQUE = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED; ## then check to see if we got things besides META :) if(defined $TAGS{ $Tag }) { for my $Btag(@{$TAGS{$Tag}}) { ## and we check if they do have one with a value if(exists $T->return_attr()->{ $Btag }) { $NewLink = $T->return_attr(); ## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>) if($UNIQUE) { push @TEXT, $NewLink; $NewLink->{_TEXT} = ""; } } } }elsif($Tag eq 'meta') { $NewLink = $T->return_attr(); } ## In case we got nested tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $T->return_text; } ## Text? }elsif($T->is_text) { $TEXT[-1]->{_TEXT} .= $T->return_text if @TEXT; ## Declaration? }elsif($T->is_declaration) { ## We look at declarations, to get anly custom .dtd's (tis linky) my $text - $T->return_text; if( $text =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) { +#" $NewLink = { raw => $text, url => $1}; } ## End tag? }elsif($T->is_end_tag){ ## these be ignored (maybe not in between <a...></a> tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $T->return_text; my $pop = pop @TEXT; $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT; $self->{_cb}->($pop) if exists $self->{_cb}; } } if(defined $NewLink) { $$NewLink{tag} = $Tag; my $base = $self->{_base}; for my $attr( @VALID_URL_ATTRIBUTES ) { $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba +se ) if exists $$NewLink{$attr}; } if(exists $self->{_cb}) { $self->{_cb}->( $NewLink ) unless @TEXT; } else { push @{$self->{_LINKS}}, $NewLink; } } }## endof while (my $token = $p->get_token) undef $self->{_tp}; return(); } sub links { my $self = shift; ## just like HTML::LinkExtor's return $self->{_LINKS}; } # Preloaded methods go here. 1; package main; unless(caller()) { my $p = new HTML::LinkExtractor( sub { print Dumper(shift); }, ); my $INPUT = q{ COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS. 1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> 2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht +ml"> 3 <base href="http://perl.org"> 4 <a href="http://www.perlmonks.org">Perlmonks.org</a> <p> 5 <a href="#BUTTER" href="#SCOTCH"> hello there 6 <img src="#AND" src="#PEANUTS"> 7 <a href="#butter"> now </a> </a> 8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q> 9 <blockquote CITE="http://www.stonehenge.com/merlyn/"> Just Another Perl Hacker, </blockquote> }; $p->parse(\$INPUT); $p = new HTML::LinkExtractor(); $p->parse(\$INPUT); use Data::Dumper; print scalar(@{$p->links()})." we GOT\n"; print Dumper( $p->links() ); } __END__ =head1 NAME HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> fr +om an HTML document =head1 DESCRIPTION HTML::LinkExtractor is used for extracting links from HTML. It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>, except that besides getting the URL, you also get the link-text. Example (please run the examples): use HTML::LinkExtractor; use Data::Dumper; my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>}; my $p = new HTML::LinkExtractor(); $p->parse(\$input); print Dumper($p->links); __END__ # the above example will yield $VAR1 = [ { '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! + </a>', 'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR +I::http'), 'tag' => 'a' } ]; C<HTML::LinkExtractor> will also correctly extract nexted link-type ta +gs. =head1 SYNOPSIS perl LinkExtractor.pm ## or use HTML::LinkExtractor; use LWP::Simple qw( get ); my $base = 'http://search.cpan.org'; my $html = get($base.'/recent'); my $p = new HTML::LinkExtractor(); $p->parse(\$html); print qq{<base href="$base">\n}; for my $Link( @{ $p->links } ) { ## new modules are linked by /author/NAME/Dist if( $$Link{href}=~ m{^\/author\/\w+} ) { print $$Link{_TEXT}."\n"; } } undef $p; __END__ =head1 METHODS =head2 C<new> Just like HTML::LinkExtor's new, it accepts 2 argument, a callback ( a sub reference, as in C<sub{}>, or C<\&sub>) which is to be called each time a new LINK is encountered ( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means after the closing tag is encountered ) and a base URL (it's up to you to make sure it's valid) which is used to convert all relative URI's to absolute ones. $ALinkP{href} = URI->new_abs( $ALink{href}, $base ); =head2 C<parse> Each time you call C<parse>, you should pass it a C<$filename> a C<*FILEHANDLE> or a C<\$FileContent> Each time you call C<parse> a new C<HTML::TokeParser::Simple> object is created and stored in C<$this-E<gt>{_tp}>. You shouldn't need to mess with the TokeParser object. =head2 C<links> Only after you call C<parse> will this method return anything. This method returns a reference to an ArrayOfHashes, which basically looks like (Data::Dumper output) $VAR1 = [ { type => 'img', src => 'image.png' }, ]; =head1 WHAT'S A LINK-type tag Take a look at C<%HTML::LinkExtractor::TAGS> to see what I consider to be link-type-tag. Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see all the possible tag attributes which can contain URI's (the links!!) Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see the tags for which the C<'_TEXT'> attribute is provided, like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>> =head2 HOW CAN THAT BE? I took at look at C<%HTML::Tagset::linkElements> and the following URL +'s http://www.blooberry.com/indexdot/html/tagindex/all.htm http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm http://www.blooberry.com/indexdot/html/tagpages/a/area.htm http://www.blooberry.com/indexdot/html/tagpages/b/base.htm http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm http://www.blooberry.com/indexdot/html/tagpages/d/del.htm http://www.blooberry.com/indexdot/html/tagpages/d/div.htm http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm http://www.blooberry.com/indexdot/html/tagpages/i/image.htm http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm http://www.blooberry.com/indexdot/html/tagpages/l/link.htm http://www.blooberry.com/indexdot/html/tagpages/o/object.htm http://www.blooberry.com/indexdot/html/tagpages/q/q.htm http://www.blooberry.com/indexdot/html/tagpages/s/script.htm http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm And the special cases <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm and <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo. +html"> http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm =head1 AUTHOR podmaster (see CPAN) aka crazyinsomniac@yahoo.com =head1 SEE ALSO L<HTML::LinkExtor>, L<HTML::TokeParser::Simple>, L<HTML::Tagset>. =cut

____________________________________________________
** The Third rule of perl club is a statement of fact: pod is sexy.


Comment on Re: HTML::LinkExtractor
Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://191637]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others romping around the Monastery: (3)
As of 2015-07-06 00:41 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (68 votes), past polls