Beefy Boxes and Bandwidth Generously Provided by pair Networks
Welcome to the Monastery
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
package HTML::LinkExtractor; use strict; use HTML::TokeParser 2; # looks like a good number use URI 1; # same here use Carp qw( croak ); use vars qw( $VERSION ); $VERSION = '0.03'; ## The html tags which might have URLs # the master list of tagolas and required attributes (to constitute a +link) use vars qw( %TAGS ); %TAGS = ( a => [qw( href )], applet => [qw( archive code codebase src )], area => [qw( href )], base => [qw( href )], bgsound => [qw( src )], blockquote => [qw( cite )], body => [qw( background )], del => [qw( cite )], div => [qw( src )], # IE likes it, but don't know where it +'s documented embed => [qw( pluginspage pluginurl src )], form => [qw( action )], frame => [qw( src longdesc )], iframe => [qw( src )], ilayer => [qw( background src )], img => [qw( dynsrc longdesc lowsrc src usemap )], input => [qw( dynsrc lowsrc src )], ins => [qw( cite )], isindex => [qw( action )], # real oddball layer => [qw( src )], link => [qw( src href )], object => [qw( archive classid code codebase data usemap )], q => [qw( cite )], script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG +! sound => [qw( src )], table => [qw( background )], td => [qw( background )], th => [qw( background )], tr => [qw( background )], ## the exotic case meta => undef, ); ## tags which contain <.*?> STUFF TO GET </\w+> use vars qw( @TAGS_IN_NEED ); @TAGS_IN_NEED = qw( a blockquote del ins q ); use vars qw( @VALID_URL_ATTRIBUTES ); @VALID_URL_ATTRIBUTES = qw( action archive background cite classid code codebase data dynsrc href longdesc lowsrc pluginspage pluginurl src usemap ); sub new { my($class, $cb, $base) = @_; my $self = {}; $self->{_cb} = $cb if defined $cb; $self->{_base} = URI->new($base) if defined $base; return bless $self, $class; } ## $p=HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); sub parse { my( $this, $hmmm ) = @_; my $tp = new HTML::TokeParser( $hmmm ); unless($tp) { croak qq[ Couldn't create a HTML::TokeParser object: $!]; } $this->{_tp} = $tp; $this->_parsola(); return(); } sub _parsola { my $self = shift; my $IS_WE_OPEN = 0; ## a stack of links for keeping track of TEXT ## which is all of "<a href>text</a>" my @TEXT = (); $self->{_LINKS} = []; # ["S", $tag, $attr, $attrseq, $text] # ["E", $tag, $text] # ["T", $text, $is_data] # ["C", $text] # ["D", $text] # ["PI", $token0, $text] while (my $T = $self->{_tp}->get_token() ) { my $NewLink; ## Start tag? if($$T[0] eq "S") { next unless exists $TAGS{$$T[1]}; ## Do we have a tag for which we want to capture text? my $UNIQUE = 0; $UNIQUE = grep { /^\Q$$T[1]\E$/i } @TAGS_IN_NEED; ## then check to see if we got things besides META :) if(defined $TAGS{ $$T[1] }) { for my $tag(@{$TAGS{$$T[1]}}) { ## and we check if they do have one with a value if(exists $$T[2]{ $tag }) { $NewLink = $$T[2]; ## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>) if($UNIQUE) { push @TEXT, $NewLink; $NewLink->{_TEXT} = ""; } } } }elsif($$T[1] eq 'meta') { $NewLink = $$T[2]; } ## In case we got nested tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; } ## Text? }elsif($$T[0] eq "T") { $TEXT[-1]->{_TEXT} .= $$T[1] if @TEXT; ## Declaration? }elsif($$T[0] eq "D") { ## We look at declarations, to get anly custom .dtd's (tis linky) if( $$T[1] =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) { + #" $NewLink = { raw => $$T[1], url => $1}; } ## End tag? }elsif($$T[0] eq "E"){ ## these be ignored (maybe not in between <a...></a> tags if(@TEXT) { $TEXT[-1]->{_TEXT} .= $$T[-1]; my $pop = pop @TEXT; $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT; $self->{_cb}->($pop) if exists $self->{_cb}; } } if(defined $NewLink) { $$NewLink{tag}=$$T[1]; my $base = $self->{_base}; for my $attr( @VALID_URL_ATTRIBUTES ) { $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba +se ) if exists $$NewLink{$attr}; } if(exists $self->{_cb}) { $self->{_cb}->( $NewLink ) unless @TEXT; } else { push @{$self->{_LINKS}}, $NewLink; } } }## endof while (my $token = $p->get_token) undef $self->{_tp}; return(); } sub links { my $self = shift; ## just like HTML::LinkExtor's return $self->{_LINKS}; } # Preloaded methods go here. 1; package main; unless(caller()) { my $p = new HTML::LinkExtractor( sub { print Dumper(shift); }, ); my $INPUT = q{ COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS. 1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> 2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht +ml"> 3 <base href="http://perl.org"> 4 <a href="http://www.perlmonks.org">Perlmonks.org</a> <p> 5 <a href="#BUTTER" href="#SCOTCH"> hello there 6 <img src="#AND" src="#PEANUTS"> 7 <a href="#butter"> now </a> </a> 8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q> 9 <blockquote CITE="http://www.stonehenge.com/merlyn/"> Just Another Perl Hacker, </blockquote> }; $p->parse(\$INPUT); $p = new HTML::LinkExtractor(); $p->parse(\$INPUT); use Data::Dumper; print scalar(@{$p->links()})." we GOT\n"; print Dumper( $p->links() ); } __END__ =head1 NAME HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> fr +om an HTML document =head1 DESCRIPTION HTML::LinkExtractor is used for extracting links from HTML. It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>, except that besides getting the URL, you also get the link-text. Example (please run the examples): use HTML::LinkExtractor; use Data::Dumper; my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>}; my $p = new HTML::LinkExtractor(); $p->parse(\$input); print Dumper($p->links); __END__ # the above example will yield $VAR1 = [ { '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! + </a>', 'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR +I::http'), 'tag' => 'a' } ]; C<HTML::LinkExtractor> will also correctly extract nexted link-type ta +gs. =head1 SYNOPSIS perl LinkExtractor.pm ## or use HTML::LinkExtractor; use LWP::Simple qw( get ); my $base = 'http://search.cpan.org'; my $html = get($base.'/recent'); my $p = new HTML::LinkExtractor(); $p->parse(\$html); print qq{<base href="$base">\n}; for my $Link( @{ $p->links } ) { ## new modules are linked by /author/NAME/Dist if( $$Link{href}=~ m{^\/author\/\w+} ) { print $$Link{_TEXT}."\n"; } } undef $p; __END__ =head1 METHODS =head2 C<new> Just like HTML::LinkExtor's new, it accepts 2 argument, a callback ( a sub reference, as in C<sub{}>, or C<\&sub>) which is to be called each time a new LINK is encountered ( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means after the closing tag is encountered ) and a base URL (it's up to you to make sure it's valid) which is used to convert all relative URI's to absolute ones. $ALinkP{href} = URI->new_abs( $ALink{href}, $base ); =head2 C<parse> Each time you call C<parse>, you should pass it a C<$filename> a C<*FILEHANDLE> or a C<\$FileContent> Each time you call C<parse> a new C<HTML::TokeParser> object is created and stored in C<$this-E<gt>{_tp}>. You shouldn't need to mess with the TokeParser object. =head2 C<links> Only after you call C<parse> will this method return anything. This method returns a reference to an ArrayOfHashes, which basically looks like (Data::Dumper output) $VAR1 = [ { type => 'img', src => 'image.png' }, ]; =head1 WHAT'S A LINK-type tag Take a look at C<%HTML::LinkExtractor::TAGS> to see what I consider to be link-type-tag. Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see all the possible tag attributes which can contain URI's (the links!!) Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see the tags for which the C<'_TEXT'> attribute is provided, like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>> =head2 HOW CAN THAT BE? I took at look at C<%HTML::Tagset::linkElements> and the following URL +'s http://www.blooberry.com/indexdot/html/tagindex/all.htm http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm http://www.blooberry.com/indexdot/html/tagpages/a/area.htm http://www.blooberry.com/indexdot/html/tagpages/b/base.htm http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm http://www.blooberry.com/indexdot/html/tagpages/d/del.htm http://www.blooberry.com/indexdot/html/tagpages/d/div.htm http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm http://www.blooberry.com/indexdot/html/tagpages/i/image.htm http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm http://www.blooberry.com/indexdot/html/tagpages/l/link.htm http://www.blooberry.com/indexdot/html/tagpages/o/object.htm http://www.blooberry.com/indexdot/html/tagpages/q/q.htm http://www.blooberry.com/indexdot/html/tagpages/s/script.htm http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm And the special cases <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm and <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo. +html"> http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm =head1 AUTHOR podmaster (see CPAN) aka crazyinsomniac@yahoo.com =head1 SEE ALSO L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>. =cut

In reply to HTML::LinkExtractor by PodMaster

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others chilling in the Monastery: (6)
    As of 2015-07-08 01:14 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (93 votes), past polls