tags
if(@TEXT) {
$TEXT[-1]->{_TEXT} .= $T->return_text;
my $pop = pop @TEXT;
$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
$self->{_cb}->($pop) if exists $self->{_cb};
}
}
if(defined $NewLink) {
$$NewLink{tag} = $Tag;
my $base = $self->{_base};
for my $attr( @VALID_URL_ATTRIBUTES ) {
$$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $base )
if exists $$NewLink{$attr};
}
if(exists $self->{_cb}) {
$self->{_cb}->( $NewLink ) unless @TEXT;
} else {
push @{$self->{_LINKS}}, $NewLink;
}
}
}## endof while (my $token = $p->get_token)
undef $self->{_tp};
return();
}
sub links {
my $self = shift;
## just like HTML::LinkExtor's
return $self->{_LINKS};
}
# Preloaded methods go here.
1;
package main;
unless(caller()) {
my $p = new HTML::LinkExtractor(
sub {
print Dumper(shift);
},
);
my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
1
2
3
4 Perlmonks.org
5
hello there
6
7 now
8 To be or not to be.
9
Just Another Perl Hacker,
};
$p->parse(\$INPUT);
$p = new HTML::LinkExtractor();
$p->parse(\$INPUT);
use Data::Dumper;
print scalar(@{$p->links()})." we GOT\n";
print Dumper( $p->links() );
}
__END__
=head1 NAME
HTML::LinkExtractor - Extract I> from an HTML document
=head1 DESCRIPTION
HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L,
except that besides getting the URL, you also get the link-text.
Example (please run the examples):
use HTML::LinkExtractor;
use Data::Dumper;
my $input = q{If I am a LINK!!! };
my $p = new HTML::LinkExtractor();
$p->parse(\$input);
print Dumper($p->links);
__END__
# the above example will yield
$VAR1 = [
{
'_TEXT' => ' I am a LINK!!! ',
'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'),
'tag' => 'a'
}
];
C will also correctly extract nexted link-type tags.
=head1 SYNOPSIS
perl LinkExtractor.pm
## or
use HTML::LinkExtractor;
use LWP::Simple qw( get );
my $base = 'http://search.cpan.org';
my $html = get($base.'/recent');
my $p = new HTML::LinkExtractor();
$p->parse(\$html);
print qq{\n};
for my $Link( @{ $p->links } ) {
## new modules are linked by /author/NAME/Dist
if( $$Link{href}=~ m{^\/author\/\w+} ) {
print $$Link{_TEXT}."\n";
}
}
undef $p;
__END__
=head1 METHODS
=head2 C
Just like HTML::LinkExtor's new, it accepts 2 argument,
a callback ( a sub reference, as in C, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
after the closing tag is encountered )
and a base URL (it's up to you to make sure it's valid)
which is used to convert all relative URI's to absolute ones.
$ALinkP{href} = URI->new_abs( $ALink{href}, $base );
=head2 C
Each time you call C, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>
Each time you call C a new C object
is created and stored in C<$this-E{_tp}>.
You shouldn't need to mess with the TokeParser object.
=head2 C
Only after you call C will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)
$VAR1 = [ { type => 'img', src => 'image.png' }, ];
=head1 WHAT'S A LINK-type tag
Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.
Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)
Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like Ca href="#"E TEST E/aE>
=head2 HOW CAN THAT BE?
I took at look at C<%HTML::Tagset::linkElements> and the following URL's
http://www.blooberry.com/indexdot/html/tagindex/all.htm
http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
http://www.blooberry.com/indexdot/html/tagpages/a/area.htm
http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm
http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
http://www.blooberry.com/indexdot/html/tagpages/d/div.htm
http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm
http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
http://www.blooberry.com/indexdot/html/tagpages/l/link.htm
http://www.blooberry.com/indexdot/html/tagpages/o/object.htm
http://www.blooberry.com/indexdot/html/tagpages/q/q.htm
http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm
And the special cases
http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
and
http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm
=head1 AUTHOR
podmaster (see CPAN) aka crazyinsomniac@yahoo.com
=head1 SEE ALSO
L, L, L.
=cut