#!/usr/bin/perl
use strict;
use warnings;
=head1 NAME
linkextor -- extract particular links from HTML documents
=head1 SYNOPSIS
F<linkextor>
S<B<[ -b baseurl ]>>
S<B<[ -f filter ]>>
S<B<[ files and urls ]>>
=head1 DESCRIPTION
C<linkextor> prints links found in given HTML documents, filtered by a
+ny given expressions. If no files are specified on the command line,
+the input document is read from STDIN. You can specify URLs on the co
+mmand line; they will be downloaded and parsed transparently.
By default, no links are filtered, so the output will include referenc
+es to every external resource found in the file such as stylesheets,
+external scripts, images, other documents, etc.
After considering criteria, all matching links will be printed to STDO
+UT. You could pipe them to C<wget -i -> to mass download them, or to
+C<while read url ; do ... ; done> or C<xargs> to process them further
+, or do anything else you can do with a list of links.
=head1 ARGUMENTS
=over 4
=item B<-h>, B<--help>
See a synopsis.
=item B<--man>
Browse the manpage.
=back
=head1 OPTIONS
=over 4
=item B<-b>, B<--base>
Sets base URL to use for relative links in the file(s).
This only applies to local files and input read from STDIN. When parsi
+ng documents retrieved from a URL, the source URL is always the base
+URL assumed for that document.
=item B<-f>, B<--filter>
Defines a filter.
If no filter has been specified, B<all> external links will be returne
+d. This includes links to CSS and Javascript files specified in C<E<l
+t>linkE<gt>> elements, image links specified in C<E<lt>imgE<gt>> elem
+ents, and so on.
If one or more filters have been specified, only links conforming to a
+ny of the specified criteria will be output.
Filters take the form C<tagname:attribute:regex>. C<tagname> specifies
+ which tag this filter will allow. C<attribute> specifies which attri
+bute of the allowed tags will be considered. C<regex> specifies a pat
+tern which candidate links must match. You can leave any of the crite
+ria empty. For empty criteria on the right-hand side of the specifica
+tion, you can even omit the colon.
Eg, C<-f 'a:href:\.mp3$'> will only extract links from the C<E<lt>href
+E<gt>> attributes in the document's C<E<lt>aE<gt>> tags which end wit
+h C<.mp3>. Since C<E<lt>aE<gt>> tags can only contain links in their
+C<E<lt>hrefE<gt>> attribute you can shorten that to C<-f 'a::\.mp3$'>
+. If you wanted any and all links from C<E<lt>aE<gt>> tags, you could
+ use C<-f a::>, where both the attribute and regex component are empt
+y, or just C<-f a>: because both components to the right are empty, y
+ou can leave out the colons entirely. Likewise, you could use C<-f im
+g> to extract all images.
=back
=head1 BUGS
None currently known.
=head1 SEE ALSO
=over 4
=item * L<perlre>
=item * L<xargs(1)>
=item * L<wget(1)>
=back
=head1 COPYRIGHT
(c)2005 Aristotle Pagaltzis
=head1 LICENCE
This script is free software; you can redistribute it and/or modify it
+ under the same terms as Perl itself.
=cut
use Getopt::Long;
use Pod::Usage;
use HTML::LinkExtor;
use URI;
use LWP::Simple;
my %filter;
sub if_valid_link {
my ( $tag, $attr, $value ) = @_;
return $value if not %filter;
my $attr_filter = $filter{ $tag } || $filter{ '' } || return;
my $patterns = $attr_filter->{ $attr } || $attr_filter->{ '' } ||
+return;
return $value if not @$patterns;
return $value if grep $value =~ /$_/, @$patterns;
return;
}
sub push_links {
my ( $p, $base, $link ) = @_;
$base = './' if not defined $base;
for( $p->links ) {
my $tag = shift @$_;
my ( $attr, $value );
push @$link, if_valid_link( $tag, $attr, URI->new_abs( $value,
+ $base )->as_string )
while ( $attr, $value ) = splice @$_, 0, 2;
}
}
GetOptions(
'h|help' => sub { pod2usage( -verbose => 1 ) },
'man' => sub { pod2usage( -verbose => 2 ) },
"b|base=s" => ( \my $opt_base ),
"f|filter=s" => ( \my @opt_filter ),
) or pod2usage();
for( @opt_filter ) {
my ( $tag, $attr, $pattern ) = split m!:!, $_, 3;
$attr = '' unless defined $attr;
push @{ $filter{ $tag }{ $attr } }, defined( $pattern ) ? qr/$patt
+ern/ : ();
}
my $p = HTML::LinkExtor->new();
my @link;
if ( @ARGV ) {
for( @ARGV ) {
if( -r ) {
$p->parse_file( $_ )
or warn( "Couldn't parse $_\n" ), next;
push_links( $p, $opt_base, \@link );
}
else {
( my $html = get $_ )
or warn( "Could neither open nor download $_\n" ), nex
+t;
$p->parse( $html )
or warn( "Couldn't parse $_\n" ), next;
push_links( $p, $_, \@link );
}
}
}
else {
$p->parse_file( \*STDIN );
push_links( $p, $opt_base, \@link );
}
{
local $\ = local $, = "\n";
print @link;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.