http://www.perlmonks.org?node_id=408617
Category: Web Stuff
Author/Contact Info Marcelo Magallon
Description: Small script to extract all links from a given URL. I whipped up something real quick sometime ago, and I use this for extracting links from different webpages (e.g. wget $(lnkxtor URL '\.tar\.gz$') and the like). Comments about further development directions and improvements much welcomed!
#!/usr/bin/perl

use strict;
use warnings;
use HTML::TreeBuilder;
use LWP::Simple;
use URI;
use Getopt::Std;

my %opts;

getopts('i', \%opts);

my ($tag, $href) = exists $opts{i} ? ('img', 'src') : ('a', 'href');

if (@ARGV < 1 or @ARGV > 2)
{
    die "Invalid number of arguments";
}

my ($url, $regex) = @ARGV;
my $uri = URI->new($url);
my $tree;

$regex ||= '.';

if (-f $url)
{
    $tree = HTML::TreeBuilder->new_from_file($url);
}
else
{
    my $content = get($uri);
    die unless defined $content;
    $tree = HTML::TreeBuilder->new_from_content($content);
}

die unless defined $tree;

foreach my $link ($tree->look_down(_tag => $tag, $href => qr{$regex}))
{
    my $link_url = URI->new_abs($link->attr($href), $uri);
    print $link_url->as_string, "\n";
}