Category: | Web Stuff |
Author/Contact Info | Marcelo Magallon |
Description: | Small script to extract all links from a given URL. I whipped up something real quick sometime ago, and I use this for extracting links from different webpages (e.g. wget $(lnkxtor URL '\.tar\.gz$') and the like). Comments about further development directions and improvements much welcomed! |
#!/usr/bin/perl use strict; use warnings; use HTML::TreeBuilder; use LWP::Simple; use URI; use Getopt::Std; my %opts; getopts('i', \%opts); my ($tag, $href) = exists $opts{i} ? ('img', 'src') : ('a', 'href'); if (@ARGV < 1 or @ARGV > 2) { die "Invalid number of arguments"; } my ($url, $regex) = @ARGV; my $uri = URI->new($url); my $tree; $regex ||= '.'; if (-f $url) { $tree = HTML::TreeBuilder->new_from_file($url); } else { my $content = get($uri); die unless defined $content; $tree = HTML::TreeBuilder->new_from_content($content); } die unless defined $tree; foreach my $link ($tree->look_down(_tag => $tag, $href => qr{$regex})) { my $link_url = URI->new_abs($link->attr($href), $uri); print $link_url->as_string, "\n"; } |
Back to
Code Catacombs