This perl script parses a HTML document, extracts the URL of all images and other resources in it, prints them one per line, optionally qualifying relative URLs using a base URL.
I am currently downloading websites using this script and the wgetas script I have recently posted. This works the following way: first I download all the HTML pages using wgetas, then I extract the URLs to images from them with reshtml, then download all the images with wgetas. This utility, however, is not tied to wgetas, you can use it on HTML files aquired from any source, and download the images found with any downloader, such as wget.
See the embedded POD for details on how to use this script.
#!perl
=head1 NAME
reshtml - extract image (and other resource) urls from a HTML
=head1 SYNOPSIS
B<reshtml > [B<-b> I<BASEURL>] [I<FILENAME>...]
=head1 DESCRIPTION
Parses a HTML document, extract the URL of all images and other resour
+ces in
it, and print them one per line.
Currently url for images, default style sheets, and favicons are colle
+cted.
Scripts, optional style sheets, random hyperlinks and random header li
+nks,
applets, netscape low-res image previews, refresh targets, frames and
+iframes
are ignored, though some of this could change in the future.
The HTML documents are read from the files whose names are given on co
+mmand
line, or from STDIN if no name is given.
Repeated URLs are printed only once, though no effort is done to recog
+nize
equivalent URLs.
=head1 OPTIONS
=over
=item B<-b> I<BASEURL>
Qualify relative urls using I<BASEURL> as the base.
Note that a base url given in the HTML document (with the B<base> tag)
is always used this way, no matter whether you give this switch or not
+.
If no base URL is known, but relative URLs are found, they are output
as is but with a warning. Use B<-b .> to silence this warning.
=item B<-i> I<LISTFILE>
Read I<LISTFILE> for a list of URLs and download filenames. The filen
+ames
from second column give the name of HTML files to read and parse, the
URLs in the first column are used as the base URL only. This option
excludes giving filenames or base urls from the command line.
The listfile has the same format as the listfile for L<wgetas(1)>, mak
+ing
it easier to process HTML files you have downloaded with that utility.
=item B<-P> I<PREFIXDIR>
Interpret filenames as relative to directory I<PREFIXDIR>. This is mo
+st
useful with B<-i>, but can be used otherwise too.
=item B<-v>
Print names of files as they're parsed.
=back
=cut
use warnings; use 5.012; use strict;
use IO::Handle;
use Getopt::Long;
use URI;
use File::Spec::Functions "catfile";
use XML::Twig;
our $VERSION = 0.001;
our $gbase = undef;
our $listfname = undef;
our $dirprefix = ".";
our $verbose;
our($versionopt, $helpopt);
Getopt::Long::Configure qw"bundling gnu_compat prefix_pattern=(--|-)";
GetOptions(
"b|base=s" => \$gbase,
"i|listfile=s" => \$listfname,
"P|directory-prefix|prefix=s" => \$dirprefix,
"v|verbose" => \$verbose,
"V|version" => \$versionopt,
"h|help" => \$helpopt,
);
if ($versionopt) {
die "reshtml $VERSION\n";
} elsif ($helpopt) {
my $helpstr = q(Usage: reshtml [-b BASEURL] [FILENAME...]
Parses a HTML document, extract the list of images and other resou
+rces
necessary to render it, qualifies URLs to absolute URL using BASEU
+RL as the
base, prints URLs one per line.
);
$helpstr =~ s/\n\s*/\n/g;
die $helpstr;
}
my $base;
my $relwarn;
my $ifname;
my %foundurl;
my $foundurl = sub {
my($u) = @_;
$u = URI->new($u);
if (length($base)) {
$u = $u->abs($base);
} elsif (!$u->scheme) { # relative URL
$relwarn++ or warn "warning: relative URL found and base addre
+ss unknown, in file $ifname";
}
# we could canonicalize the URL for hashing purposes, but let's ig
+nore that
if (!$foundurl{$u}++) {
print $u, "\n";
}
};
my %twhnd;
$twhnd{"base"} = sub {
my($tw, $e) = @_;
$base = $e->att("href");
1;
};
$twhnd{"link"} = $twhnd{"a"} = $twhnd{"area"} = sub {
my($tw, $e) = @_;
my $rs = $e->att("rel") // "";
my %rw;
for my $rw (split /[ \t\n\f\r]+/, $rs) { # seriously, HTML5 define
+s everything precisely
$rw =~ y/A-Z/a-z/;
$rw{$rw}++;
}
if (length(my $u = $e->att("href")) && (
$rw{"icon"} || $rw{"stylesheet"} && !$rw{"alternate"}
)) {
&$foundurl($u);
}
1;
};
$twhnd{"img"} = $twhnd{"object"} = sub {
my($tw, $e) = @_;
for my $an ("src", "data") {
if (length(my $u = $e->att($an))) {
&$foundurl($u);
}
}
1;
};
my $do_file = sub {
my($fname, $stdin, $lbase) = @_;
$ifname = $fname;
$verbose and print STDERR "#$fname\n";
$fname = catfile($dirprefix, $fname);
$relwarn = 0;
$base = $lbase // $gbase;
my $twig = XML::Twig->new(twig_handlers => \%twhnd);
if (!$stdin) {
$twig->parsefile_html($fname);
} else {
$twig->parse_html(\*STDIN);
}
STDOUT->flush;
};
if (defined($listfname)) {
@ARGV and die "error: both -i option and input filenames are given
+";
open my $LIST, "<", $listfname or
die "error opening list file: ($listfname) $!";
my $_;
while (<$LIST>) {
/\S/ or next;
my($lbase, $fname, $rest) = split " ";
length($rest) and die "error: invalid spec in listfile (too ma
+ny words): $_ -";
if (!defined($fname)) {
$lbase =~ m"/([^/]+)/*$" or
die "error: cannot find suffix or base uri: $lbase";
$fname = $1;
}
&$do_file($fname, 0, $lbase);
}
} elsif (@ARGV) {
for my $fname (@ARGV) {
&$do_file($fname);
}
} else {
&$do_file("-", 1);
}
__END__