<?xml version="1.0" encoding="windows-1252"?>
<node id="177026" title="goose.pl" created="2002-06-25 00:11:37" updated="2005-08-11 08:34:26">
<type id="1748">
sourcecode</type>
<author id="21569">
elusion</author>
<data>
<field name="doctext">
&lt;code&gt;#!/usr/bin/perl -w
use strict;

# goose.pl

################
# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
#
# Copyright 2002.  This may be modified and distributed
# on the same terms as Perl.
################

my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko/20020310 ';
my %proxy = (
  host =&gt; '',    # http://host.dom:port
  id   =&gt; '',    # ntdom\userid
  pass =&gt; '',    # empty quotes if no proxy auth
);

use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua-&gt;agent($browser);
$ua-&gt;proxy(http =&gt; "$proxy{host}")
	if defined $proxy{host}; 

use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1, 1, "./"); # initialize and provide some defaults
GetOptions(  'images!'	=&gt; \$images,
		    'offsite!'	=&gt; \$offsite,
		    'help'		=&gt; \$help,
		    'depth=i'	=&gt; \$depth,
		    'path=s'	=&gt; \$path,
		    'filename=s' =&gt; \$filename,
		    'compact!'	=&gt; \$compact);

if ($help) {
	print &lt;&lt;EOH;
goose.pl -- a utility to grab site from the web
---------------------------------
usage: goose.pl [options] location
options:
	--images:
	--noimages:
		whether or not to download images
		defaults to yes
	--offsite:
	--nooffsite:
		whether or not to follow offsite links
		defaults to no
	--help
		prints this help message
	--depth
		the link depth to follow
		defaults to 0
	--path
		where to save the files to
	--filename
		the name of the first file. future names are
		made using the increment operator (++)
	--compact
	--nocompact
		whether or not to remove certain html tags
		defaults to use
EOH
	exit;	
}

my $href = shift or die "Must provide a location to be goosed. Use option --help for usage information.";

$path .= "/"
	if $path !~ /[\/\\]$/;

my %links;	# holds location and filename
$|++; 		# unbuffer output; allows progress to be shown
mkdir $path
	if not -e $path;

#------------ setup ends here

{
	my $count = 0;
	sub update_progress {
		print "\rGoosing $href... got $count files";
		$count++
	}
}

#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
	my ($html, %tags) = @_;
	
	for my $tag (keys %tags) {
		# if it's not closed
		if (not $tags{$tag}-&gt;{closed})
		{
			$$html =~ s! &lt; \Q$tag\E .*? &gt; !!xgis;
		}
		
		# if it's closed and we should remove content
		elsif (not $tags{$tag}-&gt;{content})
		{
			1 while remove_tag($tag, $html);
		}
		
		# if it's not closed and we should leave content
		elsif ($tags{$tag}-&gt;{content})
		{
			$$html =~ s! &lt; /? \Q$tag\E .*? &gt; !!xgis;
		}
	}
	return $$html;
}

#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
	my ($tag, $html) = @_;
	my $pre = "";
	my ($open, $content, $close);
	
	while (1) {
		$$html =~ m! (?&lt;= \Q$pre\E )  (&lt; \Q$tag\E [^&gt;]* &gt;) (.*?) (&lt;/ \Q$tag\E \s*? &gt;) !xi
			|| return 0;
	
		($open, $content, $close) = ($1, $2, $3);
		
		# if it has a nested tag of the same kind
		last if $content !~ /&lt;\Q$tag\E/;
		
		$pre .= $open;
	}
	
	$$html =~ s/\Q$open$content$close\E//;
	
	return 1;
}


#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
	my %options = @_;
	$options{location} || die "must give a location to goose";
	$options{master} ||= ($options{location} =~ m!(http://[^/]+)! &amp;&amp; $1);

	$links{$options{location}} ||= $filename++ . ".html";

	my $count = 1; # number of files saved
	my $page = get_page( $options{location} );
	return 0 if not $page; # if get_page() fails
	
	if ($options{compact}) {
		compact( \$page, style =&gt; { closed =&gt; 1 },
					    meta =&gt; { closed =&gt; 0 },
					    link   =&gt; { closed =&gt; 0} )
	}

	update_progress();
	
	# receive just the files that still need to be goosed
	my @files = extract_links( \$page,
						$options{images},
						$options{offsite},
						$options{depth},
						$options{master},
						$options{location} );
	# get images if specified
	if ($options{images}) {
		my @images = extract_images( \$page, $options{master}, $options{location} );

		for my $image (@images) {
			$count += get_image($image);
		}
	}
		
	open FILE, "&gt;$options{path}$links{$options{location}}" || die "couldn't open $links{$options{location}}";
	print FILE $page;
	close FILE;

	for my $file (@files) {
		$count +=
		     goose( location =&gt; $file,
				depth	=&gt; $options{depth} - 1,
				images	=&gt; $options{images},
				offsite	=&gt; $options{offsite},
				master	=&gt; $options{master},
				path		=&gt; $options{path},
				compact	=&gt; $options{compact});
	}
	
	return $count;
}

#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
	my ($location, $tryagain) = @_;
	$tryagain = 1 if not defined $tryagain;
	
	my $request = HTTP::Request-&gt;new(GET =&gt; $location);
	$request-&gt;proxy_authorization_basic( $proxy{id}, $proxy{pass} )
		if defined $proxy{id};
	my $result = $ua-&gt;request($request);
	
	return $result-&gt;content # if everything went right
		if $result-&gt;is_success;
	
	return get_page($location, 0) #try once more (default once)
		if $tryagain;
	return ""; # cop out
}

#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
	my ($location, $tryagain) = @_;
	$tryagain = 1 if not defined $tryagain;

	my $request = HTTP::Request-&gt;new(GET =&gt; $location);
	$request-&gt;proxy_authorization_basic( $proxy{id}, $proxy{pass} )
		if defined $proxy{id};
	my $result = $ua-&gt;request($request);

	# if everything went right
	if ($result-&gt;is_success) {
		open FILE, "&gt;" . $path . $links{$location} || die "couldn't open $links{$location}";
		binmode(FILE);
		print FILE $result-&gt;content;
		close FILE;
		update_progress();
		return 1;
	}
	
	return get_image($location, 0) #try once more (default once)
		if $tryagain;
	return 0; # cop out
}

#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
	my ($html, $images, $offsite, $depth, $master, $parent_location) = @_;
	my @pages; # ones that still need to be goosed
	
	$$html =~ s{ ( &lt;a \s+ [^&gt;]+ &gt; ) }
			 { new_link($1, "href", $depth, $offsite, $master, $parent_location, \@pages, "html") }xgei;
	
	return @pages;
}

#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
	my ($html, $master, $parent) = @_;
	my @images;

	$$html =~ s{ ( &lt;img \s+ [^&gt;]+ &gt; ) }
				 {new_link($1, "src", 1, 1, $master, $parent, \@images)}xgei;
	
	return @images
}

#feedit: link, type, depth, offsite, master, parent location, pages (ref), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
	my ($tag, $type, $depth, $offsite, $master, $parent_location, $pages, $ext) = @_;
	  
	my $link = get_attribute($tag, $type) || return $tag;
	# if ext is provided, get the ext of the file being saved
	$ext ||= ($link =~ m!/[^/]+ \. (\w+)  (?: \? [\w%&amp;;\s#=]+ )?$!x &amp;&amp; $1);

	# if it's a mailto: link
	return $tag
		if $link =~ /^mailto:/;
	# if it's javascript
	return $tag
		if $link =~ /^javascript:/;
	
	my $fullpath = fullpath( $link, $parent_location );
	$fullpath =~ m!^(https?://[^/]+)!;
	my $root = $1;

	# if it's already been goosed or is queued up
	return set_attribute($tag, $type, $links{$fullpath})
		if defined $links{$fullpath};
	# if we're done goosing
	return set_attribute($tag, $type, $fullpath)
		if not $depth;
	# if it's an offsite link and we don't want it
	return $tag
		if lc($root) ne lc($master)
		  &amp;&amp; not $offsite;
	
	# else queue it up
	push @$pages, $fullpath;
	$links{$fullpath} = $filename++ . "." . $ext;
	
	return set_attribute($tag, $type, $links{$fullpath})
}

#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
	my ($relative, $found) = @_;
	
	#if it's not a filename and doesn't have a /
	$found .= "/"
		if $found !~ m!https?://.*/.*!i;
	
	return $relative
		if $relative =~ m!^https?://!i;
	$found =~ s![^/]+$!!; # remove filename at end
	return $1 . $relative    # /foo/bar
		if $relative =~ s/^\///
		   &amp;&amp; $found =~ m!^(https?://.+?/)!i;
	$found =~ s![^/]+/$!! # for relative urls (../whatever)
		while $relative =~ s!^\.\./!!;
	1 while $relative =~ s!^\./!!; # for urls relative to the current directory ( ./whatever)
	return $found . $relative;
}

#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
	my ($tag, $attribute) = @_;
	
	# normal tag
	return $2
		if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
	
	# tag with no quotes
	return $1
		if $tag =~ /\Q$attribute\E=([^\s&gt;]+)/;
	
	# else
	return 0;
}

#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
	my ($tag, $attribute, $value) = @_;

		# normal tag
	return $tag
		if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$value$2/xi;
	
	# tag with no quotes
	return $tag
		if $tag =~ s/(\Q$attribute\E=)[^\s&gt;]+/$1$value/i;
	
	# else
	die "\nERROR: couldn't set attribute '$attribute' to '$value' for '$tag'\n";
}

update_progress();
my $count = goose( location =&gt; $href,
				depth    =&gt; $depth,
				offsite   =&gt; $offsite,
				images  =&gt; $images,
				path	     =&gt; $path,
				compact =&gt; $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
	" " x (length($count) + 6),
	"\n$count files received\n";&lt;/code&gt;</field>
<field name="codedescription">
This is a script I wrote to grab sites from the web to stick on my hard drive and/or handheld device. It follows links to a certain depth, can download images, follow offsite links, and remove some unwanted html.&lt;p&gt;

Code reviews/critiques are welcomed and requested.&lt;p&gt;

&lt;b&gt;Updated: Jun. 28, 2002&lt;/b&gt;
&lt;ul&gt;
&lt;li&gt;Squashed 2 bugs
&lt;li&gt;UserAgent support
&lt;li&gt;Proxy support
&lt;li&gt;Progress report
&lt;/ul&gt;</field>
<field name="codecategory">
Web Stuff</field>
<field name="codeauthor">
Matthew Diephouse [elusion]&lt;br&gt;
Contact: matt --at-- diephouse --dot-- com&lt;br&gt;
where "--at--" is @ and "--dot--" is .</field>
</data>
</node>
