Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??
#!/usr/bin/perl -w use strict; # goose.pl ################ # Written by Matthew Diephouse # Contact at "matt --at-- diephouse --dot-- com" # where "--at--" is @ and "--dot--" is . # # Copyright 2002. This may be modified and distributed # on the same terms as Perl. ################ my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko +/20020310 '; my %proxy = ( host => '', # http://host.dom:port id => '', # ntdom\userid pass => '', # empty quotes if no proxy auth ); use LWP::UserAgent; my $ua = new LWP::UserAgent; $ua->agent($browser); $ua->proxy(http => "$proxy{host}") if defined $proxy{host}; use Getopt::Long; my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1 +, 1, "./"); # initialize and provide some defaults GetOptions( 'images!' => \$images, 'offsite!' => \$offsite, 'help' => \$help, 'depth=i' => \$depth, 'path=s' => \$path, 'filename=s' => \$filename, 'compact!' => \$compact); if ($help) { print <<EOH; goose.pl -- a utility to grab site from the web --------------------------------- usage: goose.pl [options] location options: --images: --noimages: whether or not to download images defaults to yes --offsite: --nooffsite: whether or not to follow offsite links defaults to no --help prints this help message --depth the link depth to follow defaults to 0 --path where to save the files to --filename the name of the first file. future names are made using the increment operator (++) --compact --nocompact whether or not to remove certain html tags defaults to use EOH exit; } my $href = shift or die "Must provide a location to be goosed. Use opt +ion --help for usage information."; $path .= "/" if $path !~ /[\/\\]$/; my %links; # holds location and filename $|++; # unbuffer output; allows progress to be shown mkdir $path if not -e $path; #------------ setup ends here { my $count = 0; sub update_progress { print "\rGoosing $href... got $count files"; $count++ } } #feedit: html (ref), hash with tags to cut and options (hash ref) #receive: compacted version #effect: compacts ref sub compact { my ($html, %tags) = @_; for my $tag (keys %tags) { # if it's not closed if (not $tags{$tag}->{closed}) { $$html =~ s! < \Q$tag\E .*? > !!xgis; } # if it's closed and we should remove content elsif (not $tags{$tag}->{content}) { 1 while remove_tag($tag, $html); } # if it's not closed and we should leave content elsif ($tags{$tag}->{content}) { $$html =~ s! < /? \Q$tag\E .*? > !!xgis; } } return $$html; } #feedit: tag name, html (ref) #receive: success indicator #effect: remove's tag from html sub remove_tag { my ($tag, $html) = @_; my $pre = ""; my ($open, $content, $close); while (1) { $$html =~ m! (?<= \Q$pre\E ) (< \Q$tag\E [^>]* >) (.*?) (</ \ +Q$tag\E \s*? >) !xi || return 0; ($open, $content, $close) = ($1, $2, $3); # if it has a nested tag of the same kind last if $content !~ /<\Q$tag\E/; $pre .= $open; } $$html =~ s/\Q$open$content$close\E//; return 1; } #feedit: location, depth, images?, offsite?, path #receive: number of files #effect: save files sub goose { my %options = @_; $options{location} || die "must give a location to goose"; $options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $ +1); $links{$options{location}} ||= $filename++ . ".html"; my $count = 1; # number of files saved my $page = get_page( $options{location} ); return 0 if not $page; # if get_page() fails if ($options{compact}) { compact( \$page, style => { closed => 1 }, meta => { closed => 0 }, link => { closed => 0} ) } update_progress(); # receive just the files that still need to be goosed my @files = extract_links( \$page, $options{images}, $options{offsite}, $options{depth}, $options{master}, $options{location} ); # get images if specified if ($options{images}) { my @images = extract_images( \$page, $options{master}, $option +s{location} ); for my $image (@images) { $count += get_image($image); } } open FILE, ">$options{path}$links{$options{location}}" || die "cou +ldn't open $links{$options{location}}"; print FILE $page; close FILE; for my $file (@files) { $count += goose( location => $file, depth => $options{depth} - 1, images => $options{images}, offsite => $options{offsite}, master => $options{master}, path => $options{path}, compact => $options{compact}); } return $count; } #feedit: the name of the page to be retrieved #receive: the html #effect: none sub get_page { my ($location, $tryagain) = @_; $tryagain = 1 if not defined $tryagain; my $request = HTTP::Request->new(GET => $location); $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} ) if defined $proxy{id}; my $result = $ua->request($request); return $result->content # if everything went right if $result->is_success; return get_page($location, 0) #try once more (default once) if $tryagain; return ""; # cop out } #feedit: the name of the image to be retrieved #receive: the number of files saved #effect: saves the files sub get_image { my ($location, $tryagain) = @_; $tryagain = 1 if not defined $tryagain; my $request = HTTP::Request->new(GET => $location); $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} ) if defined $proxy{id}; my $result = $ua->request($request); # if everything went right if ($result->is_success) { open FILE, ">" . $path . $links{$location} || die "couldn't op +en $links{$location}"; binmode(FILE); print FILE $result->content; close FILE; update_progress(); return 1; } return get_image($location, 0) #try once more (default once) if $tryagain; return 0; # cop out } #feedit: the html (ref), images?, offsite?, depth #receive: the names of pages yet to be goosed #effect: change links of html sub extract_links { my ($html, $images, $offsite, $depth, $master, $parent_location) = + @_; my @pages; # ones that still need to be goosed $$html =~ s{ ( <a \s+ [^>]+ > ) } { new_link($1, "href", $depth, $offsite, $master, $parent +_location, \@pages, "html") }xgei; return @pages; } #feedit: html with img tags, master, parent location #receive: array of images do download #effect: none sub extract_images { my ($html, $master, $parent) = @_; my @images; $$html =~ s{ ( <img \s+ [^>]+ > ) } {new_link($1, "src", 1, 1, $master, $parent, \@images +)}xgei; return @images } #feedit: link, type, depth, offsite, master, parent location, pages (r +ef), $ext (opt) #receive: a new link (possibly) #effect: adds an entry to @pages if necessary sub new_link { my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag +es, $ext) = @_; my $link = get_attribute($tag, $type) || return $tag; # if ext is provided, get the ext of the file being saved $ext ||= ($link =~ m!/[^/]+ \. (\w+) (?: \? [\w%&;\s#=]+ )?$!x && + $1); # if it's a mailto: link return $tag if $link =~ /^mailto:/; # if it's javascript return $tag if $link =~ /^javascript:/; my $fullpath = fullpath( $link, $parent_location ); $fullpath =~ m!^(https?://[^/]+)!; my $root = $1; # if it's already been goosed or is queued up return set_attribute($tag, $type, $links{$fullpath}) if defined $links{$fullpath}; # if we're done goosing return set_attribute($tag, $type, $fullpath) if not $depth; # if it's an offsite link and we don't want it return $tag if lc($root) ne lc($master) && not $offsite; # else queue it up push @$pages, $fullpath; $links{$fullpath} = $filename++ . "." . $ext; return set_attribute($tag, $type, $links{$fullpath}) } #feedit: location, location where found #receive: full location #effect: none sub fullpath { my ($relative, $found) = @_; #if it's not a filename and doesn't have a / $found .= "/" if $found !~ m!https?://.*/.*!i; return $relative if $relative =~ m!^https?://!i; $found =~ s![^/]+$!!; # remove filename at end return $1 . $relative # /foo/bar if $relative =~ s/^\/// && $found =~ m!^(https?://.+?/)!i; $found =~ s![^/]+/$!! # for relative urls (../whatever) while $relative =~ s!^\.\./!!; 1 while $relative =~ s!^\./!!; # for urls relative to the current +directory ( ./whatever) return $found . $relative; } #feedit: a tag, attribute #receive: value of attribute #effect: none sub get_attribute { my ($tag, $attribute) = @_; # normal tag return $2 if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi; # tag with no quotes return $1 if $tag =~ /\Q$attribute\E=([^\s>]+)/; # else return 0; } #feedit: tag, attribute, value #receive: tag #effect: none sub set_attribute { my ($tag, $attribute, $value) = @_; # normal tag return $tag if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va +lue$2/xi; # tag with no quotes return $tag if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i; # else die "\nERROR: couldn't set attribute '$attribute' to '$value' for +'$tag'\n"; } update_progress(); my $count = goose( location => $href, depth => $depth, offsite => $offsite, images => $images, path => $path, compact => $compact) || die " it didn't work!"; print "\rGoosing $href... Done", " " x (length($count) + 6), "\n$count files received\n";

In reply to goose.pl by elusion

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others exploiting the Monastery: (10)
    As of 2015-07-07 23:00 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (93 votes), past polls