Beefy Boxes and Bandwidth Generously Provided by pair Networks
more useful options
 
PerlMonks  

Re: Remote Directory listing

by ambrus (Abbot)
on Jul 08, 2012 at 16:47 UTC ( #980595=note: print w/ replies, xml ) Need Help??


in reply to Remote Directory listing

I have a script that mirrors some files recursively from a directory. This gets the list of filenames from a webserver-generated directory listing. I show the complete listing here, but the relevant part of the code that parses the listing is outside the readmore tags.

This script does not try to parse the last modified times from the directory listing. Instead I send a request to download every file, and arrange with HTTP magic that files that were not modified since the last complete download are not downloaded again.

#!perl use warnings; use strict; # the host names, directory names, usernames below are changed our $maindir = "/home/cmuna/a/hnwfb"; our $dldir = "/home/cmuna/export/hnwfb"; if ("jfhep" eq getpwuid($<)) { $maindir = "/home/jfhep/a/hnwfb"; $dldir = "dl"; } our $vardir = "var"; our $baseurl = "http://vphwec.example.com:780/hnwfb/"; our $http_netloc = "vphwec.example.com:780"; our $http_realm = "hnwfbmaster"; our $http_user = "hnwfb"; our $http_passfile = "secret/hnwfbhtpass"; our $maxcount_dir = 1024; our $maxcount_file = 16*1024; our $maxtotalsize = 1024*1024*1024; use BSD::Resource (); use 5.010; use Fcntl (); use IO::Handle (); use LWP (); use Date::Manip::Date (); use Sys::Hostname ();
use XML::Twig (); use Time::HiRes ();
chdir $maindir or die "error chdir main"; -O "." or die "error own main dir"; BSD::Resource::setrlimit(BSD::Resource::RLIMIT_AS(), 128*1024*1024, BS +D::Resource::RLIM_INFINITY()); BSD::Resource::setrlimit(BSD::Resource::RLIMIT_DATA(), 48*1024*1024, B +SD::Resource::RLIM_INFINITY()); BSD::Resource::setpriority(BSD::Resource::PRIO_PROCESS(), 0, 3) or die + "error setpriority"; open our $LOCKH, "<", $vardir . "/mirrorhnwfb.lock" or die "error open + lockfile"; flock $LOCKH, (Fcntl::LOCK_EX()|Fcntl::LOCK_NB()) or die "error lockin +g lockfile: $!"; sub time_monotonic { Time::HiRes::clock_gettime(Time::HiRes::CLOCK_MONOTONIC()); } open our $LOG, ">>", $vardir . "/mirrorhnwfb.log" or die "open logfile +"; autoflush $LOG; sub wrlog { chomp(my $t = join("", @_)); $t .= "\n"; print STDERR $t; print $LOG $t; } my $DATE = Date::Manip::Date->new; $DATE->config(setdate => "zone,UTC"); wrlog "Starting mirrorhnwfb on " . Sys::Hostname::hostname() . " at " +. $DATE->new_date("now")->printf("%O %Z") . "."; eval { sub escape { my($t) = @_; $t =~ s/([^\!\$\'-\*\,-9A-Za-z])/sprintf"%%%02X",ord($1)/ge; $t; } sub unescape { my($t) = @_; $t =~ s/%(\w\w)/chr(hex($1))/ge; $t; } our $LWP = LWP::UserAgent->new( max_redirect => 7, agent => q"mirrorhnwfb (1.0, jfhep@igluxz.example.com)", timeout => 300, ); our $http_pass; { open my $HTTP_PASS, "<", $http_passfile or die "error opening passfile +"; chomp($http_pass = <$HTTP_PASS>); } $LWP->credentials($http_netloc, $http_realm, $http_user, $http_pass); { wrlog "getting allowmirror"; my $resp = $LWP->head($baseurl . "/.allowmirror"); if (!$resp->is_success) { die "allowmirror http error: " . $resp->status_line; } wrlog "allowmirror ok."; } our @dir = ("/"); our %keepd = ("/" => 1); our %keepf; our $count_dir = 0; our @file; our @mkdir; our $count_file = 0; sub getdir { my($dir) = @_; my $edir = escape($dir); $dir =~ m"\0" and die "invalid dir pathname 0 " . $edir; $dir =~ m"\A\/" or die "invalid dir pathname 1 " . $edir; $dir =~ m"\/\z" or die "invalid dir pathname 2 " . $edir; $dir =~ m"/\." and die "invalid dir pathname 3 " . $edir; if (1024 < length($dir)) { wrlog "skipping directory with too long pathname " . $edir; return; } my $dirs = $dir; $dirs =~ s"\/+\z""; $keepd{$dirs} = 1; push @mkdir, $dir;
wrlog "getting directory " . $edir; my $resp = $LWP->get($baseurl . $edir); if ($resp->is_success) { my $twig = XML::Twig->new; if (!$twig->safe_parse($resp->content)) { wrlog "error xml parsing directory listing of " . $edir . +" as xml: " . $@; return; } my($etitle) = $twig->findnodes("//title"); if (!$etitle || $etitle->text !~ /\A\s*Index\b/i) { wrlog "direcotry listing has wrong title " . $edir; return; } for my $ea ($twig->findnodes("//a")) { my $href = $ea->att("href"); my $n = unescape($href); if (defined($n) && $n !~ m"\A[\?\/]") { #wrlog "found link from directory " . $edir . " : " . +escape($n); my $isdir = $n =~ s"\/+\z""; my $abs = $dir . $n; my $eabs = escape($abs);
if ($n =~ m"[\/\0]") { wrlog "skipping that href for illegal characters i +n name " . $eabs; } elsif (256 <= length($n)) { wrlog "skipping that href for too long name compon +ent " . $eabs; } elsif ($n =~ m"\A[\.\#\,\ ]" || $n =~ m"~\z" || $n =~ m"[^\ \!\$-\.0-\[\]-\~\x80-\xff]" ) { wrlog "skipping that href for don't like name " . +$eabs; } elsif ($isdir) { if ($maxcount_dir - 1 < $count_dir++) { wrlog "skipping subdirectory because there are + too many total dirs " . $eabs; } else { #wrlog "accepting subdirectory " . $eabs; push @dir, $abs . "/"; } } else { if ($maxcount_file < $count_file++) { wrlog "skipping file becuase there are too man +y total files " . $eabs; } else { #wrlog "accepting file " . $eabs; push @file, $abs; $keepf{$abs} = 1; } } } } #wrlog "finished parsing directory " . $edir; } else { die "error getting directory " . $edir . " : " . $resp->status +_line; } } while (@dir) { getdir(shift @dir); } wrlog "finished getting directory indexes"; sub checkname { my($name) = @_; $name =~ m"\0" and die "error: invalid name 0 " . escape($name); $name =~ m"\/\.\.(?:/|\z)" and die "error: invalid name 2 " . esca +pe($name); } our %mtime; { open my $MTIME, "<", $vardir . "/cur.mtime" or die "error opening cur. +mtime: $!"; while (<$MTIME>) { /\S/ or next; /\A#/ and next; my($ename, $emtime) = split " ", $_; my $name = unescape($ename); $name =~ m"\0" and die "error: invalid name 0 in cur.mtime " . $en +ame; $name =~ m"\A\/" or die "error: invalid name 1 in cur.mtime" . $en +ame; $name =~ m"\/\.\.(?:/|\z)" and die "error: invalid name 2 in cur.m +time " . $ename; my $mtime = unescape($emtime); $mtime{$name} = $mtime; } close $MTIME or die "error closing cur.mtime: $!"; } our $NMTIME; { open $NMTIME, ">", $vardir . "/new.mtime" or die "error opening new.mt +ime for write: $!"; for my $name (sort keys %mtime) { checkname($name); my $ename = escape($name); $name =~ s"\/+\z""; if (-e($dldir . "/" . $name)) { print $NMTIME $ename . " " . escape($mtime{$name}) . "\n"; } } print $NMTIME "\n"; flush $NMTIME or die "error flusing 0 new.mtime: $!"; rename $vardir . "/old.mtime", $vardir . "/old2.mtime" or warn "warnin +g renaming old.mtime to old2.mtime: $!"; rename $vardir . "/cur.mtime", $vardir . "/old.mtime" or die "error re +naming cur.mtime to old.mtime: $!"; rename $vardir . "/new.mtime", $vardir . "/cur.mtime" or die "error re +naming new.mtime to cur.mtime: $!"; } our $cleancnt = 0; our $totalsize = 0; sub cleandir { my($dir) = @_; $cleancnt++; checkname($dir); $dir =~ m"\A\/" or die "error: invalid 1 dirname to clear " . esca +pe($dir); $dir =~ s"/+\z""; my $adir = $dldir . $dir; my $DIR; if (!opendir $DIR, $adir) { wrlog "warning: cannot opendir directory to clean " . escape($ +adir) . " : " . $!; return; } while (my $file = readdir $DIR) { "." eq $file || ".." eq $file and next; my $afile = $adir . "/" . $file; my $pfile = $dir . "/" . $file; if ($file =~ m"^\.") { wrlog "in cleanup, skipping dotfile " . $afile; next; } my $isdir = -d $afile; if ($isdir) { cleandir($pfile); if (!$keepd{$pfile}) { wrlog "deleting old directory " . $pfile; $keepf{$pfile} and wrlog "incidentally, that is now a +non-dir file"; rmdir $afile or wrlog "warning: could not delete old directory " . + escape($afile) . " : $!"; } } else { if (!$keepf{$pfile}) { wrlog "deleting old file " . $pfile; $keepd{$pfile} and wrlog "incidentally, that is now a +directory"; unlink $afile or wrlog "warning: could not delete old file " . esca +pe($afile) . " : $!"; } else { $totalsize += abs(-s($afile)); } } } closedir $DIR or wrlog "warning: error closedir directory to clean + " . escape($adir); } wrlog "starting to clean up old files"; cleandir("/"); wrlog "finished cleanup of old files (recursed to " . $cleancnt . " di +rs)"; wrlog "total size of old files is " . $totalsize . " bytes"; { for my $dir (@mkdir) { my $adir = $dldir . "/" . $dir; if (!mkdir($adir) && !$!{EEXIST}) { wrlog "warning: could not mkdir " . escape($dir) . " : " . $!; } } } sub getfile { my($file) = @_; checkname($file); my $efile = escape($file); $file =~ m"\A\/" or die "error: invalid filename 1 to get " . $efi +le; my $afile = $dldir . "/" . $file; my $mtime = $mtime{$file}; wrlog "getting file " . $efile . " : " . ($mtime ? "mtime " . $mti +me : "new"); open my $PART, ">", $vardir . "/part" or die "error opening part f +ile: $!"; binmode $PART or die "error binmoding part"; my $ondata = sub { my($buf, $_resp, $_prot) = @_; if ($maxtotalsize < ($totalsize += length($buf))) { die "error: out of quota"; } print $PART $buf or die "error writing data to part file: $!"; }; my $resp = $LWP->get( $baseurl . $efile, ":content_cb" => $ondata, ($mtime ? ("If-Modified-Since" => $mtime) : ()), ); 0 <= (my $filesize = tell $PART) or die "error: tell part: $!"; close $PART or die "error closing part file: $!"; if (my $err = $resp->header("X-Died")) { die $err; } elsif (304 == $resp->code) { wrlog "got not modified " . $efile; } elsif (!$resp->is_success) { wrlog "error downloading " . $efile . " : " . $resp->status_li +ne; } else { my $nmtime = $resp->header("Last-Modified"); my $enmtime = escape($nmtime); rmdir $afile or 1; rename $vardir . "/part", $afile or die "error renaming part t +o " . escape($afile) . " : $!"; print $NMTIME $efile . " " . $enmtime . "\n"; flush $NMTIME or die "error flusing 0 new.mtime: $!"; wrlog "success downloading " . $efile . " : mtime " . $enmtime + . " size " . $filesize; } } for my $file (@file) { getfile($file); } close $NMTIME or die "error closing new.mtime: $!"; wrlog "finished all work. total size is " . $totalsize . " bytes"; }; if (my $e = $@) { wrlog $e; } wrlog "Exiting mirrorhnwfb at " . $DATE->new_date("now")->printf("%O %Z") . "."; __END__


Comment on Re: Remote Directory listing
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://980595]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others surveying the Monastery: (9)
As of 2014-08-20 07:29 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The best computer themed movie is:











    Results (107 votes), past polls