Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??

I have a script that mirrors some files recursively from a directory. This gets the list of filenames from a webserver-generated directory listing. I show the complete listing here, but the relevant part of the code that parses the listing is outside the readmore tags.

This script does not try to parse the last modified times from the directory listing. Instead I send a request to download every file, and arrange with HTTP magic that files that were not modified since the last complete download are not downloaded again.

#!perl use warnings; use strict; # the host names, directory names, usernames below are changed our $maindir = "/home/cmuna/a/hnwfb"; our $dldir = "/home/cmuna/export/hnwfb"; if ("jfhep" eq getpwuid($<)) { $maindir = "/home/jfhep/a/hnwfb"; $dldir = "dl"; } our $vardir = "var"; our $baseurl = "http://vphwec.example.com:780/hnwfb/"; our $http_netloc = "vphwec.example.com:780"; our $http_realm = "hnwfbmaster"; our $http_user = "hnwfb"; our $http_passfile = "secret/hnwfbhtpass"; our $maxcount_dir = 1024; our $maxcount_file = 16*1024; our $maxtotalsize = 1024*1024*1024; use BSD::Resource (); use 5.010; use Fcntl (); use IO::Handle (); use LWP (); use Date::Manip::Date (); use Sys::Hostname ();
use XML::Twig (); use Time::HiRes ();
chdir $maindir or die "error chdir main"; -O "." or die "error own main dir"; BSD::Resource::setrlimit(BSD::Resource::RLIMIT_AS(), 128*1024*1024, BS +D::Resource::RLIM_INFINITY()); BSD::Resource::setrlimit(BSD::Resource::RLIMIT_DATA(), 48*1024*1024, B +SD::Resource::RLIM_INFINITY()); BSD::Resource::setpriority(BSD::Resource::PRIO_PROCESS(), 0, 3) or die + "error setpriority"; open our $LOCKH, "<", $vardir . "/mirrorhnwfb.lock" or die "error open + lockfile"; flock $LOCKH, (Fcntl::LOCK_EX()|Fcntl::LOCK_NB()) or die "error lockin +g lockfile: $!"; sub time_monotonic { Time::HiRes::clock_gettime(Time::HiRes::CLOCK_MONOTONIC()); } open our $LOG, ">>", $vardir . "/mirrorhnwfb.log" or die "open logfile +"; autoflush $LOG; sub wrlog { chomp(my $t = join("", @_)); $t .= "\n"; print STDERR $t; print $LOG $t; } my $DATE = Date::Manip::Date->new; $DATE->config(setdate => "zone,UTC"); wrlog "Starting mirrorhnwfb on " . Sys::Hostname::hostname() . " at " +. $DATE->new_date("now")->printf("%O %Z") . "."; eval { sub escape { my($t) = @_; $t =~ s/([^\!\$\'-\*\,-9A-Za-z])/sprintf"%%%02X",ord($1)/ge; $t; } sub unescape { my($t) = @_; $t =~ s/%(\w\w)/chr(hex($1))/ge; $t; } our $LWP = LWP::UserAgent->new( max_redirect => 7, agent => q"mirrorhnwfb (1.0, jfhep@igluxz.example.com)", timeout => 300, ); our $http_pass; { open my $HTTP_PASS, "<", $http_passfile or die "error opening passfile +"; chomp($http_pass = <$HTTP_PASS>); } $LWP->credentials($http_netloc, $http_realm, $http_user, $http_pass); { wrlog "getting allowmirror"; my $resp = $LWP->head($baseurl . "/.allowmirror"); if (!$resp->is_success) { die "allowmirror http error: " . $resp->status_line; } wrlog "allowmirror ok."; } our @dir = ("/"); our %keepd = ("/" => 1); our %keepf; our $count_dir = 0; our @file; our @mkdir; our $count_file = 0; sub getdir { my($dir) = @_; my $edir = escape($dir); $dir =~ m"\0" and die "invalid dir pathname 0 " . $edir; $dir =~ m"\A\/" or die "invalid dir pathname 1 " . $edir; $dir =~ m"\/\z" or die "invalid dir pathname 2 " . $edir; $dir =~ m"/\." and die "invalid dir pathname 3 " . $edir; if (1024 < length($dir)) { wrlog "skipping directory with too long pathname " . $edir; return; } my $dirs = $dir; $dirs =~ s"\/+\z""; $keepd{$dirs} = 1; push @mkdir, $dir;
wrlog "getting directory " . $edir; my $resp = $LWP->get($baseurl . $edir); if ($resp->is_success) { my $twig = XML::Twig->new; if (!$twig->safe_parse($resp->content)) { wrlog "error xml parsing directory listing of " . $edir . +" as xml: " . $@; return; } my($etitle) = $twig->findnodes("//title"); if (!$etitle || $etitle->text !~ /\A\s*Index\b/i) { wrlog "direcotry listing has wrong title " . $edir; return; } for my $ea ($twig->findnodes("//a")) { my $href = $ea->att("href"); my $n = unescape($href); if (defined($n) && $n !~ m"\A[\?\/]") { #wrlog "found link from directory " . $edir . " : " . +escape($n); my $isdir = $n =~ s"\/+\z""; my $abs = $dir . $n; my $eabs = escape($abs);
if ($n =~ m"[\/\0]") { wrlog "skipping that href for illegal characters i +n name " . $eabs; } elsif (256 <= length($n)) { wrlog "skipping that href for too long name compon +ent " . $eabs; } elsif ($n =~ m"\A[\.\#\,\ ]" || $n =~ m"~\z" || $n =~ m"[^\ \!\$-\.0-\[\]-\~\x80-\xff]" ) { wrlog "skipping that href for don't like name " . +$eabs; } elsif ($isdir) { if ($maxcount_dir - 1 < $count_dir++) { wrlog "skipping subdirectory because there are + too many total dirs " . $eabs; } else { #wrlog "accepting subdirectory " . $eabs; push @dir, $abs . "/"; } } else { if ($maxcount_file < $count_file++) { wrlog "skipping file becuase there are too man +y total files " . $eabs; } else { #wrlog "accepting file " . $eabs; push @file, $abs; $keepf{$abs} = 1; } } } } #wrlog "finished parsing directory " . $edir; } else { die "error getting directory " . $edir . " : " . $resp->status +_line; } } while (@dir) { getdir(shift @dir); } wrlog "finished getting directory indexes"; sub checkname { my($name) = @_; $name =~ m"\0" and die "error: invalid name 0 " . escape($name); $name =~ m"\/\.\.(?:/|\z)" and die "error: invalid name 2 " . esca +pe($name); } our %mtime; { open my $MTIME, "<", $vardir . "/cur.mtime" or die "error opening cur. +mtime: $!"; while (<$MTIME>) { /\S/ or next; /\A#/ and next; my($ename, $emtime) = split " ", $_; my $name = unescape($ename); $name =~ m"\0" and die "error: invalid name 0 in cur.mtime " . $en +ame; $name =~ m"\A\/" or die "error: invalid name 1 in cur.mtime" . $en +ame; $name =~ m"\/\.\.(?:/|\z)" and die "error: invalid name 2 in cur.m +time " . $ename; my $mtime = unescape($emtime); $mtime{$name} = $mtime; } close $MTIME or die "error closing cur.mtime: $!"; } our $NMTIME; { open $NMTIME, ">", $vardir . "/new.mtime" or die "error opening new.mt +ime for write: $!"; for my $name (sort keys %mtime) { checkname($name); my $ename = escape($name); $name =~ s"\/+\z""; if (-e($dldir . "/" . $name)) { print $NMTIME $ename . " " . escape($mtime{$name}) . "\n"; } } print $NMTIME "\n"; flush $NMTIME or die "error flusing 0 new.mtime: $!"; rename $vardir . "/old.mtime", $vardir . "/old2.mtime" or warn "warnin +g renaming old.mtime to old2.mtime: $!"; rename $vardir . "/cur.mtime", $vardir . "/old.mtime" or die "error re +naming cur.mtime to old.mtime: $!"; rename $vardir . "/new.mtime", $vardir . "/cur.mtime" or die "error re +naming new.mtime to cur.mtime: $!"; } our $cleancnt = 0; our $totalsize = 0; sub cleandir { my($dir) = @_; $cleancnt++; checkname($dir); $dir =~ m"\A\/" or die "error: invalid 1 dirname to clear " . esca +pe($dir); $dir =~ s"/+\z""; my $adir = $dldir . $dir; my $DIR; if (!opendir $DIR, $adir) { wrlog "warning: cannot opendir directory to clean " . escape($ +adir) . " : " . $!; return; } while (my $file = readdir $DIR) { "." eq $file || ".." eq $file and next; my $afile = $adir . "/" . $file; my $pfile = $dir . "/" . $file; if ($file =~ m"^\.") { wrlog "in cleanup, skipping dotfile " . $afile; next; } my $isdir = -d $afile; if ($isdir) { cleandir($pfile); if (!$keepd{$pfile}) { wrlog "deleting old directory " . $pfile; $keepf{$pfile} and wrlog "incidentally, that is now a +non-dir file"; rmdir $afile or wrlog "warning: could not delete old directory " . + escape($afile) . " : $!"; } } else { if (!$keepf{$pfile}) { wrlog "deleting old file " . $pfile; $keepd{$pfile} and wrlog "incidentally, that is now a +directory"; unlink $afile or wrlog "warning: could not delete old file " . esca +pe($afile) . " : $!"; } else { $totalsize += abs(-s($afile)); } } } closedir $DIR or wrlog "warning: error closedir directory to clean + " . escape($adir); } wrlog "starting to clean up old files"; cleandir("/"); wrlog "finished cleanup of old files (recursed to " . $cleancnt . " di +rs)"; wrlog "total size of old files is " . $totalsize . " bytes"; { for my $dir (@mkdir) { my $adir = $dldir . "/" . $dir; if (!mkdir($adir) && !$!{EEXIST}) { wrlog "warning: could not mkdir " . escape($dir) . " : " . $!; } } } sub getfile { my($file) = @_; checkname($file); my $efile = escape($file); $file =~ m"\A\/" or die "error: invalid filename 1 to get " . $efi +le; my $afile = $dldir . "/" . $file; my $mtime = $mtime{$file}; wrlog "getting file " . $efile . " : " . ($mtime ? "mtime " . $mti +me : "new"); open my $PART, ">", $vardir . "/part" or die "error opening part f +ile: $!"; binmode $PART or die "error binmoding part"; my $ondata = sub { my($buf, $_resp, $_prot) = @_; if ($maxtotalsize < ($totalsize += length($buf))) { die "error: out of quota"; } print $PART $buf or die "error writing data to part file: $!"; }; my $resp = $LWP->get( $baseurl . $efile, ":content_cb" => $ondata, ($mtime ? ("If-Modified-Since" => $mtime) : ()), ); 0 <= (my $filesize = tell $PART) or die "error: tell part: $!"; close $PART or die "error closing part file: $!"; if (my $err = $resp->header("X-Died")) { die $err; } elsif (304 == $resp->code) { wrlog "got not modified " . $efile; } elsif (!$resp->is_success) { wrlog "error downloading " . $efile . " : " . $resp->status_li +ne; } else { my $nmtime = $resp->header("Last-Modified"); my $enmtime = escape($nmtime); rmdir $afile or 1; rename $vardir . "/part", $afile or die "error renaming part t +o " . escape($afile) . " : $!"; print $NMTIME $efile . " " . $enmtime . "\n"; flush $NMTIME or die "error flusing 0 new.mtime: $!"; wrlog "success downloading " . $efile . " : mtime " . $enmtime + . " size " . $filesize; } } for my $file (@file) { getfile($file); } close $NMTIME or die "error closing new.mtime: $!"; wrlog "finished all work. total size is " . $totalsize . " bytes"; }; if (my $e = $@) { wrlog $e; } wrlog "Exiting mirrorhnwfb at " . $DATE->new_date("now")->printf("%O %Z") . "."; __END__

In reply to Re: Remote Directory listing by ambrus
in thread Remote Directory listing by johnfl68

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others scrutinizing the Monastery: (2)
As of 2024-04-19 22:16 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found