in reply to Re: No child processes - system limit? in thread No child processes - system limit?
At the suggestion of moritz, I ran the script with strace, the relevant bits of which are as follows:
select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0}
+)
read(3, "e_type\":\"all_birthday\",\"ancestor"..., 4096) = 4096
mremap(0xb7a16000, 3756032, 3760128, MREMAP_MAYMOVE) = 0xb7a16000
time(NULL) = 1270142654
select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0}
+)
read(3, "us\":\"active\",\"last_modified\":\"20"..., 4096) = 4096
mremap(0xb7a16000, 3760128, 3764224, MREMAP_MAYMOVE) = 0xb7a16000
time(NULL) = 1270142654
select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0}
+)
read(3, ",\"text\":\"RUSSELL Nee Viney Nigel"..., 4096) = 4096
mremap(0xb7a16000, 3764224, 3768320, MREMAP_MAYMOVE) = 0xb7a16000
time(NULL) = 1270142654
select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0}
+)
read(3, "8\",\"featured\":0,\"sub_type\":\"memo"..., 836) = 836
time(NULL) = 1270142654
time(NULL) = 1270142654
mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO
+US, -1, 0) = 0xb767e000
munmap(0xb6a62000, 4059136) = 0
mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO
+US, -1, 0) = 0xb6e88000
munmap(0xb5d2a000, 4059136) = 0
mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO
+US, -1, 0) = 0xb6af0000
munmap(0xb767e000, 3768320) = 0
munmap(0xb7a16000, 3768320) = 0
write(1, "\n", 1) = 1
write(1, " - objects 225001 .. 230000", 27) = 27
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID
+|SIGCHLD, child_tidptr=0xb7f7db78) = 14233
time(NULL) = 1270142654
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {0xe5c500, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({1, 0}, 0xbfb45ea4) = ? ERESTART_RESTARTBLOCK
+(To be restarted)
--- SIGCHLD (Child exited) @ 0 (0) ---
sigreturn() = ? (mask now [])
time(NULL) = 1270142654
rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0
waitpid(14232, 0xbfb45be8, WNOHANG) = 0
waitpid(14224, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) =
+ 14224
waitpid(14233, 0xbfb45be8, WNOHANG) = 0
waitpid(14225, 0xbfb45be8, WNOHANG) = 0
waitpid(14228, 0xbfb45be8, WNOHANG) = 0
waitpid(14229, 0xbfb45be8, WNOHANG) = 0
waitpid(14226, 0xbfb45be8, WNOHANG) = 0
waitpid(14230, 0xbfb45be8, WNOHANG) = 0
waitpid(14231, 0xbfb45be8, WNOHANG) = 0
waitpid(14227, 0xbfb45be8, WNOHANG) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0
rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0
write(1, "\n", 1) = 1
Here is where the parent child makes the request:
select(8, [3], NULL, NULL, {0, 0}) = 0 (Timeout)
time(NULL) = 1270142654
select(8, [3], [3], NULL, {180, 0}) = 1 (out [3], left {180, 0
+})
write(3, "GET /ia_object/_search?searchTyp"..., 246) = 246
time(NULL) = 1270142654
select(8, [3], NULL, NULL, {180, 0}) = ? ERESTARTNOHAND (To be
+restarted)
--- SIGCHLD (Child exited) @ 0 (0) ---
sigreturn() = ? (mask now [])
rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0
waitpid(14232, 0xbfb45be8, WNOHANG) = 0
waitpid(14233, 0xbfb45be8, WNOHANG) = 0
waitpid(14225, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) =
+ 14225
waitpid(14228, 0xbfb45be8, WNOHANG) = 0
waitpid(14229, 0xbfb45be8, WNOHANG) = 0
waitpid(14226, 0xbfb45be8, WNOHANG) = 0
waitpid(14230, 0xbfb45be8, WNOHANG) = 0
waitpid(14231, 0xbfb45be8, WNOHANG) = 0
waitpid(14227, 0xbfb45be8, WNOHANG) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0
rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0
time(NULL) = 1270142662
time(NULL) = 1270142662
select(8, [3], NULL, NULL, {172, 0}) = ? ERESTARTNOHAND (To be
+restarted)
--- SIGCHLD (Child exited) @ 0 (0) ---
sigreturn() = ? (mask now [])
rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0
waitpid(14232, 0xbfb45be8, WNOHANG) = 0
waitpid(14233, 0xbfb45be8, WNOHANG) = 0
waitpid(14225, 0xbfb45be8, WNOHANG) = -1 ECHILD (No child proc
+esses)
waitpid(14228, 0xbfb45be8, WNOHANG) = 0
waitpid(14229, 0xbfb45be8, WNOHANG) = 0
waitpid(14226, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) =
+ 14226
waitpid(14230, 0xbfb45be8, WNOHANG) = 0
waitpid(14231, 0xbfb45be8, WNOHANG) = 0
waitpid(14227, 0xbfb45be8, WNOHANG) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0
rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0
close(3) = 0
time(NULL) = 1270142665
time(NULL) = 1270142665
stat64("/opt/apache/sites/IA/perl/HTTP/Headers/Util.pmc", 0xbfb45d
+9c) = -1 ENOENT (No such file or directory)
stat64("/opt/apache/sites/IA/perl/HTTP/Headers/Util.pm", 0xbfb45cb
+0) = -1 ENOENT (No such file or directory)
stat64("/opt/apache/sites/Burro/HTTP/Headers/Util.pmc", 0xbfb45d9c
+) = -1 ENOENT (No such file or directory)
stat64("/opt/apache/sites/Burro/HTTP/Headers/Util.pm", 0xbfb45cb0)
+ = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/5.8.9/i686-linux/HTTP/Headers/Util.pmc
+", 0xbfb45d9c) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/5.8.9/i686-linux/HTTP/Headers/Util.pm"
+, 0xbfb45cb0) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/5.8.9/HTTP/Headers/Util.pmc", 0xbfb45d
+9c) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/5.8.9/HTTP/Headers/Util.pm", 0xbfb45cb
+0) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/i686-linux/HTTP/Header
+s/Util.pmc", 0xbfb45d9c) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/i686-linux/HTTP/Header
+s/Util.pm", 0xbfb45cb0) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pmc"
+, 0xbfb45d9c) = -1 ENOENT (No such file or directory)
stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pm",
+ {st_mode=S_IFREG|0444, st_size=4887, ...}) = 0
open("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pm", O
+_RDONLY|O_LARGEFILE) = 3
ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbfb45ad8) = -1 ENOTTY (I
+nappropriate ioctl for device)
_llseek(3, 0, [0], SEEK_CUR) = 0
read(3, "package HTTP::Headers::Util;\n\nus"..., 4096) = 4096
_llseek(3, 1712, [1712], SEEK_SET) = 0
_llseek(3, 0, [1712], SEEK_CUR) = 0
close(3) = 0
munmap(0xb6af0000, 3768320) = 0
munmap(0xb6e88000, 3768320) = 0
At this stage, my code catches the select failed: no child processes error in an eval, issues a warning, then sleeps before retrying:
write(2, "\nSystem is busy - trying again\n", 31) = 31
time(NULL) = 1270142665
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {0xe5c500, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({3, 0}, 0xbfb45ea4) = ? ERESTART_RESTARTBLOCK
+(To be restarted)
I'm not sure what most of this means, but is the value of $! being set to "no child processes" by one of my waitpid calls, which is interfering with the code in LWP::Protocol::http? Would it help if I localised $! in my reaper sub?
Re^3: No child processes - system limit?
by ikegami (Pope) on Apr 01, 2010 at 18:09 UTC
|
Would it help if I localised $! in my reaper sub?
I believe so. That's exactly where I was going with my question.
| [reply] |
Re^3: No child processes - system limit?
by almut (Canon) on Apr 01, 2010 at 19:15 UTC
|
select(8, [3], NULL, NULL, {172, 0}) = ? ERESTARTNOHAND (To be rest
+arted)
--- SIGCHLD (Child exited) @ 0 (0) ---
sigreturn() = ? (mask now [])
rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0
waitpid(14232, 0xbfb45be8, WNOHANG) = 0
waitpid(14233, 0xbfb45be8, WNOHANG) = 0
waitpid(14225, 0xbfb45be8, WNOHANG) = -1 ECHILD (No child processe
+s)
...
My interpretation of this would be (as you already figured) that $!
is being modified in the signal handler before the interrupted select call
gets a chance to be restarted, i.e. the redo SELECT doesn't execute
because of that very modification of $!.
(Note that because of Perl's deferred (aka safe) signal handling,
the sigreturn() (which is being called at the end of the "real" system/C-level
signal handler) happens immediately, before the Perl signal handler
runs all the waitpid calls. Still, they do run before the
next Perl opcode executes (which means this is presumably before if ($!{EINTR} || $!{EAGAIN}) ).
What I find a little surprising is that the ECHILD does occur at
all, because your $Children{$pid} should've been set to zero in
the previous call to the signal handler
waitpid(14225, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) = 142
+25
where the waitpid did return 14225 (i.e. $res > 0). In other words, you shouldn't be
calling waitpid(14225,...) again thereafter, because the 14225 is no longer supposed to be in the hash... (update: err wait, this is nonsense of course, as you're iterating over the keys, not the values. OTOH, this brings up the question what would happen if you did set the values to the PIDs, too, and then iterate over the values instead (as you seem be to getting that panic when deleting the keys...)
Maybe you could try to figure out why this is — in addition
to trying to localize $! as a workaround, of course.
| [reply] [d/l] [select] |
|
local'ising $! seems to have sorted out that issue, revealing the real error that is happening on the remote process.
Re your other point, yes - deleting keys in the hash causes a panic, but I'll change the loop to only waitpid to those keys that have true values, which should help
thanks
| [reply] [d/l] [select] |
|
|