Use unshare() instead of clone()
It turns out that using clone() to start a child process is unsafe in a multithreaded program. It can cause the initialisation of a build child process to hang in setgroups(), as seen several times in the build farm: The reason is that Glibc thinks that the other threads of the parent exist in the child, so in setxid_mark_thread() it tries to get a futex that has been acquired by another thread just before the clone(). With fork(), Glibc runs pthread_atfork() handlers that take care of this (in particular, __reclaim_stacks()). But clone() doesn't do that. Fortunately, we can use fork()+unshare() instead of clone() to set up private namespaces. See also https://www.mail-archive.com/lxc-devel@lists.linuxcontainers.org/msg03434.html.
This commit is contained in:
		
							parent
							
								
									fefd3650d4
								
							
						
					
					
						commit
						524f89f139
					
				
					 1 changed files with 35 additions and 55 deletions
				
			
		| 
						 | 
				
			
			@ -1575,13 +1575,6 @@ void chmod_(const Path & path, mode_t mode)
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int childEntry(void * arg)
 | 
			
		||||
{
 | 
			
		||||
    ((DerivationGoal *) arg)->initChild();
 | 
			
		||||
    return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void DerivationGoal::startBuilder()
 | 
			
		||||
{
 | 
			
		||||
    startNest(nest, lvlInfo, format(
 | 
			
		||||
| 
						 | 
				
			
			@ -1897,48 +1890,10 @@ void DerivationGoal::startBuilder()
 | 
			
		|||
    /* Create a pipe to get the output of the builder. */
 | 
			
		||||
    builderOut.create();
 | 
			
		||||
 | 
			
		||||
    /* Fork a child to build the package.  Note that while we
 | 
			
		||||
       currently use forks to run and wait for the children, it
 | 
			
		||||
       shouldn't be hard to use threads for this on systems where
 | 
			
		||||
       fork() is unavailable or inefficient.
 | 
			
		||||
 | 
			
		||||
       If we're building in a chroot, then also set up private
 | 
			
		||||
       namespaces for the build:
 | 
			
		||||
 | 
			
		||||
       - The PID namespace causes the build to start as PID 1.
 | 
			
		||||
         Processes outside of the chroot are not visible to those on
 | 
			
		||||
         the inside, but processes inside the chroot are visible from
 | 
			
		||||
         the outside (though with different PIDs).
 | 
			
		||||
 | 
			
		||||
       - The private mount namespace ensures that all the bind mounts
 | 
			
		||||
         we do will only show up in this process and its children, and
 | 
			
		||||
         will disappear automatically when we're done.
 | 
			
		||||
 | 
			
		||||
       - The private network namespace ensures that the builder cannot
 | 
			
		||||
         talk to the outside world (or vice versa).  It only has a
 | 
			
		||||
         private loopback interface.
 | 
			
		||||
 | 
			
		||||
       - The IPC namespace prevents the builder from communicating
 | 
			
		||||
         with outside processes using SysV IPC mechanisms (shared
 | 
			
		||||
         memory, message queues, semaphores).  It also ensures that
 | 
			
		||||
         all IPC objects are destroyed when the builder exits.
 | 
			
		||||
 | 
			
		||||
       - The UTS namespace ensures that builders see a hostname of
 | 
			
		||||
         localhost rather than the actual hostname.
 | 
			
		||||
    */
 | 
			
		||||
#if CHROOT_ENABLED
 | 
			
		||||
    if (useChroot) {
 | 
			
		||||
        char stack[32 * 1024];
 | 
			
		||||
        pid = clone(childEntry, stack + sizeof(stack) - 8,
 | 
			
		||||
            CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | SIGCHLD, this);
 | 
			
		||||
    } else
 | 
			
		||||
#endif
 | 
			
		||||
    {
 | 
			
		||||
        pid = fork();
 | 
			
		||||
        if (pid == 0) initChild();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (pid == -1) throw SysError("unable to fork");
 | 
			
		||||
    /* Fork a child to build the package. */
 | 
			
		||||
    pid = startProcess([&]() {
 | 
			
		||||
        initChild();
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    /* parent */
 | 
			
		||||
    pid.setSeparatePG(true);
 | 
			
		||||
| 
						 | 
				
			
			@ -1965,14 +1920,41 @@ void DerivationGoal::initChild()
 | 
			
		|||
 | 
			
		||||
    try { /* child */
 | 
			
		||||
 | 
			
		||||
        _writeToStderr = 0;
 | 
			
		||||
 | 
			
		||||
        restoreAffinity();
 | 
			
		||||
 | 
			
		||||
        commonChildInit(builderOut);
 | 
			
		||||
 | 
			
		||||
#if CHROOT_ENABLED
 | 
			
		||||
        if (useChroot) {
 | 
			
		||||
 | 
			
		||||
            /* Set up private namespaces for the build:
 | 
			
		||||
 | 
			
		||||
               - The PID namespace causes the build to start as PID 1.
 | 
			
		||||
                 Processes outside of the chroot are not visible to
 | 
			
		||||
                 those on the inside, but processes inside the chroot
 | 
			
		||||
                 are visible from the outside (though with different
 | 
			
		||||
                 PIDs).
 | 
			
		||||
 | 
			
		||||
               - The private mount namespace ensures that all the bind
 | 
			
		||||
                 mounts we do will only show up in this process and
 | 
			
		||||
                 its children, and will disappear automatically when
 | 
			
		||||
                 we're done.
 | 
			
		||||
 | 
			
		||||
               - The private network namespace ensures that the
 | 
			
		||||
                 builder cannot talk to the outside world (or vice
 | 
			
		||||
                 versa).  It only has a private loopback interface.
 | 
			
		||||
 | 
			
		||||
               - The IPC namespace prevents the builder from
 | 
			
		||||
                 communicating with outside processes using SysV IPC
 | 
			
		||||
                 mechanisms (shared memory, message queues,
 | 
			
		||||
                 semaphores).  It also ensures that all IPC objects
 | 
			
		||||
                 are destroyed when the builder exits.
 | 
			
		||||
 | 
			
		||||
               - The UTS namespace ensures that builders see a
 | 
			
		||||
                 hostname of localhost rather than the actual
 | 
			
		||||
                 hostname.
 | 
			
		||||
            */
 | 
			
		||||
            if (unshare(CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS) == -1)
 | 
			
		||||
                throw SysError("setting up private namespaces");
 | 
			
		||||
 | 
			
		||||
            /* Initialise the loopback interface. */
 | 
			
		||||
            AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
 | 
			
		||||
            if (fd == -1) throw SysError("cannot open IP socket");
 | 
			
		||||
| 
						 | 
				
			
			@ -2169,8 +2151,6 @@ void DerivationGoal::initChild()
 | 
			
		|||
        writeToStderr("while setting up the build environment: " + string(e.what()) + "\n");
 | 
			
		||||
        _exit(1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    abort(); /* never reached */
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue