Source file src/syscall/exec_linux.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build linux
     6  
     7  package syscall
     8  
     9  import (
    10  	"internal/itoa"
    11  	"runtime"
    12  	"unsafe"
    13  )
    14  
    15  // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
    16  // See user_namespaces(7).
    17  type SysProcIDMap struct {
    18  	ContainerID int // Container ID.
    19  	HostID      int // Host ID.
    20  	Size        int // Size.
    21  }
    22  
    23  type SysProcAttr struct {
    24  	Chroot     string      // Chroot.
    25  	Credential *Credential // Credential.
    26  	// Ptrace tells the child to call ptrace(PTRACE_TRACEME).
    27  	// Call runtime.LockOSThread before starting a process with this set,
    28  	// and don't call UnlockOSThread until done with PtraceSyscall calls.
    29  	Ptrace bool
    30  	Setsid bool // Create session.
    31  	// Setpgid sets the process group ID of the child to Pgid,
    32  	// or, if Pgid == 0, to the new child's process ID.
    33  	Setpgid bool
    34  	// Setctty sets the controlling terminal of the child to
    35  	// file descriptor Ctty. Ctty must be a descriptor number
    36  	// in the child process: an index into ProcAttr.Files.
    37  	// This is only meaningful if Setsid is true.
    38  	Setctty bool
    39  	Noctty  bool // Detach fd 0 from controlling terminal
    40  	Ctty    int  // Controlling TTY fd
    41  	// Foreground places the child process group in the foreground.
    42  	// This implies Setpgid. The Ctty field must be set to
    43  	// the descriptor of the controlling TTY.
    44  	// Unlike Setctty, in this case Ctty must be a descriptor
    45  	// number in the parent process.
    46  	Foreground bool
    47  	Pgid       int // Child's process group ID if Setpgid.
    48  	// Pdeathsig, if non-zero, is a signal that the kernel will send to
    49  	// the child process when the creating thread dies. Note that the signal
    50  	// is sent on thread termination, which may happen before process termination.
    51  	// There are more details at https://go.dev/issue/27505.
    52  	Pdeathsig    Signal
    53  	Cloneflags   uintptr        // Flags for clone calls (Linux only)
    54  	Unshareflags uintptr        // Flags for unshare calls (Linux only)
    55  	UidMappings  []SysProcIDMap // User ID mappings for user namespaces.
    56  	GidMappings  []SysProcIDMap // Group ID mappings for user namespaces.
    57  	// GidMappingsEnableSetgroups enabling setgroups syscall.
    58  	// If false, then setgroups syscall will be disabled for the child process.
    59  	// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
    60  	// users this should be set to false for mappings work.
    61  	GidMappingsEnableSetgroups bool
    62  	AmbientCaps                []uintptr // Ambient capabilities (Linux only)
    63  }
    64  
    65  var (
    66  	none  = [...]byte{'n', 'o', 'n', 'e', 0}
    67  	slash = [...]byte{'/', 0}
    68  )
    69  
    70  // Implemented in runtime package.
    71  func runtime_BeforeFork()
    72  func runtime_AfterFork()
    73  func runtime_AfterForkInChild()
    74  
    75  // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
    76  // If a dup or exec fails, write the errno error to pipe.
    77  // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
    78  // In the child, this function must not acquire any locks, because
    79  // they might have been locked at the time of the fork. This means
    80  // no rescheduling, no malloc calls, and no new stack segments.
    81  // For the same reason compiler does not race instrument it.
    82  // The calls to RawSyscall are okay because they are assembly
    83  // functions that do not grow the stack.
    84  //
    85  //go:norace
    86  func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
    87  	// Set up and fork. This returns immediately in the parent or
    88  	// if there's an error.
    89  	r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
    90  	if locked {
    91  		runtime_AfterFork()
    92  	}
    93  	if err1 != 0 {
    94  		return 0, err1
    95  	}
    96  
    97  	// parent; return PID
    98  	pid = int(r1)
    99  
   100  	if sys.UidMappings != nil || sys.GidMappings != nil {
   101  		Close(p[0])
   102  		var err2 Errno
   103  		// uid/gid mappings will be written after fork and unshare(2) for user
   104  		// namespaces.
   105  		if sys.Unshareflags&CLONE_NEWUSER == 0 {
   106  			if err := writeUidGidMappings(pid, sys); err != nil {
   107  				err2 = err.(Errno)
   108  			}
   109  		}
   110  		RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   111  		Close(p[1])
   112  	}
   113  
   114  	return pid, 0
   115  }
   116  
   117  const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
   118  
   119  type capHeader struct {
   120  	version uint32
   121  	pid     int32
   122  }
   123  
   124  type capData struct {
   125  	effective   uint32
   126  	permitted   uint32
   127  	inheritable uint32
   128  }
   129  type caps struct {
   130  	hdr  capHeader
   131  	data [2]capData
   132  }
   133  
   134  // See CAP_TO_INDEX in linux/capability.h:
   135  func capToIndex(cap uintptr) uintptr { return cap >> 5 }
   136  
   137  // See CAP_TO_MASK in linux/capability.h:
   138  func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
   139  
   140  // forkAndExecInChild1 implements the body of forkAndExecInChild up to
   141  // the parent's post-fork path. This is a separate function so we can
   142  // separate the child's and parent's stack frames if we're using
   143  // vfork.
   144  //
   145  // This is go:noinline because the point is to keep the stack frames
   146  // of this and forkAndExecInChild separate.
   147  //
   148  //go:noinline
   149  //go:norace
   150  func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
   151  	// Defined in linux/prctl.h starting with Linux 4.3.
   152  	const (
   153  		PR_CAP_AMBIENT       = 0x2f
   154  		PR_CAP_AMBIENT_RAISE = 0x2
   155  	)
   156  
   157  	// vfork requires that the child not touch any of the parent's
   158  	// active stack frames. Hence, the child does all post-fork
   159  	// processing in this stack frame and never returns, while the
   160  	// parent returns immediately from this frame and does all
   161  	// post-fork processing in the outer frame.
   162  	// Declare all variables at top in case any
   163  	// declarations require heap allocation (e.g., err1).
   164  	var (
   165  		err2                      Errno
   166  		nextfd                    int
   167  		i                         int
   168  		caps                      caps
   169  		fd1                       uintptr
   170  		puid, psetgroups, pgid    []byte
   171  		uidmap, setgroups, gidmap []byte
   172  	)
   173  
   174  	if sys.UidMappings != nil {
   175  		puid = []byte("/proc/self/uid_map\000")
   176  		uidmap = formatIDMappings(sys.UidMappings)
   177  	}
   178  
   179  	if sys.GidMappings != nil {
   180  		psetgroups = []byte("/proc/self/setgroups\000")
   181  		pgid = []byte("/proc/self/gid_map\000")
   182  
   183  		if sys.GidMappingsEnableSetgroups {
   184  			setgroups = []byte("allow\000")
   185  		} else {
   186  			setgroups = []byte("deny\000")
   187  		}
   188  		gidmap = formatIDMappings(sys.GidMappings)
   189  	}
   190  
   191  	// Record parent PID so child can test if it has died.
   192  	ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
   193  
   194  	// Guard against side effects of shuffling fds below.
   195  	// Make sure that nextfd is beyond any currently open files so
   196  	// that we can't run the risk of overwriting any of them.
   197  	fd := make([]int, len(attr.Files))
   198  	nextfd = len(attr.Files)
   199  	for i, ufd := range attr.Files {
   200  		if nextfd < int(ufd) {
   201  			nextfd = int(ufd)
   202  		}
   203  		fd[i] = int(ufd)
   204  	}
   205  	nextfd++
   206  
   207  	// Allocate another pipe for parent to child communication for
   208  	// synchronizing writing of User ID/Group ID mappings.
   209  	if sys.UidMappings != nil || sys.GidMappings != nil {
   210  		if err := forkExecPipe(p[:]); err != nil {
   211  			err1 = err.(Errno)
   212  			return
   213  		}
   214  	}
   215  
   216  	// About to call fork.
   217  	// No more allocation or calls of non-assembly functions.
   218  	runtime_BeforeFork()
   219  	locked = true
   220  	switch {
   221  	case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
   222  		r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
   223  	case runtime.GOARCH == "s390x":
   224  		r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
   225  	default:
   226  		r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
   227  	}
   228  	if err1 != 0 || r1 != 0 {
   229  		// If we're in the parent, we must return immediately
   230  		// so we're not in the same stack frame as the child.
   231  		// This can at most use the return PC, which the child
   232  		// will not modify, and the results of
   233  		// rawVforkSyscall, which must have been written after
   234  		// the child was replaced.
   235  		return
   236  	}
   237  
   238  	// Fork succeeded, now in child.
   239  
   240  	// Enable the "keep capabilities" flag to set ambient capabilities later.
   241  	if len(sys.AmbientCaps) > 0 {
   242  		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
   243  		if err1 != 0 {
   244  			goto childerror
   245  		}
   246  	}
   247  
   248  	// Wait for User ID/Group ID mappings to be written.
   249  	if sys.UidMappings != nil || sys.GidMappings != nil {
   250  		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
   251  			goto childerror
   252  		}
   253  		r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   254  		if err1 != 0 {
   255  			goto childerror
   256  		}
   257  		if r1 != unsafe.Sizeof(err2) {
   258  			err1 = EINVAL
   259  			goto childerror
   260  		}
   261  		if err2 != 0 {
   262  			err1 = err2
   263  			goto childerror
   264  		}
   265  	}
   266  
   267  	// Session ID
   268  	if sys.Setsid {
   269  		_, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
   270  		if err1 != 0 {
   271  			goto childerror
   272  		}
   273  	}
   274  
   275  	// Set process group
   276  	if sys.Setpgid || sys.Foreground {
   277  		// Place child in process group.
   278  		_, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
   279  		if err1 != 0 {
   280  			goto childerror
   281  		}
   282  	}
   283  
   284  	if sys.Foreground {
   285  		pgrp := int32(sys.Pgid)
   286  		if pgrp == 0 {
   287  			r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
   288  
   289  			pgrp = int32(r1)
   290  		}
   291  
   292  		// Place process group in foreground.
   293  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
   294  		if err1 != 0 {
   295  			goto childerror
   296  		}
   297  	}
   298  
   299  	// Restore the signal mask. We do this after TIOCSPGRP to avoid
   300  	// having the kernel send a SIGTTOU signal to the process group.
   301  	runtime_AfterForkInChild()
   302  
   303  	// Unshare
   304  	if sys.Unshareflags != 0 {
   305  		_, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
   306  		if err1 != 0 {
   307  			goto childerror
   308  		}
   309  
   310  		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
   311  			dirfd := int(_AT_FDCWD)
   312  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
   313  				goto childerror
   314  			}
   315  			r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
   316  			if err1 != 0 {
   317  				goto childerror
   318  			}
   319  			if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
   320  				goto childerror
   321  			}
   322  
   323  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
   324  				goto childerror
   325  			}
   326  			r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
   327  			if err1 != 0 {
   328  				goto childerror
   329  			}
   330  			if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
   331  				goto childerror
   332  			}
   333  		}
   334  
   335  		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
   336  			dirfd := int(_AT_FDCWD)
   337  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
   338  				goto childerror
   339  			}
   340  			r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
   341  			if err1 != 0 {
   342  				goto childerror
   343  			}
   344  			if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
   345  				goto childerror
   346  			}
   347  		}
   348  
   349  		// The unshare system call in Linux doesn't unshare mount points
   350  		// mounted with --shared. Systemd mounts / with --shared. For a
   351  		// long discussion of the pros and cons of this see debian bug 739593.
   352  		// The Go model of unsharing is more like Plan 9, where you ask
   353  		// to unshare and the namespaces are unconditionally unshared.
   354  		// To make this model work we must further mark / as MS_PRIVATE.
   355  		// This is what the standard unshare command does.
   356  		if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
   357  			_, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
   358  			if err1 != 0 {
   359  				goto childerror
   360  			}
   361  		}
   362  	}
   363  
   364  	// Chroot
   365  	if chroot != nil {
   366  		_, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
   367  		if err1 != 0 {
   368  			goto childerror
   369  		}
   370  	}
   371  
   372  	// User and groups
   373  	if cred := sys.Credential; cred != nil {
   374  		ngroups := uintptr(len(cred.Groups))
   375  		groups := uintptr(0)
   376  		if ngroups > 0 {
   377  			groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
   378  		}
   379  		if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
   380  			_, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
   381  			if err1 != 0 {
   382  				goto childerror
   383  			}
   384  		}
   385  		_, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
   386  		if err1 != 0 {
   387  			goto childerror
   388  		}
   389  		_, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
   390  		if err1 != 0 {
   391  			goto childerror
   392  		}
   393  	}
   394  
   395  	if len(sys.AmbientCaps) != 0 {
   396  		// Ambient capabilities were added in the 4.3 kernel,
   397  		// so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
   398  		caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
   399  
   400  		if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
   401  			goto childerror
   402  		}
   403  
   404  		for _, c := range sys.AmbientCaps {
   405  			// Add the c capability to the permitted and inheritable capability mask,
   406  			// otherwise we will not be able to add it to the ambient capability mask.
   407  			caps.data[capToIndex(c)].permitted |= capToMask(c)
   408  			caps.data[capToIndex(c)].inheritable |= capToMask(c)
   409  		}
   410  
   411  		if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
   412  			goto childerror
   413  		}
   414  
   415  		for _, c := range sys.AmbientCaps {
   416  			_, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
   417  			if err1 != 0 {
   418  				goto childerror
   419  			}
   420  		}
   421  	}
   422  
   423  	// Chdir
   424  	if dir != nil {
   425  		_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
   426  		if err1 != 0 {
   427  			goto childerror
   428  		}
   429  	}
   430  
   431  	// Parent death signal
   432  	if sys.Pdeathsig != 0 {
   433  		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
   434  		if err1 != 0 {
   435  			goto childerror
   436  		}
   437  
   438  		// Signal self if parent is already dead. This might cause a
   439  		// duplicate signal in rare cases, but it won't matter when
   440  		// using SIGKILL.
   441  		r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
   442  		if r1 != ppid {
   443  			pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
   444  			_, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
   445  			if err1 != 0 {
   446  				goto childerror
   447  			}
   448  		}
   449  	}
   450  
   451  	// Pass 1: look for fd[i] < i and move those up above len(fd)
   452  	// so that pass 2 won't stomp on an fd it needs later.
   453  	if pipe < nextfd {
   454  		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
   455  		if err1 != 0 {
   456  			goto childerror
   457  		}
   458  		pipe = nextfd
   459  		nextfd++
   460  	}
   461  	for i = 0; i < len(fd); i++ {
   462  		if fd[i] >= 0 && fd[i] < int(i) {
   463  			if nextfd == pipe { // don't stomp on pipe
   464  				nextfd++
   465  			}
   466  			_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
   467  			if err1 != 0 {
   468  				goto childerror
   469  			}
   470  			fd[i] = nextfd
   471  			nextfd++
   472  		}
   473  	}
   474  
   475  	// Pass 2: dup fd[i] down onto i.
   476  	for i = 0; i < len(fd); i++ {
   477  		if fd[i] == -1 {
   478  			RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   479  			continue
   480  		}
   481  		if fd[i] == int(i) {
   482  			// dup2(i, i) won't clear close-on-exec flag on Linux,
   483  			// probably not elsewhere either.
   484  			_, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
   485  			if err1 != 0 {
   486  				goto childerror
   487  			}
   488  			continue
   489  		}
   490  		// The new fd is created NOT close-on-exec,
   491  		// which is exactly what we want.
   492  		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
   493  		if err1 != 0 {
   494  			goto childerror
   495  		}
   496  	}
   497  
   498  	// By convention, we don't close-on-exec the fds we are
   499  	// started with, so if len(fd) < 3, close 0, 1, 2 as needed.
   500  	// Programs that know they inherit fds >= 3 will need
   501  	// to set them close-on-exec.
   502  	for i = len(fd); i < 3; i++ {
   503  		RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   504  	}
   505  
   506  	// Detach fd 0 from tty
   507  	if sys.Noctty {
   508  		_, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
   509  		if err1 != 0 {
   510  			goto childerror
   511  		}
   512  	}
   513  
   514  	// Set the controlling TTY to Ctty
   515  	if sys.Setctty {
   516  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
   517  		if err1 != 0 {
   518  			goto childerror
   519  		}
   520  	}
   521  
   522  	// Enable tracing if requested.
   523  	// Do this right before exec so that we don't unnecessarily trace the runtime
   524  	// setting up after the fork. See issue #21428.
   525  	if sys.Ptrace {
   526  		_, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
   527  		if err1 != 0 {
   528  			goto childerror
   529  		}
   530  	}
   531  
   532  	// Time to exec.
   533  	_, _, err1 = RawSyscall(SYS_EXECVE,
   534  		uintptr(unsafe.Pointer(argv0)),
   535  		uintptr(unsafe.Pointer(&argv[0])),
   536  		uintptr(unsafe.Pointer(&envv[0])))
   537  
   538  childerror:
   539  	// send error code on pipe
   540  	RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
   541  	for {
   542  		RawSyscall(SYS_EXIT, 253, 0, 0)
   543  	}
   544  }
   545  
   546  // Try to open a pipe with O_CLOEXEC set on both file descriptors.
   547  func forkExecPipe(p []int) (err error) {
   548  	return Pipe2(p, O_CLOEXEC)
   549  }
   550  
   551  func formatIDMappings(idMap []SysProcIDMap) []byte {
   552  	var data []byte
   553  	for _, im := range idMap {
   554  		data = append(data, []byte(itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n")...)
   555  	}
   556  	return data
   557  }
   558  
   559  // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
   560  func writeIDMappings(path string, idMap []SysProcIDMap) error {
   561  	fd, err := Open(path, O_RDWR, 0)
   562  	if err != nil {
   563  		return err
   564  	}
   565  
   566  	if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
   567  		Close(fd)
   568  		return err
   569  	}
   570  
   571  	if err := Close(fd); err != nil {
   572  		return err
   573  	}
   574  
   575  	return nil
   576  }
   577  
   578  // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
   579  // and "allow" if enable is true.
   580  // This is needed since kernel 3.19, because you can't write gid_map without
   581  // disabling setgroups() system call.
   582  func writeSetgroups(pid int, enable bool) error {
   583  	sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
   584  	fd, err := Open(sgf, O_RDWR, 0)
   585  	if err != nil {
   586  		return err
   587  	}
   588  
   589  	var data []byte
   590  	if enable {
   591  		data = []byte("allow")
   592  	} else {
   593  		data = []byte("deny")
   594  	}
   595  
   596  	if _, err := Write(fd, data); err != nil {
   597  		Close(fd)
   598  		return err
   599  	}
   600  
   601  	return Close(fd)
   602  }
   603  
   604  // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
   605  // for a process and it is called from the parent process.
   606  func writeUidGidMappings(pid int, sys *SysProcAttr) error {
   607  	if sys.UidMappings != nil {
   608  		uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
   609  		if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
   610  			return err
   611  		}
   612  	}
   613  
   614  	if sys.GidMappings != nil {
   615  		// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
   616  		if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
   617  			return err
   618  		}
   619  		gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
   620  		if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
   621  			return err
   622  		}
   623  	}
   624  
   625  	return nil
   626  }
   627  

View as plain text