Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 const (
18 CLONE_VM = 0x00000100
19 CLONE_FS = 0x00000200
20 CLONE_FILES = 0x00000400
21 CLONE_SIGHAND = 0x00000800
22 CLONE_PIDFD = 0x00001000
23 CLONE_PTRACE = 0x00002000
24 CLONE_VFORK = 0x00004000
25 CLONE_PARENT = 0x00008000
26 CLONE_THREAD = 0x00010000
27 CLONE_NEWNS = 0x00020000
28 CLONE_SYSVSEM = 0x00040000
29 CLONE_SETTLS = 0x00080000
30 CLONE_PARENT_SETTID = 0x00100000
31 CLONE_CHILD_CLEARTID = 0x00200000
32 CLONE_DETACHED = 0x00400000
33 CLONE_UNTRACED = 0x00800000
34 CLONE_CHILD_SETTID = 0x01000000
35 CLONE_NEWCGROUP = 0x02000000
36 CLONE_NEWUTS = 0x04000000
37 CLONE_NEWIPC = 0x08000000
38 CLONE_NEWUSER = 0x10000000
39 CLONE_NEWPID = 0x20000000
40 CLONE_NEWNET = 0x40000000
41 CLONE_IO = 0x80000000
42
43
44
45 CLONE_CLEAR_SIGHAND = 0x100000000
46 CLONE_INTO_CGROUP = 0x200000000
47
48
49
50
51 CLONE_NEWTIME = 0x00000080
52 )
53
54
55
56 type SysProcIDMap struct {
57 ContainerID int
58 HostID int
59 Size int
60 }
61
62 type SysProcAttr struct {
63 Chroot string
64 Credential *Credential
65
66
67
68 Ptrace bool
69 Setsid bool
70
71
72 Setpgid bool
73
74
75
76
77 Setctty bool
78 Noctty bool
79 Ctty int
80
81
82
83
84
85 Foreground bool
86 Pgid int
87
88
89
90
91 Pdeathsig Signal
92 Cloneflags uintptr
93 Unshareflags uintptr
94 UidMappings []SysProcIDMap
95 GidMappings []SysProcIDMap
96
97
98
99
100 GidMappingsEnableSetgroups bool
101 AmbientCaps []uintptr
102 UseCgroupFD bool
103 CgroupFD int
104 }
105
106 var (
107 none = [...]byte{'n', 'o', 'n', 'e', 0}
108 slash = [...]byte{'/', 0}
109 )
110
111
112 func runtime_BeforeFork()
113 func runtime_AfterFork()
114 func runtime_AfterForkInChild()
115
116
117
118
119
120
121
122
123
124
125
126
127 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
128
129
130 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
131 if locked {
132 runtime_AfterFork()
133 }
134 if err1 != 0 {
135 return 0, err1
136 }
137
138
139 pid = int(r1)
140
141 if sys.UidMappings != nil || sys.GidMappings != nil {
142 Close(p[0])
143 var err2 Errno
144
145
146 if sys.Unshareflags&CLONE_NEWUSER == 0 {
147 if err := writeUidGidMappings(pid, sys); err != nil {
148 err2 = err.(Errno)
149 }
150 }
151 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
152 Close(p[1])
153 }
154
155 return pid, 0
156 }
157
158 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
159
160 type capHeader struct {
161 version uint32
162 pid int32
163 }
164
165 type capData struct {
166 effective uint32
167 permitted uint32
168 inheritable uint32
169 }
170 type caps struct {
171 hdr capHeader
172 data [2]capData
173 }
174
175
176 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
177
178
179 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
180
181
182 type cloneArgs struct {
183 flags uint64
184 pidFD uint64
185 childTID uint64
186 parentTID uint64
187 exitSignal uint64
188 stack uint64
189 stackSize uint64
190 tls uint64
191 setTID uint64
192 setTIDSize uint64
193 cgroup uint64
194 }
195
196
197
198
199
200
201
202
203
204
205
206 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
207
208 const (
209 PR_CAP_AMBIENT = 0x2f
210 PR_CAP_AMBIENT_RAISE = 0x2
211 )
212
213
214
215
216
217
218
219
220 var (
221 err2 Errno
222 nextfd int
223 i int
224 caps caps
225 fd1, flags uintptr
226 puid, psetgroups, pgid []byte
227 uidmap, setgroups, gidmap []byte
228 clone3 *cloneArgs
229 )
230
231 rlim, rlimOK := origRlimitNofile.Load().(Rlimit)
232
233 if sys.UidMappings != nil {
234 puid = []byte("/proc/self/uid_map\000")
235 uidmap = formatIDMappings(sys.UidMappings)
236 }
237
238 if sys.GidMappings != nil {
239 psetgroups = []byte("/proc/self/setgroups\000")
240 pgid = []byte("/proc/self/gid_map\000")
241
242 if sys.GidMappingsEnableSetgroups {
243 setgroups = []byte("allow\000")
244 } else {
245 setgroups = []byte("deny\000")
246 }
247 gidmap = formatIDMappings(sys.GidMappings)
248 }
249
250
251 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
252
253
254
255
256 fd := make([]int, len(attr.Files))
257 nextfd = len(attr.Files)
258 for i, ufd := range attr.Files {
259 if nextfd < int(ufd) {
260 nextfd = int(ufd)
261 }
262 fd[i] = int(ufd)
263 }
264 nextfd++
265
266
267
268 if sys.UidMappings != nil || sys.GidMappings != nil {
269 if err := forkExecPipe(p[:]); err != nil {
270 err1 = err.(Errno)
271 return
272 }
273 }
274
275 flags = sys.Cloneflags
276 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
277 flags |= CLONE_VFORK | CLONE_VM
278 }
279
280 if sys.UseCgroupFD {
281 clone3 = &cloneArgs{
282 flags: uint64(flags) | CLONE_INTO_CGROUP,
283 exitSignal: uint64(SIGCHLD),
284 cgroup: uint64(sys.CgroupFD),
285 }
286 }
287
288
289
290 runtime_BeforeFork()
291 locked = true
292 if clone3 != nil {
293 r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
294 } else {
295 flags |= uintptr(SIGCHLD)
296 if runtime.GOARCH == "s390x" {
297
298 r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
299 } else {
300 r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
301 }
302 }
303 if err1 != 0 || r1 != 0 {
304
305
306
307
308
309
310 return
311 }
312
313
314
315
316 if len(sys.AmbientCaps) > 0 {
317 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
318 if err1 != 0 {
319 goto childerror
320 }
321 }
322
323
324 if sys.UidMappings != nil || sys.GidMappings != nil {
325 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
326 goto childerror
327 }
328 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
329 if err1 != 0 {
330 goto childerror
331 }
332 if r1 != unsafe.Sizeof(err2) {
333 err1 = EINVAL
334 goto childerror
335 }
336 if err2 != 0 {
337 err1 = err2
338 goto childerror
339 }
340 }
341
342
343 if sys.Setsid {
344 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
345 if err1 != 0 {
346 goto childerror
347 }
348 }
349
350
351 if sys.Setpgid || sys.Foreground {
352
353 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
354 if err1 != 0 {
355 goto childerror
356 }
357 }
358
359 if sys.Foreground {
360 pgrp := int32(sys.Pgid)
361 if pgrp == 0 {
362 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
363
364 pgrp = int32(r1)
365 }
366
367
368 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
369 if err1 != 0 {
370 goto childerror
371 }
372 }
373
374
375
376 runtime_AfterForkInChild()
377
378
379 if sys.Unshareflags != 0 {
380 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
381 if err1 != 0 {
382 goto childerror
383 }
384
385 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
386 dirfd := int(_AT_FDCWD)
387 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
388 goto childerror
389 }
390 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
391 if err1 != 0 {
392 goto childerror
393 }
394 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
395 goto childerror
396 }
397
398 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
399 goto childerror
400 }
401 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
402 if err1 != 0 {
403 goto childerror
404 }
405 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
406 goto childerror
407 }
408 }
409
410 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
411 dirfd := int(_AT_FDCWD)
412 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
413 goto childerror
414 }
415 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
416 if err1 != 0 {
417 goto childerror
418 }
419 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
420 goto childerror
421 }
422 }
423
424
425
426
427
428
429
430
431 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
432 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
433 if err1 != 0 {
434 goto childerror
435 }
436 }
437 }
438
439
440 if chroot != nil {
441 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
442 if err1 != 0 {
443 goto childerror
444 }
445 }
446
447
448 if cred := sys.Credential; cred != nil {
449 ngroups := uintptr(len(cred.Groups))
450 groups := uintptr(0)
451 if ngroups > 0 {
452 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
453 }
454 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
455 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
456 if err1 != 0 {
457 goto childerror
458 }
459 }
460 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
461 if err1 != 0 {
462 goto childerror
463 }
464 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
465 if err1 != 0 {
466 goto childerror
467 }
468 }
469
470 if len(sys.AmbientCaps) != 0 {
471
472
473 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
474
475 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
476 goto childerror
477 }
478
479 for _, c := range sys.AmbientCaps {
480
481
482 caps.data[capToIndex(c)].permitted |= capToMask(c)
483 caps.data[capToIndex(c)].inheritable |= capToMask(c)
484 }
485
486 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
487 goto childerror
488 }
489
490 for _, c := range sys.AmbientCaps {
491 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
492 if err1 != 0 {
493 goto childerror
494 }
495 }
496 }
497
498
499 if dir != nil {
500 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
501 if err1 != 0 {
502 goto childerror
503 }
504 }
505
506
507 if sys.Pdeathsig != 0 {
508 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
509 if err1 != 0 {
510 goto childerror
511 }
512
513
514
515
516 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
517 if r1 != ppid {
518 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
519 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
520 if err1 != 0 {
521 goto childerror
522 }
523 }
524 }
525
526
527
528 if pipe < nextfd {
529 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
530 if err1 != 0 {
531 goto childerror
532 }
533 pipe = nextfd
534 nextfd++
535 }
536 for i = 0; i < len(fd); i++ {
537 if fd[i] >= 0 && fd[i] < i {
538 if nextfd == pipe {
539 nextfd++
540 }
541 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
542 if err1 != 0 {
543 goto childerror
544 }
545 fd[i] = nextfd
546 nextfd++
547 }
548 }
549
550
551 for i = 0; i < len(fd); i++ {
552 if fd[i] == -1 {
553 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
554 continue
555 }
556 if fd[i] == i {
557
558
559 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
560 if err1 != 0 {
561 goto childerror
562 }
563 continue
564 }
565
566
567 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
568 if err1 != 0 {
569 goto childerror
570 }
571 }
572
573
574
575
576
577 for i = len(fd); i < 3; i++ {
578 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
579 }
580
581
582 if sys.Noctty {
583 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
584 if err1 != 0 {
585 goto childerror
586 }
587 }
588
589
590 if sys.Setctty {
591 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
592 if err1 != 0 {
593 goto childerror
594 }
595 }
596
597
598 if rlimOK && rlim.Cur != 0 {
599 rawSetrlimit(RLIMIT_NOFILE, &rlim)
600 }
601
602
603
604
605 if sys.Ptrace {
606 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
607 if err1 != 0 {
608 goto childerror
609 }
610 }
611
612
613 _, _, err1 = RawSyscall(SYS_EXECVE,
614 uintptr(unsafe.Pointer(argv0)),
615 uintptr(unsafe.Pointer(&argv[0])),
616 uintptr(unsafe.Pointer(&envv[0])))
617
618 childerror:
619
620 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
621 for {
622 RawSyscall(SYS_EXIT, 253, 0, 0)
623 }
624 }
625
626
627 func forkExecPipe(p []int) (err error) {
628 return Pipe2(p, O_CLOEXEC)
629 }
630
631 func formatIDMappings(idMap []SysProcIDMap) []byte {
632 var data []byte
633 for _, im := range idMap {
634 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
635 }
636 return data
637 }
638
639
640 func writeIDMappings(path string, idMap []SysProcIDMap) error {
641 fd, err := Open(path, O_RDWR, 0)
642 if err != nil {
643 return err
644 }
645
646 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
647 Close(fd)
648 return err
649 }
650
651 if err := Close(fd); err != nil {
652 return err
653 }
654
655 return nil
656 }
657
658
659
660
661
662 func writeSetgroups(pid int, enable bool) error {
663 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
664 fd, err := Open(sgf, O_RDWR, 0)
665 if err != nil {
666 return err
667 }
668
669 var data []byte
670 if enable {
671 data = []byte("allow")
672 } else {
673 data = []byte("deny")
674 }
675
676 if _, err := Write(fd, data); err != nil {
677 Close(fd)
678 return err
679 }
680
681 return Close(fd)
682 }
683
684
685
686 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
687 if sys.UidMappings != nil {
688 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
689 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
690 return err
691 }
692 }
693
694 if sys.GidMappings != nil {
695
696 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
697 return err
698 }
699 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
700 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
701 return err
702 }
703 }
704
705 return nil
706 }
707
View as plain text