Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 const (
18 CLONE_VM = 0x00000100
19 CLONE_FS = 0x00000200
20 CLONE_FILES = 0x00000400
21 CLONE_SIGHAND = 0x00000800
22 CLONE_PIDFD = 0x00001000
23 CLONE_PTRACE = 0x00002000
24 CLONE_VFORK = 0x00004000
25 CLONE_PARENT = 0x00008000
26 CLONE_THREAD = 0x00010000
27 CLONE_NEWNS = 0x00020000
28 CLONE_SYSVSEM = 0x00040000
29 CLONE_SETTLS = 0x00080000
30 CLONE_PARENT_SETTID = 0x00100000
31 CLONE_CHILD_CLEARTID = 0x00200000
32 CLONE_DETACHED = 0x00400000
33 CLONE_UNTRACED = 0x00800000
34 CLONE_CHILD_SETTID = 0x01000000
35 CLONE_NEWCGROUP = 0x02000000
36 CLONE_NEWUTS = 0x04000000
37 CLONE_NEWIPC = 0x08000000
38 CLONE_NEWUSER = 0x10000000
39 CLONE_NEWPID = 0x20000000
40 CLONE_NEWNET = 0x40000000
41 CLONE_IO = 0x80000000
42
43
44
45 CLONE_CLEAR_SIGHAND = 0x100000000
46 CLONE_INTO_CGROUP = 0x200000000
47
48
49
50
51 CLONE_NEWTIME = 0x00000080
52 )
53
54
55
56 type SysProcIDMap struct {
57 ContainerID int
58 HostID int
59 Size int
60 }
61
62 type SysProcAttr struct {
63 Chroot string
64 Credential *Credential
65
66
67
68 Ptrace bool
69 Setsid bool
70
71
72 Setpgid bool
73
74
75
76
77 Setctty bool
78 Noctty bool
79 Ctty int
80
81
82
83
84
85 Foreground bool
86 Pgid int
87
88
89
90
91 Pdeathsig Signal
92 Cloneflags uintptr
93 Unshareflags uintptr
94 UidMappings []SysProcIDMap
95 GidMappings []SysProcIDMap
96
97
98
99
100 GidMappingsEnableSetgroups bool
101 AmbientCaps []uintptr
102 UseCgroupFD bool
103 CgroupFD int
104 }
105
106 var (
107 none = [...]byte{'n', 'o', 'n', 'e', 0}
108 slash = [...]byte{'/', 0}
109 )
110
111
112 func runtime_BeforeFork()
113 func runtime_AfterFork()
114 func runtime_AfterForkInChild()
115
116
117
118
119
120
121
122
123
124
125
126
127 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
128
129
130 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
131 if locked {
132 runtime_AfterFork()
133 }
134 if err1 != 0 {
135 return 0, err1
136 }
137
138
139 pid = int(r1)
140
141 if sys.UidMappings != nil || sys.GidMappings != nil {
142 Close(p[0])
143 var err2 Errno
144
145
146 if sys.Unshareflags&CLONE_NEWUSER == 0 {
147 if err := writeUidGidMappings(pid, sys); err != nil {
148 err2 = err.(Errno)
149 }
150 }
151 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
152 Close(p[1])
153 }
154
155 return pid, 0
156 }
157
158 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
159
160 type capHeader struct {
161 version uint32
162 pid int32
163 }
164
165 type capData struct {
166 effective uint32
167 permitted uint32
168 inheritable uint32
169 }
170 type caps struct {
171 hdr capHeader
172 data [2]capData
173 }
174
175
176 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
177
178
179 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
180
181
182 type cloneArgs struct {
183 flags uint64
184 pidFD uint64
185 childTID uint64
186 parentTID uint64
187 exitSignal uint64
188 stack uint64
189 stackSize uint64
190 tls uint64
191 setTID uint64
192 setTIDSize uint64
193 cgroup uint64
194 }
195
196
197
198
199
200
201
202
203
204
205
206 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
207
208 const (
209 PR_CAP_AMBIENT = 0x2f
210 PR_CAP_AMBIENT_RAISE = 0x2
211 )
212
213
214
215
216
217
218
219
220 var (
221 err2 Errno
222 nextfd int
223 i int
224 caps caps
225 fd1, flags uintptr
226 puid, psetgroups, pgid []byte
227 uidmap, setgroups, gidmap []byte
228 clone3 *cloneArgs
229 )
230
231 if sys.UidMappings != nil {
232 puid = []byte("/proc/self/uid_map\000")
233 uidmap = formatIDMappings(sys.UidMappings)
234 }
235
236 if sys.GidMappings != nil {
237 psetgroups = []byte("/proc/self/setgroups\000")
238 pgid = []byte("/proc/self/gid_map\000")
239
240 if sys.GidMappingsEnableSetgroups {
241 setgroups = []byte("allow\000")
242 } else {
243 setgroups = []byte("deny\000")
244 }
245 gidmap = formatIDMappings(sys.GidMappings)
246 }
247
248
249 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
250
251
252
253
254 fd := make([]int, len(attr.Files))
255 nextfd = len(attr.Files)
256 for i, ufd := range attr.Files {
257 if nextfd < int(ufd) {
258 nextfd = int(ufd)
259 }
260 fd[i] = int(ufd)
261 }
262 nextfd++
263
264
265
266 if sys.UidMappings != nil || sys.GidMappings != nil {
267 if err := forkExecPipe(p[:]); err != nil {
268 err1 = err.(Errno)
269 return
270 }
271 }
272
273 flags = sys.Cloneflags
274 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
275 flags |= CLONE_VFORK | CLONE_VM
276 }
277
278 if sys.UseCgroupFD {
279 clone3 = &cloneArgs{
280 flags: uint64(flags) | CLONE_INTO_CGROUP,
281 exitSignal: uint64(SIGCHLD),
282 cgroup: uint64(sys.CgroupFD),
283 }
284 }
285
286
287
288 runtime_BeforeFork()
289 locked = true
290 if clone3 != nil {
291 r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
292 } else {
293 flags |= uintptr(SIGCHLD)
294 if runtime.GOARCH == "s390x" {
295
296 r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
297 } else {
298 r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
299 }
300 }
301 if err1 != 0 || r1 != 0 {
302
303
304
305
306
307
308 return
309 }
310
311
312
313
314 if len(sys.AmbientCaps) > 0 {
315 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
316 if err1 != 0 {
317 goto childerror
318 }
319 }
320
321
322 if sys.UidMappings != nil || sys.GidMappings != nil {
323 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
324 goto childerror
325 }
326 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
327 if err1 != 0 {
328 goto childerror
329 }
330 if r1 != unsafe.Sizeof(err2) {
331 err1 = EINVAL
332 goto childerror
333 }
334 if err2 != 0 {
335 err1 = err2
336 goto childerror
337 }
338 }
339
340
341 if sys.Setsid {
342 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
343 if err1 != 0 {
344 goto childerror
345 }
346 }
347
348
349 if sys.Setpgid || sys.Foreground {
350
351 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
352 if err1 != 0 {
353 goto childerror
354 }
355 }
356
357 if sys.Foreground {
358 pgrp := int32(sys.Pgid)
359 if pgrp == 0 {
360 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
361
362 pgrp = int32(r1)
363 }
364
365
366 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
367 if err1 != 0 {
368 goto childerror
369 }
370 }
371
372
373
374 runtime_AfterForkInChild()
375
376
377 if sys.Unshareflags != 0 {
378 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
379 if err1 != 0 {
380 goto childerror
381 }
382
383 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
384 dirfd := int(_AT_FDCWD)
385 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
386 goto childerror
387 }
388 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
389 if err1 != 0 {
390 goto childerror
391 }
392 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
393 goto childerror
394 }
395
396 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
397 goto childerror
398 }
399 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
400 if err1 != 0 {
401 goto childerror
402 }
403 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
404 goto childerror
405 }
406 }
407
408 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
409 dirfd := int(_AT_FDCWD)
410 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
411 goto childerror
412 }
413 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
414 if err1 != 0 {
415 goto childerror
416 }
417 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
418 goto childerror
419 }
420 }
421
422
423
424
425
426
427
428
429 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
430 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
431 if err1 != 0 {
432 goto childerror
433 }
434 }
435 }
436
437
438 if chroot != nil {
439 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
440 if err1 != 0 {
441 goto childerror
442 }
443 }
444
445
446 if cred := sys.Credential; cred != nil {
447 ngroups := uintptr(len(cred.Groups))
448 groups := uintptr(0)
449 if ngroups > 0 {
450 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
451 }
452 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
453 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
454 if err1 != 0 {
455 goto childerror
456 }
457 }
458 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
459 if err1 != 0 {
460 goto childerror
461 }
462 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
463 if err1 != 0 {
464 goto childerror
465 }
466 }
467
468 if len(sys.AmbientCaps) != 0 {
469
470
471 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
472
473 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
474 goto childerror
475 }
476
477 for _, c := range sys.AmbientCaps {
478
479
480 caps.data[capToIndex(c)].permitted |= capToMask(c)
481 caps.data[capToIndex(c)].inheritable |= capToMask(c)
482 }
483
484 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
485 goto childerror
486 }
487
488 for _, c := range sys.AmbientCaps {
489 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
490 if err1 != 0 {
491 goto childerror
492 }
493 }
494 }
495
496
497 if dir != nil {
498 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
499 if err1 != 0 {
500 goto childerror
501 }
502 }
503
504
505 if sys.Pdeathsig != 0 {
506 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
507 if err1 != 0 {
508 goto childerror
509 }
510
511
512
513
514 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
515 if r1 != ppid {
516 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
517 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
518 if err1 != 0 {
519 goto childerror
520 }
521 }
522 }
523
524
525
526 if pipe < nextfd {
527 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
528 if err1 != 0 {
529 goto childerror
530 }
531 pipe = nextfd
532 nextfd++
533 }
534 for i = 0; i < len(fd); i++ {
535 if fd[i] >= 0 && fd[i] < i {
536 if nextfd == pipe {
537 nextfd++
538 }
539 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
540 if err1 != 0 {
541 goto childerror
542 }
543 fd[i] = nextfd
544 nextfd++
545 }
546 }
547
548
549 for i = 0; i < len(fd); i++ {
550 if fd[i] == -1 {
551 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
552 continue
553 }
554 if fd[i] == i {
555
556
557 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
558 if err1 != 0 {
559 goto childerror
560 }
561 continue
562 }
563
564
565 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
566 if err1 != 0 {
567 goto childerror
568 }
569 }
570
571
572
573
574
575 for i = len(fd); i < 3; i++ {
576 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
577 }
578
579
580 if sys.Noctty {
581 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
582 if err1 != 0 {
583 goto childerror
584 }
585 }
586
587
588 if sys.Setctty {
589 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
590 if err1 != 0 {
591 goto childerror
592 }
593 }
594
595
596
597
598 if sys.Ptrace {
599 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
600 if err1 != 0 {
601 goto childerror
602 }
603 }
604
605
606 _, _, err1 = RawSyscall(SYS_EXECVE,
607 uintptr(unsafe.Pointer(argv0)),
608 uintptr(unsafe.Pointer(&argv[0])),
609 uintptr(unsafe.Pointer(&envv[0])))
610
611 childerror:
612
613 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
614 for {
615 RawSyscall(SYS_EXIT, 253, 0, 0)
616 }
617 }
618
619
620 func forkExecPipe(p []int) (err error) {
621 return Pipe2(p, O_CLOEXEC)
622 }
623
624 func formatIDMappings(idMap []SysProcIDMap) []byte {
625 var data []byte
626 for _, im := range idMap {
627 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
628 }
629 return data
630 }
631
632
633 func writeIDMappings(path string, idMap []SysProcIDMap) error {
634 fd, err := Open(path, O_RDWR, 0)
635 if err != nil {
636 return err
637 }
638
639 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
640 Close(fd)
641 return err
642 }
643
644 if err := Close(fd); err != nil {
645 return err
646 }
647
648 return nil
649 }
650
651
652
653
654
655 func writeSetgroups(pid int, enable bool) error {
656 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
657 fd, err := Open(sgf, O_RDWR, 0)
658 if err != nil {
659 return err
660 }
661
662 var data []byte
663 if enable {
664 data = []byte("allow")
665 } else {
666 data = []byte("deny")
667 }
668
669 if _, err := Write(fd, data); err != nil {
670 Close(fd)
671 return err
672 }
673
674 return Close(fd)
675 }
676
677
678
679 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
680 if sys.UidMappings != nil {
681 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
682 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
683 return err
684 }
685 }
686
687 if sys.GidMappings != nil {
688
689 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
690 return err
691 }
692 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
693 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
694 return err
695 }
696 }
697
698 return nil
699 }
700
View as plain text