Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 type SysProcIDMap struct {
18 ContainerID int
19 HostID int
20 Size int
21 }
22
23 type SysProcAttr struct {
24 Chroot string
25 Credential *Credential
26
27
28
29 Ptrace bool
30 Setsid bool
31
32
33 Setpgid bool
34
35
36
37
38 Setctty bool
39 Noctty bool
40 Ctty int
41
42
43
44
45
46 Foreground bool
47 Pgid int
48
49
50
51
52 Pdeathsig Signal
53 Cloneflags uintptr
54 Unshareflags uintptr
55 UidMappings []SysProcIDMap
56 GidMappings []SysProcIDMap
57
58
59
60
61 GidMappingsEnableSetgroups bool
62 AmbientCaps []uintptr
63 }
64
65 var (
66 none = [...]byte{'n', 'o', 'n', 'e', 0}
67 slash = [...]byte{'/', 0}
68 )
69
70
71 func runtime_BeforeFork()
72 func runtime_AfterFork()
73 func runtime_AfterForkInChild()
74
75
76
77
78
79
80
81
82
83
84
85
86 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
87
88
89 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
90 if locked {
91 runtime_AfterFork()
92 }
93 if err1 != 0 {
94 return 0, err1
95 }
96
97
98 pid = int(r1)
99
100 if sys.UidMappings != nil || sys.GidMappings != nil {
101 Close(p[0])
102 var err2 Errno
103
104
105 if sys.Unshareflags&CLONE_NEWUSER == 0 {
106 if err := writeUidGidMappings(pid, sys); err != nil {
107 err2 = err.(Errno)
108 }
109 }
110 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
111 Close(p[1])
112 }
113
114 return pid, 0
115 }
116
117 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
118
119 type capHeader struct {
120 version uint32
121 pid int32
122 }
123
124 type capData struct {
125 effective uint32
126 permitted uint32
127 inheritable uint32
128 }
129 type caps struct {
130 hdr capHeader
131 data [2]capData
132 }
133
134
135 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
136
137
138 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
139
140
141
142
143
144
145
146
147
148
149
150 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
151
152 const (
153 PR_CAP_AMBIENT = 0x2f
154 PR_CAP_AMBIENT_RAISE = 0x2
155 )
156
157
158
159
160
161
162
163
164 var (
165 err2 Errno
166 nextfd int
167 i int
168 caps caps
169 fd1 uintptr
170 puid, psetgroups, pgid []byte
171 uidmap, setgroups, gidmap []byte
172 )
173
174 if sys.UidMappings != nil {
175 puid = []byte("/proc/self/uid_map\000")
176 uidmap = formatIDMappings(sys.UidMappings)
177 }
178
179 if sys.GidMappings != nil {
180 psetgroups = []byte("/proc/self/setgroups\000")
181 pgid = []byte("/proc/self/gid_map\000")
182
183 if sys.GidMappingsEnableSetgroups {
184 setgroups = []byte("allow\000")
185 } else {
186 setgroups = []byte("deny\000")
187 }
188 gidmap = formatIDMappings(sys.GidMappings)
189 }
190
191
192 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
193
194
195
196
197 fd := make([]int, len(attr.Files))
198 nextfd = len(attr.Files)
199 for i, ufd := range attr.Files {
200 if nextfd < int(ufd) {
201 nextfd = int(ufd)
202 }
203 fd[i] = int(ufd)
204 }
205 nextfd++
206
207
208
209 if sys.UidMappings != nil || sys.GidMappings != nil {
210 if err := forkExecPipe(p[:]); err != nil {
211 err1 = err.(Errno)
212 return
213 }
214 }
215
216
217
218 runtime_BeforeFork()
219 locked = true
220 switch {
221 case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
222 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
223 case runtime.GOARCH == "s390x":
224 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
225 default:
226 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
227 }
228 if err1 != 0 || r1 != 0 {
229
230
231
232
233
234
235 return
236 }
237
238
239
240
241 if len(sys.AmbientCaps) > 0 {
242 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
243 if err1 != 0 {
244 goto childerror
245 }
246 }
247
248
249 if sys.UidMappings != nil || sys.GidMappings != nil {
250 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
251 goto childerror
252 }
253 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
254 if err1 != 0 {
255 goto childerror
256 }
257 if r1 != unsafe.Sizeof(err2) {
258 err1 = EINVAL
259 goto childerror
260 }
261 if err2 != 0 {
262 err1 = err2
263 goto childerror
264 }
265 }
266
267
268 if sys.Setsid {
269 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
270 if err1 != 0 {
271 goto childerror
272 }
273 }
274
275
276 if sys.Setpgid || sys.Foreground {
277
278 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
279 if err1 != 0 {
280 goto childerror
281 }
282 }
283
284 if sys.Foreground {
285 pgrp := int32(sys.Pgid)
286 if pgrp == 0 {
287 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
288
289 pgrp = int32(r1)
290 }
291
292
293 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
294 if err1 != 0 {
295 goto childerror
296 }
297 }
298
299
300
301 runtime_AfterForkInChild()
302
303
304 if sys.Unshareflags != 0 {
305 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
306 if err1 != 0 {
307 goto childerror
308 }
309
310 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
311 dirfd := int(_AT_FDCWD)
312 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
313 goto childerror
314 }
315 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
316 if err1 != 0 {
317 goto childerror
318 }
319 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
320 goto childerror
321 }
322
323 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
324 goto childerror
325 }
326 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
327 if err1 != 0 {
328 goto childerror
329 }
330 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
331 goto childerror
332 }
333 }
334
335 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
336 dirfd := int(_AT_FDCWD)
337 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
338 goto childerror
339 }
340 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
341 if err1 != 0 {
342 goto childerror
343 }
344 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
345 goto childerror
346 }
347 }
348
349
350
351
352
353
354
355
356 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
357 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
358 if err1 != 0 {
359 goto childerror
360 }
361 }
362 }
363
364
365 if chroot != nil {
366 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
367 if err1 != 0 {
368 goto childerror
369 }
370 }
371
372
373 if cred := sys.Credential; cred != nil {
374 ngroups := uintptr(len(cred.Groups))
375 groups := uintptr(0)
376 if ngroups > 0 {
377 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
378 }
379 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
380 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
381 if err1 != 0 {
382 goto childerror
383 }
384 }
385 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
386 if err1 != 0 {
387 goto childerror
388 }
389 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
390 if err1 != 0 {
391 goto childerror
392 }
393 }
394
395 if len(sys.AmbientCaps) != 0 {
396
397
398 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
399
400 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
401 goto childerror
402 }
403
404 for _, c := range sys.AmbientCaps {
405
406
407 caps.data[capToIndex(c)].permitted |= capToMask(c)
408 caps.data[capToIndex(c)].inheritable |= capToMask(c)
409 }
410
411 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
412 goto childerror
413 }
414
415 for _, c := range sys.AmbientCaps {
416 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
417 if err1 != 0 {
418 goto childerror
419 }
420 }
421 }
422
423
424 if dir != nil {
425 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
426 if err1 != 0 {
427 goto childerror
428 }
429 }
430
431
432 if sys.Pdeathsig != 0 {
433 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
434 if err1 != 0 {
435 goto childerror
436 }
437
438
439
440
441 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
442 if r1 != ppid {
443 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
444 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
445 if err1 != 0 {
446 goto childerror
447 }
448 }
449 }
450
451
452
453 if pipe < nextfd {
454 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
455 if err1 != 0 {
456 goto childerror
457 }
458 pipe = nextfd
459 nextfd++
460 }
461 for i = 0; i < len(fd); i++ {
462 if fd[i] >= 0 && fd[i] < int(i) {
463 if nextfd == pipe {
464 nextfd++
465 }
466 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
467 if err1 != 0 {
468 goto childerror
469 }
470 fd[i] = nextfd
471 nextfd++
472 }
473 }
474
475
476 for i = 0; i < len(fd); i++ {
477 if fd[i] == -1 {
478 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
479 continue
480 }
481 if fd[i] == int(i) {
482
483
484 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
485 if err1 != 0 {
486 goto childerror
487 }
488 continue
489 }
490
491
492 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
493 if err1 != 0 {
494 goto childerror
495 }
496 }
497
498
499
500
501
502 for i = len(fd); i < 3; i++ {
503 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
504 }
505
506
507 if sys.Noctty {
508 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
509 if err1 != 0 {
510 goto childerror
511 }
512 }
513
514
515 if sys.Setctty {
516 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
517 if err1 != 0 {
518 goto childerror
519 }
520 }
521
522
523
524
525 if sys.Ptrace {
526 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
527 if err1 != 0 {
528 goto childerror
529 }
530 }
531
532
533 _, _, err1 = RawSyscall(SYS_EXECVE,
534 uintptr(unsafe.Pointer(argv0)),
535 uintptr(unsafe.Pointer(&argv[0])),
536 uintptr(unsafe.Pointer(&envv[0])))
537
538 childerror:
539
540 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
541 for {
542 RawSyscall(SYS_EXIT, 253, 0, 0)
543 }
544 }
545
546
547 func forkExecPipe(p []int) (err error) {
548 return Pipe2(p, O_CLOEXEC)
549 }
550
551 func formatIDMappings(idMap []SysProcIDMap) []byte {
552 var data []byte
553 for _, im := range idMap {
554 data = append(data, []byte(itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n")...)
555 }
556 return data
557 }
558
559
560 func writeIDMappings(path string, idMap []SysProcIDMap) error {
561 fd, err := Open(path, O_RDWR, 0)
562 if err != nil {
563 return err
564 }
565
566 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
567 Close(fd)
568 return err
569 }
570
571 if err := Close(fd); err != nil {
572 return err
573 }
574
575 return nil
576 }
577
578
579
580
581
582 func writeSetgroups(pid int, enable bool) error {
583 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
584 fd, err := Open(sgf, O_RDWR, 0)
585 if err != nil {
586 return err
587 }
588
589 var data []byte
590 if enable {
591 data = []byte("allow")
592 } else {
593 data = []byte("deny")
594 }
595
596 if _, err := Write(fd, data); err != nil {
597 Close(fd)
598 return err
599 }
600
601 return Close(fd)
602 }
603
604
605
606 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
607 if sys.UidMappings != nil {
608 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
609 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
610 return err
611 }
612 }
613
614 if sys.GidMappings != nil {
615
616 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
617 return err
618 }
619 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
620 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
621 return err
622 }
623 }
624
625 return nil
626 }
627
View as plain text