xref: /netbsd-src/sys/compat/linux/common/linux_sched.c (revision adc48a8562cd4f1cdbcfe32d967d5b4465f55d00)
1 /*	$NetBSD: linux_sched.c,v 1.83 2024/10/03 12:56:49 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999, 2019 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center; by Matthias Scheler.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Linux compatibility module. Try to deal with scheduler related syscalls.
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.83 2024/10/03 12:56:49 hannken Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/mount.h>
42 #include <sys/proc.h>
43 #include <sys/systm.h>
44 #include <sys/sysctl.h>
45 #include <sys/syscallargs.h>
46 #include <sys/wait.h>
47 #include <sys/kauth.h>
48 #include <sys/ptrace.h>
49 #include <sys/atomic.h>
50 
51 #include <sys/cpu.h>
52 
53 #include <compat/linux/common/linux_types.h>
54 #include <compat/linux/common/linux_signal.h>
55 #include <compat/linux/common/linux_emuldata.h>
56 #include <compat/linux/common/linux_ipc.h>
57 #include <compat/linux/common/linux_sem.h>
58 #include <compat/linux/common/linux_exec.h>
59 #include <compat/linux/common/linux_machdep.h>
60 
61 #include <compat/linux/linux_syscallargs.h>
62 
63 #include <compat/linux/common/linux_sched.h>
64 
65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *,
66     register_t *);
67 
68 /* Unlike Linux, dynamically calculate CPU mask size */
69 #define	LINUX_CPU_MASK_SIZE (sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT))
70 
71 #if DEBUG_LINUX
72 #define DPRINTF(x, ...) uprintf(x, __VA_ARGS__)
73 #else
74 #define DPRINTF(x, ...)
75 #endif
76 
77 static void
78 linux_child_return(void *arg)
79 {
80 	struct lwp *l = arg;
81 	struct proc *p = l->l_proc;
82 	struct linux_emuldata *led = l->l_emuldata;
83 	void *ctp = led->led_child_tidptr;
84 	int error;
85 
86 	if (ctp) {
87 		if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0)
88 			printf("%s: LINUX_CLONE_CHILD_SETTID "
89 			    "failed (child_tidptr = %p, tid = %d error =%d)\n",
90 			    __func__, ctp, p->p_pid, error);
91 	}
92 	child_return(arg);
93 }
94 
95 int
96 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap,
97     register_t *retval)
98 {
99 	/* {
100 		syscallarg(int) flags;
101 		syscallarg(void *) stack;
102 		syscallarg(void *) parent_tidptr;
103 		syscallarg(void *) tls;
104 		syscallarg(void *) child_tidptr;
105 	} */
106 	struct linux_emuldata *led;
107 	int flags, sig, error;
108 
109 	/*
110 	 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags.
111 	 */
112 	if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE))
113 		return EINVAL;
114 
115 	/*
116 	 * Thread group implies shared signals. Shared signals
117 	 * imply shared VM. This matches what Linux kernel does.
118 	 */
119 	if (SCARG(uap, flags) & LINUX_CLONE_THREAD
120 	    && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0)
121 		return EINVAL;
122 	if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND
123 	    && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0)
124 		return EINVAL;
125 
126 	/*
127 	 * The thread group flavor is implemented totally differently.
128 	 */
129 	if (SCARG(uap, flags) & LINUX_CLONE_THREAD)
130 		return linux_clone_nptl(l, uap, retval);
131 
132 	flags = 0;
133 	if (SCARG(uap, flags) & LINUX_CLONE_VM)
134 		flags |= FORK_SHAREVM;
135 	if (SCARG(uap, flags) & LINUX_CLONE_FS)
136 		flags |= FORK_SHARECWD;
137 	if (SCARG(uap, flags) & LINUX_CLONE_FILES)
138 		flags |= FORK_SHAREFILES;
139 	if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND)
140 		flags |= FORK_SHARESIGS;
141 	if (SCARG(uap, flags) & LINUX_CLONE_VFORK)
142 		flags |= FORK_PPWAIT;
143 
144 	sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL;
145 	if (sig < 0 || sig >= LINUX__NSIG)
146 		return EINVAL;
147 	sig = linux_to_native_signo[sig];
148 
149 	if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) {
150 		led = l->l_emuldata;
151 		led->led_child_tidptr = SCARG(uap, child_tidptr);
152 	}
153 
154 	/*
155 	 * Note that Linux does not provide a portable way of specifying
156 	 * the stack area; the caller must know if the stack grows up
157 	 * or down.  So, we pass a stack size of 0, so that the code
158 	 * that makes this adjustment is a noop.
159 	 */
160 	if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0,
161 	    linux_child_return, NULL, retval)) != 0) {
162 		DPRINTF("%s: fork1: error %d\n", __func__, error);
163 		return error;
164 	}
165 
166 	return 0;
167 }
168 
169 
170 int
171 linux_sys_clone3(struct lwp *l, const struct linux_sys_clone3_args *uap, register_t *retval)
172 {
173 	struct linux_user_clone3_args cl_args;
174 	struct linux_sys_clone_args clone_args;
175 	int error;
176 
177 	if (SCARG(uap, size) != sizeof(cl_args)) {
178 	    DPRINTF("%s: Invalid size less or more\n", __func__);
179 	    return EINVAL;
180 	}
181 
182 	error = copyin(SCARG(uap, cl_args), &cl_args, SCARG(uap, size));
183 	if (error) {
184 		DPRINTF("%s: Copyin failed: %d\n", __func__, error);
185 		return error;
186 	}
187 
188 	DPRINTF("%s: Flags: %#jx\n", __func__, (intmax_t)cl_args.flags);
189 
190 	/* Define allowed flags */
191 	if (cl_args.flags & LINUX_CLONE_UNIMPLEMENTED_FLAGS) {
192 		DPRINTF("%s: Unsupported flags for clone3: %#" PRIx64 "\n",
193 		    __func__, cl_args.flags & LINUX_CLONE_UNIMPLEMENTED_FLAGS);
194 		return EOPNOTSUPP;
195 	}
196 	if (cl_args.flags & ~LINUX_CLONE_ALLOWED_FLAGS) {
197 		DPRINTF("%s: Disallowed flags for clone3: %#" PRIx64 "\n",
198 		    __func__, cl_args.flags & ~LINUX_CLONE_ALLOWED_FLAGS);
199 		return EINVAL;
200 	}
201 
202 #if 0
203 	// XXX: this is wrong, exit_signal is the signal to deliver to the
204 	// process upon exit.
205 	if ((cl_args.exit_signal & ~(uint64_t)LINUX_CLONE_CSIGNAL) != 0){
206 		DPRINTF("%s: Disallowed flags for clone3: %#x\n", __func__,
207 		    cl_args.exit_signal & ~(uint64_t)LINUX_CLONE_CSIGNAL);
208 		return EINVAL;
209 	}
210 #endif
211 
212 	if (cl_args.stack == 0 && cl_args.stack_size != 0) {
213 		DPRINTF("%s: Stack is NULL but stack size is not 0\n",
214 		    __func__);
215 		return EINVAL;
216 	}
217 	if (cl_args.stack != 0 && cl_args.stack_size == 0) {
218 		DPRINTF("%s: Stack is not NULL but stack size is 0\n",
219 		    __func__);
220 		return EINVAL;
221 	}
222 
223 	int flags = cl_args.flags & LINUX_CLONE_ALLOWED_FLAGS;
224 #if 0
225 	int sig = cl_args.exit_signal & LINUX_CLONE_CSIGNAL;
226 #endif
227 	// XXX: Pidfd member handling
228 	// XXX: we don't have cgroups
229 	// XXX: what to do with tid_set and tid_set_size
230 	// XXX: clone3 has stacksize, instead implement clone as a clone3
231 	// wrapper.
232 	SCARG(&clone_args, flags) = flags;
233 	SCARG(&clone_args, stack) = (void *)(uintptr_t)cl_args.stack;
234 	SCARG(&clone_args, parent_tidptr) =
235 	    (void *)(intptr_t)cl_args.parent_tid;
236 	SCARG(&clone_args, tls) =
237 	    (void *)(intptr_t)cl_args.tls;
238 	SCARG(&clone_args, child_tidptr) =
239 	    (void *)(intptr_t)cl_args.child_tid;
240 
241 	return linux_sys_clone(l, &clone_args, retval);
242 }
243 
244 static int
245 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval)
246 {
247 	/* {
248 		syscallarg(int) flags;
249 		syscallarg(void *) stack;
250 		syscallarg(void *) parent_tidptr;
251 		syscallarg(void *) tls;
252 		syscallarg(void *) child_tidptr;
253 	} */
254 	struct proc *p;
255 	struct lwp *l2;
256 	struct linux_emuldata *led;
257 	void *parent_tidptr, *tls, *child_tidptr;
258 	vaddr_t uaddr;
259 	lwpid_t lid;
260 	int flags, error;
261 
262 	p = l->l_proc;
263 	flags = SCARG(uap, flags);
264 	parent_tidptr = SCARG(uap, parent_tidptr);
265 	tls = SCARG(uap, tls);
266 	child_tidptr = SCARG(uap, child_tidptr);
267 
268 	uaddr = uvm_uarea_alloc();
269 	if (__predict_false(uaddr == 0)) {
270 		return ENOMEM;
271 	}
272 
273 	error = lwp_create(l, p, uaddr, LWP_DETACHED,
274 	    SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class,
275 	    &l->l_sigmask, &l->l_sigstk);
276 	if (__predict_false(error)) {
277 		DPRINTF("%s: lwp_create error=%d\n", __func__, error);
278 		uvm_uarea_free(uaddr);
279 		return error;
280 	}
281 	lid = l2->l_lid;
282 
283 	/* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */
284 	if (flags & LINUX_CLONE_CHILD_CLEARTID) {
285 		led = l2->l_emuldata;
286 		led->led_clear_tid = child_tidptr;
287 	}
288 
289 	/* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */
290 	if (flags & LINUX_CLONE_PARENT_SETTID) {
291 		if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0)
292 			printf("%s: LINUX_CLONE_PARENT_SETTID "
293 			    "failed (parent_tidptr = %p tid = %d error=%d)\n",
294 			    __func__, parent_tidptr, lid, error);
295 	}
296 
297 	/* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory  */
298 	if (flags & LINUX_CLONE_CHILD_SETTID) {
299 		if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0)
300 			printf("%s: LINUX_CLONE_CHILD_SETTID "
301 			    "failed (child_tidptr = %p, tid = %d error=%d)\n",
302 			    __func__, child_tidptr, lid, error);
303 	}
304 
305 	if (flags & LINUX_CLONE_SETTLS) {
306 		error = LINUX_LWP_SETPRIVATE(l2, tls);
307 		if (error) {
308 			DPRINTF("%s: LINUX_LWP_SETPRIVATE %d\n", __func__,
309 			    error);
310 			lwp_exit(l2);
311 			return error;
312 		}
313 	}
314 
315 	/* Set the new LWP running. */
316 	lwp_start(l2, 0);
317 
318 	retval[0] = lid;
319 	retval[1] = 0;
320 	return 0;
321 }
322 
323 /*
324  * linux realtime priority
325  *
326  * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99].
327  *
328  * - SCHED_OTHER tasks don't have realtime priorities.
329  *   in particular, sched_param::sched_priority is always 0.
330  */
331 
332 #define	LINUX_SCHED_RTPRIO_MIN	1
333 #define	LINUX_SCHED_RTPRIO_MAX	99
334 
335 static int
336 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params,
337     int *native_policy, struct sched_param *native_params)
338 {
339 
340 	switch (linux_policy) {
341 	case LINUX_SCHED_OTHER:
342 		if (native_policy != NULL) {
343 			*native_policy = SCHED_OTHER;
344 		}
345 		break;
346 
347 	case LINUX_SCHED_FIFO:
348 		if (native_policy != NULL) {
349 			*native_policy = SCHED_FIFO;
350 		}
351 		break;
352 
353 	case LINUX_SCHED_RR:
354 		if (native_policy != NULL) {
355 			*native_policy = SCHED_RR;
356 		}
357 		break;
358 
359 	default:
360 		return EINVAL;
361 	}
362 
363 	if (linux_params != NULL) {
364 		int prio = linux_params->sched_priority;
365 
366 		KASSERT(native_params != NULL);
367 
368 		if (linux_policy == LINUX_SCHED_OTHER) {
369 			if (prio != 0) {
370 				return EINVAL;
371 			}
372 			native_params->sched_priority = PRI_NONE; /* XXX */
373 		} else {
374 			if (prio < LINUX_SCHED_RTPRIO_MIN ||
375 			    prio > LINUX_SCHED_RTPRIO_MAX) {
376 				return EINVAL;
377 			}
378 			native_params->sched_priority =
379 			    (prio - LINUX_SCHED_RTPRIO_MIN)
380 			    * (SCHED_PRI_MAX - SCHED_PRI_MIN)
381 			    / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
382 			    + SCHED_PRI_MIN;
383 		}
384 	}
385 
386 	return 0;
387 }
388 
389 static int
390 sched_native2linux(int native_policy, struct sched_param *native_params,
391     int *linux_policy, struct linux_sched_param *linux_params)
392 {
393 
394 	switch (native_policy) {
395 	case SCHED_OTHER:
396 		if (linux_policy != NULL) {
397 			*linux_policy = LINUX_SCHED_OTHER;
398 		}
399 		break;
400 
401 	case SCHED_FIFO:
402 		if (linux_policy != NULL) {
403 			*linux_policy = LINUX_SCHED_FIFO;
404 		}
405 		break;
406 
407 	case SCHED_RR:
408 		if (linux_policy != NULL) {
409 			*linux_policy = LINUX_SCHED_RR;
410 		}
411 		break;
412 
413 	default:
414 		panic("%s: unknown policy %d\n", __func__, native_policy);
415 	}
416 
417 	if (native_params != NULL) {
418 		int prio = native_params->sched_priority;
419 
420 		KASSERT(prio >= SCHED_PRI_MIN);
421 		KASSERT(prio <= SCHED_PRI_MAX);
422 		KASSERT(linux_params != NULL);
423 
424 		memset(linux_params, 0, sizeof(*linux_params));
425 
426 		DPRINTF("%s: native: policy %d, priority %d\n",
427 		    __func__, native_policy, prio);
428 
429 		if (native_policy == SCHED_OTHER) {
430 			linux_params->sched_priority = 0;
431 		} else {
432 			linux_params->sched_priority =
433 			    (prio - SCHED_PRI_MIN)
434 			    * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
435 			    / (SCHED_PRI_MAX - SCHED_PRI_MIN)
436 			    + LINUX_SCHED_RTPRIO_MIN;
437 		}
438 		DPRINTF("%s: linux: policy %d, priority %d\n",
439 		    __func__, -1, linux_params->sched_priority);
440 	}
441 
442 	return 0;
443 }
444 
445 int
446 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval)
447 {
448 	/* {
449 		syscallarg(linux_pid_t) pid;
450 		syscallarg(const struct linux_sched_param *) sp;
451 	} */
452 	int error, policy;
453 	struct linux_sched_param lp;
454 	struct sched_param sp;
455 
456 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
457 		error = EINVAL;
458 		goto out;
459 	}
460 
461 	error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
462 	if (error)
463 		goto out;
464 
465 	/* We need the current policy in Linux terms. */
466 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
467 	if (error)
468 		goto out;
469 	error = sched_native2linux(policy, NULL, &policy, NULL);
470 	if (error)
471 		goto out;
472 
473 	error = sched_linux2native(policy, &lp, &policy, &sp);
474 	if (error)
475 		goto out;
476 
477 	error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
478 	if (error)
479 		goto out;
480 
481  out:
482 	return error;
483 }
484 
485 int
486 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval)
487 {
488 	/* {
489 		syscallarg(linux_pid_t) pid;
490 		syscallarg(struct linux_sched_param *) sp;
491 	} */
492 	struct linux_sched_param lp;
493 	struct sched_param sp;
494 	int error, policy;
495 
496 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
497 		error = EINVAL;
498 		goto out;
499 	}
500 
501 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp);
502 	if (error)
503 		goto out;
504 	DPRINTF("%s: native: policy %d, priority %d\n",
505 	    __func__, policy, sp.sched_priority);
506 
507 	error = sched_native2linux(policy, &sp, NULL, &lp);
508 	if (error)
509 		goto out;
510 	DPRINTF("%s: linux: policy %d, priority %d\n",
511 	    __func__, policy, lp.sched_priority);
512 
513 	error = copyout(&lp, SCARG(uap, sp), sizeof(lp));
514 	if (error)
515 		goto out;
516 
517  out:
518 	return error;
519 }
520 
521 int
522 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval)
523 {
524 	/* {
525 		syscallarg(linux_pid_t) pid;
526 		syscallarg(int) policy;
527 		syscallarg(cont struct linux_sched_param *) sp;
528 	} */
529 	int error, policy;
530 	struct linux_sched_param lp;
531 	struct sched_param sp;
532 
533 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
534 		error = EINVAL;
535 		goto out;
536 	}
537 
538 	error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
539 	if (error)
540 		goto out;
541 	DPRINTF("%s: linux: policy %d, priority %d\n",
542 	    __func__, SCARG(uap, policy), lp.sched_priority);
543 
544 	error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp);
545 	if (error)
546 		goto out;
547 	DPRINTF("%s: native: policy %d, priority %d\n",
548 	    __func__, policy, sp.sched_priority);
549 
550 	error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
551 	if (error)
552 		goto out;
553 
554  out:
555 	return error;
556 }
557 
558 int
559 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval)
560 {
561 	/* {
562 		syscallarg(linux_pid_t) pid;
563 	} */
564 	int error, policy;
565 
566 	*retval = -1;
567 
568 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
569 	if (error)
570 		goto out;
571 
572 	error = sched_native2linux(policy, NULL, &policy, NULL);
573 	if (error)
574 		goto out;
575 
576 	*retval = policy;
577 
578  out:
579 	return error;
580 }
581 
582 int
583 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
584 {
585 
586 	yield();
587 	return 0;
588 }
589 
590 int
591 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval)
592 {
593 	/* {
594 		syscallarg(int) policy;
595 	} */
596 
597 	switch (SCARG(uap, policy)) {
598 	case LINUX_SCHED_OTHER:
599 		*retval = 0;
600 		break;
601 	case LINUX_SCHED_FIFO:
602 	case LINUX_SCHED_RR:
603 		*retval = LINUX_SCHED_RTPRIO_MAX;
604 		break;
605 	default:
606 		return EINVAL;
607 	}
608 
609 	return 0;
610 }
611 
612 int
613 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval)
614 {
615 	/* {
616 		syscallarg(int) policy;
617 	} */
618 
619 	switch (SCARG(uap, policy)) {
620 	case LINUX_SCHED_OTHER:
621 		*retval = 0;
622 		break;
623 	case LINUX_SCHED_FIFO:
624 	case LINUX_SCHED_RR:
625 		*retval = LINUX_SCHED_RTPRIO_MIN;
626 		break;
627 	default:
628 		return EINVAL;
629 	}
630 
631 	return 0;
632 }
633 
634 int
635 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval)
636 {
637 
638 	lwp_exit(l);
639 	return 0;
640 }
641 
642 #ifndef __m68k__
643 /* Present on everything but m68k */
644 int
645 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval)
646 {
647 
648 	return sys_exit(l, (const void *)uap, retval);
649 }
650 #endif /* !__m68k__ */
651 
652 int
653 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval)
654 {
655 	/* {
656 		syscallarg(int *) tidptr;
657 	} */
658 	struct linux_emuldata *led;
659 
660 	led = (struct linux_emuldata *)l->l_emuldata;
661 	led->led_clear_tid = SCARG(uap, tid);
662 	*retval = l->l_lid;
663 
664 	return 0;
665 }
666 
667 /* ARGUSED1 */
668 int
669 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval)
670 {
671 
672 	*retval = l->l_lid;
673 	return 0;
674 }
675 
676 /*
677  * The affinity syscalls assume that the layout of our cpu kcpuset is
678  * the same as linux's: a linear bitmask.
679  */
680 int
681 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval)
682 {
683 	/* {
684 		syscallarg(linux_pid_t) pid;
685 		syscallarg(unsigned int) len;
686 		syscallarg(unsigned long *) mask;
687 	} */
688 	struct proc *p;
689 	struct lwp *t;
690 	kcpuset_t *kcset;
691 	size_t size;
692 	cpuid_t i;
693 	int error;
694 
695 	size = LINUX_CPU_MASK_SIZE;
696 	if (SCARG(uap, len) < size)
697 		return EINVAL;
698 
699 	if (SCARG(uap, pid) == 0) {
700 		p = curproc;
701 		mutex_enter(p->p_lock);
702 		t = curlwp;
703 	} else {
704 		t = lwp_find2(-1, SCARG(uap, pid));
705 		if (__predict_false(t == NULL)) {
706 			return ESRCH;
707 		}
708 		p = t->l_proc;
709 		KASSERT(mutex_owned(p->p_lock));
710 	}
711 
712 	/* Check the permission */
713 	if (kauth_authorize_process(l->l_cred,
714 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, p, NULL, NULL, NULL)) {
715 		mutex_exit(p->p_lock);
716 		return EPERM;
717 	}
718 
719 	kcpuset_create(&kcset, true);
720 	lwp_lock(t);
721 	if (t->l_affinity != NULL)
722 		kcpuset_copy(kcset, t->l_affinity);
723 	else {
724 		/*
725 		 * All available CPUs should be masked when affinity has not
726 		 * been set.
727 		 */
728 		kcpuset_zero(kcset);
729 		for (i = 0; i < ncpu; i++)
730 			kcpuset_set(kcset, i);
731 	}
732 	lwp_unlock(t);
733 	mutex_exit(p->p_lock);
734 	error = kcpuset_copyout(kcset, (cpuset_t *)SCARG(uap, mask), size);
735 	kcpuset_unuse(kcset, NULL);
736 	*retval = size;
737 	return error;
738 }
739 
740 int
741 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval)
742 {
743 	/* {
744 		syscallarg(linux_pid_t) pid;
745 		syscallarg(unsigned int) len;
746 		syscallarg(unsigned long *) mask;
747 	} */
748 	struct sys__sched_setaffinity_args ssa;
749 	size_t size;
750 	pid_t pid;
751 	lwpid_t lid;
752 
753 	size = LINUX_CPU_MASK_SIZE;
754 	if (SCARG(uap, len) < size)
755 		return EINVAL;
756 
757 	lid = SCARG(uap, pid);
758 	if (lid != 0) {
759 		/* Get the canonical PID for the process. */
760 		mutex_enter(&proc_lock);
761 		struct proc *p = proc_find_lwpid(SCARG(uap, pid));
762 		if (p == NULL) {
763 			mutex_exit(&proc_lock);
764 			return ESRCH;
765 		}
766 		pid = p->p_pid;
767 		mutex_exit(&proc_lock);
768 	} else {
769 		pid = curproc->p_pid;
770 		lid = curlwp->l_lid;
771 	}
772 
773 	SCARG(&ssa, pid) = pid;
774 	SCARG(&ssa, lid) = lid;
775 	SCARG(&ssa, size) = size;
776 	SCARG(&ssa, cpuset) = (cpuset_t *)SCARG(uap, mask);
777 
778 	return sys__sched_setaffinity(l, &ssa, retval);
779 }
780