xref: /netbsd-src/sys/compat/linux/common/linux_sched.c (revision 54c71dee8ce8ff710b7e2b5a511b77d6cae19a0e)
1 /*	$NetBSD: linux_sched.c,v 1.62 2010/07/01 02:38:29 rmind Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center; by Matthias Scheler.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Linux compatibility module. Try to deal with scheduler related syscalls.
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.62 2010/07/01 02:38:29 rmind Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/mount.h>
42 #include <sys/proc.h>
43 #include <sys/systm.h>
44 #include <sys/sysctl.h>
45 #include <sys/malloc.h>
46 #include <sys/syscallargs.h>
47 #include <sys/wait.h>
48 #include <sys/kauth.h>
49 #include <sys/ptrace.h>
50 #include <sys/types.h>
51 
52 #include <sys/cpu.h>
53 
54 #include <compat/linux/common/linux_types.h>
55 #include <compat/linux/common/linux_signal.h>
56 #include <compat/linux/common/linux_machdep.h> /* For LINUX_NPTL */
57 #include <compat/linux/common/linux_emuldata.h>
58 #include <compat/linux/common/linux_ipc.h>
59 #include <compat/linux/common/linux_sem.h>
60 #include <compat/linux/common/linux_exec.h>
61 
62 #include <compat/linux/linux_syscallargs.h>
63 
64 #include <compat/linux/common/linux_sched.h>
65 
66 int
67 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval)
68 {
69 	/* {
70 		syscallarg(int) flags;
71 		syscallarg(void *) stack;
72 #ifdef LINUX_NPTL
73 		syscallarg(void *) parent_tidptr;
74 		syscallarg(void *) child_tidptr;
75 #endif
76 	} */
77 	int flags, sig;
78 	int error;
79 	struct proc *p;
80 #ifdef LINUX_NPTL
81 	struct linux_emuldata *led;
82 #endif
83 
84 	/*
85 	 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags.
86 	 */
87 	if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE))
88 		return (EINVAL);
89 
90 	/*
91 	 * Thread group implies shared signals. Shared signals
92 	 * imply shared VM. This matches what Linux kernel does.
93 	 */
94 	if (SCARG(uap, flags) & LINUX_CLONE_THREAD
95 	    && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0)
96 		return (EINVAL);
97 	if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND
98 	    && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0)
99 		return (EINVAL);
100 
101 	flags = 0;
102 
103 	if (SCARG(uap, flags) & LINUX_CLONE_VM)
104 		flags |= FORK_SHAREVM;
105 	if (SCARG(uap, flags) & LINUX_CLONE_FS)
106 		flags |= FORK_SHARECWD;
107 	if (SCARG(uap, flags) & LINUX_CLONE_FILES)
108 		flags |= FORK_SHAREFILES;
109 	if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND)
110 		flags |= FORK_SHARESIGS;
111 	if (SCARG(uap, flags) & LINUX_CLONE_VFORK)
112 		flags |= FORK_PPWAIT;
113 
114 	sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL;
115 	if (sig < 0 || sig >= LINUX__NSIG)
116 		return (EINVAL);
117 	sig = linux_to_native_signo[sig];
118 
119 #ifdef LINUX_NPTL
120 	led = (struct linux_emuldata *)l->l_proc->p_emuldata;
121 
122 	led->parent_tidptr = SCARG(uap, parent_tidptr);
123 	led->child_tidptr = SCARG(uap, child_tidptr);
124 	led->clone_flags = SCARG(uap, flags);
125 #endif /* LINUX_NPTL */
126 
127 	/*
128 	 * Note that Linux does not provide a portable way of specifying
129 	 * the stack area; the caller must know if the stack grows up
130 	 * or down.  So, we pass a stack size of 0, so that the code
131 	 * that makes this adjustment is a noop.
132 	 */
133 	if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0,
134 	    NULL, NULL, retval, &p)) != 0)
135 		return error;
136 
137 #ifdef LINUX_NPTL
138 	if ((SCARG(uap, flags) & LINUX_CLONE_SETTLS) != 0)
139 		return linux_init_thread_area(l, LIST_FIRST(&p->p_lwps));
140 #endif /* LINUX_NPTL */
141 
142 	return 0;
143 }
144 
145 /*
146  * linux realtime priority
147  *
148  * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99].
149  *
150  * - SCHED_OTHER tasks don't have realtime priorities.
151  *   in particular, sched_param::sched_priority is always 0.
152  */
153 
154 #define	LINUX_SCHED_RTPRIO_MIN	1
155 #define	LINUX_SCHED_RTPRIO_MAX	99
156 
157 static int
158 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params,
159     int *native_policy, struct sched_param *native_params)
160 {
161 
162 	switch (linux_policy) {
163 	case LINUX_SCHED_OTHER:
164 		if (native_policy != NULL) {
165 			*native_policy = SCHED_OTHER;
166 		}
167 		break;
168 
169 	case LINUX_SCHED_FIFO:
170 		if (native_policy != NULL) {
171 			*native_policy = SCHED_FIFO;
172 		}
173 		break;
174 
175 	case LINUX_SCHED_RR:
176 		if (native_policy != NULL) {
177 			*native_policy = SCHED_RR;
178 		}
179 		break;
180 
181 	default:
182 		return EINVAL;
183 	}
184 
185 	if (linux_params != NULL) {
186 		int prio = linux_params->sched_priority;
187 
188 		KASSERT(native_params != NULL);
189 
190 		if (linux_policy == LINUX_SCHED_OTHER) {
191 			if (prio != 0) {
192 				return EINVAL;
193 			}
194 			native_params->sched_priority = PRI_NONE; /* XXX */
195 		} else {
196 			if (prio < LINUX_SCHED_RTPRIO_MIN ||
197 			    prio > LINUX_SCHED_RTPRIO_MAX) {
198 				return EINVAL;
199 			}
200 			native_params->sched_priority =
201 			    (prio - LINUX_SCHED_RTPRIO_MIN)
202 			    * (SCHED_PRI_MAX - SCHED_PRI_MIN)
203 			    / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
204 			    + SCHED_PRI_MIN;
205 		}
206 	}
207 
208 	return 0;
209 }
210 
211 static int
212 sched_native2linux(int native_policy, struct sched_param *native_params,
213     int *linux_policy, struct linux_sched_param *linux_params)
214 {
215 
216 	switch (native_policy) {
217 	case SCHED_OTHER:
218 		if (linux_policy != NULL) {
219 			*linux_policy = LINUX_SCHED_OTHER;
220 		}
221 		break;
222 
223 	case SCHED_FIFO:
224 		if (linux_policy != NULL) {
225 			*linux_policy = LINUX_SCHED_FIFO;
226 		}
227 		break;
228 
229 	case SCHED_RR:
230 		if (linux_policy != NULL) {
231 			*linux_policy = LINUX_SCHED_RR;
232 		}
233 		break;
234 
235 	default:
236 		panic("%s: unknown policy %d\n", __func__, native_policy);
237 	}
238 
239 	if (native_params != NULL) {
240 		int prio = native_params->sched_priority;
241 
242 		KASSERT(prio >= SCHED_PRI_MIN);
243 		KASSERT(prio <= SCHED_PRI_MAX);
244 		KASSERT(linux_params != NULL);
245 
246 #ifdef DEBUG_LINUX
247 		printf("native2linux: native: policy %d, priority %d\n",
248 		    native_policy, prio);
249 #endif
250 
251 		if (native_policy == SCHED_OTHER) {
252 			linux_params->sched_priority = 0;
253 		} else {
254 			linux_params->sched_priority =
255 			    (prio - SCHED_PRI_MIN)
256 			    * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
257 			    / (SCHED_PRI_MAX - SCHED_PRI_MIN)
258 			    + LINUX_SCHED_RTPRIO_MIN;
259 		}
260 #ifdef DEBUG_LINUX
261 		printf("native2linux: linux: policy %d, priority %d\n",
262 		    -1, linux_params->sched_priority);
263 #endif
264 	}
265 
266 	return 0;
267 }
268 
269 int
270 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval)
271 {
272 	/* {
273 		syscallarg(linux_pid_t) pid;
274 		syscallarg(const struct linux_sched_param *) sp;
275 	} */
276 	int error, policy;
277 	struct linux_sched_param lp;
278 	struct sched_param sp;
279 
280 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
281 		error = EINVAL;
282 		goto out;
283 	}
284 
285 	error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
286 	if (error)
287 		goto out;
288 
289 	/* We need the current policy in Linux terms. */
290 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
291 	if (error)
292 		goto out;
293 	error = sched_native2linux(policy, NULL, &policy, NULL);
294 	if (error)
295 		goto out;
296 
297 	error = sched_linux2native(policy, &lp, &policy, &sp);
298 	if (error)
299 		goto out;
300 
301 	error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
302 	if (error)
303 		goto out;
304 
305  out:
306 	return error;
307 }
308 
309 int
310 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval)
311 {
312 	/* {
313 		syscallarg(linux_pid_t) pid;
314 		syscallarg(struct linux_sched_param *) sp;
315 	} */
316 	struct linux_sched_param lp;
317 	struct sched_param sp;
318 	int error, policy;
319 
320 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
321 		error = EINVAL;
322 		goto out;
323 	}
324 
325 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp);
326 	if (error)
327 		goto out;
328 #ifdef DEBUG_LINUX
329 	printf("getparam: native: policy %d, priority %d\n",
330 	    policy, sp.sched_priority);
331 #endif
332 
333 	error = sched_native2linux(policy, &sp, NULL, &lp);
334 	if (error)
335 		goto out;
336 #ifdef DEBUG_LINUX
337 	printf("getparam: linux: policy %d, priority %d\n",
338 	    policy, lp.sched_priority);
339 #endif
340 
341 	error = copyout(&lp, SCARG(uap, sp), sizeof(lp));
342 	if (error)
343 		goto out;
344 
345  out:
346 	return error;
347 }
348 
349 int
350 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval)
351 {
352 	/* {
353 		syscallarg(linux_pid_t) pid;
354 		syscallarg(int) policy;
355 		syscallarg(cont struct linux_sched_param *) sp;
356 	} */
357 	int error, policy;
358 	struct linux_sched_param lp;
359 	struct sched_param sp;
360 
361 	if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
362 		error = EINVAL;
363 		goto out;
364 	}
365 
366 	error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
367 	if (error)
368 		goto out;
369 #ifdef DEBUG_LINUX
370 	printf("setscheduler: linux: policy %d, priority %d\n",
371 	    SCARG(uap, policy), lp.sched_priority);
372 #endif
373 
374 	error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp);
375 	if (error)
376 		goto out;
377 #ifdef DEBUG_LINUX
378 	printf("setscheduler: native: policy %d, priority %d\n",
379 	    policy, sp.sched_priority);
380 #endif
381 
382 	error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
383 	if (error)
384 		goto out;
385 
386  out:
387 	return error;
388 }
389 
390 int
391 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval)
392 {
393 	/* {
394 		syscallarg(linux_pid_t) pid;
395 	} */
396 	int error, policy;
397 
398 	*retval = -1;
399 
400 	error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
401 	if (error)
402 		goto out;
403 
404 	error = sched_native2linux(policy, NULL, &policy, NULL);
405 	if (error)
406 		goto out;
407 
408 	*retval = policy;
409 
410  out:
411 	return error;
412 }
413 
414 int
415 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
416 {
417 
418 	yield();
419 	return 0;
420 }
421 
422 int
423 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval)
424 {
425 	/* {
426 		syscallarg(int) policy;
427 	} */
428 
429 	switch (SCARG(uap, policy)) {
430 	case LINUX_SCHED_OTHER:
431 		*retval = 0;
432 		break;
433 	case LINUX_SCHED_FIFO:
434 	case LINUX_SCHED_RR:
435 		*retval = LINUX_SCHED_RTPRIO_MAX;
436 		break;
437 	default:
438 		return EINVAL;
439 	}
440 
441 	return 0;
442 }
443 
444 int
445 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval)
446 {
447 	/* {
448 		syscallarg(int) policy;
449 	} */
450 
451 	switch (SCARG(uap, policy)) {
452 	case LINUX_SCHED_OTHER:
453 		*retval = 0;
454 		break;
455 	case LINUX_SCHED_FIFO:
456 	case LINUX_SCHED_RR:
457 		*retval = LINUX_SCHED_RTPRIO_MIN;
458 		break;
459 	default:
460 		return EINVAL;
461 	}
462 
463 	return 0;
464 }
465 
466 #ifndef __m68k__
467 /* Present on everything but m68k */
468 int
469 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval)
470 {
471 #ifdef LINUX_NPTL
472 	/* {
473 		syscallarg(int) error_code;
474 	} */
475 	struct proc *p = l->l_proc;
476 	struct linux_emuldata *led = p->p_emuldata;
477 	struct linux_emuldata *e;
478 
479 	if (led->s->flags & LINUX_LES_USE_NPTL) {
480 
481 #ifdef DEBUG_LINUX
482 		printf("%s:%d, led->s->refs = %d\n", __func__, __LINE__,
483 		    led->s->refs);
484 #endif
485 
486 		/*
487 		 * The calling thread is supposed to kill all threads
488 		 * in the same thread group (i.e. all threads created
489 		 * via clone(2) with CLONE_THREAD flag set).
490 		 *
491 		 * If there is only one thread, things are quite simple
492 		 */
493 		if (led->s->refs == 1)
494 			return sys_exit(l, (const void *)uap, retval);
495 
496 #ifdef DEBUG_LINUX
497 		printf("%s:%d\n", __func__, __LINE__);
498 #endif
499 
500 		mutex_enter(proc_lock);
501 		led->s->flags |= LINUX_LES_INEXITGROUP;
502 		led->s->xstat = W_EXITCODE(SCARG(uap, error_code), 0);
503 
504 		/*
505 		 * Kill all threads in the group. The emulation exit hook takes
506 		 * care of hiding the zombies and reporting the exit code
507 		 * properly.
508 		 */
509       		LIST_FOREACH(e, &led->s->threads, threads) {
510 			if (e->proc == p)
511 				continue;
512 
513 #ifdef DEBUG_LINUX
514 			printf("%s: kill PID %d\n", __func__, e->proc->p_pid);
515 #endif
516 			psignal(e->proc, SIGKILL);
517 		}
518 
519 		/* Now, kill ourselves */
520 		psignal(p, SIGKILL);
521 		mutex_exit(proc_lock);
522 
523 		return 0;
524 
525 	}
526 #endif /* LINUX_NPTL */
527 
528 	return sys_exit(l, (const void *)uap, retval);
529 }
530 #endif /* !__m68k__ */
531 
532 #ifdef LINUX_NPTL
533 int
534 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval)
535 {
536 	/* {
537 		syscallarg(int *) tidptr;
538 	} */
539 	struct linux_emuldata *led;
540 
541 	led = (struct linux_emuldata *)l->l_proc->p_emuldata;
542 	led->clear_tid = SCARG(uap, tid);
543 
544 	led->s->flags |= LINUX_LES_USE_NPTL;
545 
546 	*retval = l->l_proc->p_pid;
547 
548 	return 0;
549 }
550 
551 /* ARGUSED1 */
552 int
553 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval)
554 {
555 	/* The Linux kernel does it exactly that way */
556 	*retval = l->l_proc->p_pid;
557 	return 0;
558 }
559 
560 #ifdef LINUX_NPTL
561 /* ARGUSED1 */
562 int
563 linux_sys_getpid(struct lwp *l, const void *v, register_t *retval)
564 {
565 	struct linux_emuldata *led = l->l_proc->p_emuldata;
566 
567 	if (led->s->flags & LINUX_LES_USE_NPTL) {
568 		/* The Linux kernel does it exactly that way */
569 		*retval = led->s->group_pid;
570 	} else {
571 		*retval = l->l_proc->p_pid;
572 	}
573 
574 	return 0;
575 }
576 
577 /* ARGUSED1 */
578 int
579 linux_sys_getppid(struct lwp *l, const void *v, register_t *retval)
580 {
581 	struct proc *p = l->l_proc;
582 	struct linux_emuldata *led = p->p_emuldata;
583 	struct proc *glp;
584 	struct proc *pp;
585 
586 	mutex_enter(proc_lock);
587 	if (led->s->flags & LINUX_LES_USE_NPTL) {
588 
589 		/* Find the thread group leader's parent */
590 		glp = proc_find(led->s->group_pid);
591 		if (glp == NULL) {
592 			/* Maybe panic... */
593 			printf("linux_sys_getppid: missing group leader PID"
594 			    " %d\n", led->s->group_pid);
595 			mutex_exit(proc_lock);
596 			return -1;
597 		}
598 		pp = glp->p_pptr;
599 
600 		/* If this is a Linux process too, return thread group PID */
601 		if (pp->p_emul == p->p_emul) {
602 			struct linux_emuldata *pled;
603 
604 			pled = pp->p_emuldata;
605 			*retval = pled->s->group_pid;
606 		} else {
607 			*retval = pp->p_pid;
608 		}
609 
610 	} else {
611 		*retval = p->p_pptr->p_pid;
612 	}
613 	mutex_exit(proc_lock);
614 
615 	return 0;
616 }
617 #endif /* LINUX_NPTL */
618 
619 int
620 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval)
621 {
622 	/* {
623 		syscallarg(pid_t) pid;
624 		syscallarg(unsigned int) len;
625 		syscallarg(unsigned long *) mask;
626 	} */
627 	int error, size, nb = ncpu;
628 	unsigned long *c, *data;
629 	proc_t *p;
630 
631 	/* Unlike Linux, dynamically calculate cpu mask size */
632 	size = sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT);
633 	if (SCARG(uap, len) < size)
634 		return EINVAL;
635 
636 	/* XXX: Pointless check.  TODO: Actually implement this. */
637 	mutex_enter(proc_lock);
638 	p = proc_find(SCARG(uap, pid));
639 	mutex_exit(proc_lock);
640 	if (p == NULL) {
641 		return ESRCH;
642 	}
643 
644 	/*
645 	 * return the actual number of CPU, tag all of them as available
646 	 * The result is a mask, the first CPU being in the least significant
647 	 * bit.
648 	 */
649 	data = kmem_zalloc(size, KM_SLEEP);
650 	c = data;
651 	while (nb > LONG_BIT) {
652 		*c++ = ~0UL;
653 		nb -= LONG_BIT;
654 	}
655 	if (nb)
656 		*c = (1 << ncpu) - 1;
657 
658 	error = copyout(data, SCARG(uap, mask), size);
659 	kmem_free(data, size);
660 
661 	*retval = size;
662 	return error;
663 
664 }
665 
666 int
667 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval)
668 {
669 	/* {
670 		syscallarg(pid_t) pid;
671 		syscallarg(unsigned int) len;
672 		syscallarg(unsigned long *) mask;
673 	} */
674 	proc_t *p;
675 
676 	/* XXX: Pointless check.  TODO: Actually implement this. */
677 	mutex_enter(proc_lock);
678 	p = proc_find(SCARG(uap, pid));
679 	mutex_exit(proc_lock);
680 	if (p == NULL) {
681 		return ESRCH;
682 	}
683 
684 	/* Let's ignore it */
685 #ifdef DEBUG_LINUX
686 	printf("linux_sys_sched_setaffinity\n");
687 #endif
688 	return 0;
689 };
690 #endif /* LINUX_NPTL */
691