xref: /netbsd-src/sys/kern/sys_sched.c (revision f76667381c54a32e48cabd18c4681a437b432444)
1 /*	$NetBSD: sys_sched.c,v 1.39 2012/01/29 22:55:40 rmind Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * System calls relating to the scheduler.
31  *
32  * Lock order:
33  *
34  *	cpu_lock ->
35  *	    proc_lock ->
36  *		proc_t::p_lock ->
37  *		    lwp_t::lwp_lock
38  *
39  * TODO:
40  *  - Handle pthread_setschedprio() as defined by POSIX;
41  *  - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
42  */
43 
44 #include <sys/cdefs.h>
45 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.39 2012/01/29 22:55:40 rmind Exp $");
46 
47 #include <sys/param.h>
48 
49 #include <sys/cpu.h>
50 #include <sys/kauth.h>
51 #include <sys/kmem.h>
52 #include <sys/lwp.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/pset.h>
56 #include <sys/sa.h>
57 #include <sys/savar.h>
58 #include <sys/sched.h>
59 #include <sys/syscallargs.h>
60 #include <sys/sysctl.h>
61 #include <sys/systm.h>
62 #include <sys/types.h>
63 #include <sys/unistd.h>
64 
65 #include "opt_sa.h"
66 
67 static struct sysctllog *sched_sysctl_log;
68 static kauth_listener_t sched_listener;
69 
70 /*
71  * Convert user priority or the in-kernel priority or convert the current
72  * priority to the appropriate range according to the policy change.
73  */
74 static pri_t
75 convert_pri(lwp_t *l, int policy, pri_t pri)
76 {
77 
78 	/* Convert user priority to the in-kernel */
79 	if (pri != PRI_NONE) {
80 		/* Only for real-time threads */
81 		KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
82 		KASSERT(policy != SCHED_OTHER);
83 		return PRI_USER_RT + pri;
84 	}
85 
86 	/* Neither policy, nor priority change */
87 	if (l->l_class == policy)
88 		return l->l_priority;
89 
90 	/* Time-sharing -> real-time */
91 	if (l->l_class == SCHED_OTHER) {
92 		KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
93 		return PRI_USER_RT;
94 	}
95 
96 	/* Real-time -> time-sharing */
97 	if (policy == SCHED_OTHER) {
98 		KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
99 		return l->l_priority - PRI_USER_RT;
100 	}
101 
102 	/* Real-time -> real-time */
103 	return l->l_priority;
104 }
105 
106 int
107 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
108     const struct sched_param *params)
109 {
110 	struct proc *p;
111 	struct lwp *t;
112 	pri_t pri;
113 	u_int lcnt;
114 	int error;
115 
116 	error = 0;
117 
118 	pri = params->sched_priority;
119 
120 	/* If no parameters specified, just return (this should not happen) */
121 	if (pri == PRI_NONE && policy == SCHED_NONE)
122 		return 0;
123 
124 	/* Validate scheduling class */
125 	if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
126 		return EINVAL;
127 
128 	/* Validate priority */
129 	if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
130 		return EINVAL;
131 
132 	if (pid != 0) {
133 		/* Find the process */
134 		mutex_enter(proc_lock);
135 		p = proc_find(pid);
136 		if (p == NULL) {
137 			mutex_exit(proc_lock);
138 			return ESRCH;
139 		}
140 		mutex_enter(p->p_lock);
141 		mutex_exit(proc_lock);
142 		/* Disallow modification of system processes */
143 		if ((p->p_flag & PK_SYSTEM) != 0) {
144 			mutex_exit(p->p_lock);
145 			return EPERM;
146 		}
147 	} else {
148 		/* Use the calling process */
149 		p = curlwp->l_proc;
150 		mutex_enter(p->p_lock);
151 	}
152 
153 	/* Find the LWP(s) */
154 	lcnt = 0;
155 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
156 		pri_t kpri;
157 		int lpolicy;
158 
159 		if (lid && lid != t->l_lid)
160 			continue;
161 
162 		lcnt++;
163 		lwp_lock(t);
164 		lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
165 
166 		/* Disallow setting of priority for SCHED_OTHER threads */
167 		if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
168 			lwp_unlock(t);
169 			error = EINVAL;
170 			break;
171 		}
172 
173 		/* Convert priority, if needed */
174 		kpri = convert_pri(t, lpolicy, pri);
175 
176 		/* Check the permission */
177 		error = kauth_authorize_process(kauth_cred_get(),
178 		    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
179 		    KAUTH_ARG(kpri));
180 		if (error) {
181 			lwp_unlock(t);
182 			break;
183 		}
184 
185 		/* Set the scheduling class, change the priority */
186 		t->l_class = lpolicy;
187 		lwp_changepri(t, kpri);
188 		lwp_unlock(t);
189 	}
190 	mutex_exit(p->p_lock);
191 	return (lcnt == 0) ? ESRCH : error;
192 }
193 
194 /*
195  * Set scheduling parameters.
196  */
197 int
198 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
199     register_t *retval)
200 {
201 	/* {
202 		syscallarg(pid_t) pid;
203 		syscallarg(lwpid_t) lid;
204 		syscallarg(int) policy;
205 		syscallarg(const struct sched_param *) params;
206 	} */
207 	struct sched_param params;
208 	int error;
209 
210 	/* Get the parameters from the user-space */
211 	error = copyin(SCARG(uap, params), &params, sizeof(params));
212 	if (error)
213 		goto out;
214 
215 	error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
216 	    SCARG(uap, policy), &params);
217 out:
218 	return error;
219 }
220 
221 int
222 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
223     struct sched_param *params)
224 {
225 	struct sched_param lparams;
226 	struct lwp *t;
227 	int error, lpolicy;
228 
229 	/* Locks the LWP */
230 	t = lwp_find2(pid, lid);
231 	if (t == NULL)
232 		return ESRCH;
233 
234 	/* Check the permission */
235 	error = kauth_authorize_process(kauth_cred_get(),
236 	    KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
237 	if (error != 0) {
238 		mutex_exit(t->l_proc->p_lock);
239 		return error;
240 	}
241 
242 	lwp_lock(t);
243 	lparams.sched_priority = t->l_priority;
244 	lpolicy = t->l_class;
245 
246 	switch (lpolicy) {
247 	case SCHED_OTHER:
248 		lparams.sched_priority -= PRI_USER;
249 		break;
250 	case SCHED_RR:
251 	case SCHED_FIFO:
252 		lparams.sched_priority -= PRI_USER_RT;
253 		break;
254 	}
255 
256 	if (policy != NULL)
257 		*policy = lpolicy;
258 
259 	if (params != NULL)
260 		*params = lparams;
261 
262 	lwp_unlock(t);
263 	mutex_exit(t->l_proc->p_lock);
264 	return error;
265 }
266 
267 /*
268  * Get scheduling parameters.
269  */
270 int
271 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
272     register_t *retval)
273 {
274 	/* {
275 		syscallarg(pid_t) pid;
276 		syscallarg(lwpid_t) lid;
277 		syscallarg(int *) policy;
278 		syscallarg(struct sched_param *) params;
279 	} */
280 	struct sched_param params;
281 	int error, policy;
282 
283 	error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
284 	    &params);
285 	if (error)
286 		goto out;
287 
288 	error = copyout(&params, SCARG(uap, params), sizeof(params));
289 	if (error == 0 && SCARG(uap, policy) != NULL)
290 		error = copyout(&policy, SCARG(uap, policy), sizeof(int));
291 out:
292 	return error;
293 }
294 
295 /*
296  * Allocate the CPU set, and get it from userspace.
297  */
298 static int
299 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
300 {
301 	kcpuset_t *kset;
302 	int error;
303 
304 	kcpuset_create(&kset, false);
305 	error = kcpuset_copyin(sset, kset, size);
306 	if (error) {
307 		kcpuset_unuse(kset, NULL);
308 	} else {
309 		*dset = kset;
310 	}
311 	return error;
312 }
313 
314 /*
315  * Set affinity.
316  */
317 int
318 sys__sched_setaffinity(struct lwp *l,
319     const struct sys__sched_setaffinity_args *uap, register_t *retval)
320 {
321 	/* {
322 		syscallarg(pid_t) pid;
323 		syscallarg(lwpid_t) lid;
324 		syscallarg(size_t) size;
325 		syscallarg(const cpuset_t *) cpuset;
326 	} */
327 	kcpuset_t *kcset, *kcpulst = NULL;
328 	struct cpu_info *ici, *ci;
329 	struct proc *p;
330 	struct lwp *t;
331 	CPU_INFO_ITERATOR cii;
332 	bool alloff;
333 	lwpid_t lid;
334 	u_int lcnt;
335 	int error;
336 
337 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
338 	if (error)
339 		return error;
340 
341 	/*
342 	 * Traverse _each_ CPU to:
343 	 *  - Check that CPUs in the mask have no assigned processor set.
344 	 *  - Check that at least one CPU from the mask is online.
345 	 *  - Find the first target CPU to migrate.
346 	 *
347 	 * To avoid the race with CPU online/offline calls and processor sets,
348 	 * cpu_lock will be locked for the entire operation.
349 	 */
350 	ci = NULL;
351 	alloff = false;
352 	mutex_enter(&cpu_lock);
353 	for (CPU_INFO_FOREACH(cii, ici)) {
354 		struct schedstate_percpu *ispc;
355 
356 		if (!kcpuset_isset(kcset, cpu_index(ici))) {
357 			continue;
358 		}
359 
360 		ispc = &ici->ci_schedstate;
361 		/* Check that CPU is not in the processor-set */
362 		if (ispc->spc_psid != PS_NONE) {
363 			error = EPERM;
364 			goto out;
365 		}
366 		/* Skip offline CPUs */
367 		if (ispc->spc_flags & SPCF_OFFLINE) {
368 			alloff = true;
369 			continue;
370 		}
371 		/* Target CPU to migrate */
372 		if (ci == NULL) {
373 			ci = ici;
374 		}
375 	}
376 	if (ci == NULL) {
377 		if (alloff) {
378 			/* All CPUs in the set are offline */
379 			error = EPERM;
380 			goto out;
381 		}
382 		/* Empty set */
383 		kcpuset_unuse(kcset, &kcpulst);
384 		kcset = NULL;
385 	}
386 
387 	if (SCARG(uap, pid) != 0) {
388 		/* Find the process */
389 		mutex_enter(proc_lock);
390 		p = proc_find(SCARG(uap, pid));
391 		if (p == NULL) {
392 			mutex_exit(proc_lock);
393 			error = ESRCH;
394 			goto out;
395 		}
396 		mutex_enter(p->p_lock);
397 		mutex_exit(proc_lock);
398 		/* Disallow modification of system processes. */
399 		if ((p->p_flag & PK_SYSTEM) != 0) {
400 			mutex_exit(p->p_lock);
401 			error = EPERM;
402 			goto out;
403 		}
404 	} else {
405 		/* Use the calling process */
406 		p = l->l_proc;
407 		mutex_enter(p->p_lock);
408 	}
409 
410 	/*
411 	 * Check the permission.
412 	 */
413 	error = kauth_authorize_process(l->l_cred,
414 	    KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
415 	if (error != 0) {
416 		mutex_exit(p->p_lock);
417 		goto out;
418 	}
419 
420 #ifdef KERN_SA
421 	/* Changing the affinity of a SA process is not supported */
422 	if ((p->p_sflag & (PS_SA | PS_WEXIT)) != 0 || p->p_sa != NULL) {
423 		mutex_exit(p->p_lock);
424 		error = EINVAL;
425 		goto out;
426 	}
427 #endif
428 
429 	/* Iterate through LWP(s). */
430 	lcnt = 0;
431 	lid = SCARG(uap, lid);
432 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
433 		if (lid && lid != t->l_lid) {
434 			continue;
435 		}
436 		lwp_lock(t);
437 		/* No affinity for zombie LWPs. */
438 		if (t->l_stat == LSZOMB) {
439 			lwp_unlock(t);
440 			continue;
441 		}
442 		/* First, release existing affinity, if any. */
443 		if (t->l_affinity) {
444 			kcpuset_unuse(t->l_affinity, &kcpulst);
445 		}
446 		if (kcset) {
447 			/*
448 			 * Hold a reference on affinity mask, assign mask to
449 			 * LWP and migrate it to another CPU (unlocks LWP).
450 			 */
451 			kcpuset_use(kcset);
452 			t->l_affinity = kcset;
453 			lwp_migrate(t, ci);
454 		} else {
455 			/* Old affinity mask is released, just clear. */
456 			t->l_affinity = NULL;
457 			lwp_unlock(t);
458 		}
459 		lcnt++;
460 	}
461 	mutex_exit(p->p_lock);
462 	if (lcnt == 0) {
463 		error = ESRCH;
464 	}
465 out:
466 	mutex_exit(&cpu_lock);
467 
468 	/*
469 	 * Drop the initial reference (LWPs, if any, have the ownership now),
470 	 * and destroy whatever is in the G/C list, if filled.
471 	 */
472 	if (kcset) {
473 		kcpuset_unuse(kcset, &kcpulst);
474 	}
475 	if (kcpulst) {
476 		kcpuset_destroy(kcpulst);
477 	}
478 	return error;
479 }
480 
481 /*
482  * Get affinity.
483  */
484 int
485 sys__sched_getaffinity(struct lwp *l,
486     const struct sys__sched_getaffinity_args *uap, register_t *retval)
487 {
488 	/* {
489 		syscallarg(pid_t) pid;
490 		syscallarg(lwpid_t) lid;
491 		syscallarg(size_t) size;
492 		syscallarg(cpuset_t *) cpuset;
493 	} */
494 	struct lwp *t;
495 	kcpuset_t *kcset;
496 	int error;
497 
498 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
499 	if (error)
500 		return error;
501 
502 	/* Locks the LWP */
503 	t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
504 	if (t == NULL) {
505 		error = ESRCH;
506 		goto out;
507 	}
508 	/* Check the permission */
509 	if (kauth_authorize_process(l->l_cred,
510 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
511 		mutex_exit(t->l_proc->p_lock);
512 		error = EPERM;
513 		goto out;
514 	}
515 	lwp_lock(t);
516 	if (t->l_affinity) {
517 		kcpuset_copy(kcset, t->l_affinity);
518 	} else {
519 		kcpuset_zero(kcset);
520 	}
521 	lwp_unlock(t);
522 	mutex_exit(t->l_proc->p_lock);
523 
524 	error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
525 out:
526 	kcpuset_unuse(kcset, NULL);
527 	return error;
528 }
529 
530 /*
531  * Yield.
532  */
533 int
534 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
535 {
536 
537 	yield();
538 #ifdef KERN_SA
539 	if (l->l_flag & LW_SA) {
540 		sa_preempt(l);
541 	}
542 #endif
543 	return 0;
544 }
545 
546 /*
547  * Sysctl nodes and initialization.
548  */
549 static void
550 sysctl_sched_setup(struct sysctllog **clog)
551 {
552 	const struct sysctlnode *node = NULL;
553 
554 	sysctl_createv(clog, 0, NULL, NULL,
555 		CTLFLAG_PERMANENT,
556 		CTLTYPE_NODE, "kern", NULL,
557 		NULL, 0, NULL, 0,
558 		CTL_KERN, CTL_EOL);
559 	sysctl_createv(clog, 0, NULL, NULL,
560 		CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
561 		CTLTYPE_INT, "posix_sched",
562 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
563 			     "Process Scheduling option to which the "
564 			     "system attempts to conform"),
565 		NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
566 		CTL_KERN, CTL_CREATE, CTL_EOL);
567 	sysctl_createv(clog, 0, NULL, &node,
568 		CTLFLAG_PERMANENT,
569 		CTLTYPE_NODE, "sched",
570 		SYSCTL_DESCR("Scheduler options"),
571 		NULL, 0, NULL, 0,
572 		CTL_KERN, CTL_CREATE, CTL_EOL);
573 
574 	if (node == NULL)
575 		return;
576 
577 	sysctl_createv(clog, 0, &node, NULL,
578 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
579 		CTLTYPE_INT, "pri_min",
580 		SYSCTL_DESCR("Minimal POSIX real-time priority"),
581 		NULL, SCHED_PRI_MIN, NULL, 0,
582 		CTL_CREATE, CTL_EOL);
583 	sysctl_createv(clog, 0, &node, NULL,
584 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
585 		CTLTYPE_INT, "pri_max",
586 		SYSCTL_DESCR("Maximal POSIX real-time priority"),
587 		NULL, SCHED_PRI_MAX, NULL, 0,
588 		CTL_CREATE, CTL_EOL);
589 }
590 
591 static int
592 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
593     void *arg0, void *arg1, void *arg2, void *arg3)
594 {
595 	struct proc *p;
596 	int result;
597 
598 	result = KAUTH_RESULT_DEFER;
599 	p = arg0;
600 
601 	switch (action) {
602 	case KAUTH_PROCESS_SCHEDULER_GETPARAM:
603 		if (kauth_cred_uidmatch(cred, p->p_cred))
604 			result = KAUTH_RESULT_ALLOW;
605 		break;
606 
607 	case KAUTH_PROCESS_SCHEDULER_SETPARAM:
608 		if (kauth_cred_uidmatch(cred, p->p_cred)) {
609 			struct lwp *l;
610 			int policy;
611 			pri_t priority;
612 
613 			l = arg1;
614 			policy = (int)(unsigned long)arg2;
615 			priority = (pri_t)(unsigned long)arg3;
616 
617 			if ((policy == l->l_class ||
618 			    (policy != SCHED_FIFO && policy != SCHED_RR)) &&
619 			    priority <= l->l_priority)
620 				result = KAUTH_RESULT_ALLOW;
621 		}
622 
623 		break;
624 
625 	case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
626 		result = KAUTH_RESULT_ALLOW;
627 		break;
628 
629 	case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
630 		/* Privileged; we let the secmodel handle this. */
631 		break;
632 
633 	default:
634 		break;
635 	}
636 
637 	return result;
638 }
639 
640 void
641 sched_init(void)
642 {
643 
644 	sysctl_sched_setup(&sched_sysctl_log);
645 
646 	sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
647 	    sched_listener_cb, NULL);
648 }
649