xref: /netbsd-src/sys/kern/sys_sched.c (revision 313c6c94c424eed90c7b7e494aa83308a0a5d0ce)
1 /*	$NetBSD: sys_sched.c,v 1.33 2009/03/03 21:55:06 rmind Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * System calls relating to the scheduler.
31  *
32  * Lock order:
33  *
34  *	cpu_lock ->
35  *	    proc_lock ->
36  *		proc_t::p_lock ->
37  *		    lwp_t::lwp_lock
38  *
39  * TODO:
40  *  - Handle pthread_setschedprio() as defined by POSIX;
41  *  - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
42  */
43 
44 #include <sys/cdefs.h>
45 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.33 2009/03/03 21:55:06 rmind Exp $");
46 
47 #include <sys/param.h>
48 
49 #include <sys/cpu.h>
50 #include <sys/kauth.h>
51 #include <sys/kmem.h>
52 #include <sys/lwp.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/pset.h>
56 #include <sys/sa.h>
57 #include <sys/savar.h>
58 #include <sys/sched.h>
59 #include <sys/syscallargs.h>
60 #include <sys/sysctl.h>
61 #include <sys/systm.h>
62 #include <sys/types.h>
63 #include <sys/unistd.h>
64 
65 #include "opt_sa.h"
66 
67 /*
68  * Convert user priority or the in-kernel priority or convert the current
69  * priority to the appropriate range according to the policy change.
70  */
71 static pri_t
72 convert_pri(lwp_t *l, int policy, pri_t pri)
73 {
74 
75 	/* Convert user priority to the in-kernel */
76 	if (pri != PRI_NONE) {
77 		/* Only for real-time threads */
78 		KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
79 		KASSERT(policy != SCHED_OTHER);
80 		return PRI_USER_RT + pri;
81 	}
82 
83 	/* Neither policy, nor priority change */
84 	if (l->l_class == policy)
85 		return l->l_priority;
86 
87 	/* Time-sharing -> real-time */
88 	if (l->l_class == SCHED_OTHER) {
89 		KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
90 		return PRI_USER_RT;
91 	}
92 
93 	/* Real-time -> time-sharing */
94 	if (policy == SCHED_OTHER) {
95 		KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
96 		return l->l_priority - PRI_USER_RT;
97 	}
98 
99 	/* Real-time -> real-time */
100 	return l->l_priority;
101 }
102 
103 int
104 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
105     const struct sched_param *params)
106 {
107 	struct proc *p;
108 	struct lwp *t;
109 	pri_t pri;
110 	u_int lcnt;
111 	int error;
112 
113 	error = 0;
114 
115 	pri = params->sched_priority;
116 
117 	/* If no parameters specified, just return (this should not happen) */
118 	if (pri == PRI_NONE && policy == SCHED_NONE)
119 		return 0;
120 
121 	/* Validate scheduling class */
122 	if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
123 		return EINVAL;
124 
125 	/* Validate priority */
126 	if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
127 		return EINVAL;
128 
129 	if (pid != 0) {
130 		/* Find the process */
131 		mutex_enter(proc_lock);
132 		p = p_find(pid, PFIND_LOCKED);
133 		if (p == NULL) {
134 			mutex_exit(proc_lock);
135 			return ESRCH;
136 		}
137 		mutex_enter(p->p_lock);
138 		mutex_exit(proc_lock);
139 		/* Disallow modification of system processes */
140 		if ((p->p_flag & PK_SYSTEM) != 0) {
141 			mutex_exit(p->p_lock);
142 			return EPERM;
143 		}
144 	} else {
145 		/* Use the calling process */
146 		p = curlwp->l_proc;
147 		mutex_enter(p->p_lock);
148 	}
149 
150 	/* Find the LWP(s) */
151 	lcnt = 0;
152 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
153 		pri_t kpri;
154 		int lpolicy;
155 
156 		if (lid && lid != t->l_lid)
157 			continue;
158 
159 		lcnt++;
160 		lwp_lock(t);
161 		lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
162 
163 		/* Disallow setting of priority for SCHED_OTHER threads */
164 		if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
165 			lwp_unlock(t);
166 			error = EINVAL;
167 			break;
168 		}
169 
170 		/* Convert priority, if needed */
171 		kpri = convert_pri(t, lpolicy, pri);
172 
173 		/* Check the permission */
174 		error = kauth_authorize_process(kauth_cred_get(),
175 		    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
176 		    KAUTH_ARG(kpri));
177 		if (error) {
178 			lwp_unlock(t);
179 			break;
180 		}
181 
182 		/* Set the scheduling class, change the priority */
183 		t->l_class = lpolicy;
184 		lwp_changepri(t, kpri);
185 		lwp_unlock(t);
186 	}
187 	mutex_exit(p->p_lock);
188 	return (lcnt == 0) ? ESRCH : error;
189 }
190 
191 /*
192  * Set scheduling parameters.
193  */
194 int
195 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
196     register_t *retval)
197 {
198 	/* {
199 		syscallarg(pid_t) pid;
200 		syscallarg(lwpid_t) lid;
201 		syscallarg(int) policy;
202 		syscallarg(const struct sched_param *) params;
203 	} */
204 	struct sched_param params;
205 	int error;
206 
207 	/* Get the parameters from the user-space */
208 	error = copyin(SCARG(uap, params), &params, sizeof(params));
209 	if (error)
210 		goto out;
211 
212 	error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
213 	    SCARG(uap, policy), &params);
214 out:
215 	return error;
216 }
217 
218 int
219 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
220     struct sched_param *params)
221 {
222 	struct sched_param lparams;
223 	struct lwp *t;
224 	int error, lpolicy;
225 
226 	/* Locks the LWP */
227 	t = lwp_find2(pid, lid);
228 	if (t == NULL)
229 		return ESRCH;
230 
231 	/* Check the permission */
232 	error = kauth_authorize_process(kauth_cred_get(),
233 	    KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
234 	if (error != 0) {
235 		mutex_exit(t->l_proc->p_lock);
236 		return error;
237 	}
238 
239 	lwp_lock(t);
240 	lparams.sched_priority = t->l_priority;
241 	lpolicy = t->l_class;
242 
243 	switch (lpolicy) {
244 	case SCHED_OTHER:
245 		lparams.sched_priority -= PRI_USER;
246 		break;
247 	case SCHED_RR:
248 	case SCHED_FIFO:
249 		lparams.sched_priority -= PRI_USER_RT;
250 		break;
251 	}
252 
253 	if (policy != NULL)
254 		*policy = lpolicy;
255 
256 	if (params != NULL)
257 		*params = lparams;
258 
259 	lwp_unlock(t);
260 	mutex_exit(t->l_proc->p_lock);
261 	return error;
262 }
263 
264 /*
265  * Get scheduling parameters.
266  */
267 int
268 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
269     register_t *retval)
270 {
271 	/* {
272 		syscallarg(pid_t) pid;
273 		syscallarg(lwpid_t) lid;
274 		syscallarg(int *) policy;
275 		syscallarg(struct sched_param *) params;
276 	} */
277 	struct sched_param params;
278 	int error, policy;
279 
280 	error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
281 	    &params);
282 	if (error)
283 		goto out;
284 
285 	error = copyout(&params, SCARG(uap, params), sizeof(params));
286 	if (error == 0 && SCARG(uap, policy) != NULL)
287 		error = copyout(&policy, SCARG(uap, policy), sizeof(int));
288 out:
289 	return error;
290 }
291 
292 /*
293  * Allocate the CPU set, and get it from userspace.
294  */
295 static int
296 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
297 {
298 	int error;
299 
300 	*dset = kcpuset_create();
301 	error = kcpuset_copyin(sset, *dset, size);
302 	if (error != 0)
303 		kcpuset_unuse(*dset, NULL);
304 	return error;
305 }
306 
307 /*
308  * Set affinity.
309  */
310 int
311 sys__sched_setaffinity(struct lwp *l,
312     const struct sys__sched_setaffinity_args *uap, register_t *retval)
313 {
314 	/* {
315 		syscallarg(pid_t) pid;
316 		syscallarg(lwpid_t) lid;
317 		syscallarg(size_t) size;
318 		syscallarg(const cpuset_t *) cpuset;
319 	} */
320 	kcpuset_t *cpuset, *cpulst = NULL;
321 	struct cpu_info *ici, *ci;
322 	struct proc *p;
323 	struct lwp *t;
324 	CPU_INFO_ITERATOR cii;
325 	bool alloff;
326 	lwpid_t lid;
327 	u_int lcnt;
328 	int error;
329 
330 	error = genkcpuset(&cpuset, SCARG(uap, cpuset), SCARG(uap, size));
331 	if (error)
332 		return error;
333 
334 	/*
335 	 * Traverse _each_ CPU to:
336 	 *  - Check that CPUs in the mask have no assigned processor set.
337 	 *  - Check that at least one CPU from the mask is online.
338 	 *  - Find the first target CPU to migrate.
339 	 *
340 	 * To avoid the race with CPU online/offline calls and processor sets,
341 	 * cpu_lock will be locked for the entire operation.
342 	 */
343 	ci = NULL;
344 	alloff = false;
345 	mutex_enter(&cpu_lock);
346 	for (CPU_INFO_FOREACH(cii, ici)) {
347 		struct schedstate_percpu *ispc;
348 
349 		if (kcpuset_isset(cpu_index(ici), cpuset) == 0)
350 			continue;
351 
352 		ispc = &ici->ci_schedstate;
353 		/* Check that CPU is not in the processor-set */
354 		if (ispc->spc_psid != PS_NONE) {
355 			error = EPERM;
356 			goto out;
357 		}
358 		/* Skip offline CPUs */
359 		if (ispc->spc_flags & SPCF_OFFLINE) {
360 			alloff = true;
361 			continue;
362 		}
363 		/* Target CPU to migrate */
364 		if (ci == NULL) {
365 			ci = ici;
366 		}
367 	}
368 	if (ci == NULL) {
369 		if (alloff) {
370 			/* All CPUs in the set are offline */
371 			error = EPERM;
372 			goto out;
373 		}
374 		/* Empty set */
375 		kcpuset_unuse(cpuset, &cpulst);
376 		cpuset = NULL;
377 	}
378 
379 	if (SCARG(uap, pid) != 0) {
380 		/* Find the process */
381 		mutex_enter(proc_lock);
382 		p = p_find(SCARG(uap, pid), PFIND_LOCKED);
383 		if (p == NULL) {
384 			mutex_exit(proc_lock);
385 			error = ESRCH;
386 			goto out;
387 		}
388 		mutex_enter(p->p_lock);
389 		mutex_exit(proc_lock);
390 		/* Disallow modification of system processes. */
391 		if ((p->p_flag & PK_SYSTEM) != 0) {
392 			mutex_exit(p->p_lock);
393 			error = EPERM;
394 			goto out;
395 		}
396 	} else {
397 		/* Use the calling process */
398 		p = l->l_proc;
399 		mutex_enter(p->p_lock);
400 	}
401 
402 	/*
403 	 * Check the permission.
404 	 */
405 	error = kauth_authorize_process(l->l_cred,
406 	    KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
407 	if (error != 0) {
408 		mutex_exit(p->p_lock);
409 		goto out;
410 	}
411 
412 #ifdef KERN_SA
413 	/* Changing the affinity of a SA process is not supported */
414 	if ((p->p_sflag & (PS_SA | PS_WEXIT)) != 0 || p->p_sa != NULL) {
415 		mutex_exit(p->p_lock);
416 		error = EINVAL;
417 		goto out;
418 	}
419 #endif
420 
421 	/* Find the LWP(s) */
422 	lcnt = 0;
423 	lid = SCARG(uap, lid);
424 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
425 		if (lid && lid != t->l_lid)
426 			continue;
427 		lwp_lock(t);
428 		/* It is not allowed to set the affinity for zombie LWPs */
429 		if (t->l_stat == LSZOMB) {
430 			lwp_unlock(t);
431 			continue;
432 		}
433 		if (cpuset) {
434 			/* Set the affinity flag and new CPU set */
435 			t->l_flag |= LW_AFFINITY;
436 			kcpuset_use(cpuset);
437 			if (t->l_affinity != NULL)
438 				kcpuset_unuse(t->l_affinity, &cpulst);
439 			t->l_affinity = cpuset;
440 			/* Migrate to another CPU, unlocks LWP */
441 			lwp_migrate(t, ci);
442 		} else {
443 			/* Unset the affinity flag */
444 			t->l_flag &= ~LW_AFFINITY;
445 			if (t->l_affinity != NULL)
446 				kcpuset_unuse(t->l_affinity, &cpulst);
447 			t->l_affinity = NULL;
448 			lwp_unlock(t);
449 		}
450 		lcnt++;
451 	}
452 	mutex_exit(p->p_lock);
453 	if (lcnt == 0)
454 		error = ESRCH;
455 out:
456 	mutex_exit(&cpu_lock);
457 	if (cpuset != NULL)
458 		kcpuset_unuse(cpuset, &cpulst);
459 	kcpuset_destroy(cpulst);
460 	return error;
461 }
462 
463 /*
464  * Get affinity.
465  */
466 int
467 sys__sched_getaffinity(struct lwp *l,
468     const struct sys__sched_getaffinity_args *uap, register_t *retval)
469 {
470 	/* {
471 		syscallarg(pid_t) pid;
472 		syscallarg(lwpid_t) lid;
473 		syscallarg(size_t) size;
474 		syscallarg(cpuset_t *) cpuset;
475 	} */
476 	struct lwp *t;
477 	kcpuset_t *cpuset;
478 	int error;
479 
480 	error = genkcpuset(&cpuset, SCARG(uap, cpuset), SCARG(uap, size));
481 	if (error)
482 		return error;
483 
484 	/* Locks the LWP */
485 	t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
486 	if (t == NULL) {
487 		error = ESRCH;
488 		goto out;
489 	}
490 	/* Check the permission */
491 	if (kauth_authorize_process(l->l_cred,
492 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
493 		mutex_exit(t->l_proc->p_lock);
494 		error = EPERM;
495 		goto out;
496 	}
497 	lwp_lock(t);
498 	if (t->l_flag & LW_AFFINITY) {
499 		KASSERT(t->l_affinity != NULL);
500 		kcpuset_copy(cpuset, t->l_affinity);
501 	} else
502 		kcpuset_zero(cpuset);
503 	lwp_unlock(t);
504 	mutex_exit(t->l_proc->p_lock);
505 
506 	error = kcpuset_copyout(cpuset, SCARG(uap, cpuset), SCARG(uap, size));
507 out:
508 	kcpuset_unuse(cpuset, NULL);
509 	return error;
510 }
511 
512 /*
513  * Yield.
514  */
515 int
516 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
517 {
518 
519 	yield();
520 #ifdef KERN_SA
521 	if (l->l_flag & LW_SA) {
522 		sa_preempt(l);
523 	}
524 #endif
525 	return 0;
526 }
527 
528 /*
529  * Sysctl nodes and initialization.
530  */
531 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
532 {
533 	const struct sysctlnode *node = NULL;
534 
535 	sysctl_createv(clog, 0, NULL, NULL,
536 		CTLFLAG_PERMANENT,
537 		CTLTYPE_NODE, "kern", NULL,
538 		NULL, 0, NULL, 0,
539 		CTL_KERN, CTL_EOL);
540 	sysctl_createv(clog, 0, NULL, NULL,
541 		CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
542 		CTLTYPE_INT, "posix_sched",
543 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
544 			     "Process Scheduling option to which the "
545 			     "system attempts to conform"),
546 		NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
547 		CTL_KERN, CTL_CREATE, CTL_EOL);
548 	sysctl_createv(clog, 0, NULL, &node,
549 		CTLFLAG_PERMANENT,
550 		CTLTYPE_NODE, "sched",
551 		SYSCTL_DESCR("Scheduler options"),
552 		NULL, 0, NULL, 0,
553 		CTL_KERN, CTL_CREATE, CTL_EOL);
554 
555 	if (node == NULL)
556 		return;
557 
558 	sysctl_createv(clog, 0, &node, NULL,
559 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
560 		CTLTYPE_INT, "pri_min",
561 		SYSCTL_DESCR("Minimal POSIX real-time priority"),
562 		NULL, SCHED_PRI_MIN, NULL, 0,
563 		CTL_CREATE, CTL_EOL);
564 	sysctl_createv(clog, 0, &node, NULL,
565 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
566 		CTLTYPE_INT, "pri_max",
567 		SYSCTL_DESCR("Maximal POSIX real-time priority"),
568 		NULL, SCHED_PRI_MAX, NULL, 0,
569 		CTL_CREATE, CTL_EOL);
570 }
571