xref: /netbsd-src/sys/kern/sys_sched.c (revision ed75d7a867996c84cfa88e3b8906816277e957f7)
1 /*	$NetBSD: sys_sched.c,v 1.47 2020/01/27 22:05:10 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * System calls relating to the scheduler.
31  *
32  * Lock order:
33  *
34  *	cpu_lock ->
35  *	    proc_lock ->
36  *		proc_t::p_lock ->
37  *		    lwp_t::lwp_lock
38  *
39  * TODO:
40  *  - Handle pthread_setschedprio() as defined by POSIX;
41  */
42 
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.47 2020/01/27 22:05:10 ad Exp $");
45 
46 #include <sys/param.h>
47 
48 #include <sys/cpu.h>
49 #include <sys/kauth.h>
50 #include <sys/kmem.h>
51 #include <sys/lwp.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/pset.h>
55 #include <sys/sched.h>
56 #include <sys/syscallargs.h>
57 #include <sys/sysctl.h>
58 #include <sys/systm.h>
59 #include <sys/types.h>
60 #include <sys/unistd.h>
61 
62 static struct sysctllog *sched_sysctl_log;
63 static kauth_listener_t sched_listener;
64 
65 /*
66  * Convert user priority or the in-kernel priority or convert the current
67  * priority to the appropriate range according to the policy change.
68  */
69 static pri_t
70 convert_pri(lwp_t *l, int policy, pri_t pri)
71 {
72 
73 	/* Convert user priority to the in-kernel */
74 	if (pri != PRI_NONE) {
75 		/* Only for real-time threads */
76 		KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
77 		KASSERT(policy != SCHED_OTHER);
78 		return PRI_USER_RT + pri;
79 	}
80 
81 	/* Neither policy, nor priority change */
82 	if (l->l_class == policy)
83 		return l->l_priority;
84 
85 	/* Time-sharing -> real-time */
86 	if (l->l_class == SCHED_OTHER) {
87 		KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
88 		return PRI_USER_RT;
89 	}
90 
91 	/* Real-time -> time-sharing */
92 	if (policy == SCHED_OTHER) {
93 		KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
94 		/*
95 		 * this is a bit arbitrary because the priority is dynamic
96 		 * for SCHED_OTHER threads and will likely be changed by
97 		 * the scheduler soon anyway.
98 		 */
99 		return l->l_priority - PRI_USER_RT;
100 	}
101 
102 	/* Real-time -> real-time */
103 	return l->l_priority;
104 }
105 
106 int
107 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
108     const struct sched_param *params)
109 {
110 	struct proc *p;
111 	struct lwp *t;
112 	pri_t pri;
113 	u_int lcnt;
114 	int error;
115 
116 	error = 0;
117 
118 	pri = params->sched_priority;
119 
120 	/* If no parameters specified, just return (this should not happen) */
121 	if (pri == PRI_NONE && policy == SCHED_NONE)
122 		return 0;
123 
124 	/* Validate scheduling class */
125 	if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
126 		return EINVAL;
127 
128 	/* Validate priority */
129 	if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
130 		return EINVAL;
131 
132 	if (pid != 0) {
133 		/* Find the process */
134 		mutex_enter(proc_lock);
135 		p = proc_find(pid);
136 		if (p == NULL) {
137 			mutex_exit(proc_lock);
138 			return ESRCH;
139 		}
140 		mutex_enter(p->p_lock);
141 		mutex_exit(proc_lock);
142 		/* Disallow modification of system processes */
143 		if ((p->p_flag & PK_SYSTEM) != 0) {
144 			mutex_exit(p->p_lock);
145 			return EPERM;
146 		}
147 	} else {
148 		/* Use the calling process */
149 		p = curlwp->l_proc;
150 		mutex_enter(p->p_lock);
151 	}
152 
153 	/* Find the LWP(s) */
154 	lcnt = 0;
155 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
156 		pri_t kpri;
157 		int lpolicy;
158 
159 		if (lid && lid != t->l_lid)
160 			continue;
161 
162 		lcnt++;
163 		lwp_lock(t);
164 		lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
165 
166 		/* Disallow setting of priority for SCHED_OTHER threads */
167 		if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
168 			lwp_unlock(t);
169 			error = EINVAL;
170 			break;
171 		}
172 
173 		/* Convert priority, if needed */
174 		kpri = convert_pri(t, lpolicy, pri);
175 
176 		/* Check the permission */
177 		error = kauth_authorize_process(kauth_cred_get(),
178 		    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
179 		    KAUTH_ARG(kpri));
180 		if (error) {
181 			lwp_unlock(t);
182 			break;
183 		}
184 
185 		/* Set the scheduling class, change the priority */
186 		t->l_class = lpolicy;
187 		lwp_changepri(t, kpri);
188 		lwp_unlock(t);
189 	}
190 	mutex_exit(p->p_lock);
191 	return (lcnt == 0) ? ESRCH : error;
192 }
193 
194 /*
195  * Set scheduling parameters.
196  */
197 int
198 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
199     register_t *retval)
200 {
201 	/* {
202 		syscallarg(pid_t) pid;
203 		syscallarg(lwpid_t) lid;
204 		syscallarg(int) policy;
205 		syscallarg(const struct sched_param *) params;
206 	} */
207 	struct sched_param params;
208 	int error;
209 
210 	/* Get the parameters from the user-space */
211 	error = copyin(SCARG(uap, params), &params, sizeof(params));
212 	if (error)
213 		goto out;
214 
215 	error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
216 	    SCARG(uap, policy), &params);
217 out:
218 	return error;
219 }
220 
221 /*
222  * do_sched_getparam:
223  *
224  * if lid=0, returns the parameter of the first LWP in the process.
225  */
226 int
227 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
228     struct sched_param *params)
229 {
230 	struct sched_param lparams;
231 	struct lwp *t;
232 	int error, lpolicy;
233 
234 	t = lwp_find2(pid, lid); /* acquire p_lock */
235 	if (t == NULL)
236 		return ESRCH;
237 
238 	/* Check the permission */
239 	error = kauth_authorize_process(kauth_cred_get(),
240 	    KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
241 	if (error != 0) {
242 		mutex_exit(t->l_proc->p_lock);
243 		return error;
244 	}
245 
246 	lwp_lock(t);
247 	lparams.sched_priority = t->l_priority;
248 	lpolicy = t->l_class;
249 	lwp_unlock(t);
250 	mutex_exit(t->l_proc->p_lock);
251 
252 	/*
253 	 * convert to the user-visible priority value.
254 	 * it's an inversion of convert_pri().
255 	 *
256 	 * the SCHED_OTHER case is a bit arbitrary given that
257 	 *	- we don't allow setting the priority.
258 	 *	- the priority is dynamic.
259 	 */
260 	switch (lpolicy) {
261 	case SCHED_OTHER:
262 		lparams.sched_priority -= PRI_USER;
263 		break;
264 	case SCHED_RR:
265 	case SCHED_FIFO:
266 		lparams.sched_priority -= PRI_USER_RT;
267 		break;
268 	}
269 
270 	if (policy != NULL)
271 		*policy = lpolicy;
272 
273 	if (params != NULL)
274 		*params = lparams;
275 
276 	return error;
277 }
278 
279 /*
280  * Get scheduling parameters.
281  */
282 int
283 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
284     register_t *retval)
285 {
286 	/* {
287 		syscallarg(pid_t) pid;
288 		syscallarg(lwpid_t) lid;
289 		syscallarg(int *) policy;
290 		syscallarg(struct sched_param *) params;
291 	} */
292 	struct sched_param params;
293 	int error, policy;
294 
295 	error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
296 	    &params);
297 	if (error)
298 		goto out;
299 
300 	error = copyout(&params, SCARG(uap, params), sizeof(params));
301 	if (error == 0 && SCARG(uap, policy) != NULL)
302 		error = copyout(&policy, SCARG(uap, policy), sizeof(int));
303 out:
304 	return error;
305 }
306 
307 /*
308  * Allocate the CPU set, and get it from userspace.
309  */
310 static int
311 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
312 {
313 	kcpuset_t *kset;
314 	int error;
315 
316 	kcpuset_create(&kset, true);
317 	error = kcpuset_copyin(sset, kset, size);
318 	if (error) {
319 		kcpuset_unuse(kset, NULL);
320 	} else {
321 		*dset = kset;
322 	}
323 	return error;
324 }
325 
326 /*
327  * Set affinity.
328  */
329 int
330 sys__sched_setaffinity(struct lwp *l,
331     const struct sys__sched_setaffinity_args *uap, register_t *retval)
332 {
333 	/* {
334 		syscallarg(pid_t) pid;
335 		syscallarg(lwpid_t) lid;
336 		syscallarg(size_t) size;
337 		syscallarg(const cpuset_t *) cpuset;
338 	} */
339 	kcpuset_t *kcset, *kcpulst = NULL;
340 	struct cpu_info *ici, *ci;
341 	struct proc *p;
342 	struct lwp *t;
343 	CPU_INFO_ITERATOR cii;
344 	bool alloff;
345 	lwpid_t lid;
346 	u_int lcnt;
347 	int error;
348 
349 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
350 	if (error)
351 		return error;
352 
353 	/*
354 	 * Traverse _each_ CPU to:
355 	 *  - Check that CPUs in the mask have no assigned processor set.
356 	 *  - Check that at least one CPU from the mask is online.
357 	 *  - Find the first target CPU to migrate.
358 	 *
359 	 * To avoid the race with CPU online/offline calls and processor sets,
360 	 * cpu_lock will be locked for the entire operation.
361 	 */
362 	ci = NULL;
363 	alloff = false;
364 	mutex_enter(&cpu_lock);
365 	for (CPU_INFO_FOREACH(cii, ici)) {
366 		struct schedstate_percpu *ispc;
367 
368 		if (!kcpuset_isset(kcset, cpu_index(ici))) {
369 			continue;
370 		}
371 
372 		ispc = &ici->ci_schedstate;
373 		/* Check that CPU is not in the processor-set */
374 		if (ispc->spc_psid != PS_NONE) {
375 			error = EPERM;
376 			goto out;
377 		}
378 		/* Skip offline CPUs */
379 		if (ispc->spc_flags & SPCF_OFFLINE) {
380 			alloff = true;
381 			continue;
382 		}
383 		/* Target CPU to migrate */
384 		if (ci == NULL) {
385 			ci = ici;
386 		}
387 	}
388 	if (ci == NULL) {
389 		if (alloff) {
390 			/* All CPUs in the set are offline */
391 			error = EPERM;
392 			goto out;
393 		}
394 		/* Empty set */
395 		kcpuset_unuse(kcset, &kcpulst);
396 		kcset = NULL;
397 	}
398 
399 	if (SCARG(uap, pid) != 0) {
400 		/* Find the process */
401 		mutex_enter(proc_lock);
402 		p = proc_find(SCARG(uap, pid));
403 		if (p == NULL) {
404 			mutex_exit(proc_lock);
405 			error = ESRCH;
406 			goto out;
407 		}
408 		mutex_enter(p->p_lock);
409 		mutex_exit(proc_lock);
410 		/* Disallow modification of system processes. */
411 		if ((p->p_flag & PK_SYSTEM) != 0) {
412 			mutex_exit(p->p_lock);
413 			error = EPERM;
414 			goto out;
415 		}
416 	} else {
417 		/* Use the calling process */
418 		p = l->l_proc;
419 		mutex_enter(p->p_lock);
420 	}
421 
422 	/*
423 	 * Check the permission.
424 	 */
425 	error = kauth_authorize_process(l->l_cred,
426 	    KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
427 	if (error != 0) {
428 		mutex_exit(p->p_lock);
429 		goto out;
430 	}
431 
432 	/* Iterate through LWP(s). */
433 	lcnt = 0;
434 	lid = SCARG(uap, lid);
435 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
436 		if (lid && lid != t->l_lid) {
437 			continue;
438 		}
439 		lwp_lock(t);
440 		/* No affinity for zombie LWPs. */
441 		if (t->l_stat == LSZOMB) {
442 			lwp_unlock(t);
443 			continue;
444 		}
445 		/* First, release existing affinity, if any. */
446 		if (t->l_affinity) {
447 			kcpuset_unuse(t->l_affinity, &kcpulst);
448 		}
449 		if (kcset) {
450 			/*
451 			 * Hold a reference on affinity mask, assign mask to
452 			 * LWP and migrate it to another CPU (unlocks LWP).
453 			 */
454 			kcpuset_use(kcset);
455 			t->l_affinity = kcset;
456 			lwp_migrate(t, ci);
457 		} else {
458 			/* Old affinity mask is released, just clear. */
459 			t->l_affinity = NULL;
460 			lwp_unlock(t);
461 		}
462 		lcnt++;
463 	}
464 	mutex_exit(p->p_lock);
465 	if (lcnt == 0) {
466 		error = ESRCH;
467 	}
468 out:
469 	mutex_exit(&cpu_lock);
470 
471 	/*
472 	 * Drop the initial reference (LWPs, if any, have the ownership now),
473 	 * and destroy whatever is in the G/C list, if filled.
474 	 */
475 	if (kcset) {
476 		kcpuset_unuse(kcset, &kcpulst);
477 	}
478 	if (kcpulst) {
479 		kcpuset_destroy(kcpulst);
480 	}
481 	return error;
482 }
483 
484 /*
485  * Get affinity.
486  */
487 int
488 sys__sched_getaffinity(struct lwp *l,
489     const struct sys__sched_getaffinity_args *uap, register_t *retval)
490 {
491 	/* {
492 		syscallarg(pid_t) pid;
493 		syscallarg(lwpid_t) lid;
494 		syscallarg(size_t) size;
495 		syscallarg(cpuset_t *) cpuset;
496 	} */
497 	struct lwp *t;
498 	kcpuset_t *kcset;
499 	int error;
500 
501 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
502 	if (error)
503 		return error;
504 
505 	/* Locks the LWP */
506 	t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
507 	if (t == NULL) {
508 		error = ESRCH;
509 		goto out;
510 	}
511 	/* Check the permission */
512 	if (kauth_authorize_process(l->l_cred,
513 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
514 		mutex_exit(t->l_proc->p_lock);
515 		error = EPERM;
516 		goto out;
517 	}
518 	lwp_lock(t);
519 	if (t->l_affinity) {
520 		kcpuset_copy(kcset, t->l_affinity);
521 	} else {
522 		kcpuset_zero(kcset);
523 	}
524 	lwp_unlock(t);
525 	mutex_exit(t->l_proc->p_lock);
526 
527 	error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
528 out:
529 	kcpuset_unuse(kcset, NULL);
530 	return error;
531 }
532 
533 /*
534  * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
535  * analogue of priority inheritance: temp raise the priority
536  * of the caller when accessing a protected resource.
537  */
538 int
539 sys__sched_protect(struct lwp *l,
540     const struct sys__sched_protect_args *uap, register_t *retval)
541 {
542         /* {
543                 syscallarg(int) priority;
544 		syscallarg(int *) opriority;
545         } */
546 	int error;
547 	pri_t pri;
548 
549 	KASSERT(l->l_inheritedprio == -1);
550 	KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);
551 
552 	pri = SCARG(uap, priority);
553 	error = 0;
554 	lwp_lock(l);
555 	if (pri == -1) {
556 		/* back out priority changes */
557 		switch(l->l_protectdepth) {
558 		case 0:
559 			error = EINVAL;
560 			break;
561 		case 1:
562 			l->l_protectdepth = 0;
563 			l->l_protectprio = -1;
564 			l->l_auxprio = -1;
565 			break;
566 		default:
567 			l->l_protectdepth--;
568 			break;
569 		}
570 	} else if (pri < 0) {
571 		/* Just retrieve the current value, for debugging */
572 		if (l->l_protectprio == -1)
573 			error = ENOENT;
574 		else
575 			*retval = l->l_protectprio - PRI_USER_RT;
576 	} else if (__predict_false(pri < SCHED_PRI_MIN ||
577 	    pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
578 		/* must fail if existing priority is higher */
579 		error = EPERM;
580 	} else {
581 		/* play along but make no changes if not a realtime LWP. */
582 		l->l_protectdepth++;
583 		pri += PRI_USER_RT;
584 		if (__predict_true(l->l_class != SCHED_OTHER &&
585 		    pri > l->l_protectprio)) {
586 			l->l_protectprio = pri;
587 			l->l_auxprio = pri;
588 		}
589 	}
590 	lwp_unlock(l);
591 
592 	return error;
593 }
594 
595 /*
596  * Yield.
597  */
598 int
599 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
600 {
601 
602 	yield();
603 	return 0;
604 }
605 
606 /*
607  * Sysctl nodes and initialization.
608  */
609 static void
610 sysctl_sched_setup(struct sysctllog **clog)
611 {
612 	const struct sysctlnode *node = NULL;
613 
614 	sysctl_createv(clog, 0, NULL, NULL,
615 		CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
616 		CTLTYPE_INT, "posix_sched",
617 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
618 			     "Process Scheduling option to which the "
619 			     "system attempts to conform"),
620 		NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
621 		CTL_KERN, CTL_CREATE, CTL_EOL);
622 	sysctl_createv(clog, 0, NULL, &node,
623 		CTLFLAG_PERMANENT,
624 		CTLTYPE_NODE, "sched",
625 		SYSCTL_DESCR("Scheduler options"),
626 		NULL, 0, NULL, 0,
627 		CTL_KERN, CTL_CREATE, CTL_EOL);
628 
629 	if (node == NULL)
630 		return;
631 
632 	sysctl_createv(clog, 0, &node, NULL,
633 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
634 		CTLTYPE_INT, "pri_min",
635 		SYSCTL_DESCR("Minimal POSIX real-time priority"),
636 		NULL, SCHED_PRI_MIN, NULL, 0,
637 		CTL_CREATE, CTL_EOL);
638 	sysctl_createv(clog, 0, &node, NULL,
639 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
640 		CTLTYPE_INT, "pri_max",
641 		SYSCTL_DESCR("Maximal POSIX real-time priority"),
642 		NULL, SCHED_PRI_MAX, NULL, 0,
643 		CTL_CREATE, CTL_EOL);
644 }
645 
646 static int
647 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
648     void *arg0, void *arg1, void *arg2, void *arg3)
649 {
650 	struct proc *p;
651 	int result;
652 
653 	result = KAUTH_RESULT_DEFER;
654 	p = arg0;
655 
656 	switch (action) {
657 	case KAUTH_PROCESS_SCHEDULER_GETPARAM:
658 		if (kauth_cred_uidmatch(cred, p->p_cred))
659 			result = KAUTH_RESULT_ALLOW;
660 		break;
661 
662 	case KAUTH_PROCESS_SCHEDULER_SETPARAM:
663 		if (kauth_cred_uidmatch(cred, p->p_cred)) {
664 			struct lwp *l;
665 			int policy;
666 			pri_t priority;
667 
668 			l = arg1;
669 			policy = (int)(unsigned long)arg2;
670 			priority = (pri_t)(unsigned long)arg3;
671 
672 			if ((policy == l->l_class ||
673 			    (policy != SCHED_FIFO && policy != SCHED_RR)) &&
674 			    priority <= l->l_priority)
675 				result = KAUTH_RESULT_ALLOW;
676 		}
677 
678 		break;
679 
680 	case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
681 		result = KAUTH_RESULT_ALLOW;
682 		break;
683 
684 	case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
685 		/* Privileged; we let the secmodel handle this. */
686 		break;
687 
688 	default:
689 		break;
690 	}
691 
692 	return result;
693 }
694 
695 void
696 sched_init(void)
697 {
698 
699 	sysctl_sched_setup(&sched_sysctl_log);
700 
701 	sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
702 	    sched_listener_cb, NULL);
703 }
704