1 /* $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * System calls relating to the scheduler. 31 * 32 * Lock order: 33 * 34 * cpu_lock -> 35 * proc_lock -> 36 * proc_t::p_lock -> 37 * lwp_t::lwp_lock 38 * 39 * TODO: 40 * - Handle pthread_setschedprio() as defined by POSIX; 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $"); 45 46 #include <sys/param.h> 47 48 #include <sys/cpu.h> 49 #include <sys/kauth.h> 50 #include <sys/kmem.h> 51 #include <sys/lwp.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/pset.h> 55 #include <sys/sched.h> 56 #include <sys/syscallargs.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 #include <sys/types.h> 60 #include <sys/unistd.h> 61 62 static struct sysctllog *sched_sysctl_log; 63 static kauth_listener_t sched_listener; 64 65 /* 66 * Convert user priority or the in-kernel priority or convert the current 67 * priority to the appropriate range according to the policy change. 68 */ 69 static pri_t 70 convert_pri(lwp_t *l, int policy, pri_t pri) 71 { 72 73 /* Convert user priority to the in-kernel */ 74 if (pri != PRI_NONE) { 75 /* Only for real-time threads */ 76 KASSERT(pri >= SCHED_PRI_MIN); 77 KASSERT(pri <= SCHED_PRI_MAX); 78 KASSERT(policy != SCHED_OTHER); 79 return PRI_USER_RT + pri; 80 } 81 82 /* Neither policy, nor priority change */ 83 if (l->l_class == policy) 84 return l->l_priority; 85 86 /* Time-sharing -> real-time */ 87 if (l->l_class == SCHED_OTHER) { 88 KASSERT(policy == SCHED_FIFO || policy == SCHED_RR); 89 return PRI_USER_RT; 90 } 91 92 /* Real-time -> time-sharing */ 93 if (policy == SCHED_OTHER) { 94 KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR); 95 /* 96 * this is a bit arbitrary because the priority is dynamic 97 * for SCHED_OTHER threads and will likely be changed by 98 * the scheduler soon anyway. 99 */ 100 return l->l_priority - PRI_USER_RT; 101 } 102 103 /* Real-time -> real-time */ 104 return l->l_priority; 105 } 106 107 int 108 do_sched_setparam(pid_t pid, lwpid_t lid, int policy, 109 const struct sched_param *params) 110 { 111 struct proc *p; 112 struct lwp *t; 113 pri_t pri; 114 u_int lcnt; 115 int error; 116 117 error = 0; 118 119 pri = params->sched_priority; 120 121 /* If no parameters specified, just return (this should not happen) */ 122 if (pri == PRI_NONE && policy == SCHED_NONE) 123 return 0; 124 125 /* Validate scheduling class */ 126 if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR)) 127 return EINVAL; 128 129 /* Validate priority */ 130 if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX)) 131 return EINVAL; 132 133 if (pid != 0) { 134 /* Find the process */ 135 mutex_enter(&proc_lock); 136 p = proc_find(pid); 137 if (p == NULL) { 138 mutex_exit(&proc_lock); 139 return ESRCH; 140 } 141 mutex_enter(p->p_lock); 142 mutex_exit(&proc_lock); 143 /* Disallow modification of system processes */ 144 if ((p->p_flag & PK_SYSTEM) != 0) { 145 mutex_exit(p->p_lock); 146 return EPERM; 147 } 148 } else { 149 /* Use the calling process */ 150 p = curlwp->l_proc; 151 mutex_enter(p->p_lock); 152 } 153 154 /* Find the LWP(s) */ 155 lcnt = 0; 156 LIST_FOREACH(t, &p->p_lwps, l_sibling) { 157 pri_t kpri; 158 int lpolicy; 159 160 if (lid && lid != t->l_lid) 161 continue; 162 163 lcnt++; 164 lwp_lock(t); 165 lpolicy = (policy == SCHED_NONE) ? t->l_class : policy; 166 167 /* Disallow setting of priority for SCHED_OTHER threads */ 168 if (lpolicy == SCHED_OTHER && pri != PRI_NONE) { 169 lwp_unlock(t); 170 error = EINVAL; 171 break; 172 } 173 174 /* Convert priority, if needed */ 175 kpri = convert_pri(t, lpolicy, pri); 176 177 /* Check the permission */ 178 error = kauth_authorize_process(kauth_cred_get(), 179 KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy), 180 KAUTH_ARG(kpri)); 181 if (error) { 182 lwp_unlock(t); 183 break; 184 } 185 186 /* Set the scheduling class, change the priority */ 187 t->l_class = lpolicy; 188 lwp_changepri(t, kpri); 189 lwp_unlock(t); 190 } 191 mutex_exit(p->p_lock); 192 return (lcnt == 0) ? ESRCH : error; 193 } 194 195 /* 196 * Set scheduling parameters. 197 */ 198 int 199 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap, 200 register_t *retval) 201 { 202 /* { 203 syscallarg(pid_t) pid; 204 syscallarg(lwpid_t) lid; 205 syscallarg(int) policy; 206 syscallarg(const struct sched_param *) params; 207 } */ 208 struct sched_param params; 209 int error; 210 211 /* Get the parameters from the user-space */ 212 error = copyin(SCARG(uap, params), ¶ms, sizeof(params)); 213 if (error) 214 goto out; 215 216 error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid), 217 SCARG(uap, policy), ¶ms); 218 out: 219 return error; 220 } 221 222 /* 223 * do_sched_getparam: 224 * 225 * if lid=0, returns the parameter of the first LWP in the process. 226 */ 227 int 228 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy, 229 struct sched_param *params) 230 { 231 struct sched_param lparams; 232 struct lwp *t; 233 int error, lpolicy; 234 235 if (pid < 0 || lid < 0) 236 return EINVAL; 237 238 t = lwp_find2(pid, lid); /* acquire p_lock */ 239 if (t == NULL) 240 return ESRCH; 241 242 /* Check the permission */ 243 error = kauth_authorize_process(kauth_cred_get(), 244 KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL); 245 if (error != 0) { 246 mutex_exit(t->l_proc->p_lock); 247 return error; 248 } 249 250 lwp_lock(t); 251 lparams.sched_priority = t->l_priority; 252 lpolicy = t->l_class; 253 lwp_unlock(t); 254 mutex_exit(t->l_proc->p_lock); 255 256 /* 257 * convert to the user-visible priority value. 258 * it's an inversion of convert_pri(). 259 * 260 * the SCHED_OTHER case is a bit arbitrary given that 261 * - we don't allow setting the priority. 262 * - the priority is dynamic. 263 */ 264 switch (lpolicy) { 265 case SCHED_OTHER: 266 lparams.sched_priority -= PRI_USER; 267 break; 268 case SCHED_RR: 269 case SCHED_FIFO: 270 lparams.sched_priority -= PRI_USER_RT; 271 break; 272 } 273 274 if (policy != NULL) 275 *policy = lpolicy; 276 277 if (params != NULL) 278 *params = lparams; 279 280 return error; 281 } 282 283 /* 284 * Get scheduling parameters. 285 */ 286 int 287 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap, 288 register_t *retval) 289 { 290 /* { 291 syscallarg(pid_t) pid; 292 syscallarg(lwpid_t) lid; 293 syscallarg(int *) policy; 294 syscallarg(struct sched_param *) params; 295 } */ 296 struct sched_param params; 297 int error, policy; 298 299 error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy, 300 ¶ms); 301 if (error) 302 goto out; 303 304 error = copyout(¶ms, SCARG(uap, params), sizeof(params)); 305 if (error == 0 && SCARG(uap, policy) != NULL) 306 error = copyout(&policy, SCARG(uap, policy), sizeof(int)); 307 out: 308 return error; 309 } 310 311 /* 312 * Allocate the CPU set, and get it from userspace. 313 */ 314 static int 315 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size) 316 { 317 kcpuset_t *kset; 318 int error; 319 320 kcpuset_create(&kset, true); 321 error = kcpuset_copyin(sset, kset, size); 322 if (error) { 323 kcpuset_unuse(kset, NULL); 324 } else { 325 *dset = kset; 326 } 327 return error; 328 } 329 330 /* 331 * Set affinity. 332 */ 333 int 334 sys__sched_setaffinity(struct lwp *l, 335 const struct sys__sched_setaffinity_args *uap, register_t *retval) 336 { 337 /* { 338 syscallarg(pid_t) pid; 339 syscallarg(lwpid_t) lid; 340 syscallarg(size_t) size; 341 syscallarg(const cpuset_t *) cpuset; 342 } */ 343 kcpuset_t *kcset, *kcpulst = NULL; 344 struct cpu_info *ici, *ci; 345 struct proc *p; 346 struct lwp *t; 347 CPU_INFO_ITERATOR cii; 348 bool alloff; 349 lwpid_t lid; 350 u_int lcnt; 351 int error; 352 353 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); 354 if (error) 355 return error; 356 357 /* 358 * Traverse _each_ CPU to: 359 * - Check that CPUs in the mask have no assigned processor set. 360 * - Check that at least one CPU from the mask is online. 361 * - Find the first target CPU to migrate. 362 * 363 * To avoid the race with CPU online/offline calls and processor sets, 364 * cpu_lock will be locked for the entire operation. 365 */ 366 ci = NULL; 367 alloff = false; 368 mutex_enter(&cpu_lock); 369 for (CPU_INFO_FOREACH(cii, ici)) { 370 struct schedstate_percpu *ispc; 371 372 if (!kcpuset_isset(kcset, cpu_index(ici))) { 373 continue; 374 } 375 376 ispc = &ici->ci_schedstate; 377 /* Check that CPU is not in the processor-set */ 378 if (ispc->spc_psid != PS_NONE) { 379 error = EPERM; 380 goto out; 381 } 382 /* Skip offline CPUs */ 383 if (ispc->spc_flags & SPCF_OFFLINE) { 384 alloff = true; 385 continue; 386 } 387 /* Target CPU to migrate */ 388 if (ci == NULL) { 389 ci = ici; 390 } 391 } 392 if (ci == NULL) { 393 if (alloff) { 394 /* All CPUs in the set are offline */ 395 error = EPERM; 396 goto out; 397 } 398 /* Empty set */ 399 kcpuset_unuse(kcset, &kcpulst); 400 kcset = NULL; 401 } 402 403 if (SCARG(uap, pid) != 0) { 404 /* Find the process */ 405 mutex_enter(&proc_lock); 406 p = proc_find(SCARG(uap, pid)); 407 if (p == NULL) { 408 mutex_exit(&proc_lock); 409 error = ESRCH; 410 goto out; 411 } 412 mutex_enter(p->p_lock); 413 mutex_exit(&proc_lock); 414 /* Disallow modification of system processes. */ 415 if ((p->p_flag & PK_SYSTEM) != 0) { 416 mutex_exit(p->p_lock); 417 error = EPERM; 418 goto out; 419 } 420 } else { 421 /* Use the calling process */ 422 p = l->l_proc; 423 mutex_enter(p->p_lock); 424 } 425 426 /* 427 * Check the permission. 428 */ 429 error = kauth_authorize_process(l->l_cred, 430 KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL); 431 if (error != 0) { 432 mutex_exit(p->p_lock); 433 goto out; 434 } 435 436 /* Iterate through LWP(s). */ 437 lcnt = 0; 438 lid = SCARG(uap, lid); 439 LIST_FOREACH(t, &p->p_lwps, l_sibling) { 440 if (lid && lid != t->l_lid) { 441 continue; 442 } 443 lwp_lock(t); 444 /* No affinity for zombie LWPs. */ 445 if (t->l_stat == LSZOMB) { 446 lwp_unlock(t); 447 continue; 448 } 449 /* First, release existing affinity, if any. */ 450 if (t->l_affinity) { 451 kcpuset_unuse(t->l_affinity, &kcpulst); 452 } 453 if (kcset) { 454 /* 455 * Hold a reference on affinity mask, assign mask to 456 * LWP and migrate it to another CPU (unlocks LWP). 457 */ 458 kcpuset_use(kcset); 459 t->l_affinity = kcset; 460 lwp_migrate(t, ci); 461 } else { 462 /* Old affinity mask is released, just clear. */ 463 t->l_affinity = NULL; 464 lwp_unlock(t); 465 } 466 lcnt++; 467 } 468 mutex_exit(p->p_lock); 469 if (lcnt == 0) { 470 error = ESRCH; 471 } 472 out: 473 mutex_exit(&cpu_lock); 474 475 /* 476 * Drop the initial reference (LWPs, if any, have the ownership now), 477 * and destroy whatever is in the G/C list, if filled. 478 */ 479 if (kcset) { 480 kcpuset_unuse(kcset, &kcpulst); 481 } 482 if (kcpulst) { 483 kcpuset_destroy(kcpulst); 484 } 485 return error; 486 } 487 488 /* 489 * Get affinity. 490 */ 491 int 492 sys__sched_getaffinity(struct lwp *l, 493 const struct sys__sched_getaffinity_args *uap, register_t *retval) 494 { 495 /* { 496 syscallarg(pid_t) pid; 497 syscallarg(lwpid_t) lid; 498 syscallarg(size_t) size; 499 syscallarg(cpuset_t *) cpuset; 500 } */ 501 struct lwp *t; 502 kcpuset_t *kcset; 503 int error; 504 505 if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0) 506 return EINVAL; 507 508 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); 509 if (error) 510 return error; 511 512 /* Locks the LWP */ 513 t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid)); 514 if (t == NULL) { 515 error = ESRCH; 516 goto out; 517 } 518 /* Check the permission */ 519 if (kauth_authorize_process(l->l_cred, 520 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) { 521 mutex_exit(t->l_proc->p_lock); 522 error = EPERM; 523 goto out; 524 } 525 lwp_lock(t); 526 if (t->l_affinity) { 527 kcpuset_copy(kcset, t->l_affinity); 528 } else { 529 kcpuset_zero(kcset); 530 } 531 lwp_unlock(t); 532 mutex_exit(t->l_proc->p_lock); 533 534 error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size)); 535 out: 536 kcpuset_unuse(kcset, NULL); 537 return error; 538 } 539 540 /* 541 * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak 542 * analogue of priority inheritance: temp raise the priority 543 * of the caller when accessing a protected resource. 544 */ 545 int 546 sys__sched_protect(struct lwp *l, 547 const struct sys__sched_protect_args *uap, register_t *retval) 548 { 549 /* { 550 syscallarg(int) priority; 551 syscallarg(int *) opriority; 552 } */ 553 int error; 554 pri_t pri; 555 556 KASSERT(l->l_inheritedprio == -1); 557 KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio); 558 559 pri = SCARG(uap, priority); 560 error = 0; 561 lwp_lock(l); 562 if (pri == -1) { 563 /* back out priority changes */ 564 switch(l->l_protectdepth) { 565 case 0: 566 error = EINVAL; 567 break; 568 case 1: 569 l->l_protectdepth = 0; 570 l->l_protectprio = -1; 571 l->l_auxprio = -1; 572 break; 573 default: 574 l->l_protectdepth--; 575 break; 576 } 577 } else if (pri < 0) { 578 /* Just retrieve the current value, for debugging */ 579 if (l->l_protectprio == -1) 580 error = ENOENT; 581 else 582 *retval = l->l_protectprio - PRI_USER_RT; 583 } else if (__predict_false(pri < SCHED_PRI_MIN || 584 pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) { 585 /* must fail if existing priority is higher */ 586 error = EPERM; 587 } else { 588 /* play along but make no changes if not a realtime LWP. */ 589 l->l_protectdepth++; 590 pri += PRI_USER_RT; 591 if (__predict_true(l->l_class != SCHED_OTHER && 592 pri > l->l_protectprio)) { 593 l->l_protectprio = pri; 594 l->l_auxprio = pri; 595 } 596 } 597 lwp_unlock(l); 598 599 return error; 600 } 601 602 /* 603 * Yield. 604 */ 605 int 606 sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 607 { 608 609 yield(); 610 return 0; 611 } 612 613 /* 614 * Sysctl nodes and initialization. 615 */ 616 static void 617 sysctl_sched_setup(struct sysctllog **clog) 618 { 619 const struct sysctlnode *node = NULL; 620 621 sysctl_createv(clog, 0, NULL, NULL, 622 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 623 CTLTYPE_INT, "posix_sched", 624 SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " 625 "Process Scheduling option to which the " 626 "system attempts to conform"), 627 NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0, 628 CTL_KERN, CTL_CREATE, CTL_EOL); 629 sysctl_createv(clog, 0, NULL, &node, 630 CTLFLAG_PERMANENT, 631 CTLTYPE_NODE, "sched", 632 SYSCTL_DESCR("Scheduler options"), 633 NULL, 0, NULL, 0, 634 CTL_KERN, CTL_CREATE, CTL_EOL); 635 636 if (node == NULL) 637 return; 638 639 sysctl_createv(clog, 0, &node, NULL, 640 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 641 CTLTYPE_INT, "pri_min", 642 SYSCTL_DESCR("Minimal POSIX real-time priority"), 643 NULL, SCHED_PRI_MIN, NULL, 0, 644 CTL_CREATE, CTL_EOL); 645 sysctl_createv(clog, 0, &node, NULL, 646 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 647 CTLTYPE_INT, "pri_max", 648 SYSCTL_DESCR("Maximal POSIX real-time priority"), 649 NULL, SCHED_PRI_MAX, NULL, 0, 650 CTL_CREATE, CTL_EOL); 651 } 652 653 static int 654 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 655 void *arg0, void *arg1, void *arg2, void *arg3) 656 { 657 struct proc *p; 658 int result; 659 660 result = KAUTH_RESULT_DEFER; 661 p = arg0; 662 663 switch (action) { 664 case KAUTH_PROCESS_SCHEDULER_GETPARAM: 665 if (kauth_cred_uidmatch(cred, p->p_cred)) 666 result = KAUTH_RESULT_ALLOW; 667 break; 668 669 case KAUTH_PROCESS_SCHEDULER_SETPARAM: 670 if (kauth_cred_uidmatch(cred, p->p_cred)) { 671 struct lwp *l; 672 int policy; 673 pri_t priority; 674 675 l = arg1; 676 policy = (int)(unsigned long)arg2; 677 priority = (pri_t)(unsigned long)arg3; 678 679 if ((policy == l->l_class || 680 (policy != SCHED_FIFO && policy != SCHED_RR)) && 681 priority <= l->l_priority) 682 result = KAUTH_RESULT_ALLOW; 683 } 684 685 break; 686 687 case KAUTH_PROCESS_SCHEDULER_GETAFFINITY: 688 result = KAUTH_RESULT_ALLOW; 689 break; 690 691 case KAUTH_PROCESS_SCHEDULER_SETAFFINITY: 692 /* Privileged; we let the secmodel handle this. */ 693 break; 694 695 default: 696 break; 697 } 698 699 return result; 700 } 701 702 void 703 sched_init(void) 704 { 705 706 sysctl_sched_setup(&sched_sysctl_log); 707 708 sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, 709 sched_listener_cb, NULL); 710 } 711