1 /* $NetBSD: sys_sched.c,v 1.49 2020/05/23 23:42:43 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * System calls relating to the scheduler. 31 * 32 * Lock order: 33 * 34 * cpu_lock -> 35 * proc_lock -> 36 * proc_t::p_lock -> 37 * lwp_t::lwp_lock 38 * 39 * TODO: 40 * - Handle pthread_setschedprio() as defined by POSIX; 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.49 2020/05/23 23:42:43 ad Exp $"); 45 46 #include <sys/param.h> 47 48 #include <sys/cpu.h> 49 #include <sys/kauth.h> 50 #include <sys/kmem.h> 51 #include <sys/lwp.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/pset.h> 55 #include <sys/sched.h> 56 #include <sys/syscallargs.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 #include <sys/types.h> 60 #include <sys/unistd.h> 61 62 static struct sysctllog *sched_sysctl_log; 63 static kauth_listener_t sched_listener; 64 65 /* 66 * Convert user priority or the in-kernel priority or convert the current 67 * priority to the appropriate range according to the policy change. 68 */ 69 static pri_t 70 convert_pri(lwp_t *l, int policy, pri_t pri) 71 { 72 73 /* Convert user priority to the in-kernel */ 74 if (pri != PRI_NONE) { 75 /* Only for real-time threads */ 76 KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX); 77 KASSERT(policy != SCHED_OTHER); 78 return PRI_USER_RT + pri; 79 } 80 81 /* Neither policy, nor priority change */ 82 if (l->l_class == policy) 83 return l->l_priority; 84 85 /* Time-sharing -> real-time */ 86 if (l->l_class == SCHED_OTHER) { 87 KASSERT(policy == SCHED_FIFO || policy == SCHED_RR); 88 return PRI_USER_RT; 89 } 90 91 /* Real-time -> time-sharing */ 92 if (policy == SCHED_OTHER) { 93 KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR); 94 /* 95 * this is a bit arbitrary because the priority is dynamic 96 * for SCHED_OTHER threads and will likely be changed by 97 * the scheduler soon anyway. 98 */ 99 return l->l_priority - PRI_USER_RT; 100 } 101 102 /* Real-time -> real-time */ 103 return l->l_priority; 104 } 105 106 int 107 do_sched_setparam(pid_t pid, lwpid_t lid, int policy, 108 const struct sched_param *params) 109 { 110 struct proc *p; 111 struct lwp *t; 112 pri_t pri; 113 u_int lcnt; 114 int error; 115 116 error = 0; 117 118 pri = params->sched_priority; 119 120 /* If no parameters specified, just return (this should not happen) */ 121 if (pri == PRI_NONE && policy == SCHED_NONE) 122 return 0; 123 124 /* Validate scheduling class */ 125 if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR)) 126 return EINVAL; 127 128 /* Validate priority */ 129 if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX)) 130 return EINVAL; 131 132 if (pid != 0) { 133 /* Find the process */ 134 mutex_enter(&proc_lock); 135 p = proc_find(pid); 136 if (p == NULL) { 137 mutex_exit(&proc_lock); 138 return ESRCH; 139 } 140 mutex_enter(p->p_lock); 141 mutex_exit(&proc_lock); 142 /* Disallow modification of system processes */ 143 if ((p->p_flag & PK_SYSTEM) != 0) { 144 mutex_exit(p->p_lock); 145 return EPERM; 146 } 147 } else { 148 /* Use the calling process */ 149 p = curlwp->l_proc; 150 mutex_enter(p->p_lock); 151 } 152 153 /* Find the LWP(s) */ 154 lcnt = 0; 155 LIST_FOREACH(t, &p->p_lwps, l_sibling) { 156 pri_t kpri; 157 int lpolicy; 158 159 if (lid && lid != t->l_lid) 160 continue; 161 162 lcnt++; 163 lwp_lock(t); 164 lpolicy = (policy == SCHED_NONE) ? t->l_class : policy; 165 166 /* Disallow setting of priority for SCHED_OTHER threads */ 167 if (lpolicy == SCHED_OTHER && pri != PRI_NONE) { 168 lwp_unlock(t); 169 error = EINVAL; 170 break; 171 } 172 173 /* Convert priority, if needed */ 174 kpri = convert_pri(t, lpolicy, pri); 175 176 /* Check the permission */ 177 error = kauth_authorize_process(kauth_cred_get(), 178 KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy), 179 KAUTH_ARG(kpri)); 180 if (error) { 181 lwp_unlock(t); 182 break; 183 } 184 185 /* Set the scheduling class, change the priority */ 186 t->l_class = lpolicy; 187 lwp_changepri(t, kpri); 188 lwp_unlock(t); 189 } 190 mutex_exit(p->p_lock); 191 return (lcnt == 0) ? ESRCH : error; 192 } 193 194 /* 195 * Set scheduling parameters. 196 */ 197 int 198 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap, 199 register_t *retval) 200 { 201 /* { 202 syscallarg(pid_t) pid; 203 syscallarg(lwpid_t) lid; 204 syscallarg(int) policy; 205 syscallarg(const struct sched_param *) params; 206 } */ 207 struct sched_param params; 208 int error; 209 210 /* Get the parameters from the user-space */ 211 error = copyin(SCARG(uap, params), ¶ms, sizeof(params)); 212 if (error) 213 goto out; 214 215 error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid), 216 SCARG(uap, policy), ¶ms); 217 out: 218 return error; 219 } 220 221 /* 222 * do_sched_getparam: 223 * 224 * if lid=0, returns the parameter of the first LWP in the process. 225 */ 226 int 227 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy, 228 struct sched_param *params) 229 { 230 struct sched_param lparams; 231 struct lwp *t; 232 int error, lpolicy; 233 234 if (pid < 0 || lid < 0) 235 return EINVAL; 236 237 t = lwp_find2(pid, lid); /* acquire p_lock */ 238 if (t == NULL) 239 return ESRCH; 240 241 /* Check the permission */ 242 error = kauth_authorize_process(kauth_cred_get(), 243 KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL); 244 if (error != 0) { 245 mutex_exit(t->l_proc->p_lock); 246 return error; 247 } 248 249 lwp_lock(t); 250 lparams.sched_priority = t->l_priority; 251 lpolicy = t->l_class; 252 lwp_unlock(t); 253 mutex_exit(t->l_proc->p_lock); 254 255 /* 256 * convert to the user-visible priority value. 257 * it's an inversion of convert_pri(). 258 * 259 * the SCHED_OTHER case is a bit arbitrary given that 260 * - we don't allow setting the priority. 261 * - the priority is dynamic. 262 */ 263 switch (lpolicy) { 264 case SCHED_OTHER: 265 lparams.sched_priority -= PRI_USER; 266 break; 267 case SCHED_RR: 268 case SCHED_FIFO: 269 lparams.sched_priority -= PRI_USER_RT; 270 break; 271 } 272 273 if (policy != NULL) 274 *policy = lpolicy; 275 276 if (params != NULL) 277 *params = lparams; 278 279 return error; 280 } 281 282 /* 283 * Get scheduling parameters. 284 */ 285 int 286 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap, 287 register_t *retval) 288 { 289 /* { 290 syscallarg(pid_t) pid; 291 syscallarg(lwpid_t) lid; 292 syscallarg(int *) policy; 293 syscallarg(struct sched_param *) params; 294 } */ 295 struct sched_param params; 296 int error, policy; 297 298 error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy, 299 ¶ms); 300 if (error) 301 goto out; 302 303 error = copyout(¶ms, SCARG(uap, params), sizeof(params)); 304 if (error == 0 && SCARG(uap, policy) != NULL) 305 error = copyout(&policy, SCARG(uap, policy), sizeof(int)); 306 out: 307 return error; 308 } 309 310 /* 311 * Allocate the CPU set, and get it from userspace. 312 */ 313 static int 314 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size) 315 { 316 kcpuset_t *kset; 317 int error; 318 319 kcpuset_create(&kset, true); 320 error = kcpuset_copyin(sset, kset, size); 321 if (error) { 322 kcpuset_unuse(kset, NULL); 323 } else { 324 *dset = kset; 325 } 326 return error; 327 } 328 329 /* 330 * Set affinity. 331 */ 332 int 333 sys__sched_setaffinity(struct lwp *l, 334 const struct sys__sched_setaffinity_args *uap, register_t *retval) 335 { 336 /* { 337 syscallarg(pid_t) pid; 338 syscallarg(lwpid_t) lid; 339 syscallarg(size_t) size; 340 syscallarg(const cpuset_t *) cpuset; 341 } */ 342 kcpuset_t *kcset, *kcpulst = NULL; 343 struct cpu_info *ici, *ci; 344 struct proc *p; 345 struct lwp *t; 346 CPU_INFO_ITERATOR cii; 347 bool alloff; 348 lwpid_t lid; 349 u_int lcnt; 350 int error; 351 352 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); 353 if (error) 354 return error; 355 356 /* 357 * Traverse _each_ CPU to: 358 * - Check that CPUs in the mask have no assigned processor set. 359 * - Check that at least one CPU from the mask is online. 360 * - Find the first target CPU to migrate. 361 * 362 * To avoid the race with CPU online/offline calls and processor sets, 363 * cpu_lock will be locked for the entire operation. 364 */ 365 ci = NULL; 366 alloff = false; 367 mutex_enter(&cpu_lock); 368 for (CPU_INFO_FOREACH(cii, ici)) { 369 struct schedstate_percpu *ispc; 370 371 if (!kcpuset_isset(kcset, cpu_index(ici))) { 372 continue; 373 } 374 375 ispc = &ici->ci_schedstate; 376 /* Check that CPU is not in the processor-set */ 377 if (ispc->spc_psid != PS_NONE) { 378 error = EPERM; 379 goto out; 380 } 381 /* Skip offline CPUs */ 382 if (ispc->spc_flags & SPCF_OFFLINE) { 383 alloff = true; 384 continue; 385 } 386 /* Target CPU to migrate */ 387 if (ci == NULL) { 388 ci = ici; 389 } 390 } 391 if (ci == NULL) { 392 if (alloff) { 393 /* All CPUs in the set are offline */ 394 error = EPERM; 395 goto out; 396 } 397 /* Empty set */ 398 kcpuset_unuse(kcset, &kcpulst); 399 kcset = NULL; 400 } 401 402 if (SCARG(uap, pid) != 0) { 403 /* Find the process */ 404 mutex_enter(&proc_lock); 405 p = proc_find(SCARG(uap, pid)); 406 if (p == NULL) { 407 mutex_exit(&proc_lock); 408 error = ESRCH; 409 goto out; 410 } 411 mutex_enter(p->p_lock); 412 mutex_exit(&proc_lock); 413 /* Disallow modification of system processes. */ 414 if ((p->p_flag & PK_SYSTEM) != 0) { 415 mutex_exit(p->p_lock); 416 error = EPERM; 417 goto out; 418 } 419 } else { 420 /* Use the calling process */ 421 p = l->l_proc; 422 mutex_enter(p->p_lock); 423 } 424 425 /* 426 * Check the permission. 427 */ 428 error = kauth_authorize_process(l->l_cred, 429 KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL); 430 if (error != 0) { 431 mutex_exit(p->p_lock); 432 goto out; 433 } 434 435 /* Iterate through LWP(s). */ 436 lcnt = 0; 437 lid = SCARG(uap, lid); 438 LIST_FOREACH(t, &p->p_lwps, l_sibling) { 439 if (lid && lid != t->l_lid) { 440 continue; 441 } 442 lwp_lock(t); 443 /* No affinity for zombie LWPs. */ 444 if (t->l_stat == LSZOMB) { 445 lwp_unlock(t); 446 continue; 447 } 448 /* First, release existing affinity, if any. */ 449 if (t->l_affinity) { 450 kcpuset_unuse(t->l_affinity, &kcpulst); 451 } 452 if (kcset) { 453 /* 454 * Hold a reference on affinity mask, assign mask to 455 * LWP and migrate it to another CPU (unlocks LWP). 456 */ 457 kcpuset_use(kcset); 458 t->l_affinity = kcset; 459 lwp_migrate(t, ci); 460 } else { 461 /* Old affinity mask is released, just clear. */ 462 t->l_affinity = NULL; 463 lwp_unlock(t); 464 } 465 lcnt++; 466 } 467 mutex_exit(p->p_lock); 468 if (lcnt == 0) { 469 error = ESRCH; 470 } 471 out: 472 mutex_exit(&cpu_lock); 473 474 /* 475 * Drop the initial reference (LWPs, if any, have the ownership now), 476 * and destroy whatever is in the G/C list, if filled. 477 */ 478 if (kcset) { 479 kcpuset_unuse(kcset, &kcpulst); 480 } 481 if (kcpulst) { 482 kcpuset_destroy(kcpulst); 483 } 484 return error; 485 } 486 487 /* 488 * Get affinity. 489 */ 490 int 491 sys__sched_getaffinity(struct lwp *l, 492 const struct sys__sched_getaffinity_args *uap, register_t *retval) 493 { 494 /* { 495 syscallarg(pid_t) pid; 496 syscallarg(lwpid_t) lid; 497 syscallarg(size_t) size; 498 syscallarg(cpuset_t *) cpuset; 499 } */ 500 struct lwp *t; 501 kcpuset_t *kcset; 502 int error; 503 504 if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0) 505 return EINVAL; 506 507 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size)); 508 if (error) 509 return error; 510 511 /* Locks the LWP */ 512 t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid)); 513 if (t == NULL) { 514 error = ESRCH; 515 goto out; 516 } 517 /* Check the permission */ 518 if (kauth_authorize_process(l->l_cred, 519 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) { 520 mutex_exit(t->l_proc->p_lock); 521 error = EPERM; 522 goto out; 523 } 524 lwp_lock(t); 525 if (t->l_affinity) { 526 kcpuset_copy(kcset, t->l_affinity); 527 } else { 528 kcpuset_zero(kcset); 529 } 530 lwp_unlock(t); 531 mutex_exit(t->l_proc->p_lock); 532 533 error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size)); 534 out: 535 kcpuset_unuse(kcset, NULL); 536 return error; 537 } 538 539 /* 540 * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak 541 * analogue of priority inheritance: temp raise the priority 542 * of the caller when accessing a protected resource. 543 */ 544 int 545 sys__sched_protect(struct lwp *l, 546 const struct sys__sched_protect_args *uap, register_t *retval) 547 { 548 /* { 549 syscallarg(int) priority; 550 syscallarg(int *) opriority; 551 } */ 552 int error; 553 pri_t pri; 554 555 KASSERT(l->l_inheritedprio == -1); 556 KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio); 557 558 pri = SCARG(uap, priority); 559 error = 0; 560 lwp_lock(l); 561 if (pri == -1) { 562 /* back out priority changes */ 563 switch(l->l_protectdepth) { 564 case 0: 565 error = EINVAL; 566 break; 567 case 1: 568 l->l_protectdepth = 0; 569 l->l_protectprio = -1; 570 l->l_auxprio = -1; 571 break; 572 default: 573 l->l_protectdepth--; 574 break; 575 } 576 } else if (pri < 0) { 577 /* Just retrieve the current value, for debugging */ 578 if (l->l_protectprio == -1) 579 error = ENOENT; 580 else 581 *retval = l->l_protectprio - PRI_USER_RT; 582 } else if (__predict_false(pri < SCHED_PRI_MIN || 583 pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) { 584 /* must fail if existing priority is higher */ 585 error = EPERM; 586 } else { 587 /* play along but make no changes if not a realtime LWP. */ 588 l->l_protectdepth++; 589 pri += PRI_USER_RT; 590 if (__predict_true(l->l_class != SCHED_OTHER && 591 pri > l->l_protectprio)) { 592 l->l_protectprio = pri; 593 l->l_auxprio = pri; 594 } 595 } 596 lwp_unlock(l); 597 598 return error; 599 } 600 601 /* 602 * Yield. 603 */ 604 int 605 sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 606 { 607 608 yield(); 609 return 0; 610 } 611 612 /* 613 * Sysctl nodes and initialization. 614 */ 615 static void 616 sysctl_sched_setup(struct sysctllog **clog) 617 { 618 const struct sysctlnode *node = NULL; 619 620 sysctl_createv(clog, 0, NULL, NULL, 621 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 622 CTLTYPE_INT, "posix_sched", 623 SYSCTL_DESCR("Version of IEEE Std 1003.1 and its " 624 "Process Scheduling option to which the " 625 "system attempts to conform"), 626 NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0, 627 CTL_KERN, CTL_CREATE, CTL_EOL); 628 sysctl_createv(clog, 0, NULL, &node, 629 CTLFLAG_PERMANENT, 630 CTLTYPE_NODE, "sched", 631 SYSCTL_DESCR("Scheduler options"), 632 NULL, 0, NULL, 0, 633 CTL_KERN, CTL_CREATE, CTL_EOL); 634 635 if (node == NULL) 636 return; 637 638 sysctl_createv(clog, 0, &node, NULL, 639 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 640 CTLTYPE_INT, "pri_min", 641 SYSCTL_DESCR("Minimal POSIX real-time priority"), 642 NULL, SCHED_PRI_MIN, NULL, 0, 643 CTL_CREATE, CTL_EOL); 644 sysctl_createv(clog, 0, &node, NULL, 645 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 646 CTLTYPE_INT, "pri_max", 647 SYSCTL_DESCR("Maximal POSIX real-time priority"), 648 NULL, SCHED_PRI_MAX, NULL, 0, 649 CTL_CREATE, CTL_EOL); 650 } 651 652 static int 653 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 654 void *arg0, void *arg1, void *arg2, void *arg3) 655 { 656 struct proc *p; 657 int result; 658 659 result = KAUTH_RESULT_DEFER; 660 p = arg0; 661 662 switch (action) { 663 case KAUTH_PROCESS_SCHEDULER_GETPARAM: 664 if (kauth_cred_uidmatch(cred, p->p_cred)) 665 result = KAUTH_RESULT_ALLOW; 666 break; 667 668 case KAUTH_PROCESS_SCHEDULER_SETPARAM: 669 if (kauth_cred_uidmatch(cred, p->p_cred)) { 670 struct lwp *l; 671 int policy; 672 pri_t priority; 673 674 l = arg1; 675 policy = (int)(unsigned long)arg2; 676 priority = (pri_t)(unsigned long)arg3; 677 678 if ((policy == l->l_class || 679 (policy != SCHED_FIFO && policy != SCHED_RR)) && 680 priority <= l->l_priority) 681 result = KAUTH_RESULT_ALLOW; 682 } 683 684 break; 685 686 case KAUTH_PROCESS_SCHEDULER_GETAFFINITY: 687 result = KAUTH_RESULT_ALLOW; 688 break; 689 690 case KAUTH_PROCESS_SCHEDULER_SETAFFINITY: 691 /* Privileged; we let the secmodel handle this. */ 692 break; 693 694 default: 695 break; 696 } 697 698 return result; 699 } 700 701 void 702 sched_init(void) 703 { 704 705 sysctl_sched_setup(&sched_sysctl_log); 706 707 sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS, 708 sched_listener_cb, NULL); 709 } 710