1 /* $NetBSD: linux_sched.c,v 1.78 2020/05/23 23:42:41 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2019 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center; by Matthias Scheler. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Linux compatibility module. Try to deal with scheduler related syscalls. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.78 2020/05/23 23:42:41 ad Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/mount.h> 42 #include <sys/proc.h> 43 #include <sys/systm.h> 44 #include <sys/sysctl.h> 45 #include <sys/syscallargs.h> 46 #include <sys/wait.h> 47 #include <sys/kauth.h> 48 #include <sys/ptrace.h> 49 #include <sys/atomic.h> 50 51 #include <sys/cpu.h> 52 53 #include <compat/linux/common/linux_types.h> 54 #include <compat/linux/common/linux_signal.h> 55 #include <compat/linux/common/linux_emuldata.h> 56 #include <compat/linux/common/linux_ipc.h> 57 #include <compat/linux/common/linux_sem.h> 58 #include <compat/linux/common/linux_exec.h> 59 #include <compat/linux/common/linux_machdep.h> 60 61 #include <compat/linux/linux_syscallargs.h> 62 63 #include <compat/linux/common/linux_sched.h> 64 65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *, 66 register_t *); 67 68 /* Unlike Linux, dynamically calculate CPU mask size */ 69 #define LINUX_CPU_MASK_SIZE (sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT)) 70 71 #if DEBUG_LINUX 72 #define DPRINTF(x) uprintf x 73 #else 74 #define DPRINTF(x) 75 #endif 76 77 static void 78 linux_child_return(void *arg) 79 { 80 struct lwp *l = arg; 81 struct proc *p = l->l_proc; 82 struct linux_emuldata *led = l->l_emuldata; 83 void *ctp = led->led_child_tidptr; 84 int error; 85 86 if (ctp) { 87 if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0) 88 printf("%s: LINUX_CLONE_CHILD_SETTID " 89 "failed (child_tidptr = %p, tid = %d error =%d)\n", 90 __func__, ctp, p->p_pid, error); 91 } 92 child_return(arg); 93 } 94 95 int 96 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, 97 register_t *retval) 98 { 99 /* { 100 syscallarg(int) flags; 101 syscallarg(void *) stack; 102 syscallarg(void *) parent_tidptr; 103 syscallarg(void *) tls; 104 syscallarg(void *) child_tidptr; 105 } */ 106 struct linux_emuldata *led; 107 int flags, sig, error; 108 109 /* 110 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags. 111 */ 112 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE)) 113 return EINVAL; 114 115 /* 116 * Thread group implies shared signals. Shared signals 117 * imply shared VM. This matches what Linux kernel does. 118 */ 119 if (SCARG(uap, flags) & LINUX_CLONE_THREAD 120 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0) 121 return EINVAL; 122 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND 123 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0) 124 return EINVAL; 125 126 /* 127 * The thread group flavor is implemented totally differently. 128 */ 129 if (SCARG(uap, flags) & LINUX_CLONE_THREAD) 130 return linux_clone_nptl(l, uap, retval); 131 132 flags = 0; 133 if (SCARG(uap, flags) & LINUX_CLONE_VM) 134 flags |= FORK_SHAREVM; 135 if (SCARG(uap, flags) & LINUX_CLONE_FS) 136 flags |= FORK_SHARECWD; 137 if (SCARG(uap, flags) & LINUX_CLONE_FILES) 138 flags |= FORK_SHAREFILES; 139 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) 140 flags |= FORK_SHARESIGS; 141 if (SCARG(uap, flags) & LINUX_CLONE_VFORK) 142 flags |= FORK_PPWAIT; 143 144 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL; 145 if (sig < 0 || sig >= LINUX__NSIG) 146 return EINVAL; 147 sig = linux_to_native_signo[sig]; 148 149 if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) { 150 led = l->l_emuldata; 151 led->led_child_tidptr = SCARG(uap, child_tidptr); 152 } 153 154 /* 155 * Note that Linux does not provide a portable way of specifying 156 * the stack area; the caller must know if the stack grows up 157 * or down. So, we pass a stack size of 0, so that the code 158 * that makes this adjustment is a noop. 159 */ 160 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0, 161 linux_child_return, NULL, retval)) != 0) { 162 DPRINTF(("%s: fork1: error %d\n", __func__, error)); 163 return error; 164 } 165 166 return 0; 167 } 168 169 static int 170 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval) 171 { 172 /* { 173 syscallarg(int) flags; 174 syscallarg(void *) stack; 175 syscallarg(void *) parent_tidptr; 176 syscallarg(void *) tls; 177 syscallarg(void *) child_tidptr; 178 } */ 179 struct proc *p; 180 struct lwp *l2; 181 struct linux_emuldata *led; 182 void *parent_tidptr, *tls, *child_tidptr; 183 vaddr_t uaddr; 184 lwpid_t lid; 185 int flags, error; 186 187 p = l->l_proc; 188 flags = SCARG(uap, flags); 189 parent_tidptr = SCARG(uap, parent_tidptr); 190 tls = SCARG(uap, tls); 191 child_tidptr = SCARG(uap, child_tidptr); 192 193 uaddr = uvm_uarea_alloc(); 194 if (__predict_false(uaddr == 0)) { 195 return ENOMEM; 196 } 197 198 error = lwp_create(l, p, uaddr, LWP_DETACHED, 199 SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class, 200 &l->l_sigmask, &l->l_sigstk); 201 if (__predict_false(error)) { 202 DPRINTF(("%s: lwp_create error=%d\n", __func__, error)); 203 uvm_uarea_free(uaddr); 204 return error; 205 } 206 lid = l2->l_lid; 207 208 /* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */ 209 if (flags & LINUX_CLONE_CHILD_CLEARTID) { 210 led = l2->l_emuldata; 211 led->led_clear_tid = child_tidptr; 212 } 213 214 /* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */ 215 if (flags & LINUX_CLONE_PARENT_SETTID) { 216 if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0) 217 printf("%s: LINUX_CLONE_PARENT_SETTID " 218 "failed (parent_tidptr = %p tid = %d error=%d)\n", 219 __func__, parent_tidptr, lid, error); 220 } 221 222 /* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory */ 223 if (flags & LINUX_CLONE_CHILD_SETTID) { 224 if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0) 225 printf("%s: LINUX_CLONE_CHILD_SETTID " 226 "failed (child_tidptr = %p, tid = %d error=%d)\n", 227 __func__, child_tidptr, lid, error); 228 } 229 230 if (flags & LINUX_CLONE_SETTLS) { 231 error = LINUX_LWP_SETPRIVATE(l2, tls); 232 if (error) { 233 DPRINTF(("%s: LINUX_LWP_SETPRIVATE %d\n", __func__, 234 error)); 235 lwp_exit(l2); 236 return error; 237 } 238 } 239 240 /* Set the new LWP running. */ 241 lwp_start(l2, 0); 242 243 retval[0] = lid; 244 retval[1] = 0; 245 return 0; 246 } 247 248 /* 249 * linux realtime priority 250 * 251 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99]. 252 * 253 * - SCHED_OTHER tasks don't have realtime priorities. 254 * in particular, sched_param::sched_priority is always 0. 255 */ 256 257 #define LINUX_SCHED_RTPRIO_MIN 1 258 #define LINUX_SCHED_RTPRIO_MAX 99 259 260 static int 261 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params, 262 int *native_policy, struct sched_param *native_params) 263 { 264 265 switch (linux_policy) { 266 case LINUX_SCHED_OTHER: 267 if (native_policy != NULL) { 268 *native_policy = SCHED_OTHER; 269 } 270 break; 271 272 case LINUX_SCHED_FIFO: 273 if (native_policy != NULL) { 274 *native_policy = SCHED_FIFO; 275 } 276 break; 277 278 case LINUX_SCHED_RR: 279 if (native_policy != NULL) { 280 *native_policy = SCHED_RR; 281 } 282 break; 283 284 default: 285 return EINVAL; 286 } 287 288 if (linux_params != NULL) { 289 int prio = linux_params->sched_priority; 290 291 KASSERT(native_params != NULL); 292 293 if (linux_policy == LINUX_SCHED_OTHER) { 294 if (prio != 0) { 295 return EINVAL; 296 } 297 native_params->sched_priority = PRI_NONE; /* XXX */ 298 } else { 299 if (prio < LINUX_SCHED_RTPRIO_MIN || 300 prio > LINUX_SCHED_RTPRIO_MAX) { 301 return EINVAL; 302 } 303 native_params->sched_priority = 304 (prio - LINUX_SCHED_RTPRIO_MIN) 305 * (SCHED_PRI_MAX - SCHED_PRI_MIN) 306 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 307 + SCHED_PRI_MIN; 308 } 309 } 310 311 return 0; 312 } 313 314 static int 315 sched_native2linux(int native_policy, struct sched_param *native_params, 316 int *linux_policy, struct linux_sched_param *linux_params) 317 { 318 319 switch (native_policy) { 320 case SCHED_OTHER: 321 if (linux_policy != NULL) { 322 *linux_policy = LINUX_SCHED_OTHER; 323 } 324 break; 325 326 case SCHED_FIFO: 327 if (linux_policy != NULL) { 328 *linux_policy = LINUX_SCHED_FIFO; 329 } 330 break; 331 332 case SCHED_RR: 333 if (linux_policy != NULL) { 334 *linux_policy = LINUX_SCHED_RR; 335 } 336 break; 337 338 default: 339 panic("%s: unknown policy %d\n", __func__, native_policy); 340 } 341 342 if (native_params != NULL) { 343 int prio = native_params->sched_priority; 344 345 KASSERT(prio >= SCHED_PRI_MIN); 346 KASSERT(prio <= SCHED_PRI_MAX); 347 KASSERT(linux_params != NULL); 348 349 DPRINTF(("%s: native: policy %d, priority %d\n", 350 __func__, native_policy, prio)); 351 352 if (native_policy == SCHED_OTHER) { 353 linux_params->sched_priority = 0; 354 } else { 355 linux_params->sched_priority = 356 (prio - SCHED_PRI_MIN) 357 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 358 / (SCHED_PRI_MAX - SCHED_PRI_MIN) 359 + LINUX_SCHED_RTPRIO_MIN; 360 } 361 DPRINTF(("%s: linux: policy %d, priority %d\n", 362 __func__, -1, linux_params->sched_priority)); 363 } 364 365 return 0; 366 } 367 368 int 369 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval) 370 { 371 /* { 372 syscallarg(linux_pid_t) pid; 373 syscallarg(const struct linux_sched_param *) sp; 374 } */ 375 int error, policy; 376 struct linux_sched_param lp; 377 struct sched_param sp; 378 379 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 380 error = EINVAL; 381 goto out; 382 } 383 384 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 385 if (error) 386 goto out; 387 388 /* We need the current policy in Linux terms. */ 389 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 390 if (error) 391 goto out; 392 error = sched_native2linux(policy, NULL, &policy, NULL); 393 if (error) 394 goto out; 395 396 error = sched_linux2native(policy, &lp, &policy, &sp); 397 if (error) 398 goto out; 399 400 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 401 if (error) 402 goto out; 403 404 out: 405 return error; 406 } 407 408 int 409 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval) 410 { 411 /* { 412 syscallarg(linux_pid_t) pid; 413 syscallarg(struct linux_sched_param *) sp; 414 } */ 415 struct linux_sched_param lp; 416 struct sched_param sp; 417 int error, policy; 418 419 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 420 error = EINVAL; 421 goto out; 422 } 423 424 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp); 425 if (error) 426 goto out; 427 DPRINTF(("%s: native: policy %d, priority %d\n", 428 __func__, policy, sp.sched_priority)); 429 430 error = sched_native2linux(policy, &sp, NULL, &lp); 431 if (error) 432 goto out; 433 DPRINTF(("%s: linux: policy %d, priority %d\n", 434 __func__, policy, lp.sched_priority)); 435 436 error = copyout(&lp, SCARG(uap, sp), sizeof(lp)); 437 if (error) 438 goto out; 439 440 out: 441 return error; 442 } 443 444 int 445 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval) 446 { 447 /* { 448 syscallarg(linux_pid_t) pid; 449 syscallarg(int) policy; 450 syscallarg(cont struct linux_sched_param *) sp; 451 } */ 452 int error, policy; 453 struct linux_sched_param lp; 454 struct sched_param sp; 455 456 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 457 error = EINVAL; 458 goto out; 459 } 460 461 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 462 if (error) 463 goto out; 464 DPRINTF(("%s: linux: policy %d, priority %d\n", 465 __func__, SCARG(uap, policy), lp.sched_priority)); 466 467 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp); 468 if (error) 469 goto out; 470 DPRINTF(("%s: native: policy %d, priority %d\n", 471 __func__, policy, sp.sched_priority)); 472 473 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 474 if (error) 475 goto out; 476 477 out: 478 return error; 479 } 480 481 int 482 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval) 483 { 484 /* { 485 syscallarg(linux_pid_t) pid; 486 } */ 487 int error, policy; 488 489 *retval = -1; 490 491 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 492 if (error) 493 goto out; 494 495 error = sched_native2linux(policy, NULL, &policy, NULL); 496 if (error) 497 goto out; 498 499 *retval = policy; 500 501 out: 502 return error; 503 } 504 505 int 506 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 507 { 508 509 yield(); 510 return 0; 511 } 512 513 int 514 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval) 515 { 516 /* { 517 syscallarg(int) policy; 518 } */ 519 520 switch (SCARG(uap, policy)) { 521 case LINUX_SCHED_OTHER: 522 *retval = 0; 523 break; 524 case LINUX_SCHED_FIFO: 525 case LINUX_SCHED_RR: 526 *retval = LINUX_SCHED_RTPRIO_MAX; 527 break; 528 default: 529 return EINVAL; 530 } 531 532 return 0; 533 } 534 535 int 536 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval) 537 { 538 /* { 539 syscallarg(int) policy; 540 } */ 541 542 switch (SCARG(uap, policy)) { 543 case LINUX_SCHED_OTHER: 544 *retval = 0; 545 break; 546 case LINUX_SCHED_FIFO: 547 case LINUX_SCHED_RR: 548 *retval = LINUX_SCHED_RTPRIO_MIN; 549 break; 550 default: 551 return EINVAL; 552 } 553 554 return 0; 555 } 556 557 int 558 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval) 559 { 560 561 lwp_exit(l); 562 return 0; 563 } 564 565 #ifndef __m68k__ 566 /* Present on everything but m68k */ 567 int 568 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval) 569 { 570 571 return sys_exit(l, (const void *)uap, retval); 572 } 573 #endif /* !__m68k__ */ 574 575 int 576 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval) 577 { 578 /* { 579 syscallarg(int *) tidptr; 580 } */ 581 struct linux_emuldata *led; 582 583 led = (struct linux_emuldata *)l->l_emuldata; 584 led->led_clear_tid = SCARG(uap, tid); 585 *retval = l->l_lid; 586 587 return 0; 588 } 589 590 /* ARGUSED1 */ 591 int 592 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval) 593 { 594 595 *retval = l->l_lid; 596 return 0; 597 } 598 599 /* 600 * The affinity syscalls assume that the layout of our cpu kcpuset is 601 * the same as linux's: a linear bitmask. 602 */ 603 int 604 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval) 605 { 606 /* { 607 syscallarg(linux_pid_t) pid; 608 syscallarg(unsigned int) len; 609 syscallarg(unsigned long *) mask; 610 } */ 611 struct proc *p; 612 struct lwp *t; 613 kcpuset_t *kcset; 614 size_t size; 615 cpuid_t i; 616 int error; 617 618 size = LINUX_CPU_MASK_SIZE; 619 if (SCARG(uap, len) < size) 620 return EINVAL; 621 622 if (SCARG(uap, pid) == 0) { 623 p = curproc; 624 mutex_enter(p->p_lock); 625 t = curlwp; 626 } else { 627 t = lwp_find2(-1, SCARG(uap, pid)); 628 if (__predict_false(t == NULL)) { 629 return ESRCH; 630 } 631 p = t->l_proc; 632 KASSERT(mutex_owned(p->p_lock)); 633 } 634 635 /* Check the permission */ 636 if (kauth_authorize_process(l->l_cred, 637 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, p, NULL, NULL, NULL)) { 638 mutex_exit(p->p_lock); 639 return EPERM; 640 } 641 642 kcpuset_create(&kcset, true); 643 lwp_lock(t); 644 if (t->l_affinity != NULL) 645 kcpuset_copy(kcset, t->l_affinity); 646 else { 647 /* 648 * All available CPUs should be masked when affinity has not 649 * been set. 650 */ 651 kcpuset_zero(kcset); 652 for (i = 0; i < ncpu; i++) 653 kcpuset_set(kcset, i); 654 } 655 lwp_unlock(t); 656 mutex_exit(p->p_lock); 657 error = kcpuset_copyout(kcset, (cpuset_t *)SCARG(uap, mask), size); 658 kcpuset_unuse(kcset, NULL); 659 *retval = size; 660 return error; 661 } 662 663 int 664 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval) 665 { 666 /* { 667 syscallarg(linux_pid_t) pid; 668 syscallarg(unsigned int) len; 669 syscallarg(unsigned long *) mask; 670 } */ 671 struct sys__sched_setaffinity_args ssa; 672 size_t size; 673 pid_t pid; 674 lwpid_t lid; 675 676 size = LINUX_CPU_MASK_SIZE; 677 if (SCARG(uap, len) < size) 678 return EINVAL; 679 680 lid = SCARG(uap, pid); 681 if (lid != 0) { 682 /* Get the canonical PID for the process. */ 683 mutex_enter(&proc_lock); 684 struct proc *p = proc_find_lwpid(SCARG(uap, pid)); 685 if (p == NULL) { 686 mutex_exit(&proc_lock); 687 return ESRCH; 688 } 689 pid = p->p_pid; 690 mutex_exit(&proc_lock); 691 } else { 692 pid = curproc->p_pid; 693 lid = curlwp->l_lid; 694 } 695 696 SCARG(&ssa, pid) = pid; 697 SCARG(&ssa, lid) = lid; 698 SCARG(&ssa, size) = size; 699 SCARG(&ssa, cpuset) = (cpuset_t *)SCARG(uap, mask); 700 701 return sys__sched_setaffinity(l, &ssa, retval); 702 } 703