1 /* $NetBSD: linux_sched.c,v 1.73 2019/11/23 19:42:52 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2019 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center; by Matthias Scheler. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Linux compatibility module. Try to deal with scheduler related syscalls. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.73 2019/11/23 19:42:52 ad Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/mount.h> 42 #include <sys/proc.h> 43 #include <sys/systm.h> 44 #include <sys/sysctl.h> 45 #include <sys/syscallargs.h> 46 #include <sys/wait.h> 47 #include <sys/kauth.h> 48 #include <sys/ptrace.h> 49 #include <sys/atomic.h> 50 51 #include <sys/cpu.h> 52 53 #include <compat/linux/common/linux_types.h> 54 #include <compat/linux/common/linux_signal.h> 55 #include <compat/linux/common/linux_emuldata.h> 56 #include <compat/linux/common/linux_ipc.h> 57 #include <compat/linux/common/linux_sem.h> 58 #include <compat/linux/common/linux_exec.h> 59 #include <compat/linux/common/linux_machdep.h> 60 61 #include <compat/linux/linux_syscallargs.h> 62 63 #include <compat/linux/common/linux_sched.h> 64 65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *, 66 register_t *); 67 68 /* Unlike Linux, dynamically calculate CPU mask size */ 69 #define LINUX_CPU_MASK_SIZE (sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT)) 70 71 #if DEBUG_LINUX 72 #define DPRINTF(x) uprintf x 73 #else 74 #define DPRINTF(x) 75 #endif 76 77 static void 78 linux_child_return(void *arg) 79 { 80 struct lwp *l = arg; 81 struct proc *p = l->l_proc; 82 struct linux_emuldata *led = l->l_emuldata; 83 void *ctp = led->led_child_tidptr; 84 int error; 85 86 if (ctp) { 87 if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0) 88 printf("%s: LINUX_CLONE_CHILD_SETTID " 89 "failed (child_tidptr = %p, tid = %d error =%d)\n", 90 __func__, ctp, p->p_pid, error); 91 } 92 child_return(arg); 93 } 94 95 int 96 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, 97 register_t *retval) 98 { 99 /* { 100 syscallarg(int) flags; 101 syscallarg(void *) stack; 102 syscallarg(void *) parent_tidptr; 103 syscallarg(void *) tls; 104 syscallarg(void *) child_tidptr; 105 } */ 106 struct linux_emuldata *led; 107 int flags, sig, error; 108 109 /* 110 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags. 111 */ 112 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE)) 113 return EINVAL; 114 115 /* 116 * Thread group implies shared signals. Shared signals 117 * imply shared VM. This matches what Linux kernel does. 118 */ 119 if (SCARG(uap, flags) & LINUX_CLONE_THREAD 120 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0) 121 return EINVAL; 122 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND 123 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0) 124 return EINVAL; 125 126 /* 127 * The thread group flavor is implemented totally differently. 128 */ 129 if (SCARG(uap, flags) & LINUX_CLONE_THREAD) 130 return linux_clone_nptl(l, uap, retval); 131 132 flags = 0; 133 if (SCARG(uap, flags) & LINUX_CLONE_VM) 134 flags |= FORK_SHAREVM; 135 if (SCARG(uap, flags) & LINUX_CLONE_FS) 136 flags |= FORK_SHARECWD; 137 if (SCARG(uap, flags) & LINUX_CLONE_FILES) 138 flags |= FORK_SHAREFILES; 139 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) 140 flags |= FORK_SHARESIGS; 141 if (SCARG(uap, flags) & LINUX_CLONE_VFORK) 142 flags |= FORK_PPWAIT; 143 144 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL; 145 if (sig < 0 || sig >= LINUX__NSIG) 146 return EINVAL; 147 sig = linux_to_native_signo[sig]; 148 149 if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) { 150 led = l->l_emuldata; 151 led->led_child_tidptr = SCARG(uap, child_tidptr); 152 } 153 154 /* 155 * Note that Linux does not provide a portable way of specifying 156 * the stack area; the caller must know if the stack grows up 157 * or down. So, we pass a stack size of 0, so that the code 158 * that makes this adjustment is a noop. 159 */ 160 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0, 161 linux_child_return, NULL, retval)) != 0) { 162 DPRINTF(("%s: fork1: error %d\n", __func__, error)); 163 return error; 164 } 165 166 return 0; 167 } 168 169 static int 170 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval) 171 { 172 /* { 173 syscallarg(int) flags; 174 syscallarg(void *) stack; 175 syscallarg(void *) parent_tidptr; 176 syscallarg(void *) tls; 177 syscallarg(void *) child_tidptr; 178 } */ 179 struct proc *p; 180 struct lwp *l2; 181 struct linux_emuldata *led; 182 void *parent_tidptr, *tls, *child_tidptr; 183 vaddr_t uaddr; 184 lwpid_t lid; 185 int flags, tnprocs, error; 186 187 p = l->l_proc; 188 flags = SCARG(uap, flags); 189 parent_tidptr = SCARG(uap, parent_tidptr); 190 tls = SCARG(uap, tls); 191 child_tidptr = SCARG(uap, child_tidptr); 192 193 tnprocs = atomic_inc_uint_nv(&nprocs); 194 if (__predict_false(tnprocs >= maxproc) || 195 kauth_authorize_process(l->l_cred, KAUTH_PROCESS_FORK, p, 196 KAUTH_ARG(tnprocs), NULL, NULL) != 0) { 197 atomic_dec_uint(&nprocs); 198 return EAGAIN; 199 } 200 201 uaddr = uvm_uarea_alloc(); 202 if (__predict_false(uaddr == 0)) { 203 atomic_dec_uint(&nprocs); 204 return ENOMEM; 205 } 206 207 error = lwp_create(l, p, uaddr, LWP_DETACHED | LWP_PIDLID, 208 SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class, 209 &l->l_sigmask, &l->l_sigstk); 210 if (__predict_false(error)) { 211 DPRINTF(("%s: lwp_create error=%d\n", __func__, error)); 212 atomic_dec_uint(&nprocs); 213 uvm_uarea_free(uaddr); 214 return error; 215 } 216 lid = l2->l_lid; 217 218 /* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */ 219 if (flags & LINUX_CLONE_CHILD_CLEARTID) { 220 led = l2->l_emuldata; 221 led->led_clear_tid = child_tidptr; 222 } 223 224 /* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */ 225 if (flags & LINUX_CLONE_PARENT_SETTID) { 226 if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0) 227 printf("%s: LINUX_CLONE_PARENT_SETTID " 228 "failed (parent_tidptr = %p tid = %d error=%d)\n", 229 __func__, parent_tidptr, lid, error); 230 } 231 232 /* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory */ 233 if (flags & LINUX_CLONE_CHILD_SETTID) { 234 if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0) 235 printf("%s: LINUX_CLONE_CHILD_SETTID " 236 "failed (child_tidptr = %p, tid = %d error=%d)\n", 237 __func__, child_tidptr, lid, error); 238 } 239 240 if (flags & LINUX_CLONE_SETTLS) { 241 error = LINUX_LWP_SETPRIVATE(l2, tls); 242 if (error) { 243 DPRINTF(("%s: LINUX_LWP_SETPRIVATE %d\n", __func__, 244 error)); 245 lwp_exit(l2); 246 return error; 247 } 248 } 249 250 /* Set the new LWP running. */ 251 lwp_start(l2, 0); 252 253 retval[0] = lid; 254 retval[1] = 0; 255 return 0; 256 } 257 258 /* 259 * linux realtime priority 260 * 261 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99]. 262 * 263 * - SCHED_OTHER tasks don't have realtime priorities. 264 * in particular, sched_param::sched_priority is always 0. 265 */ 266 267 #define LINUX_SCHED_RTPRIO_MIN 1 268 #define LINUX_SCHED_RTPRIO_MAX 99 269 270 static int 271 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params, 272 int *native_policy, struct sched_param *native_params) 273 { 274 275 switch (linux_policy) { 276 case LINUX_SCHED_OTHER: 277 if (native_policy != NULL) { 278 *native_policy = SCHED_OTHER; 279 } 280 break; 281 282 case LINUX_SCHED_FIFO: 283 if (native_policy != NULL) { 284 *native_policy = SCHED_FIFO; 285 } 286 break; 287 288 case LINUX_SCHED_RR: 289 if (native_policy != NULL) { 290 *native_policy = SCHED_RR; 291 } 292 break; 293 294 default: 295 return EINVAL; 296 } 297 298 if (linux_params != NULL) { 299 int prio = linux_params->sched_priority; 300 301 KASSERT(native_params != NULL); 302 303 if (linux_policy == LINUX_SCHED_OTHER) { 304 if (prio != 0) { 305 return EINVAL; 306 } 307 native_params->sched_priority = PRI_NONE; /* XXX */ 308 } else { 309 if (prio < LINUX_SCHED_RTPRIO_MIN || 310 prio > LINUX_SCHED_RTPRIO_MAX) { 311 return EINVAL; 312 } 313 native_params->sched_priority = 314 (prio - LINUX_SCHED_RTPRIO_MIN) 315 * (SCHED_PRI_MAX - SCHED_PRI_MIN) 316 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 317 + SCHED_PRI_MIN; 318 } 319 } 320 321 return 0; 322 } 323 324 static int 325 sched_native2linux(int native_policy, struct sched_param *native_params, 326 int *linux_policy, struct linux_sched_param *linux_params) 327 { 328 329 switch (native_policy) { 330 case SCHED_OTHER: 331 if (linux_policy != NULL) { 332 *linux_policy = LINUX_SCHED_OTHER; 333 } 334 break; 335 336 case SCHED_FIFO: 337 if (linux_policy != NULL) { 338 *linux_policy = LINUX_SCHED_FIFO; 339 } 340 break; 341 342 case SCHED_RR: 343 if (linux_policy != NULL) { 344 *linux_policy = LINUX_SCHED_RR; 345 } 346 break; 347 348 default: 349 panic("%s: unknown policy %d\n", __func__, native_policy); 350 } 351 352 if (native_params != NULL) { 353 int prio = native_params->sched_priority; 354 355 KASSERT(prio >= SCHED_PRI_MIN); 356 KASSERT(prio <= SCHED_PRI_MAX); 357 KASSERT(linux_params != NULL); 358 359 DPRINTF(("%s: native: policy %d, priority %d\n", 360 __func__, native_policy, prio)); 361 362 if (native_policy == SCHED_OTHER) { 363 linux_params->sched_priority = 0; 364 } else { 365 linux_params->sched_priority = 366 (prio - SCHED_PRI_MIN) 367 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 368 / (SCHED_PRI_MAX - SCHED_PRI_MIN) 369 + LINUX_SCHED_RTPRIO_MIN; 370 } 371 DPRINTF(("%s: linux: policy %d, priority %d\n", 372 __func__, -1, linux_params->sched_priority)); 373 } 374 375 return 0; 376 } 377 378 int 379 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval) 380 { 381 /* { 382 syscallarg(linux_pid_t) pid; 383 syscallarg(const struct linux_sched_param *) sp; 384 } */ 385 int error, policy; 386 struct linux_sched_param lp; 387 struct sched_param sp; 388 389 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 390 error = EINVAL; 391 goto out; 392 } 393 394 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 395 if (error) 396 goto out; 397 398 /* We need the current policy in Linux terms. */ 399 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 400 if (error) 401 goto out; 402 error = sched_native2linux(policy, NULL, &policy, NULL); 403 if (error) 404 goto out; 405 406 error = sched_linux2native(policy, &lp, &policy, &sp); 407 if (error) 408 goto out; 409 410 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 411 if (error) 412 goto out; 413 414 out: 415 return error; 416 } 417 418 int 419 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval) 420 { 421 /* { 422 syscallarg(linux_pid_t) pid; 423 syscallarg(struct linux_sched_param *) sp; 424 } */ 425 struct linux_sched_param lp; 426 struct sched_param sp; 427 int error, policy; 428 429 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 430 error = EINVAL; 431 goto out; 432 } 433 434 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp); 435 if (error) 436 goto out; 437 DPRINTF(("%s: native: policy %d, priority %d\n", 438 __func__, policy, sp.sched_priority)); 439 440 error = sched_native2linux(policy, &sp, NULL, &lp); 441 if (error) 442 goto out; 443 DPRINTF(("%s: linux: policy %d, priority %d\n", 444 __func__, policy, lp.sched_priority)); 445 446 error = copyout(&lp, SCARG(uap, sp), sizeof(lp)); 447 if (error) 448 goto out; 449 450 out: 451 return error; 452 } 453 454 int 455 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval) 456 { 457 /* { 458 syscallarg(linux_pid_t) pid; 459 syscallarg(int) policy; 460 syscallarg(cont struct linux_sched_param *) sp; 461 } */ 462 int error, policy; 463 struct linux_sched_param lp; 464 struct sched_param sp; 465 466 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 467 error = EINVAL; 468 goto out; 469 } 470 471 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 472 if (error) 473 goto out; 474 DPRINTF(("%s: linux: policy %d, priority %d\n", 475 __func__, SCARG(uap, policy), lp.sched_priority)); 476 477 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp); 478 if (error) 479 goto out; 480 DPRINTF(("%s: native: policy %d, priority %d\n", 481 __func__, policy, sp.sched_priority)); 482 483 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 484 if (error) 485 goto out; 486 487 out: 488 return error; 489 } 490 491 int 492 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval) 493 { 494 /* { 495 syscallarg(linux_pid_t) pid; 496 } */ 497 int error, policy; 498 499 *retval = -1; 500 501 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 502 if (error) 503 goto out; 504 505 error = sched_native2linux(policy, NULL, &policy, NULL); 506 if (error) 507 goto out; 508 509 *retval = policy; 510 511 out: 512 return error; 513 } 514 515 int 516 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 517 { 518 519 yield(); 520 return 0; 521 } 522 523 int 524 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval) 525 { 526 /* { 527 syscallarg(int) policy; 528 } */ 529 530 switch (SCARG(uap, policy)) { 531 case LINUX_SCHED_OTHER: 532 *retval = 0; 533 break; 534 case LINUX_SCHED_FIFO: 535 case LINUX_SCHED_RR: 536 *retval = LINUX_SCHED_RTPRIO_MAX; 537 break; 538 default: 539 return EINVAL; 540 } 541 542 return 0; 543 } 544 545 int 546 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval) 547 { 548 /* { 549 syscallarg(int) policy; 550 } */ 551 552 switch (SCARG(uap, policy)) { 553 case LINUX_SCHED_OTHER: 554 *retval = 0; 555 break; 556 case LINUX_SCHED_FIFO: 557 case LINUX_SCHED_RR: 558 *retval = LINUX_SCHED_RTPRIO_MIN; 559 break; 560 default: 561 return EINVAL; 562 } 563 564 return 0; 565 } 566 567 int 568 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval) 569 { 570 571 lwp_exit(l); 572 return 0; 573 } 574 575 #ifndef __m68k__ 576 /* Present on everything but m68k */ 577 int 578 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval) 579 { 580 581 return sys_exit(l, (const void *)uap, retval); 582 } 583 #endif /* !__m68k__ */ 584 585 int 586 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval) 587 { 588 /* { 589 syscallarg(int *) tidptr; 590 } */ 591 struct linux_emuldata *led; 592 593 led = (struct linux_emuldata *)l->l_emuldata; 594 led->led_clear_tid = SCARG(uap, tid); 595 *retval = l->l_lid; 596 597 return 0; 598 } 599 600 /* ARGUSED1 */ 601 int 602 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval) 603 { 604 605 *retval = l->l_lid; 606 return 0; 607 } 608 609 /* 610 * The affinity syscalls assume that the layout of our cpu kcpuset is 611 * the same as linux's: a linear bitmask. 612 */ 613 int 614 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval) 615 { 616 /* { 617 syscallarg(linux_pid_t) pid; 618 syscallarg(unsigned int) len; 619 syscallarg(unsigned long *) mask; 620 } */ 621 struct lwp *t; 622 kcpuset_t *kcset; 623 size_t size; 624 cpuid_t i; 625 int error; 626 627 size = LINUX_CPU_MASK_SIZE; 628 if (SCARG(uap, len) < size) 629 return EINVAL; 630 631 /* Lock the LWP */ 632 t = lwp_find2(SCARG(uap, pid), l->l_lid); 633 if (t == NULL) 634 return ESRCH; 635 636 /* Check the permission */ 637 if (kauth_authorize_process(l->l_cred, 638 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) { 639 mutex_exit(t->l_proc->p_lock); 640 return EPERM; 641 } 642 643 kcpuset_create(&kcset, true); 644 lwp_lock(t); 645 if (t->l_affinity != NULL) 646 kcpuset_copy(kcset, t->l_affinity); 647 else { 648 /* 649 * All available CPUs should be masked when affinity has not 650 * been set. 651 */ 652 kcpuset_zero(kcset); 653 for (i = 0; i < ncpu; i++) 654 kcpuset_set(kcset, i); 655 } 656 lwp_unlock(t); 657 mutex_exit(t->l_proc->p_lock); 658 error = kcpuset_copyout(kcset, (cpuset_t *)SCARG(uap, mask), size); 659 kcpuset_unuse(kcset, NULL); 660 *retval = size; 661 return error; 662 } 663 664 int 665 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval) 666 { 667 /* { 668 syscallarg(linux_pid_t) pid; 669 syscallarg(unsigned int) len; 670 syscallarg(unsigned long *) mask; 671 } */ 672 struct sys__sched_setaffinity_args ssa; 673 size_t size; 674 675 size = LINUX_CPU_MASK_SIZE; 676 if (SCARG(uap, len) < size) 677 return EINVAL; 678 679 SCARG(&ssa, pid) = SCARG(uap, pid); 680 SCARG(&ssa, lid) = l->l_lid; 681 SCARG(&ssa, size) = size; 682 SCARG(&ssa, cpuset) = (cpuset_t *)SCARG(uap, mask); 683 684 return sys__sched_setaffinity(l, &ssa, retval); 685 } 686