1 /* $NetBSD: linux_sched.c,v 1.67 2014/11/09 17:48:08 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center; by Matthias Scheler. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Linux compatibility module. Try to deal with scheduler related syscalls. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.67 2014/11/09 17:48:08 maxv Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/mount.h> 42 #include <sys/proc.h> 43 #include <sys/systm.h> 44 #include <sys/sysctl.h> 45 #include <sys/syscallargs.h> 46 #include <sys/wait.h> 47 #include <sys/kauth.h> 48 #include <sys/ptrace.h> 49 #include <sys/atomic.h> 50 51 #include <sys/cpu.h> 52 53 #include <compat/linux/common/linux_types.h> 54 #include <compat/linux/common/linux_signal.h> 55 #include <compat/linux/common/linux_emuldata.h> 56 #include <compat/linux/common/linux_ipc.h> 57 #include <compat/linux/common/linux_sem.h> 58 #include <compat/linux/common/linux_exec.h> 59 #include <compat/linux/common/linux_machdep.h> 60 61 #include <compat/linux/linux_syscallargs.h> 62 63 #include <compat/linux/common/linux_sched.h> 64 65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *, 66 register_t *); 67 68 #if DEBUG_LINUX 69 #define DPRINTF(x) uprintf x 70 #else 71 #define DPRINTF(x) 72 #endif 73 74 static void 75 linux_child_return(void *arg) 76 { 77 struct lwp *l = arg; 78 struct proc *p = l->l_proc; 79 struct linux_emuldata *led = l->l_emuldata; 80 void *ctp = led->led_child_tidptr; 81 int error; 82 83 if (ctp) { 84 if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0) 85 printf("%s: LINUX_CLONE_CHILD_SETTID " 86 "failed (child_tidptr = %p, tid = %d error =%d)\n", 87 __func__, ctp, p->p_pid, error); 88 } 89 child_return(arg); 90 } 91 92 int 93 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, 94 register_t *retval) 95 { 96 /* { 97 syscallarg(int) flags; 98 syscallarg(void *) stack; 99 syscallarg(void *) parent_tidptr; 100 syscallarg(void *) tls; 101 syscallarg(void *) child_tidptr; 102 } */ 103 struct proc *p; 104 struct linux_emuldata *led; 105 int flags, sig, error; 106 107 /* 108 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags. 109 */ 110 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE)) 111 return EINVAL; 112 113 /* 114 * Thread group implies shared signals. Shared signals 115 * imply shared VM. This matches what Linux kernel does. 116 */ 117 if (SCARG(uap, flags) & LINUX_CLONE_THREAD 118 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0) 119 return EINVAL; 120 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND 121 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0) 122 return EINVAL; 123 124 /* 125 * The thread group flavor is implemented totally differently. 126 */ 127 if (SCARG(uap, flags) & LINUX_CLONE_THREAD) 128 return linux_clone_nptl(l, uap, retval); 129 130 flags = 0; 131 if (SCARG(uap, flags) & LINUX_CLONE_VM) 132 flags |= FORK_SHAREVM; 133 if (SCARG(uap, flags) & LINUX_CLONE_FS) 134 flags |= FORK_SHARECWD; 135 if (SCARG(uap, flags) & LINUX_CLONE_FILES) 136 flags |= FORK_SHAREFILES; 137 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) 138 flags |= FORK_SHARESIGS; 139 if (SCARG(uap, flags) & LINUX_CLONE_VFORK) 140 flags |= FORK_PPWAIT; 141 142 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL; 143 if (sig < 0 || sig >= LINUX__NSIG) 144 return EINVAL; 145 sig = linux_to_native_signo[sig]; 146 147 if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) { 148 led = l->l_emuldata; 149 led->led_child_tidptr = SCARG(uap, child_tidptr); 150 } 151 152 /* 153 * Note that Linux does not provide a portable way of specifying 154 * the stack area; the caller must know if the stack grows up 155 * or down. So, we pass a stack size of 0, so that the code 156 * that makes this adjustment is a noop. 157 */ 158 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0, 159 linux_child_return, NULL, retval, &p)) != 0) { 160 DPRINTF(("%s: fork1: error %d\n", __func__, error)); 161 return error; 162 } 163 164 return 0; 165 } 166 167 static int 168 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval) 169 { 170 /* { 171 syscallarg(int) flags; 172 syscallarg(void *) stack; 173 syscallarg(void *) parent_tidptr; 174 syscallarg(void *) tls; 175 syscallarg(void *) child_tidptr; 176 } */ 177 struct proc *p; 178 struct lwp *l2; 179 struct linux_emuldata *led; 180 void *parent_tidptr, *tls, *child_tidptr; 181 struct schedstate_percpu *spc; 182 vaddr_t uaddr; 183 lwpid_t lid; 184 int flags, tnprocs, error; 185 186 p = l->l_proc; 187 flags = SCARG(uap, flags); 188 parent_tidptr = SCARG(uap, parent_tidptr); 189 tls = SCARG(uap, tls); 190 child_tidptr = SCARG(uap, child_tidptr); 191 192 tnprocs = atomic_inc_uint_nv(&nprocs); 193 if (__predict_false(tnprocs >= maxproc) || 194 kauth_authorize_process(l->l_cred, KAUTH_PROCESS_FORK, p, 195 KAUTH_ARG(tnprocs), NULL, NULL) != 0) { 196 atomic_dec_uint(&nprocs); 197 return EAGAIN; 198 } 199 200 uaddr = uvm_uarea_alloc(); 201 if (__predict_false(uaddr == 0)) { 202 atomic_dec_uint(&nprocs); 203 return ENOMEM; 204 } 205 206 error = lwp_create(l, p, uaddr, LWP_DETACHED | LWP_PIDLID, 207 SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class); 208 if (__predict_false(error)) { 209 DPRINTF(("%s: lwp_create error=%d\n", __func__, error)); 210 atomic_dec_uint(&nprocs); 211 uvm_uarea_free(uaddr); 212 return error; 213 } 214 lid = l2->l_lid; 215 216 /* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */ 217 if (flags & LINUX_CLONE_CHILD_CLEARTID) { 218 led = l2->l_emuldata; 219 led->led_clear_tid = child_tidptr; 220 } 221 222 /* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */ 223 if (flags & LINUX_CLONE_PARENT_SETTID) { 224 if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0) 225 printf("%s: LINUX_CLONE_PARENT_SETTID " 226 "failed (parent_tidptr = %p tid = %d error=%d)\n", 227 __func__, parent_tidptr, lid, error); 228 } 229 230 /* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory */ 231 if (flags & LINUX_CLONE_CHILD_SETTID) { 232 if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0) 233 printf("%s: LINUX_CLONE_CHILD_SETTID " 234 "failed (child_tidptr = %p, tid = %d error=%d)\n", 235 __func__, child_tidptr, lid, error); 236 } 237 238 if (flags & LINUX_CLONE_SETTLS) { 239 error = LINUX_LWP_SETPRIVATE(l2, tls); 240 if (error) { 241 DPRINTF(("%s: LINUX_LWP_SETPRIVATE %d\n", __func__, 242 error)); 243 lwp_exit(l2); 244 return error; 245 } 246 } 247 248 /* 249 * Set the new LWP running, unless the process is stopping, 250 * then the LWP is created stopped. 251 */ 252 mutex_enter(p->p_lock); 253 lwp_lock(l2); 254 spc = &l2->l_cpu->ci_schedstate; 255 if ((l->l_flag & (LW_WREBOOT | LW_WSUSPEND | LW_WEXIT)) == 0) { 256 if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) { 257 KASSERT(l2->l_wchan == NULL); 258 l2->l_stat = LSSTOP; 259 p->p_nrlwps--; 260 lwp_unlock_to(l2, spc->spc_lwplock); 261 } else { 262 KASSERT(lwp_locked(l2, spc->spc_mutex)); 263 l2->l_stat = LSRUN; 264 sched_enqueue(l2, false); 265 lwp_unlock(l2); 266 } 267 } else { 268 l2->l_stat = LSSUSPENDED; 269 p->p_nrlwps--; 270 lwp_unlock_to(l2, spc->spc_lwplock); 271 } 272 mutex_exit(p->p_lock); 273 274 retval[0] = lid; 275 retval[1] = 0; 276 return 0; 277 } 278 279 /* 280 * linux realtime priority 281 * 282 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99]. 283 * 284 * - SCHED_OTHER tasks don't have realtime priorities. 285 * in particular, sched_param::sched_priority is always 0. 286 */ 287 288 #define LINUX_SCHED_RTPRIO_MIN 1 289 #define LINUX_SCHED_RTPRIO_MAX 99 290 291 static int 292 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params, 293 int *native_policy, struct sched_param *native_params) 294 { 295 296 switch (linux_policy) { 297 case LINUX_SCHED_OTHER: 298 if (native_policy != NULL) { 299 *native_policy = SCHED_OTHER; 300 } 301 break; 302 303 case LINUX_SCHED_FIFO: 304 if (native_policy != NULL) { 305 *native_policy = SCHED_FIFO; 306 } 307 break; 308 309 case LINUX_SCHED_RR: 310 if (native_policy != NULL) { 311 *native_policy = SCHED_RR; 312 } 313 break; 314 315 default: 316 return EINVAL; 317 } 318 319 if (linux_params != NULL) { 320 int prio = linux_params->sched_priority; 321 322 KASSERT(native_params != NULL); 323 324 if (linux_policy == LINUX_SCHED_OTHER) { 325 if (prio != 0) { 326 return EINVAL; 327 } 328 native_params->sched_priority = PRI_NONE; /* XXX */ 329 } else { 330 if (prio < LINUX_SCHED_RTPRIO_MIN || 331 prio > LINUX_SCHED_RTPRIO_MAX) { 332 return EINVAL; 333 } 334 native_params->sched_priority = 335 (prio - LINUX_SCHED_RTPRIO_MIN) 336 * (SCHED_PRI_MAX - SCHED_PRI_MIN) 337 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 338 + SCHED_PRI_MIN; 339 } 340 } 341 342 return 0; 343 } 344 345 static int 346 sched_native2linux(int native_policy, struct sched_param *native_params, 347 int *linux_policy, struct linux_sched_param *linux_params) 348 { 349 350 switch (native_policy) { 351 case SCHED_OTHER: 352 if (linux_policy != NULL) { 353 *linux_policy = LINUX_SCHED_OTHER; 354 } 355 break; 356 357 case SCHED_FIFO: 358 if (linux_policy != NULL) { 359 *linux_policy = LINUX_SCHED_FIFO; 360 } 361 break; 362 363 case SCHED_RR: 364 if (linux_policy != NULL) { 365 *linux_policy = LINUX_SCHED_RR; 366 } 367 break; 368 369 default: 370 panic("%s: unknown policy %d\n", __func__, native_policy); 371 } 372 373 if (native_params != NULL) { 374 int prio = native_params->sched_priority; 375 376 KASSERT(prio >= SCHED_PRI_MIN); 377 KASSERT(prio <= SCHED_PRI_MAX); 378 KASSERT(linux_params != NULL); 379 380 DPRINTF(("%s: native: policy %d, priority %d\n", 381 __func__, native_policy, prio)); 382 383 if (native_policy == SCHED_OTHER) { 384 linux_params->sched_priority = 0; 385 } else { 386 linux_params->sched_priority = 387 (prio - SCHED_PRI_MIN) 388 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 389 / (SCHED_PRI_MAX - SCHED_PRI_MIN) 390 + LINUX_SCHED_RTPRIO_MIN; 391 } 392 DPRINTF(("%s: linux: policy %d, priority %d\n", 393 __func__, -1, linux_params->sched_priority)); 394 } 395 396 return 0; 397 } 398 399 int 400 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval) 401 { 402 /* { 403 syscallarg(linux_pid_t) pid; 404 syscallarg(const struct linux_sched_param *) sp; 405 } */ 406 int error, policy; 407 struct linux_sched_param lp; 408 struct sched_param sp; 409 410 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 411 error = EINVAL; 412 goto out; 413 } 414 415 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 416 if (error) 417 goto out; 418 419 /* We need the current policy in Linux terms. */ 420 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 421 if (error) 422 goto out; 423 error = sched_native2linux(policy, NULL, &policy, NULL); 424 if (error) 425 goto out; 426 427 error = sched_linux2native(policy, &lp, &policy, &sp); 428 if (error) 429 goto out; 430 431 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 432 if (error) 433 goto out; 434 435 out: 436 return error; 437 } 438 439 int 440 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval) 441 { 442 /* { 443 syscallarg(linux_pid_t) pid; 444 syscallarg(struct linux_sched_param *) sp; 445 } */ 446 struct linux_sched_param lp; 447 struct sched_param sp; 448 int error, policy; 449 450 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 451 error = EINVAL; 452 goto out; 453 } 454 455 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp); 456 if (error) 457 goto out; 458 DPRINTF(("%s: native: policy %d, priority %d\n", 459 __func__, policy, sp.sched_priority)); 460 461 error = sched_native2linux(policy, &sp, NULL, &lp); 462 if (error) 463 goto out; 464 DPRINTF(("%s: linux: policy %d, priority %d\n", 465 __func__, policy, lp.sched_priority)); 466 467 error = copyout(&lp, SCARG(uap, sp), sizeof(lp)); 468 if (error) 469 goto out; 470 471 out: 472 return error; 473 } 474 475 int 476 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval) 477 { 478 /* { 479 syscallarg(linux_pid_t) pid; 480 syscallarg(int) policy; 481 syscallarg(cont struct linux_sched_param *) sp; 482 } */ 483 int error, policy; 484 struct linux_sched_param lp; 485 struct sched_param sp; 486 487 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 488 error = EINVAL; 489 goto out; 490 } 491 492 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 493 if (error) 494 goto out; 495 DPRINTF(("%s: linux: policy %d, priority %d\n", 496 __func__, SCARG(uap, policy), lp.sched_priority)); 497 498 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp); 499 if (error) 500 goto out; 501 DPRINTF(("%s: native: policy %d, priority %d\n", 502 __func__, policy, sp.sched_priority)); 503 504 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 505 if (error) 506 goto out; 507 508 out: 509 return error; 510 } 511 512 int 513 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval) 514 { 515 /* { 516 syscallarg(linux_pid_t) pid; 517 } */ 518 int error, policy; 519 520 *retval = -1; 521 522 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 523 if (error) 524 goto out; 525 526 error = sched_native2linux(policy, NULL, &policy, NULL); 527 if (error) 528 goto out; 529 530 *retval = policy; 531 532 out: 533 return error; 534 } 535 536 int 537 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 538 { 539 540 yield(); 541 return 0; 542 } 543 544 int 545 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval) 546 { 547 /* { 548 syscallarg(int) policy; 549 } */ 550 551 switch (SCARG(uap, policy)) { 552 case LINUX_SCHED_OTHER: 553 *retval = 0; 554 break; 555 case LINUX_SCHED_FIFO: 556 case LINUX_SCHED_RR: 557 *retval = LINUX_SCHED_RTPRIO_MAX; 558 break; 559 default: 560 return EINVAL; 561 } 562 563 return 0; 564 } 565 566 int 567 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval) 568 { 569 /* { 570 syscallarg(int) policy; 571 } */ 572 573 switch (SCARG(uap, policy)) { 574 case LINUX_SCHED_OTHER: 575 *retval = 0; 576 break; 577 case LINUX_SCHED_FIFO: 578 case LINUX_SCHED_RR: 579 *retval = LINUX_SCHED_RTPRIO_MIN; 580 break; 581 default: 582 return EINVAL; 583 } 584 585 return 0; 586 } 587 588 int 589 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval) 590 { 591 592 lwp_exit(l); 593 return 0; 594 } 595 596 #ifndef __m68k__ 597 /* Present on everything but m68k */ 598 int 599 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval) 600 { 601 602 return sys_exit(l, (const void *)uap, retval); 603 } 604 #endif /* !__m68k__ */ 605 606 int 607 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval) 608 { 609 /* { 610 syscallarg(int *) tidptr; 611 } */ 612 struct linux_emuldata *led; 613 614 led = (struct linux_emuldata *)l->l_emuldata; 615 led->led_clear_tid = SCARG(uap, tid); 616 *retval = l->l_lid; 617 618 return 0; 619 } 620 621 /* ARGUSED1 */ 622 int 623 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval) 624 { 625 626 *retval = l->l_lid; 627 return 0; 628 } 629 630 int 631 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval) 632 { 633 /* { 634 syscallarg(linux_pid_t) pid; 635 syscallarg(unsigned int) len; 636 syscallarg(unsigned long *) mask; 637 } */ 638 proc_t *p; 639 unsigned long *lp, *data; 640 int error, size, nb = ncpu; 641 642 /* Unlike Linux, dynamically calculate cpu mask size */ 643 size = sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT); 644 if (SCARG(uap, len) < size) 645 return EINVAL; 646 647 /* XXX: Pointless check. TODO: Actually implement this. */ 648 mutex_enter(proc_lock); 649 p = proc_find(SCARG(uap, pid)); 650 mutex_exit(proc_lock); 651 if (p == NULL) { 652 return ESRCH; 653 } 654 655 /* 656 * return the actual number of CPU, tag all of them as available 657 * The result is a mask, the first CPU being in the least significant 658 * bit. 659 */ 660 data = kmem_zalloc(size, KM_SLEEP); 661 lp = data; 662 while (nb > LONG_BIT) { 663 *lp++ = ~0UL; 664 nb -= LONG_BIT; 665 } 666 if (nb) 667 *lp = (1 << ncpu) - 1; 668 669 error = copyout(data, SCARG(uap, mask), size); 670 kmem_free(data, size); 671 *retval = size; 672 return error; 673 } 674 675 int 676 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval) 677 { 678 /* { 679 syscallarg(linux_pid_t) pid; 680 syscallarg(unsigned int) len; 681 syscallarg(unsigned long *) mask; 682 } */ 683 proc_t *p; 684 685 /* XXX: Pointless check. TODO: Actually implement this. */ 686 mutex_enter(proc_lock); 687 p = proc_find(SCARG(uap, pid)); 688 mutex_exit(proc_lock); 689 if (p == NULL) { 690 return ESRCH; 691 } 692 693 /* Let's ignore it */ 694 DPRINTF(("%s\n", __func__)); 695 return 0; 696 } 697