1 /* $NetBSD: linux_sched.c,v 1.62 2010/07/01 02:38:29 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center; by Matthias Scheler. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Linux compatibility module. Try to deal with scheduler related syscalls. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.62 2010/07/01 02:38:29 rmind Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/mount.h> 42 #include <sys/proc.h> 43 #include <sys/systm.h> 44 #include <sys/sysctl.h> 45 #include <sys/malloc.h> 46 #include <sys/syscallargs.h> 47 #include <sys/wait.h> 48 #include <sys/kauth.h> 49 #include <sys/ptrace.h> 50 #include <sys/types.h> 51 52 #include <sys/cpu.h> 53 54 #include <compat/linux/common/linux_types.h> 55 #include <compat/linux/common/linux_signal.h> 56 #include <compat/linux/common/linux_machdep.h> /* For LINUX_NPTL */ 57 #include <compat/linux/common/linux_emuldata.h> 58 #include <compat/linux/common/linux_ipc.h> 59 #include <compat/linux/common/linux_sem.h> 60 #include <compat/linux/common/linux_exec.h> 61 62 #include <compat/linux/linux_syscallargs.h> 63 64 #include <compat/linux/common/linux_sched.h> 65 66 int 67 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval) 68 { 69 /* { 70 syscallarg(int) flags; 71 syscallarg(void *) stack; 72 #ifdef LINUX_NPTL 73 syscallarg(void *) parent_tidptr; 74 syscallarg(void *) child_tidptr; 75 #endif 76 } */ 77 int flags, sig; 78 int error; 79 struct proc *p; 80 #ifdef LINUX_NPTL 81 struct linux_emuldata *led; 82 #endif 83 84 /* 85 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags. 86 */ 87 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE)) 88 return (EINVAL); 89 90 /* 91 * Thread group implies shared signals. Shared signals 92 * imply shared VM. This matches what Linux kernel does. 93 */ 94 if (SCARG(uap, flags) & LINUX_CLONE_THREAD 95 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0) 96 return (EINVAL); 97 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND 98 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0) 99 return (EINVAL); 100 101 flags = 0; 102 103 if (SCARG(uap, flags) & LINUX_CLONE_VM) 104 flags |= FORK_SHAREVM; 105 if (SCARG(uap, flags) & LINUX_CLONE_FS) 106 flags |= FORK_SHARECWD; 107 if (SCARG(uap, flags) & LINUX_CLONE_FILES) 108 flags |= FORK_SHAREFILES; 109 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) 110 flags |= FORK_SHARESIGS; 111 if (SCARG(uap, flags) & LINUX_CLONE_VFORK) 112 flags |= FORK_PPWAIT; 113 114 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL; 115 if (sig < 0 || sig >= LINUX__NSIG) 116 return (EINVAL); 117 sig = linux_to_native_signo[sig]; 118 119 #ifdef LINUX_NPTL 120 led = (struct linux_emuldata *)l->l_proc->p_emuldata; 121 122 led->parent_tidptr = SCARG(uap, parent_tidptr); 123 led->child_tidptr = SCARG(uap, child_tidptr); 124 led->clone_flags = SCARG(uap, flags); 125 #endif /* LINUX_NPTL */ 126 127 /* 128 * Note that Linux does not provide a portable way of specifying 129 * the stack area; the caller must know if the stack grows up 130 * or down. So, we pass a stack size of 0, so that the code 131 * that makes this adjustment is a noop. 132 */ 133 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0, 134 NULL, NULL, retval, &p)) != 0) 135 return error; 136 137 #ifdef LINUX_NPTL 138 if ((SCARG(uap, flags) & LINUX_CLONE_SETTLS) != 0) 139 return linux_init_thread_area(l, LIST_FIRST(&p->p_lwps)); 140 #endif /* LINUX_NPTL */ 141 142 return 0; 143 } 144 145 /* 146 * linux realtime priority 147 * 148 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99]. 149 * 150 * - SCHED_OTHER tasks don't have realtime priorities. 151 * in particular, sched_param::sched_priority is always 0. 152 */ 153 154 #define LINUX_SCHED_RTPRIO_MIN 1 155 #define LINUX_SCHED_RTPRIO_MAX 99 156 157 static int 158 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params, 159 int *native_policy, struct sched_param *native_params) 160 { 161 162 switch (linux_policy) { 163 case LINUX_SCHED_OTHER: 164 if (native_policy != NULL) { 165 *native_policy = SCHED_OTHER; 166 } 167 break; 168 169 case LINUX_SCHED_FIFO: 170 if (native_policy != NULL) { 171 *native_policy = SCHED_FIFO; 172 } 173 break; 174 175 case LINUX_SCHED_RR: 176 if (native_policy != NULL) { 177 *native_policy = SCHED_RR; 178 } 179 break; 180 181 default: 182 return EINVAL; 183 } 184 185 if (linux_params != NULL) { 186 int prio = linux_params->sched_priority; 187 188 KASSERT(native_params != NULL); 189 190 if (linux_policy == LINUX_SCHED_OTHER) { 191 if (prio != 0) { 192 return EINVAL; 193 } 194 native_params->sched_priority = PRI_NONE; /* XXX */ 195 } else { 196 if (prio < LINUX_SCHED_RTPRIO_MIN || 197 prio > LINUX_SCHED_RTPRIO_MAX) { 198 return EINVAL; 199 } 200 native_params->sched_priority = 201 (prio - LINUX_SCHED_RTPRIO_MIN) 202 * (SCHED_PRI_MAX - SCHED_PRI_MIN) 203 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 204 + SCHED_PRI_MIN; 205 } 206 } 207 208 return 0; 209 } 210 211 static int 212 sched_native2linux(int native_policy, struct sched_param *native_params, 213 int *linux_policy, struct linux_sched_param *linux_params) 214 { 215 216 switch (native_policy) { 217 case SCHED_OTHER: 218 if (linux_policy != NULL) { 219 *linux_policy = LINUX_SCHED_OTHER; 220 } 221 break; 222 223 case SCHED_FIFO: 224 if (linux_policy != NULL) { 225 *linux_policy = LINUX_SCHED_FIFO; 226 } 227 break; 228 229 case SCHED_RR: 230 if (linux_policy != NULL) { 231 *linux_policy = LINUX_SCHED_RR; 232 } 233 break; 234 235 default: 236 panic("%s: unknown policy %d\n", __func__, native_policy); 237 } 238 239 if (native_params != NULL) { 240 int prio = native_params->sched_priority; 241 242 KASSERT(prio >= SCHED_PRI_MIN); 243 KASSERT(prio <= SCHED_PRI_MAX); 244 KASSERT(linux_params != NULL); 245 246 #ifdef DEBUG_LINUX 247 printf("native2linux: native: policy %d, priority %d\n", 248 native_policy, prio); 249 #endif 250 251 if (native_policy == SCHED_OTHER) { 252 linux_params->sched_priority = 0; 253 } else { 254 linux_params->sched_priority = 255 (prio - SCHED_PRI_MIN) 256 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN) 257 / (SCHED_PRI_MAX - SCHED_PRI_MIN) 258 + LINUX_SCHED_RTPRIO_MIN; 259 } 260 #ifdef DEBUG_LINUX 261 printf("native2linux: linux: policy %d, priority %d\n", 262 -1, linux_params->sched_priority); 263 #endif 264 } 265 266 return 0; 267 } 268 269 int 270 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval) 271 { 272 /* { 273 syscallarg(linux_pid_t) pid; 274 syscallarg(const struct linux_sched_param *) sp; 275 } */ 276 int error, policy; 277 struct linux_sched_param lp; 278 struct sched_param sp; 279 280 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 281 error = EINVAL; 282 goto out; 283 } 284 285 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 286 if (error) 287 goto out; 288 289 /* We need the current policy in Linux terms. */ 290 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 291 if (error) 292 goto out; 293 error = sched_native2linux(policy, NULL, &policy, NULL); 294 if (error) 295 goto out; 296 297 error = sched_linux2native(policy, &lp, &policy, &sp); 298 if (error) 299 goto out; 300 301 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 302 if (error) 303 goto out; 304 305 out: 306 return error; 307 } 308 309 int 310 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval) 311 { 312 /* { 313 syscallarg(linux_pid_t) pid; 314 syscallarg(struct linux_sched_param *) sp; 315 } */ 316 struct linux_sched_param lp; 317 struct sched_param sp; 318 int error, policy; 319 320 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 321 error = EINVAL; 322 goto out; 323 } 324 325 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp); 326 if (error) 327 goto out; 328 #ifdef DEBUG_LINUX 329 printf("getparam: native: policy %d, priority %d\n", 330 policy, sp.sched_priority); 331 #endif 332 333 error = sched_native2linux(policy, &sp, NULL, &lp); 334 if (error) 335 goto out; 336 #ifdef DEBUG_LINUX 337 printf("getparam: linux: policy %d, priority %d\n", 338 policy, lp.sched_priority); 339 #endif 340 341 error = copyout(&lp, SCARG(uap, sp), sizeof(lp)); 342 if (error) 343 goto out; 344 345 out: 346 return error; 347 } 348 349 int 350 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval) 351 { 352 /* { 353 syscallarg(linux_pid_t) pid; 354 syscallarg(int) policy; 355 syscallarg(cont struct linux_sched_param *) sp; 356 } */ 357 int error, policy; 358 struct linux_sched_param lp; 359 struct sched_param sp; 360 361 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) { 362 error = EINVAL; 363 goto out; 364 } 365 366 error = copyin(SCARG(uap, sp), &lp, sizeof(lp)); 367 if (error) 368 goto out; 369 #ifdef DEBUG_LINUX 370 printf("setscheduler: linux: policy %d, priority %d\n", 371 SCARG(uap, policy), lp.sched_priority); 372 #endif 373 374 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp); 375 if (error) 376 goto out; 377 #ifdef DEBUG_LINUX 378 printf("setscheduler: native: policy %d, priority %d\n", 379 policy, sp.sched_priority); 380 #endif 381 382 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp); 383 if (error) 384 goto out; 385 386 out: 387 return error; 388 } 389 390 int 391 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval) 392 { 393 /* { 394 syscallarg(linux_pid_t) pid; 395 } */ 396 int error, policy; 397 398 *retval = -1; 399 400 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL); 401 if (error) 402 goto out; 403 404 error = sched_native2linux(policy, NULL, &policy, NULL); 405 if (error) 406 goto out; 407 408 *retval = policy; 409 410 out: 411 return error; 412 } 413 414 int 415 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval) 416 { 417 418 yield(); 419 return 0; 420 } 421 422 int 423 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval) 424 { 425 /* { 426 syscallarg(int) policy; 427 } */ 428 429 switch (SCARG(uap, policy)) { 430 case LINUX_SCHED_OTHER: 431 *retval = 0; 432 break; 433 case LINUX_SCHED_FIFO: 434 case LINUX_SCHED_RR: 435 *retval = LINUX_SCHED_RTPRIO_MAX; 436 break; 437 default: 438 return EINVAL; 439 } 440 441 return 0; 442 } 443 444 int 445 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval) 446 { 447 /* { 448 syscallarg(int) policy; 449 } */ 450 451 switch (SCARG(uap, policy)) { 452 case LINUX_SCHED_OTHER: 453 *retval = 0; 454 break; 455 case LINUX_SCHED_FIFO: 456 case LINUX_SCHED_RR: 457 *retval = LINUX_SCHED_RTPRIO_MIN; 458 break; 459 default: 460 return EINVAL; 461 } 462 463 return 0; 464 } 465 466 #ifndef __m68k__ 467 /* Present on everything but m68k */ 468 int 469 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval) 470 { 471 #ifdef LINUX_NPTL 472 /* { 473 syscallarg(int) error_code; 474 } */ 475 struct proc *p = l->l_proc; 476 struct linux_emuldata *led = p->p_emuldata; 477 struct linux_emuldata *e; 478 479 if (led->s->flags & LINUX_LES_USE_NPTL) { 480 481 #ifdef DEBUG_LINUX 482 printf("%s:%d, led->s->refs = %d\n", __func__, __LINE__, 483 led->s->refs); 484 #endif 485 486 /* 487 * The calling thread is supposed to kill all threads 488 * in the same thread group (i.e. all threads created 489 * via clone(2) with CLONE_THREAD flag set). 490 * 491 * If there is only one thread, things are quite simple 492 */ 493 if (led->s->refs == 1) 494 return sys_exit(l, (const void *)uap, retval); 495 496 #ifdef DEBUG_LINUX 497 printf("%s:%d\n", __func__, __LINE__); 498 #endif 499 500 mutex_enter(proc_lock); 501 led->s->flags |= LINUX_LES_INEXITGROUP; 502 led->s->xstat = W_EXITCODE(SCARG(uap, error_code), 0); 503 504 /* 505 * Kill all threads in the group. The emulation exit hook takes 506 * care of hiding the zombies and reporting the exit code 507 * properly. 508 */ 509 LIST_FOREACH(e, &led->s->threads, threads) { 510 if (e->proc == p) 511 continue; 512 513 #ifdef DEBUG_LINUX 514 printf("%s: kill PID %d\n", __func__, e->proc->p_pid); 515 #endif 516 psignal(e->proc, SIGKILL); 517 } 518 519 /* Now, kill ourselves */ 520 psignal(p, SIGKILL); 521 mutex_exit(proc_lock); 522 523 return 0; 524 525 } 526 #endif /* LINUX_NPTL */ 527 528 return sys_exit(l, (const void *)uap, retval); 529 } 530 #endif /* !__m68k__ */ 531 532 #ifdef LINUX_NPTL 533 int 534 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval) 535 { 536 /* { 537 syscallarg(int *) tidptr; 538 } */ 539 struct linux_emuldata *led; 540 541 led = (struct linux_emuldata *)l->l_proc->p_emuldata; 542 led->clear_tid = SCARG(uap, tid); 543 544 led->s->flags |= LINUX_LES_USE_NPTL; 545 546 *retval = l->l_proc->p_pid; 547 548 return 0; 549 } 550 551 /* ARGUSED1 */ 552 int 553 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval) 554 { 555 /* The Linux kernel does it exactly that way */ 556 *retval = l->l_proc->p_pid; 557 return 0; 558 } 559 560 #ifdef LINUX_NPTL 561 /* ARGUSED1 */ 562 int 563 linux_sys_getpid(struct lwp *l, const void *v, register_t *retval) 564 { 565 struct linux_emuldata *led = l->l_proc->p_emuldata; 566 567 if (led->s->flags & LINUX_LES_USE_NPTL) { 568 /* The Linux kernel does it exactly that way */ 569 *retval = led->s->group_pid; 570 } else { 571 *retval = l->l_proc->p_pid; 572 } 573 574 return 0; 575 } 576 577 /* ARGUSED1 */ 578 int 579 linux_sys_getppid(struct lwp *l, const void *v, register_t *retval) 580 { 581 struct proc *p = l->l_proc; 582 struct linux_emuldata *led = p->p_emuldata; 583 struct proc *glp; 584 struct proc *pp; 585 586 mutex_enter(proc_lock); 587 if (led->s->flags & LINUX_LES_USE_NPTL) { 588 589 /* Find the thread group leader's parent */ 590 glp = proc_find(led->s->group_pid); 591 if (glp == NULL) { 592 /* Maybe panic... */ 593 printf("linux_sys_getppid: missing group leader PID" 594 " %d\n", led->s->group_pid); 595 mutex_exit(proc_lock); 596 return -1; 597 } 598 pp = glp->p_pptr; 599 600 /* If this is a Linux process too, return thread group PID */ 601 if (pp->p_emul == p->p_emul) { 602 struct linux_emuldata *pled; 603 604 pled = pp->p_emuldata; 605 *retval = pled->s->group_pid; 606 } else { 607 *retval = pp->p_pid; 608 } 609 610 } else { 611 *retval = p->p_pptr->p_pid; 612 } 613 mutex_exit(proc_lock); 614 615 return 0; 616 } 617 #endif /* LINUX_NPTL */ 618 619 int 620 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval) 621 { 622 /* { 623 syscallarg(pid_t) pid; 624 syscallarg(unsigned int) len; 625 syscallarg(unsigned long *) mask; 626 } */ 627 int error, size, nb = ncpu; 628 unsigned long *c, *data; 629 proc_t *p; 630 631 /* Unlike Linux, dynamically calculate cpu mask size */ 632 size = sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT); 633 if (SCARG(uap, len) < size) 634 return EINVAL; 635 636 /* XXX: Pointless check. TODO: Actually implement this. */ 637 mutex_enter(proc_lock); 638 p = proc_find(SCARG(uap, pid)); 639 mutex_exit(proc_lock); 640 if (p == NULL) { 641 return ESRCH; 642 } 643 644 /* 645 * return the actual number of CPU, tag all of them as available 646 * The result is a mask, the first CPU being in the least significant 647 * bit. 648 */ 649 data = kmem_zalloc(size, KM_SLEEP); 650 c = data; 651 while (nb > LONG_BIT) { 652 *c++ = ~0UL; 653 nb -= LONG_BIT; 654 } 655 if (nb) 656 *c = (1 << ncpu) - 1; 657 658 error = copyout(data, SCARG(uap, mask), size); 659 kmem_free(data, size); 660 661 *retval = size; 662 return error; 663 664 } 665 666 int 667 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval) 668 { 669 /* { 670 syscallarg(pid_t) pid; 671 syscallarg(unsigned int) len; 672 syscallarg(unsigned long *) mask; 673 } */ 674 proc_t *p; 675 676 /* XXX: Pointless check. TODO: Actually implement this. */ 677 mutex_enter(proc_lock); 678 p = proc_find(SCARG(uap, pid)); 679 mutex_exit(proc_lock); 680 if (p == NULL) { 681 return ESRCH; 682 } 683 684 /* Let's ignore it */ 685 #ifdef DEBUG_LINUX 686 printf("linux_sys_sched_setaffinity\n"); 687 #endif 688 return 0; 689 }; 690 #endif /* LINUX_NPTL */ 691