1 /* $NetBSD: kern_lwp.c,v 1.206 2019/11/07 19:45:18 joerg Exp $ */ 2 3 /*- 4 * Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Nathan J. Williams, and Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Overview 34 * 35 * Lightweight processes (LWPs) are the basic unit or thread of 36 * execution within the kernel. The core state of an LWP is described 37 * by "struct lwp", also known as lwp_t. 38 * 39 * Each LWP is contained within a process (described by "struct proc"), 40 * Every process contains at least one LWP, but may contain more. The 41 * process describes attributes shared among all of its LWPs such as a 42 * private address space, global execution state (stopped, active, 43 * zombie, ...), signal disposition and so on. On a multiprocessor 44 * machine, multiple LWPs be executing concurrently in the kernel. 45 * 46 * Execution states 47 * 48 * At any given time, an LWP has overall state that is described by 49 * lwp::l_stat. The states are broken into two sets below. The first 50 * set is guaranteed to represent the absolute, current state of the 51 * LWP: 52 * 53 * LSONPROC 54 * 55 * On processor: the LWP is executing on a CPU, either in the 56 * kernel or in user space. 57 * 58 * LSRUN 59 * 60 * Runnable: the LWP is parked on a run queue, and may soon be 61 * chosen to run by an idle processor, or by a processor that 62 * has been asked to preempt a currently runnning but lower 63 * priority LWP. 64 * 65 * LSIDL 66 * 67 * Idle: the LWP has been created but has not yet executed, 68 * or it has ceased executing a unit of work and is waiting 69 * to be started again. 70 * 71 * LSSUSPENDED: 72 * 73 * Suspended: the LWP has had its execution suspended by 74 * another LWP in the same process using the _lwp_suspend() 75 * system call. User-level LWPs also enter the suspended 76 * state when the system is shutting down. 77 * 78 * The second set represent a "statement of intent" on behalf of the 79 * LWP. The LWP may in fact be executing on a processor, may be 80 * sleeping or idle. It is expected to take the necessary action to 81 * stop executing or become "running" again within a short timeframe. 82 * The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running. 83 * Importantly, it indicates that its state is tied to a CPU. 84 * 85 * LSZOMB: 86 * 87 * Dead or dying: the LWP has released most of its resources 88 * and is about to switch away into oblivion, or has already 89 * switched away. When it switches away, its few remaining 90 * resources can be collected. 91 * 92 * LSSLEEP: 93 * 94 * Sleeping: the LWP has entered itself onto a sleep queue, and 95 * has switched away or will switch away shortly to allow other 96 * LWPs to run on the CPU. 97 * 98 * LSSTOP: 99 * 100 * Stopped: the LWP has been stopped as a result of a job 101 * control signal, or as a result of the ptrace() interface. 102 * 103 * Stopped LWPs may run briefly within the kernel to handle 104 * signals that they receive, but will not return to user space 105 * until their process' state is changed away from stopped. 106 * 107 * Single LWPs within a process can not be set stopped 108 * selectively: all actions that can stop or continue LWPs 109 * occur at the process level. 110 * 111 * State transitions 112 * 113 * Note that the LSSTOP state may only be set when returning to 114 * user space in userret(), or when sleeping interruptably. The 115 * LSSUSPENDED state may only be set in userret(). Before setting 116 * those states, we try to ensure that the LWPs will release all 117 * locks that they hold, and at a minimum try to ensure that the 118 * LWP can be set runnable again by a signal. 119 * 120 * LWPs may transition states in the following ways: 121 * 122 * RUN -------> ONPROC ONPROC -----> RUN 123 * > SLEEP 124 * > STOPPED 125 * > SUSPENDED 126 * > ZOMB 127 * > IDL (special cases) 128 * 129 * STOPPED ---> RUN SUSPENDED --> RUN 130 * > SLEEP 131 * 132 * SLEEP -----> ONPROC IDL --------> RUN 133 * > RUN > SUSPENDED 134 * > STOPPED > STOPPED 135 * > ONPROC (special cases) 136 * 137 * Some state transitions are only possible with kernel threads (eg 138 * ONPROC -> IDL) and happen under tightly controlled circumstances 139 * free of unwanted side effects. 140 * 141 * Migration 142 * 143 * Migration of threads from one CPU to another could be performed 144 * internally by the scheduler via sched_takecpu() or sched_catchlwp() 145 * functions. The universal lwp_migrate() function should be used for 146 * any other cases. Subsystems in the kernel must be aware that CPU 147 * of LWP may change, while it is not locked. 148 * 149 * Locking 150 * 151 * The majority of fields in 'struct lwp' are covered by a single, 152 * general spin lock pointed to by lwp::l_mutex. The locks covering 153 * each field are documented in sys/lwp.h. 154 * 155 * State transitions must be made with the LWP's general lock held, 156 * and may cause the LWP's lock pointer to change. Manipulation of 157 * the general lock is not performed directly, but through calls to 158 * lwp_lock(), lwp_unlock() and others. It should be noted that the 159 * adaptive locks are not allowed to be released while the LWP's lock 160 * is being held (unlike for other spin-locks). 161 * 162 * States and their associated locks: 163 * 164 * LSONPROC, LSZOMB: 165 * 166 * Always covered by spc_lwplock, which protects running LWPs. 167 * This is a per-CPU lock and matches lwp::l_cpu. 168 * 169 * LSIDL, LSRUN: 170 * 171 * Always covered by spc_mutex, which protects the run queues. 172 * This is a per-CPU lock and matches lwp::l_cpu. 173 * 174 * LSSLEEP: 175 * 176 * Covered by a lock associated with the sleep queue that the 177 * LWP resides on. Matches lwp::l_sleepq::sq_mutex. 178 * 179 * LSSTOP, LSSUSPENDED: 180 * 181 * If the LWP was previously sleeping (l_wchan != NULL), then 182 * l_mutex references the sleep queue lock. If the LWP was 183 * runnable or on the CPU when halted, or has been removed from 184 * the sleep queue since halted, then the lock is spc_lwplock. 185 * 186 * The lock order is as follows: 187 * 188 * spc::spc_lwplock -> 189 * sleeptab::st_mutex -> 190 * tschain_t::tc_mutex -> 191 * spc::spc_mutex 192 * 193 * Each process has an scheduler state lock (proc::p_lock), and a 194 * number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and 195 * so on. When an LWP is to be entered into or removed from one of the 196 * following states, p_lock must be held and the process wide counters 197 * adjusted: 198 * 199 * LSIDL, LSZOMB, LSSTOP, LSSUSPENDED 200 * 201 * (But not always for kernel threads. There are some special cases 202 * as mentioned above. See kern_softint.c.) 203 * 204 * Note that an LWP is considered running or likely to run soon if in 205 * one of the following states. This affects the value of p_nrlwps: 206 * 207 * LSRUN, LSONPROC, LSSLEEP 208 * 209 * p_lock does not need to be held when transitioning among these 210 * three states, hence p_lock is rarely taken for state transitions. 211 */ 212 213 #include <sys/cdefs.h> 214 __KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.206 2019/11/07 19:45:18 joerg Exp $"); 215 216 #include "opt_ddb.h" 217 #include "opt_lockdebug.h" 218 #include "opt_dtrace.h" 219 220 #define _LWP_API_PRIVATE 221 222 #include <sys/param.h> 223 #include <sys/systm.h> 224 #include <sys/cpu.h> 225 #include <sys/pool.h> 226 #include <sys/proc.h> 227 #include <sys/syscallargs.h> 228 #include <sys/syscall_stats.h> 229 #include <sys/kauth.h> 230 #include <sys/pserialize.h> 231 #include <sys/sleepq.h> 232 #include <sys/lockdebug.h> 233 #include <sys/kmem.h> 234 #include <sys/pset.h> 235 #include <sys/intr.h> 236 #include <sys/lwpctl.h> 237 #include <sys/atomic.h> 238 #include <sys/filedesc.h> 239 #include <sys/fstrans.h> 240 #include <sys/dtrace_bsd.h> 241 #include <sys/sdt.h> 242 #include <sys/ptrace.h> 243 #include <sys/xcall.h> 244 #include <sys/uidinfo.h> 245 #include <sys/sysctl.h> 246 #include <sys/psref.h> 247 248 #include <uvm/uvm_extern.h> 249 #include <uvm/uvm_object.h> 250 251 static pool_cache_t lwp_cache __read_mostly; 252 struct lwplist alllwp __cacheline_aligned; 253 254 static void lwp_dtor(void *, void *); 255 256 /* DTrace proc provider probes */ 257 SDT_PROVIDER_DEFINE(proc); 258 259 SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *"); 260 SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *"); 261 SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *"); 262 263 struct turnstile turnstile0; 264 struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = { 265 #ifdef LWP0_CPU_INFO 266 .l_cpu = LWP0_CPU_INFO, 267 #endif 268 #ifdef LWP0_MD_INITIALIZER 269 .l_md = LWP0_MD_INITIALIZER, 270 #endif 271 .l_proc = &proc0, 272 .l_lid = 1, 273 .l_flag = LW_SYSTEM, 274 .l_stat = LSONPROC, 275 .l_ts = &turnstile0, 276 .l_syncobj = &sched_syncobj, 277 .l_refcnt = 1, 278 .l_priority = PRI_USER + NPRI_USER - 1, 279 .l_inheritedprio = -1, 280 .l_class = SCHED_OTHER, 281 .l_psid = PS_NONE, 282 .l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders), 283 .l_name = __UNCONST("swapper"), 284 .l_fd = &filedesc0, 285 }; 286 287 static int sysctl_kern_maxlwp(SYSCTLFN_PROTO); 288 289 /* 290 * sysctl helper routine for kern.maxlwp. Ensures that the new 291 * values are not too low or too high. 292 */ 293 static int 294 sysctl_kern_maxlwp(SYSCTLFN_ARGS) 295 { 296 int error, nmaxlwp; 297 struct sysctlnode node; 298 299 nmaxlwp = maxlwp; 300 node = *rnode; 301 node.sysctl_data = &nmaxlwp; 302 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 303 if (error || newp == NULL) 304 return error; 305 306 if (nmaxlwp < 0 || nmaxlwp >= 65536) 307 return EINVAL; 308 if (nmaxlwp > cpu_maxlwp()) 309 return EINVAL; 310 maxlwp = nmaxlwp; 311 312 return 0; 313 } 314 315 static void 316 sysctl_kern_lwp_setup(void) 317 { 318 struct sysctllog *clog = NULL; 319 320 sysctl_createv(&clog, 0, NULL, NULL, 321 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 322 CTLTYPE_INT, "maxlwp", 323 SYSCTL_DESCR("Maximum number of simultaneous threads"), 324 sysctl_kern_maxlwp, 0, NULL, 0, 325 CTL_KERN, CTL_CREATE, CTL_EOL); 326 } 327 328 void 329 lwpinit(void) 330 { 331 332 LIST_INIT(&alllwp); 333 lwpinit_specificdata(); 334 lwp_sys_init(); 335 lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0, 0, 336 "lwppl", NULL, IPL_NONE, NULL, lwp_dtor, NULL); 337 338 maxlwp = cpu_maxlwp(); 339 sysctl_kern_lwp_setup(); 340 } 341 342 void 343 lwp0_init(void) 344 { 345 struct lwp *l = &lwp0; 346 347 KASSERT((void *)uvm_lwp_getuarea(l) != NULL); 348 KASSERT(l->l_lid == proc0.p_nlwpid); 349 350 LIST_INSERT_HEAD(&alllwp, l, l_list); 351 352 callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE); 353 callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l); 354 cv_init(&l->l_sigcv, "sigwait"); 355 cv_init(&l->l_waitcv, "vfork"); 356 357 kauth_cred_hold(proc0.p_cred); 358 l->l_cred = proc0.p_cred; 359 360 kdtrace_thread_ctor(NULL, l); 361 lwp_initspecific(l); 362 363 SYSCALL_TIME_LWP_INIT(l); 364 } 365 366 static void 367 lwp_dtor(void *arg, void *obj) 368 { 369 lwp_t *l = obj; 370 (void)l; 371 372 /* 373 * Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu() 374 * calls will exit before memory of LWP is returned to the pool, where 375 * KVA of LWP structure might be freed and re-used for other purposes. 376 * Kernel preemption is disabled around mutex_oncpu() and rw_oncpu() 377 * callers, therefore cross-call to all CPUs will do the job. Also, 378 * the value of l->l_cpu must be still valid at this point. 379 */ 380 KASSERT(l->l_cpu != NULL); 381 xc_barrier(0); 382 } 383 384 /* 385 * Set an suspended. 386 * 387 * Must be called with p_lock held, and the LWP locked. Will unlock the 388 * LWP before return. 389 */ 390 int 391 lwp_suspend(struct lwp *curl, struct lwp *t) 392 { 393 int error; 394 395 KASSERT(mutex_owned(t->l_proc->p_lock)); 396 KASSERT(lwp_locked(t, NULL)); 397 398 KASSERT(curl != t || curl->l_stat == LSONPROC); 399 400 /* 401 * If the current LWP has been told to exit, we must not suspend anyone 402 * else or deadlock could occur. We won't return to userspace. 403 */ 404 if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) { 405 lwp_unlock(t); 406 return (EDEADLK); 407 } 408 409 if ((t->l_flag & LW_DBGSUSPEND) != 0) { 410 lwp_unlock(t); 411 return 0; 412 } 413 414 error = 0; 415 416 switch (t->l_stat) { 417 case LSRUN: 418 case LSONPROC: 419 t->l_flag |= LW_WSUSPEND; 420 lwp_need_userret(t); 421 lwp_unlock(t); 422 break; 423 424 case LSSLEEP: 425 t->l_flag |= LW_WSUSPEND; 426 427 /* 428 * Kick the LWP and try to get it to the kernel boundary 429 * so that it will release any locks that it holds. 430 * setrunnable() will release the lock. 431 */ 432 if ((t->l_flag & LW_SINTR) != 0) 433 setrunnable(t); 434 else 435 lwp_unlock(t); 436 break; 437 438 case LSSUSPENDED: 439 lwp_unlock(t); 440 break; 441 442 case LSSTOP: 443 t->l_flag |= LW_WSUSPEND; 444 setrunnable(t); 445 break; 446 447 case LSIDL: 448 case LSZOMB: 449 error = EINTR; /* It's what Solaris does..... */ 450 lwp_unlock(t); 451 break; 452 } 453 454 return (error); 455 } 456 457 /* 458 * Restart a suspended LWP. 459 * 460 * Must be called with p_lock held, and the LWP locked. Will unlock the 461 * LWP before return. 462 */ 463 void 464 lwp_continue(struct lwp *l) 465 { 466 467 KASSERT(mutex_owned(l->l_proc->p_lock)); 468 KASSERT(lwp_locked(l, NULL)); 469 470 /* If rebooting or not suspended, then just bail out. */ 471 if ((l->l_flag & LW_WREBOOT) != 0) { 472 lwp_unlock(l); 473 return; 474 } 475 476 l->l_flag &= ~LW_WSUSPEND; 477 478 if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) { 479 lwp_unlock(l); 480 return; 481 } 482 483 /* setrunnable() will release the lock. */ 484 setrunnable(l); 485 } 486 487 /* 488 * Restart a stopped LWP. 489 * 490 * Must be called with p_lock held, and the LWP NOT locked. Will unlock the 491 * LWP before return. 492 */ 493 void 494 lwp_unstop(struct lwp *l) 495 { 496 struct proc *p = l->l_proc; 497 498 KASSERT(mutex_owned(proc_lock)); 499 KASSERT(mutex_owned(p->p_lock)); 500 501 lwp_lock(l); 502 503 KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); 504 505 /* If not stopped, then just bail out. */ 506 if (l->l_stat != LSSTOP) { 507 lwp_unlock(l); 508 return; 509 } 510 511 p->p_stat = SACTIVE; 512 p->p_sflag &= ~PS_STOPPING; 513 514 if (!p->p_waited) 515 p->p_pptr->p_nstopchild--; 516 517 if (l->l_wchan == NULL) { 518 /* setrunnable() will release the lock. */ 519 setrunnable(l); 520 } else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) { 521 /* setrunnable() so we can receive the signal */ 522 setrunnable(l); 523 } else { 524 l->l_stat = LSSLEEP; 525 p->p_nrlwps++; 526 lwp_unlock(l); 527 } 528 } 529 530 /* 531 * Wait for an LWP within the current process to exit. If 'lid' is 532 * non-zero, we are waiting for a specific LWP. 533 * 534 * Must be called with p->p_lock held. 535 */ 536 int 537 lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting) 538 { 539 const lwpid_t curlid = l->l_lid; 540 proc_t *p = l->l_proc; 541 lwp_t *l2; 542 int error; 543 544 KASSERT(mutex_owned(p->p_lock)); 545 546 p->p_nlwpwait++; 547 l->l_waitingfor = lid; 548 549 for (;;) { 550 int nfound; 551 552 /* 553 * Avoid a race between exit1() and sigexit(): if the 554 * process is dumping core, then we need to bail out: call 555 * into lwp_userret() where we will be suspended until the 556 * deed is done. 557 */ 558 if ((p->p_sflag & PS_WCORE) != 0) { 559 mutex_exit(p->p_lock); 560 lwp_userret(l); 561 KASSERT(false); 562 } 563 564 /* 565 * First off, drain any detached LWP that is waiting to be 566 * reaped. 567 */ 568 while ((l2 = p->p_zomblwp) != NULL) { 569 p->p_zomblwp = NULL; 570 lwp_free(l2, false, false);/* releases proc mutex */ 571 mutex_enter(p->p_lock); 572 } 573 574 /* 575 * Now look for an LWP to collect. If the whole process is 576 * exiting, count detached LWPs as eligible to be collected, 577 * but don't drain them here. 578 */ 579 nfound = 0; 580 error = 0; 581 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 582 /* 583 * If a specific wait and the target is waiting on 584 * us, then avoid deadlock. This also traps LWPs 585 * that try to wait on themselves. 586 * 587 * Note that this does not handle more complicated 588 * cycles, like: t1 -> t2 -> t3 -> t1. The process 589 * can still be killed so it is not a major problem. 590 */ 591 if (l2->l_lid == lid && l2->l_waitingfor == curlid) { 592 error = EDEADLK; 593 break; 594 } 595 if (l2 == l) 596 continue; 597 if ((l2->l_prflag & LPR_DETACHED) != 0) { 598 nfound += exiting; 599 continue; 600 } 601 if (lid != 0) { 602 if (l2->l_lid != lid) 603 continue; 604 /* 605 * Mark this LWP as the first waiter, if there 606 * is no other. 607 */ 608 if (l2->l_waiter == 0) 609 l2->l_waiter = curlid; 610 } else if (l2->l_waiter != 0) { 611 /* 612 * It already has a waiter - so don't 613 * collect it. If the waiter doesn't 614 * grab it we'll get another chance 615 * later. 616 */ 617 nfound++; 618 continue; 619 } 620 nfound++; 621 622 /* No need to lock the LWP in order to see LSZOMB. */ 623 if (l2->l_stat != LSZOMB) 624 continue; 625 626 /* 627 * We're no longer waiting. Reset the "first waiter" 628 * pointer on the target, in case it was us. 629 */ 630 l->l_waitingfor = 0; 631 l2->l_waiter = 0; 632 p->p_nlwpwait--; 633 if (departed) 634 *departed = l2->l_lid; 635 sched_lwp_collect(l2); 636 637 /* lwp_free() releases the proc lock. */ 638 lwp_free(l2, false, false); 639 mutex_enter(p->p_lock); 640 return 0; 641 } 642 643 if (error != 0) 644 break; 645 if (nfound == 0) { 646 error = ESRCH; 647 break; 648 } 649 650 /* 651 * Note: since the lock will be dropped, need to restart on 652 * wakeup to run all LWPs again, e.g. there may be new LWPs. 653 */ 654 if (exiting) { 655 KASSERT(p->p_nlwps > 1); 656 cv_wait(&p->p_lwpcv, p->p_lock); 657 error = EAGAIN; 658 break; 659 } 660 661 /* 662 * If all other LWPs are waiting for exits or suspends 663 * and the supply of zombies and potential zombies is 664 * exhausted, then we are about to deadlock. 665 * 666 * If the process is exiting (and this LWP is not the one 667 * that is coordinating the exit) then bail out now. 668 */ 669 if ((p->p_sflag & PS_WEXIT) != 0 || 670 p->p_nrlwps + p->p_nzlwps - p->p_ndlwps <= p->p_nlwpwait) { 671 error = EDEADLK; 672 break; 673 } 674 675 /* 676 * Sit around and wait for something to happen. We'll be 677 * awoken if any of the conditions examined change: if an 678 * LWP exits, is collected, or is detached. 679 */ 680 if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0) 681 break; 682 } 683 684 /* 685 * We didn't find any LWPs to collect, we may have received a 686 * signal, or some other condition has caused us to bail out. 687 * 688 * If waiting on a specific LWP, clear the waiters marker: some 689 * other LWP may want it. Then, kick all the remaining waiters 690 * so that they can re-check for zombies and for deadlock. 691 */ 692 if (lid != 0) { 693 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 694 if (l2->l_lid == lid) { 695 if (l2->l_waiter == curlid) 696 l2->l_waiter = 0; 697 break; 698 } 699 } 700 } 701 p->p_nlwpwait--; 702 l->l_waitingfor = 0; 703 cv_broadcast(&p->p_lwpcv); 704 705 return error; 706 } 707 708 static lwpid_t 709 lwp_find_free_lid(lwpid_t try_lid, lwp_t * new_lwp, proc_t *p) 710 { 711 #define LID_SCAN (1u << 31) 712 lwp_t *scan, *free_before; 713 lwpid_t nxt_lid; 714 715 /* 716 * We want the first unused lid greater than or equal to 717 * try_lid (modulo 2^31). 718 * (If nothing else ld.elf_so doesn't want lwpid with the top bit set.) 719 * We must not return 0, and avoiding 'LID_SCAN - 1' makes 720 * the outer test easier. 721 * This would be much easier if the list were sorted in 722 * increasing order. 723 * The list is kept sorted in decreasing order. 724 * This code is only used after a process has generated 2^31 lwp. 725 * 726 * Code assumes it can always find an id. 727 */ 728 729 try_lid &= LID_SCAN - 1; 730 if (try_lid <= 1) 731 try_lid = 2; 732 733 free_before = NULL; 734 nxt_lid = LID_SCAN - 1; 735 LIST_FOREACH(scan, &p->p_lwps, l_sibling) { 736 if (scan->l_lid != nxt_lid) { 737 /* There are available lid before this entry */ 738 free_before = scan; 739 if (try_lid > scan->l_lid) 740 break; 741 } 742 if (try_lid == scan->l_lid) { 743 /* The ideal lid is busy, take a higher one */ 744 if (free_before != NULL) { 745 try_lid = free_before->l_lid + 1; 746 break; 747 } 748 /* No higher ones, reuse low numbers */ 749 try_lid = 2; 750 } 751 752 nxt_lid = scan->l_lid - 1; 753 if (LIST_NEXT(scan, l_sibling) == NULL) { 754 /* The value we have is lower than any existing lwp */ 755 LIST_INSERT_AFTER(scan, new_lwp, l_sibling); 756 return try_lid; 757 } 758 } 759 760 LIST_INSERT_BEFORE(free_before, new_lwp, l_sibling); 761 return try_lid; 762 } 763 764 /* 765 * Create a new LWP within process 'p2', using LWP 'l1' as a template. 766 * The new LWP is created in state LSIDL and must be set running, 767 * suspended, or stopped by the caller. 768 */ 769 int 770 lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags, 771 void *stack, size_t stacksize, void (*func)(void *), void *arg, 772 lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask, 773 const stack_t *sigstk) 774 { 775 struct lwp *l2, *isfree; 776 turnstile_t *ts; 777 lwpid_t lid; 778 779 KASSERT(l1 == curlwp || l1->l_proc == &proc0); 780 781 /* 782 * Enforce limits, excluding the first lwp and kthreads. 783 */ 784 if (p2->p_nlwps != 0 && p2 != &proc0) { 785 uid_t uid = kauth_cred_getuid(l1->l_cred); 786 int count = chglwpcnt(uid, 1); 787 if (__predict_false(count > 788 p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) { 789 if (kauth_authorize_process(l1->l_cred, 790 KAUTH_PROCESS_RLIMIT, p2, 791 KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS), 792 &p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR)) 793 != 0) { 794 (void)chglwpcnt(uid, -1); 795 return EAGAIN; 796 } 797 } 798 } 799 800 /* 801 * First off, reap any detached LWP waiting to be collected. 802 * We can re-use its LWP structure and turnstile. 803 */ 804 isfree = NULL; 805 if (p2->p_zomblwp != NULL) { 806 mutex_enter(p2->p_lock); 807 if ((isfree = p2->p_zomblwp) != NULL) { 808 p2->p_zomblwp = NULL; 809 lwp_free(isfree, true, false);/* releases proc mutex */ 810 } else 811 mutex_exit(p2->p_lock); 812 } 813 if (isfree == NULL) { 814 l2 = pool_cache_get(lwp_cache, PR_WAITOK); 815 memset(l2, 0, sizeof(*l2)); 816 l2->l_ts = pool_cache_get(turnstile_cache, PR_WAITOK); 817 SLIST_INIT(&l2->l_pi_lenders); 818 } else { 819 l2 = isfree; 820 ts = l2->l_ts; 821 KASSERT(l2->l_inheritedprio == -1); 822 KASSERT(SLIST_EMPTY(&l2->l_pi_lenders)); 823 memset(l2, 0, sizeof(*l2)); 824 l2->l_ts = ts; 825 } 826 827 l2->l_stat = LSIDL; 828 l2->l_proc = p2; 829 l2->l_refcnt = 1; 830 l2->l_class = sclass; 831 832 /* 833 * If vfork(), we want the LWP to run fast and on the same CPU 834 * as its parent, so that it can reuse the VM context and cache 835 * footprint on the local CPU. 836 */ 837 l2->l_kpriority = ((flags & LWP_VFORK) ? true : false); 838 l2->l_kpribase = PRI_KERNEL; 839 l2->l_priority = l1->l_priority; 840 l2->l_inheritedprio = -1; 841 l2->l_protectprio = -1; 842 l2->l_auxprio = -1; 843 l2->l_flag = 0; 844 l2->l_pflag = LP_MPSAFE; 845 TAILQ_INIT(&l2->l_ld_locks); 846 l2->l_psrefs = 0; 847 848 /* 849 * For vfork, borrow parent's lwpctl context if it exists. 850 * This also causes us to return via lwp_userret. 851 */ 852 if (flags & LWP_VFORK && l1->l_lwpctl) { 853 l2->l_lwpctl = l1->l_lwpctl; 854 l2->l_flag |= LW_LWPCTL; 855 } 856 857 /* 858 * If not the first LWP in the process, grab a reference to the 859 * descriptor table. 860 */ 861 l2->l_fd = p2->p_fd; 862 if (p2->p_nlwps != 0) { 863 KASSERT(l1->l_proc == p2); 864 fd_hold(l2); 865 } else { 866 KASSERT(l1->l_proc != p2); 867 } 868 869 if (p2->p_flag & PK_SYSTEM) { 870 /* Mark it as a system LWP. */ 871 l2->l_flag |= LW_SYSTEM; 872 } 873 874 kpreempt_disable(); 875 l2->l_mutex = l1->l_cpu->ci_schedstate.spc_mutex; 876 l2->l_cpu = l1->l_cpu; 877 kpreempt_enable(); 878 879 kdtrace_thread_ctor(NULL, l2); 880 lwp_initspecific(l2); 881 sched_lwp_fork(l1, l2); 882 lwp_update_creds(l2); 883 callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE); 884 callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2); 885 cv_init(&l2->l_sigcv, "sigwait"); 886 cv_init(&l2->l_waitcv, "vfork"); 887 l2->l_syncobj = &sched_syncobj; 888 PSREF_DEBUG_INIT_LWP(l2); 889 890 if (rnewlwpp != NULL) 891 *rnewlwpp = l2; 892 893 /* 894 * PCU state needs to be saved before calling uvm_lwp_fork() so that 895 * the MD cpu_lwp_fork() can copy the saved state to the new LWP. 896 */ 897 pcu_save_all(l1); 898 899 uvm_lwp_setuarea(l2, uaddr); 900 uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2); 901 902 if ((flags & LWP_PIDLID) != 0) { 903 lid = proc_alloc_pid(p2); 904 l2->l_pflag |= LP_PIDLID; 905 } else if (p2->p_nlwps == 0) { 906 lid = l1->l_lid; 907 } else { 908 lid = 0; 909 } 910 911 mutex_enter(p2->p_lock); 912 913 if ((flags & LWP_DETACHED) != 0) { 914 l2->l_prflag = LPR_DETACHED; 915 p2->p_ndlwps++; 916 } else 917 l2->l_prflag = 0; 918 919 l2->l_sigstk = *sigstk; 920 l2->l_sigmask = *sigmask; 921 TAILQ_INIT(&l2->l_sigpend.sp_info); 922 sigemptyset(&l2->l_sigpend.sp_set); 923 924 if (__predict_true(lid == 0)) { 925 /* 926 * XXX: l_lid are expected to be unique (for a process) 927 * if LWP_PIDLID is sometimes set this won't be true. 928 * Once 2^31 threads have been allocated we have to 929 * scan to ensure we allocate a unique value. 930 */ 931 lid = ++p2->p_nlwpid; 932 if (__predict_false(lid & LID_SCAN)) { 933 lid = lwp_find_free_lid(lid, l2, p2); 934 p2->p_nlwpid = lid | LID_SCAN; 935 /* l2 as been inserted into p_lwps in order */ 936 goto skip_insert; 937 } 938 p2->p_nlwpid = lid; 939 } 940 LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling); 941 skip_insert: 942 l2->l_lid = lid; 943 p2->p_nlwps++; 944 p2->p_nrlwps++; 945 946 KASSERT(l2->l_affinity == NULL); 947 948 if ((p2->p_flag & PK_SYSTEM) == 0) { 949 /* Inherit the affinity mask. */ 950 if (l1->l_affinity) { 951 /* 952 * Note that we hold the state lock while inheriting 953 * the affinity to avoid race with sched_setaffinity(). 954 */ 955 lwp_lock(l1); 956 if (l1->l_affinity) { 957 kcpuset_use(l1->l_affinity); 958 l2->l_affinity = l1->l_affinity; 959 } 960 lwp_unlock(l1); 961 } 962 lwp_lock(l2); 963 /* Inherit a processor-set */ 964 l2->l_psid = l1->l_psid; 965 /* Look for a CPU to start */ 966 l2->l_cpu = sched_takecpu(l2); 967 lwp_unlock_to(l2, l2->l_cpu->ci_schedstate.spc_mutex); 968 } 969 mutex_exit(p2->p_lock); 970 971 SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0); 972 973 mutex_enter(proc_lock); 974 LIST_INSERT_HEAD(&alllwp, l2, l_list); 975 mutex_exit(proc_lock); 976 977 SYSCALL_TIME_LWP_INIT(l2); 978 979 if (p2->p_emul->e_lwp_fork) 980 (*p2->p_emul->e_lwp_fork)(l1, l2); 981 982 return (0); 983 } 984 985 /* 986 * Called by MD code when a new LWP begins execution. Must be called 987 * with the previous LWP locked (so at splsched), or if there is no 988 * previous LWP, at splsched. 989 */ 990 void 991 lwp_startup(struct lwp *prev, struct lwp *new_lwp) 992 { 993 KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev); 994 995 SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0); 996 997 KASSERT(kpreempt_disabled()); 998 if (prev != NULL) { 999 /* 1000 * Normalize the count of the spin-mutexes, it was 1001 * increased in mi_switch(). Unmark the state of 1002 * context switch - it is finished for previous LWP. 1003 */ 1004 curcpu()->ci_mtx_count++; 1005 membar_exit(); 1006 prev->l_ctxswtch = 0; 1007 } 1008 KPREEMPT_DISABLE(new_lwp); 1009 if (__predict_true(new_lwp->l_proc->p_vmspace)) 1010 pmap_activate(new_lwp); 1011 spl0(); 1012 1013 /* Note trip through cpu_switchto(). */ 1014 pserialize_switchpoint(); 1015 1016 LOCKDEBUG_BARRIER(NULL, 0); 1017 KPREEMPT_ENABLE(new_lwp); 1018 if ((new_lwp->l_pflag & LP_MPSAFE) == 0) { 1019 KERNEL_LOCK(1, new_lwp); 1020 } 1021 } 1022 1023 /* 1024 * Exit an LWP. 1025 */ 1026 void 1027 lwp_exit(struct lwp *l) 1028 { 1029 struct proc *p = l->l_proc; 1030 struct lwp *l2; 1031 bool current; 1032 1033 current = (l == curlwp); 1034 1035 KASSERT(current || (l->l_stat == LSIDL && l->l_target_cpu == NULL)); 1036 KASSERT(p == curproc); 1037 1038 SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0); 1039 1040 /* 1041 * Verify that we hold no locks other than the kernel lock. 1042 */ 1043 LOCKDEBUG_BARRIER(&kernel_lock, 0); 1044 1045 /* 1046 * If we are the last live LWP in a process, we need to exit the 1047 * entire process. We do so with an exit status of zero, because 1048 * it's a "controlled" exit, and because that's what Solaris does. 1049 * 1050 * We are not quite a zombie yet, but for accounting purposes we 1051 * must increment the count of zombies here. 1052 * 1053 * Note: the last LWP's specificdata will be deleted here. 1054 */ 1055 mutex_enter(p->p_lock); 1056 if (p->p_nlwps - p->p_nzlwps == 1) { 1057 KASSERT(current == true); 1058 KASSERT(p != &proc0); 1059 /* XXXSMP kernel_lock not held */ 1060 exit1(l, 0, 0); 1061 /* NOTREACHED */ 1062 } 1063 p->p_nzlwps++; 1064 mutex_exit(p->p_lock); 1065 1066 if (p->p_emul->e_lwp_exit) 1067 (*p->p_emul->e_lwp_exit)(l); 1068 1069 /* Drop filedesc reference. */ 1070 fd_free(); 1071 1072 /* Release fstrans private data. */ 1073 fstrans_lwp_dtor(l); 1074 1075 /* Delete the specificdata while it's still safe to sleep. */ 1076 lwp_finispecific(l); 1077 1078 /* 1079 * Release our cached credentials. 1080 */ 1081 kauth_cred_free(l->l_cred); 1082 callout_destroy(&l->l_timeout_ch); 1083 1084 /* 1085 * If traced, report LWP exit event to the debugger. 1086 * 1087 * Remove the LWP from the global list. 1088 * Free its LID from the PID namespace if needed. 1089 */ 1090 mutex_enter(proc_lock); 1091 1092 if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) == 1093 (PSL_TRACED|PSL_TRACELWP_EXIT)) { 1094 mutex_enter(p->p_lock); 1095 if (ISSET(p->p_sflag, PS_WEXIT)) { 1096 mutex_exit(p->p_lock); 1097 /* 1098 * We are exiting, bail out without informing parent 1099 * about a terminating LWP as it would deadlock. 1100 */ 1101 } else { 1102 eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid); 1103 mutex_enter(proc_lock); 1104 } 1105 } 1106 1107 LIST_REMOVE(l, l_list); 1108 if ((l->l_pflag & LP_PIDLID) != 0 && l->l_lid != p->p_pid) { 1109 proc_free_pid(l->l_lid); 1110 } 1111 mutex_exit(proc_lock); 1112 1113 /* 1114 * Get rid of all references to the LWP that others (e.g. procfs) 1115 * may have, and mark the LWP as a zombie. If the LWP is detached, 1116 * mark it waiting for collection in the proc structure. Note that 1117 * before we can do that, we need to free any other dead, deatched 1118 * LWP waiting to meet its maker. 1119 */ 1120 mutex_enter(p->p_lock); 1121 lwp_drainrefs(l); 1122 1123 if ((l->l_prflag & LPR_DETACHED) != 0) { 1124 while ((l2 = p->p_zomblwp) != NULL) { 1125 p->p_zomblwp = NULL; 1126 lwp_free(l2, false, false);/* releases proc mutex */ 1127 mutex_enter(p->p_lock); 1128 l->l_refcnt++; 1129 lwp_drainrefs(l); 1130 } 1131 p->p_zomblwp = l; 1132 } 1133 1134 /* 1135 * If we find a pending signal for the process and we have been 1136 * asked to check for signals, then we lose: arrange to have 1137 * all other LWPs in the process check for signals. 1138 */ 1139 if ((l->l_flag & LW_PENDSIG) != 0 && 1140 firstsig(&p->p_sigpend.sp_set) != 0) { 1141 LIST_FOREACH(l2, &p->p_lwps, l_sibling) { 1142 lwp_lock(l2); 1143 l2->l_flag |= LW_PENDSIG; 1144 lwp_unlock(l2); 1145 } 1146 } 1147 1148 /* 1149 * Release any PCU resources before becoming a zombie. 1150 */ 1151 pcu_discard_all(l); 1152 1153 lwp_lock(l); 1154 l->l_stat = LSZOMB; 1155 if (l->l_name != NULL) { 1156 strcpy(l->l_name, "(zombie)"); 1157 } 1158 lwp_unlock(l); 1159 p->p_nrlwps--; 1160 cv_broadcast(&p->p_lwpcv); 1161 if (l->l_lwpctl != NULL) 1162 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 1163 mutex_exit(p->p_lock); 1164 1165 /* 1166 * We can no longer block. At this point, lwp_free() may already 1167 * be gunning for us. On a multi-CPU system, we may be off p_lwps. 1168 * 1169 * Free MD LWP resources. 1170 */ 1171 cpu_lwp_free(l, 0); 1172 1173 if (current) { 1174 pmap_deactivate(l); 1175 1176 /* 1177 * Release the kernel lock, and switch away into 1178 * oblivion. 1179 */ 1180 #ifdef notyet 1181 /* XXXSMP hold in lwp_userret() */ 1182 KERNEL_UNLOCK_LAST(l); 1183 #else 1184 KERNEL_UNLOCK_ALL(l, NULL); 1185 #endif 1186 lwp_exit_switchaway(l); 1187 } 1188 } 1189 1190 /* 1191 * Free a dead LWP's remaining resources. 1192 * 1193 * XXXLWP limits. 1194 */ 1195 void 1196 lwp_free(struct lwp *l, bool recycle, bool last) 1197 { 1198 struct proc *p = l->l_proc; 1199 struct rusage *ru; 1200 ksiginfoq_t kq; 1201 1202 KASSERT(l != curlwp); 1203 KASSERT(last || mutex_owned(p->p_lock)); 1204 1205 /* 1206 * We use the process credentials instead of the lwp credentials here 1207 * because the lwp credentials maybe cached (just after a setuid call) 1208 * and we don't want pay for syncing, since the lwp is going away 1209 * anyway 1210 */ 1211 if (p != &proc0 && p->p_nlwps != 1) 1212 (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1); 1213 /* 1214 * If this was not the last LWP in the process, then adjust 1215 * counters and unlock. 1216 */ 1217 if (!last) { 1218 /* 1219 * Add the LWP's run time to the process' base value. 1220 * This needs to co-incide with coming off p_lwps. 1221 */ 1222 bintime_add(&p->p_rtime, &l->l_rtime); 1223 p->p_pctcpu += l->l_pctcpu; 1224 ru = &p->p_stats->p_ru; 1225 ruadd(ru, &l->l_ru); 1226 ru->ru_nvcsw += (l->l_ncsw - l->l_nivcsw); 1227 ru->ru_nivcsw += l->l_nivcsw; 1228 LIST_REMOVE(l, l_sibling); 1229 p->p_nlwps--; 1230 p->p_nzlwps--; 1231 if ((l->l_prflag & LPR_DETACHED) != 0) 1232 p->p_ndlwps--; 1233 1234 /* 1235 * Have any LWPs sleeping in lwp_wait() recheck for 1236 * deadlock. 1237 */ 1238 cv_broadcast(&p->p_lwpcv); 1239 mutex_exit(p->p_lock); 1240 } 1241 1242 #ifdef MULTIPROCESSOR 1243 /* 1244 * In the unlikely event that the LWP is still on the CPU, 1245 * then spin until it has switched away. We need to release 1246 * all locks to avoid deadlock against interrupt handlers on 1247 * the target CPU. 1248 */ 1249 if ((l->l_pflag & LP_RUNNING) != 0 || l->l_cpu->ci_curlwp == l) { 1250 int count; 1251 (void)count; /* XXXgcc */ 1252 KERNEL_UNLOCK_ALL(curlwp, &count); 1253 while ((l->l_pflag & LP_RUNNING) != 0 || 1254 l->l_cpu->ci_curlwp == l) 1255 SPINLOCK_BACKOFF_HOOK; 1256 KERNEL_LOCK(count, curlwp); 1257 } 1258 #endif 1259 1260 /* 1261 * Destroy the LWP's remaining signal information. 1262 */ 1263 ksiginfo_queue_init(&kq); 1264 sigclear(&l->l_sigpend, NULL, &kq); 1265 ksiginfo_queue_drain(&kq); 1266 cv_destroy(&l->l_sigcv); 1267 cv_destroy(&l->l_waitcv); 1268 1269 /* 1270 * Free lwpctl structure and affinity. 1271 */ 1272 if (l->l_lwpctl) { 1273 lwp_ctl_free(l); 1274 } 1275 if (l->l_affinity) { 1276 kcpuset_unuse(l->l_affinity, NULL); 1277 l->l_affinity = NULL; 1278 } 1279 1280 /* 1281 * Free the LWP's turnstile and the LWP structure itself unless the 1282 * caller wants to recycle them. Also, free the scheduler specific 1283 * data. 1284 * 1285 * We can't return turnstile0 to the pool (it didn't come from it), 1286 * so if it comes up just drop it quietly and move on. 1287 * 1288 * We don't recycle the VM resources at this time. 1289 */ 1290 1291 if (!recycle && l->l_ts != &turnstile0) 1292 pool_cache_put(turnstile_cache, l->l_ts); 1293 if (l->l_name != NULL) 1294 kmem_free(l->l_name, MAXCOMLEN); 1295 1296 cpu_lwp_free2(l); 1297 uvm_lwp_exit(l); 1298 1299 KASSERT(SLIST_EMPTY(&l->l_pi_lenders)); 1300 KASSERT(l->l_inheritedprio == -1); 1301 KASSERT(l->l_blcnt == 0); 1302 kdtrace_thread_dtor(NULL, l); 1303 if (!recycle) 1304 pool_cache_put(lwp_cache, l); 1305 } 1306 1307 /* 1308 * Migrate the LWP to the another CPU. Unlocks the LWP. 1309 */ 1310 void 1311 lwp_migrate(lwp_t *l, struct cpu_info *tci) 1312 { 1313 struct schedstate_percpu *tspc; 1314 int lstat = l->l_stat; 1315 1316 KASSERT(lwp_locked(l, NULL)); 1317 KASSERT(tci != NULL); 1318 1319 /* If LWP is still on the CPU, it must be handled like LSONPROC */ 1320 if ((l->l_pflag & LP_RUNNING) != 0) { 1321 lstat = LSONPROC; 1322 } 1323 1324 /* 1325 * The destination CPU could be changed while previous migration 1326 * was not finished. 1327 */ 1328 if (l->l_target_cpu != NULL) { 1329 l->l_target_cpu = tci; 1330 lwp_unlock(l); 1331 return; 1332 } 1333 1334 /* Nothing to do if trying to migrate to the same CPU */ 1335 if (l->l_cpu == tci) { 1336 lwp_unlock(l); 1337 return; 1338 } 1339 1340 KASSERT(l->l_target_cpu == NULL); 1341 tspc = &tci->ci_schedstate; 1342 switch (lstat) { 1343 case LSRUN: 1344 l->l_target_cpu = tci; 1345 break; 1346 case LSIDL: 1347 l->l_cpu = tci; 1348 lwp_unlock_to(l, tspc->spc_mutex); 1349 return; 1350 case LSSLEEP: 1351 l->l_cpu = tci; 1352 break; 1353 case LSSTOP: 1354 case LSSUSPENDED: 1355 l->l_cpu = tci; 1356 if (l->l_wchan == NULL) { 1357 lwp_unlock_to(l, tspc->spc_lwplock); 1358 return; 1359 } 1360 break; 1361 case LSONPROC: 1362 l->l_target_cpu = tci; 1363 spc_lock(l->l_cpu); 1364 cpu_need_resched(l->l_cpu, RESCHED_KPREEMPT); 1365 spc_unlock(l->l_cpu); 1366 break; 1367 } 1368 lwp_unlock(l); 1369 } 1370 1371 /* 1372 * Find the LWP in the process. Arguments may be zero, in such case, 1373 * the calling process and first LWP in the list will be used. 1374 * On success - returns proc locked. 1375 */ 1376 struct lwp * 1377 lwp_find2(pid_t pid, lwpid_t lid) 1378 { 1379 proc_t *p; 1380 lwp_t *l; 1381 1382 /* Find the process. */ 1383 if (pid != 0) { 1384 mutex_enter(proc_lock); 1385 p = proc_find(pid); 1386 if (p == NULL) { 1387 mutex_exit(proc_lock); 1388 return NULL; 1389 } 1390 mutex_enter(p->p_lock); 1391 mutex_exit(proc_lock); 1392 } else { 1393 p = curlwp->l_proc; 1394 mutex_enter(p->p_lock); 1395 } 1396 /* Find the thread. */ 1397 if (lid != 0) { 1398 l = lwp_find(p, lid); 1399 } else { 1400 l = LIST_FIRST(&p->p_lwps); 1401 } 1402 if (l == NULL) { 1403 mutex_exit(p->p_lock); 1404 } 1405 return l; 1406 } 1407 1408 /* 1409 * Look up a live LWP within the specified process. 1410 * 1411 * Must be called with p->p_lock held. 1412 */ 1413 struct lwp * 1414 lwp_find(struct proc *p, lwpid_t id) 1415 { 1416 struct lwp *l; 1417 1418 KASSERT(mutex_owned(p->p_lock)); 1419 1420 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1421 if (l->l_lid == id) 1422 break; 1423 } 1424 1425 /* 1426 * No need to lock - all of these conditions will 1427 * be visible with the process level mutex held. 1428 */ 1429 if (l != NULL && (l->l_stat == LSIDL || l->l_stat == LSZOMB)) 1430 l = NULL; 1431 1432 return l; 1433 } 1434 1435 /* 1436 * Update an LWP's cached credentials to mirror the process' master copy. 1437 * 1438 * This happens early in the syscall path, on user trap, and on LWP 1439 * creation. A long-running LWP can also voluntarily choose to update 1440 * its credentials by calling this routine. This may be called from 1441 * LWP_CACHE_CREDS(), which checks l->l_cred != p->p_cred beforehand. 1442 */ 1443 void 1444 lwp_update_creds(struct lwp *l) 1445 { 1446 kauth_cred_t oc; 1447 struct proc *p; 1448 1449 p = l->l_proc; 1450 oc = l->l_cred; 1451 1452 mutex_enter(p->p_lock); 1453 kauth_cred_hold(p->p_cred); 1454 l->l_cred = p->p_cred; 1455 l->l_prflag &= ~LPR_CRMOD; 1456 mutex_exit(p->p_lock); 1457 if (oc != NULL) 1458 kauth_cred_free(oc); 1459 } 1460 1461 /* 1462 * Verify that an LWP is locked, and optionally verify that the lock matches 1463 * one we specify. 1464 */ 1465 int 1466 lwp_locked(struct lwp *l, kmutex_t *mtx) 1467 { 1468 kmutex_t *cur = l->l_mutex; 1469 1470 return mutex_owned(cur) && (mtx == cur || mtx == NULL); 1471 } 1472 1473 /* 1474 * Lend a new mutex to an LWP. The old mutex must be held. 1475 */ 1476 void 1477 lwp_setlock(struct lwp *l, kmutex_t *mtx) 1478 { 1479 1480 KASSERT(mutex_owned(l->l_mutex)); 1481 1482 membar_exit(); 1483 l->l_mutex = mtx; 1484 } 1485 1486 /* 1487 * Lend a new mutex to an LWP, and release the old mutex. The old mutex 1488 * must be held. 1489 */ 1490 void 1491 lwp_unlock_to(struct lwp *l, kmutex_t *mtx) 1492 { 1493 kmutex_t *old; 1494 1495 KASSERT(lwp_locked(l, NULL)); 1496 1497 old = l->l_mutex; 1498 membar_exit(); 1499 l->l_mutex = mtx; 1500 mutex_spin_exit(old); 1501 } 1502 1503 int 1504 lwp_trylock(struct lwp *l) 1505 { 1506 kmutex_t *old; 1507 1508 for (;;) { 1509 if (!mutex_tryenter(old = l->l_mutex)) 1510 return 0; 1511 if (__predict_true(l->l_mutex == old)) 1512 return 1; 1513 mutex_spin_exit(old); 1514 } 1515 } 1516 1517 void 1518 lwp_unsleep(lwp_t *l, bool cleanup) 1519 { 1520 1521 KASSERT(mutex_owned(l->l_mutex)); 1522 (*l->l_syncobj->sobj_unsleep)(l, cleanup); 1523 } 1524 1525 /* 1526 * Handle exceptions for mi_userret(). Called if a member of LW_USERRET is 1527 * set. 1528 */ 1529 void 1530 lwp_userret(struct lwp *l) 1531 { 1532 struct proc *p; 1533 int sig; 1534 1535 KASSERT(l == curlwp); 1536 KASSERT(l->l_stat == LSONPROC); 1537 p = l->l_proc; 1538 1539 #ifndef __HAVE_FAST_SOFTINTS 1540 /* Run pending soft interrupts. */ 1541 if (l->l_cpu->ci_data.cpu_softints != 0) 1542 softint_overlay(); 1543 #endif 1544 1545 /* 1546 * It is safe to do this read unlocked on a MP system.. 1547 */ 1548 while ((l->l_flag & LW_USERRET) != 0) { 1549 /* 1550 * Process pending signals first, unless the process 1551 * is dumping core or exiting, where we will instead 1552 * enter the LW_WSUSPEND case below. 1553 */ 1554 if ((l->l_flag & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == 1555 LW_PENDSIG) { 1556 mutex_enter(p->p_lock); 1557 while ((sig = issignal(l)) != 0) 1558 postsig(sig); 1559 mutex_exit(p->p_lock); 1560 } 1561 1562 /* 1563 * Core-dump or suspend pending. 1564 * 1565 * In case of core dump, suspend ourselves, so that the kernel 1566 * stack and therefore the userland registers saved in the 1567 * trapframe are around for coredump() to write them out. 1568 * We also need to save any PCU resources that we have so that 1569 * they accessible for coredump(). We issue a wakeup on 1570 * p->p_lwpcv so that sigexit() will write the core file out 1571 * once all other LWPs are suspended. 1572 */ 1573 if ((l->l_flag & LW_WSUSPEND) != 0) { 1574 pcu_save_all(l); 1575 mutex_enter(p->p_lock); 1576 p->p_nrlwps--; 1577 cv_broadcast(&p->p_lwpcv); 1578 lwp_lock(l); 1579 l->l_stat = LSSUSPENDED; 1580 lwp_unlock(l); 1581 mutex_exit(p->p_lock); 1582 lwp_lock(l); 1583 mi_switch(l); 1584 } 1585 1586 /* Process is exiting. */ 1587 if ((l->l_flag & LW_WEXIT) != 0) { 1588 lwp_exit(l); 1589 KASSERT(0); 1590 /* NOTREACHED */ 1591 } 1592 1593 /* update lwpctl processor (for vfork child_return) */ 1594 if (l->l_flag & LW_LWPCTL) { 1595 lwp_lock(l); 1596 KASSERT(kpreempt_disabled()); 1597 l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu); 1598 l->l_lwpctl->lc_pctr++; 1599 l->l_flag &= ~LW_LWPCTL; 1600 lwp_unlock(l); 1601 } 1602 } 1603 } 1604 1605 /* 1606 * Force an LWP to enter the kernel, to take a trip through lwp_userret(). 1607 */ 1608 void 1609 lwp_need_userret(struct lwp *l) 1610 { 1611 KASSERT(lwp_locked(l, NULL)); 1612 1613 /* 1614 * Since the tests in lwp_userret() are done unlocked, make sure 1615 * that the condition will be seen before forcing the LWP to enter 1616 * kernel mode. 1617 */ 1618 membar_producer(); 1619 cpu_signotify(l); 1620 } 1621 1622 /* 1623 * Add one reference to an LWP. This will prevent the LWP from 1624 * exiting, thus keep the lwp structure and PCB around to inspect. 1625 */ 1626 void 1627 lwp_addref(struct lwp *l) 1628 { 1629 1630 KASSERT(mutex_owned(l->l_proc->p_lock)); 1631 KASSERT(l->l_stat != LSZOMB); 1632 KASSERT(l->l_refcnt != 0); 1633 1634 l->l_refcnt++; 1635 } 1636 1637 /* 1638 * Remove one reference to an LWP. If this is the last reference, 1639 * then we must finalize the LWP's death. 1640 */ 1641 void 1642 lwp_delref(struct lwp *l) 1643 { 1644 struct proc *p = l->l_proc; 1645 1646 mutex_enter(p->p_lock); 1647 lwp_delref2(l); 1648 mutex_exit(p->p_lock); 1649 } 1650 1651 /* 1652 * Remove one reference to an LWP. If this is the last reference, 1653 * then we must finalize the LWP's death. The proc mutex is held 1654 * on entry. 1655 */ 1656 void 1657 lwp_delref2(struct lwp *l) 1658 { 1659 struct proc *p = l->l_proc; 1660 1661 KASSERT(mutex_owned(p->p_lock)); 1662 KASSERT(l->l_stat != LSZOMB); 1663 KASSERT(l->l_refcnt > 0); 1664 if (--l->l_refcnt == 0) 1665 cv_broadcast(&p->p_lwpcv); 1666 } 1667 1668 /* 1669 * Drain all references to the current LWP. 1670 */ 1671 void 1672 lwp_drainrefs(struct lwp *l) 1673 { 1674 struct proc *p = l->l_proc; 1675 1676 KASSERT(mutex_owned(p->p_lock)); 1677 KASSERT(l->l_refcnt != 0); 1678 1679 l->l_refcnt--; 1680 while (l->l_refcnt != 0) 1681 cv_wait(&p->p_lwpcv, p->p_lock); 1682 } 1683 1684 /* 1685 * Return true if the specified LWP is 'alive'. Only p->p_lock need 1686 * be held. 1687 */ 1688 bool 1689 lwp_alive(lwp_t *l) 1690 { 1691 1692 KASSERT(mutex_owned(l->l_proc->p_lock)); 1693 1694 switch (l->l_stat) { 1695 case LSSLEEP: 1696 case LSRUN: 1697 case LSONPROC: 1698 case LSSTOP: 1699 case LSSUSPENDED: 1700 return true; 1701 default: 1702 return false; 1703 } 1704 } 1705 1706 /* 1707 * Return first live LWP in the process. 1708 */ 1709 lwp_t * 1710 lwp_find_first(proc_t *p) 1711 { 1712 lwp_t *l; 1713 1714 KASSERT(mutex_owned(p->p_lock)); 1715 1716 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1717 if (lwp_alive(l)) { 1718 return l; 1719 } 1720 } 1721 1722 return NULL; 1723 } 1724 1725 /* 1726 * Allocate a new lwpctl structure for a user LWP. 1727 */ 1728 int 1729 lwp_ctl_alloc(vaddr_t *uaddr) 1730 { 1731 lcproc_t *lp; 1732 u_int bit, i, offset; 1733 struct uvm_object *uao; 1734 int error; 1735 lcpage_t *lcp; 1736 proc_t *p; 1737 lwp_t *l; 1738 1739 l = curlwp; 1740 p = l->l_proc; 1741 1742 /* don't allow a vforked process to create lwp ctls */ 1743 if (p->p_lflag & PL_PPWAIT) 1744 return EBUSY; 1745 1746 if (l->l_lcpage != NULL) { 1747 lcp = l->l_lcpage; 1748 *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr; 1749 return 0; 1750 } 1751 1752 /* First time around, allocate header structure for the process. */ 1753 if ((lp = p->p_lwpctl) == NULL) { 1754 lp = kmem_alloc(sizeof(*lp), KM_SLEEP); 1755 mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE); 1756 lp->lp_uao = NULL; 1757 TAILQ_INIT(&lp->lp_pages); 1758 mutex_enter(p->p_lock); 1759 if (p->p_lwpctl == NULL) { 1760 p->p_lwpctl = lp; 1761 mutex_exit(p->p_lock); 1762 } else { 1763 mutex_exit(p->p_lock); 1764 mutex_destroy(&lp->lp_lock); 1765 kmem_free(lp, sizeof(*lp)); 1766 lp = p->p_lwpctl; 1767 } 1768 } 1769 1770 /* 1771 * Set up an anonymous memory region to hold the shared pages. 1772 * Map them into the process' address space. The user vmspace 1773 * gets the first reference on the UAO. 1774 */ 1775 mutex_enter(&lp->lp_lock); 1776 if (lp->lp_uao == NULL) { 1777 lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0); 1778 lp->lp_cur = 0; 1779 lp->lp_max = LWPCTL_UAREA_SZ; 1780 lp->lp_uva = p->p_emul->e_vm_default_addr(p, 1781 (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ, 1782 p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN); 1783 error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva, 1784 LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW, 1785 UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0)); 1786 if (error != 0) { 1787 uao_detach(lp->lp_uao); 1788 lp->lp_uao = NULL; 1789 mutex_exit(&lp->lp_lock); 1790 return error; 1791 } 1792 } 1793 1794 /* Get a free block and allocate for this LWP. */ 1795 TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) { 1796 if (lcp->lcp_nfree != 0) 1797 break; 1798 } 1799 if (lcp == NULL) { 1800 /* Nothing available - try to set up a free page. */ 1801 if (lp->lp_cur == lp->lp_max) { 1802 mutex_exit(&lp->lp_lock); 1803 return ENOMEM; 1804 } 1805 lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP); 1806 1807 /* 1808 * Wire the next page down in kernel space. Since this 1809 * is a new mapping, we must add a reference. 1810 */ 1811 uao = lp->lp_uao; 1812 (*uao->pgops->pgo_reference)(uao); 1813 lcp->lcp_kaddr = vm_map_min(kernel_map); 1814 error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE, 1815 uao, lp->lp_cur, PAGE_SIZE, 1816 UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, 1817 UVM_INH_NONE, UVM_ADV_RANDOM, 0)); 1818 if (error != 0) { 1819 mutex_exit(&lp->lp_lock); 1820 kmem_free(lcp, LWPCTL_LCPAGE_SZ); 1821 (*uao->pgops->pgo_detach)(uao); 1822 return error; 1823 } 1824 error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr, 1825 lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0); 1826 if (error != 0) { 1827 mutex_exit(&lp->lp_lock); 1828 uvm_unmap(kernel_map, lcp->lcp_kaddr, 1829 lcp->lcp_kaddr + PAGE_SIZE); 1830 kmem_free(lcp, LWPCTL_LCPAGE_SZ); 1831 return error; 1832 } 1833 /* Prepare the page descriptor and link into the list. */ 1834 lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur; 1835 lp->lp_cur += PAGE_SIZE; 1836 lcp->lcp_nfree = LWPCTL_PER_PAGE; 1837 lcp->lcp_rotor = 0; 1838 memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ); 1839 TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain); 1840 } 1841 for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) { 1842 if (++i >= LWPCTL_BITMAP_ENTRIES) 1843 i = 0; 1844 } 1845 bit = ffs(lcp->lcp_bitmap[i]) - 1; 1846 lcp->lcp_bitmap[i] ^= (1U << bit); 1847 lcp->lcp_rotor = i; 1848 lcp->lcp_nfree--; 1849 l->l_lcpage = lcp; 1850 offset = (i << 5) + bit; 1851 l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset; 1852 *uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t); 1853 mutex_exit(&lp->lp_lock); 1854 1855 KPREEMPT_DISABLE(l); 1856 l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu()); 1857 KPREEMPT_ENABLE(l); 1858 1859 return 0; 1860 } 1861 1862 /* 1863 * Free an lwpctl structure back to the per-process list. 1864 */ 1865 void 1866 lwp_ctl_free(lwp_t *l) 1867 { 1868 struct proc *p = l->l_proc; 1869 lcproc_t *lp; 1870 lcpage_t *lcp; 1871 u_int map, offset; 1872 1873 /* don't free a lwp context we borrowed for vfork */ 1874 if (p->p_lflag & PL_PPWAIT) { 1875 l->l_lwpctl = NULL; 1876 return; 1877 } 1878 1879 lp = p->p_lwpctl; 1880 KASSERT(lp != NULL); 1881 1882 lcp = l->l_lcpage; 1883 offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr); 1884 KASSERT(offset < LWPCTL_PER_PAGE); 1885 1886 mutex_enter(&lp->lp_lock); 1887 lcp->lcp_nfree++; 1888 map = offset >> 5; 1889 lcp->lcp_bitmap[map] |= (1U << (offset & 31)); 1890 if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0) 1891 lcp->lcp_rotor = map; 1892 if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) { 1893 TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain); 1894 TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain); 1895 } 1896 mutex_exit(&lp->lp_lock); 1897 } 1898 1899 /* 1900 * Process is exiting; tear down lwpctl state. This can only be safely 1901 * called by the last LWP in the process. 1902 */ 1903 void 1904 lwp_ctl_exit(void) 1905 { 1906 lcpage_t *lcp, *next; 1907 lcproc_t *lp; 1908 proc_t *p; 1909 lwp_t *l; 1910 1911 l = curlwp; 1912 l->l_lwpctl = NULL; 1913 l->l_lcpage = NULL; 1914 p = l->l_proc; 1915 lp = p->p_lwpctl; 1916 1917 KASSERT(lp != NULL); 1918 KASSERT(p->p_nlwps == 1); 1919 1920 for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) { 1921 next = TAILQ_NEXT(lcp, lcp_chain); 1922 uvm_unmap(kernel_map, lcp->lcp_kaddr, 1923 lcp->lcp_kaddr + PAGE_SIZE); 1924 kmem_free(lcp, LWPCTL_LCPAGE_SZ); 1925 } 1926 1927 if (lp->lp_uao != NULL) { 1928 uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva, 1929 lp->lp_uva + LWPCTL_UAREA_SZ); 1930 } 1931 1932 mutex_destroy(&lp->lp_lock); 1933 kmem_free(lp, sizeof(*lp)); 1934 p->p_lwpctl = NULL; 1935 } 1936 1937 /* 1938 * Return the current LWP's "preemption counter". Used to detect 1939 * preemption across operations that can tolerate preemption without 1940 * crashing, but which may generate incorrect results if preempted. 1941 */ 1942 uint64_t 1943 lwp_pctr(void) 1944 { 1945 1946 return curlwp->l_ncsw; 1947 } 1948 1949 /* 1950 * Set an LWP's private data pointer. 1951 */ 1952 int 1953 lwp_setprivate(struct lwp *l, void *ptr) 1954 { 1955 int error = 0; 1956 1957 l->l_private = ptr; 1958 #ifdef __HAVE_CPU_LWP_SETPRIVATE 1959 error = cpu_lwp_setprivate(l, ptr); 1960 #endif 1961 return error; 1962 } 1963 1964 #if defined(DDB) 1965 #include <machine/pcb.h> 1966 1967 void 1968 lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...)) 1969 { 1970 lwp_t *l; 1971 1972 LIST_FOREACH(l, &alllwp, l_list) { 1973 uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l); 1974 1975 if (addr < stack || stack + KSTACK_SIZE <= addr) { 1976 continue; 1977 } 1978 (*pr)("%p is %p+%zu, LWP %p's stack\n", 1979 (void *)addr, (void *)stack, 1980 (size_t)(addr - stack), l); 1981 } 1982 } 1983 #endif /* defined(DDB) */ 1984