1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.120 2008/10/26 04:29:19 sephe Exp $ 35 */ 36 37 /* 38 * Each cpu in a system has its own self-contained light weight kernel 39 * thread scheduler, which means that generally speaking we only need 40 * to use a critical section to avoid problems. Foreign thread 41 * scheduling is queued via (async) IPIs. 42 */ 43 #include "opt_ddb.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/proc.h> 49 #include <sys/rtprio.h> 50 #include <sys/queue.h> 51 #include <sys/sysctl.h> 52 #include <sys/kthread.h> 53 #include <machine/cpu.h> 54 #include <sys/lock.h> 55 #include <sys/caps.h> 56 #include <sys/spinlock.h> 57 #include <sys/ktr.h> 58 59 #include <sys/thread2.h> 60 #include <sys/spinlock2.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <vm/vm_kern.h> 65 #include <vm/vm_object.h> 66 #include <vm/vm_page.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_pager.h> 69 #include <vm/vm_extern.h> 70 71 #include <machine/stdarg.h> 72 #include <machine/smp.h> 73 74 #ifdef DDB 75 #include <ddb/ddb.h> 76 #endif 77 78 static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); 79 80 static int untimely_switch = 0; 81 #ifdef INVARIANTS 82 static int panic_on_cscount = 0; 83 #endif 84 static __int64_t switch_count = 0; 85 static __int64_t preempt_hit = 0; 86 static __int64_t preempt_miss = 0; 87 static __int64_t preempt_weird = 0; 88 static __int64_t token_contention_count = 0; 89 static __int64_t mplock_contention_count = 0; 90 static int lwkt_use_spin_port; 91 #ifdef SMP 92 static int chain_mplock = 0; 93 #endif 94 static struct objcache *thread_cache; 95 96 volatile cpumask_t mp_lock_contention_mask; 97 98 /* 99 * We can make all thread ports use the spin backend instead of the thread 100 * backend. This should only be set to debug the spin backend. 101 */ 102 TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); 103 104 SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, ""); 105 #ifdef INVARIANTS 106 SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, ""); 107 #endif 108 #ifdef SMP 109 SYSCTL_INT(_lwkt, OID_AUTO, chain_mplock, CTLFLAG_RW, &chain_mplock, 0, ""); 110 #endif 111 SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, ""); 112 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, ""); 113 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, ""); 114 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, ""); 115 #ifdef INVARIANTS 116 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, 117 &token_contention_count, 0, "spinning due to token contention"); 118 SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW, 119 &mplock_contention_count, 0, "spinning due to MPLOCK contention"); 120 #endif 121 122 /* 123 * Kernel Trace 124 */ 125 #if !defined(KTR_GIANT_CONTENTION) 126 #define KTR_GIANT_CONTENTION KTR_ALL 127 #endif 128 129 KTR_INFO_MASTER(giant); 130 KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *)); 131 KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *)); 132 133 #define loggiant(name) KTR_LOG(giant_ ## name, curthread) 134 135 /* 136 * These helper procedures handle the runq, they can only be called from 137 * within a critical section. 138 * 139 * WARNING! Prior to SMP being brought up it is possible to enqueue and 140 * dequeue threads belonging to other cpus, so be sure to use td->td_gd 141 * instead of 'mycpu' when referencing the globaldata structure. Once 142 * SMP live enqueuing and dequeueing only occurs on the current cpu. 143 */ 144 static __inline 145 void 146 _lwkt_dequeue(thread_t td) 147 { 148 if (td->td_flags & TDF_RUNQ) { 149 int nq = td->td_pri & TDPRI_MASK; 150 struct globaldata *gd = td->td_gd; 151 152 td->td_flags &= ~TDF_RUNQ; 153 TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq); 154 /* runqmask is passively cleaned up by the switcher */ 155 } 156 } 157 158 static __inline 159 void 160 _lwkt_enqueue(thread_t td) 161 { 162 if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_TSLEEPQ|TDF_BLOCKQ)) == 0) { 163 int nq = td->td_pri & TDPRI_MASK; 164 struct globaldata *gd = td->td_gd; 165 166 td->td_flags |= TDF_RUNQ; 167 TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq); 168 gd->gd_runqmask |= 1 << nq; 169 } 170 } 171 172 static __boolean_t 173 _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) 174 { 175 struct thread *td = (struct thread *)obj; 176 177 td->td_kstack = NULL; 178 td->td_kstack_size = 0; 179 td->td_flags = TDF_ALLOCATED_THREAD; 180 return (1); 181 } 182 183 static void 184 _lwkt_thread_dtor(void *obj, void *privdata) 185 { 186 struct thread *td = (struct thread *)obj; 187 188 KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, 189 ("_lwkt_thread_dtor: not allocated from objcache")); 190 KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && 191 td->td_kstack_size > 0, 192 ("_lwkt_thread_dtor: corrupted stack")); 193 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 194 } 195 196 /* 197 * Initialize the lwkt s/system. 198 */ 199 void 200 lwkt_init(void) 201 { 202 /* An objcache has 2 magazines per CPU so divide cache size by 2. */ 203 thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread), 204 NULL, CACHE_NTHREADS/2, 205 _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); 206 } 207 208 /* 209 * Schedule a thread to run. As the current thread we can always safely 210 * schedule ourselves, and a shortcut procedure is provided for that 211 * function. 212 * 213 * (non-blocking, self contained on a per cpu basis) 214 */ 215 void 216 lwkt_schedule_self(thread_t td) 217 { 218 crit_enter_quick(td); 219 KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); 220 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 221 _lwkt_enqueue(td); 222 crit_exit_quick(td); 223 } 224 225 /* 226 * Deschedule a thread. 227 * 228 * (non-blocking, self contained on a per cpu basis) 229 */ 230 void 231 lwkt_deschedule_self(thread_t td) 232 { 233 crit_enter_quick(td); 234 _lwkt_dequeue(td); 235 crit_exit_quick(td); 236 } 237 238 /* 239 * LWKTs operate on a per-cpu basis 240 * 241 * WARNING! Called from early boot, 'mycpu' may not work yet. 242 */ 243 void 244 lwkt_gdinit(struct globaldata *gd) 245 { 246 int i; 247 248 for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i) 249 TAILQ_INIT(&gd->gd_tdrunq[i]); 250 gd->gd_runqmask = 0; 251 TAILQ_INIT(&gd->gd_tdallq); 252 } 253 254 /* 255 * Create a new thread. The thread must be associated with a process context 256 * or LWKT start address before it can be scheduled. If the target cpu is 257 * -1 the thread will be created on the current cpu. 258 * 259 * If you intend to create a thread without a process context this function 260 * does everything except load the startup and switcher function. 261 */ 262 thread_t 263 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) 264 { 265 globaldata_t gd = mycpu; 266 void *stack; 267 268 /* 269 * If static thread storage is not supplied allocate a thread. Reuse 270 * a cached free thread if possible. gd_freetd is used to keep an exiting 271 * thread intact through the exit. 272 */ 273 if (td == NULL) { 274 if ((td = gd->gd_freetd) != NULL) 275 gd->gd_freetd = NULL; 276 else 277 td = objcache_get(thread_cache, M_WAITOK); 278 KASSERT((td->td_flags & 279 (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, 280 ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); 281 flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); 282 } 283 284 /* 285 * Try to reuse cached stack. 286 */ 287 if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { 288 if (flags & TDF_ALLOCATED_STACK) { 289 kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); 290 stack = NULL; 291 } 292 } 293 if (stack == NULL) { 294 stack = (void *)kmem_alloc(&kernel_map, stksize); 295 flags |= TDF_ALLOCATED_STACK; 296 } 297 if (cpu < 0) 298 lwkt_init_thread(td, stack, stksize, flags, gd); 299 else 300 lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); 301 return(td); 302 } 303 304 /* 305 * Initialize a preexisting thread structure. This function is used by 306 * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. 307 * 308 * All threads start out in a critical section at a priority of 309 * TDPRI_KERN_DAEMON. Higher level code will modify the priority as 310 * appropriate. This function may send an IPI message when the 311 * requested cpu is not the current cpu and consequently gd_tdallq may 312 * not be initialized synchronously from the point of view of the originating 313 * cpu. 314 * 315 * NOTE! we have to be careful in regards to creating threads for other cpus 316 * if SMP has not yet been activated. 317 */ 318 #ifdef SMP 319 320 static void 321 lwkt_init_thread_remote(void *arg) 322 { 323 thread_t td = arg; 324 325 /* 326 * Protected by critical section held by IPI dispatch 327 */ 328 TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); 329 } 330 331 #endif 332 333 void 334 lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, 335 struct globaldata *gd) 336 { 337 globaldata_t mygd = mycpu; 338 339 bzero(td, sizeof(struct thread)); 340 td->td_kstack = stack; 341 td->td_kstack_size = stksize; 342 td->td_flags = flags; 343 td->td_gd = gd; 344 td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT; 345 #ifdef SMP 346 if ((flags & TDF_MPSAFE) == 0) 347 td->td_mpcount = 1; 348 #endif 349 if (lwkt_use_spin_port) 350 lwkt_initport_spin(&td->td_msgport); 351 else 352 lwkt_initport_thread(&td->td_msgport, td); 353 pmap_init_thread(td); 354 #ifdef SMP 355 /* 356 * Normally initializing a thread for a remote cpu requires sending an 357 * IPI. However, the idlethread is setup before the other cpus are 358 * activated so we have to treat it as a special case. XXX manipulation 359 * of gd_tdallq requires the BGL. 360 */ 361 if (gd == mygd || td == &gd->gd_idlethread) { 362 crit_enter_gd(mygd); 363 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 364 crit_exit_gd(mygd); 365 } else { 366 lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); 367 } 368 #else 369 crit_enter_gd(mygd); 370 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 371 crit_exit_gd(mygd); 372 #endif 373 } 374 375 void 376 lwkt_set_comm(thread_t td, const char *ctl, ...) 377 { 378 __va_list va; 379 380 __va_start(va, ctl); 381 kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); 382 __va_end(va); 383 } 384 385 void 386 lwkt_hold(thread_t td) 387 { 388 ++td->td_refs; 389 } 390 391 void 392 lwkt_rele(thread_t td) 393 { 394 KKASSERT(td->td_refs > 0); 395 --td->td_refs; 396 } 397 398 void 399 lwkt_wait_free(thread_t td) 400 { 401 while (td->td_refs) 402 tsleep(td, 0, "tdreap", hz); 403 } 404 405 void 406 lwkt_free_thread(thread_t td) 407 { 408 KASSERT((td->td_flags & TDF_RUNNING) == 0, 409 ("lwkt_free_thread: did not exit! %p", td)); 410 411 if (td->td_flags & TDF_ALLOCATED_THREAD) { 412 objcache_put(thread_cache, td); 413 } else if (td->td_flags & TDF_ALLOCATED_STACK) { 414 /* client-allocated struct with internally allocated stack */ 415 KASSERT(td->td_kstack && td->td_kstack_size > 0, 416 ("lwkt_free_thread: corrupted stack")); 417 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 418 td->td_kstack = NULL; 419 td->td_kstack_size = 0; 420 } 421 } 422 423 424 /* 425 * Switch to the next runnable lwkt. If no LWKTs are runnable then 426 * switch to the idlethread. Switching must occur within a critical 427 * section to avoid races with the scheduling queue. 428 * 429 * We always have full control over our cpu's run queue. Other cpus 430 * that wish to manipulate our queue must use the cpu_*msg() calls to 431 * talk to our cpu, so a critical section is all that is needed and 432 * the result is very, very fast thread switching. 433 * 434 * The LWKT scheduler uses a fixed priority model and round-robins at 435 * each priority level. User process scheduling is a totally 436 * different beast and LWKT priorities should not be confused with 437 * user process priorities. 438 * 439 * The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch() 440 * cleans it up. Note that the td_switch() function cannot do anything that 441 * requires the MP lock since the MP lock will have already been setup for 442 * the target thread (not the current thread). It's nice to have a scheduler 443 * that does not need the MP lock to work because it allows us to do some 444 * really cool high-performance MP lock optimizations. 445 * 446 * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() 447 * is not called by the current thread in the preemption case, only when 448 * the preempting thread blocks (in order to return to the original thread). 449 */ 450 void 451 lwkt_switch(void) 452 { 453 globaldata_t gd = mycpu; 454 thread_t td = gd->gd_curthread; 455 thread_t ntd; 456 #ifdef SMP 457 int mpheld; 458 #endif 459 460 /* 461 * Switching from within a 'fast' (non thread switched) interrupt or IPI 462 * is illegal. However, we may have to do it anyway if we hit a fatal 463 * kernel trap or we have paniced. 464 * 465 * If this case occurs save and restore the interrupt nesting level. 466 */ 467 if (gd->gd_intr_nesting_level) { 468 int savegdnest; 469 int savegdtrap; 470 471 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) { 472 panic("lwkt_switch: cannot switch from within " 473 "a fast interrupt, yet, td %p\n", td); 474 } else { 475 savegdnest = gd->gd_intr_nesting_level; 476 savegdtrap = gd->gd_trap_nesting_level; 477 gd->gd_intr_nesting_level = 0; 478 gd->gd_trap_nesting_level = 0; 479 if ((td->td_flags & TDF_PANICWARN) == 0) { 480 td->td_flags |= TDF_PANICWARN; 481 kprintf("Warning: thread switch from interrupt or IPI, " 482 "thread %p (%s)\n", td, td->td_comm); 483 #ifdef DDB 484 db_print_backtrace(); 485 #endif 486 } 487 lwkt_switch(); 488 gd->gd_intr_nesting_level = savegdnest; 489 gd->gd_trap_nesting_level = savegdtrap; 490 return; 491 } 492 } 493 494 /* 495 * Passive release (used to transition from user to kernel mode 496 * when we block or switch rather then when we enter the kernel). 497 * This function is NOT called if we are switching into a preemption 498 * or returning from a preemption. Typically this causes us to lose 499 * our current process designation (if we have one) and become a true 500 * LWKT thread, and may also hand the current process designation to 501 * another process and schedule thread. 502 */ 503 if (td->td_release) 504 td->td_release(td); 505 506 crit_enter_gd(gd); 507 if (td->td_toks) 508 lwkt_relalltokens(td); 509 510 /* 511 * We had better not be holding any spin locks, but don't get into an 512 * endless panic loop. 513 */ 514 KASSERT(gd->gd_spinlock_rd == NULL || panicstr != NULL, 515 ("lwkt_switch: still holding a shared spinlock %p!", 516 gd->gd_spinlock_rd)); 517 KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, 518 ("lwkt_switch: still holding %d exclusive spinlocks!", 519 gd->gd_spinlocks_wr)); 520 521 522 #ifdef SMP 523 /* 524 * td_mpcount cannot be used to determine if we currently hold the 525 * MP lock because get_mplock() will increment it prior to attempting 526 * to get the lock, and switch out if it can't. Our ownership of 527 * the actual lock will remain stable while we are in a critical section 528 * (but, of course, another cpu may own or release the lock so the 529 * actual value of mp_lock is not stable). 530 */ 531 mpheld = MP_LOCK_HELD(); 532 #ifdef INVARIANTS 533 if (td->td_cscount) { 534 kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", 535 td); 536 if (panic_on_cscount) 537 panic("switching while mastering cpusync"); 538 } 539 #endif 540 #endif 541 if ((ntd = td->td_preempted) != NULL) { 542 /* 543 * We had preempted another thread on this cpu, resume the preempted 544 * thread. This occurs transparently, whether the preempted thread 545 * was scheduled or not (it may have been preempted after descheduling 546 * itself). 547 * 548 * We have to setup the MP lock for the original thread after backing 549 * out the adjustment that was made to curthread when the original 550 * was preempted. 551 */ 552 KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); 553 #ifdef SMP 554 if (ntd->td_mpcount && mpheld == 0) { 555 panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d", 556 td, ntd, td->td_mpcount, ntd->td_mpcount); 557 } 558 if (ntd->td_mpcount) { 559 td->td_mpcount -= ntd->td_mpcount; 560 KKASSERT(td->td_mpcount >= 0); 561 } 562 #endif 563 ntd->td_flags |= TDF_PREEMPT_DONE; 564 565 /* 566 * The interrupt may have woken a thread up, we need to properly 567 * set the reschedule flag if the originally interrupted thread is 568 * at a lower priority. 569 */ 570 if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1) 571 need_lwkt_resched(); 572 /* YYY release mp lock on switchback if original doesn't need it */ 573 } else { 574 /* 575 * Priority queue / round-robin at each priority. Note that user 576 * processes run at a fixed, low priority and the user process 577 * scheduler deals with interactions between user processes 578 * by scheduling and descheduling them from the LWKT queue as 579 * necessary. 580 * 581 * We have to adjust the MP lock for the target thread. If we 582 * need the MP lock and cannot obtain it we try to locate a 583 * thread that does not need the MP lock. If we cannot, we spin 584 * instead of HLT. 585 * 586 * A similar issue exists for the tokens held by the target thread. 587 * If we cannot obtain ownership of the tokens we cannot immediately 588 * schedule the thread. 589 */ 590 591 /* 592 * If an LWKT reschedule was requested, well that is what we are 593 * doing now so clear it. 594 */ 595 clear_lwkt_resched(); 596 again: 597 if (gd->gd_runqmask) { 598 int nq = bsrl(gd->gd_runqmask); 599 if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) { 600 gd->gd_runqmask &= ~(1 << nq); 601 goto again; 602 } 603 #ifdef SMP 604 /* 605 * THREAD SELECTION FOR AN SMP MACHINE BUILD 606 * 607 * If the target needs the MP lock and we couldn't get it, 608 * or if the target is holding tokens and we could not 609 * gain ownership of the tokens, continue looking for a 610 * thread to schedule and spin instead of HLT if we can't. 611 * 612 * NOTE: the mpheld variable invalid after this conditional, it 613 * can change due to both cpu_try_mplock() returning success 614 * AND interactions in lwkt_getalltokens() due to the fact that 615 * we are trying to check the mpcount of a thread other then 616 * the current thread. Because of this, if the current thread 617 * is not holding td_mpcount, an IPI indirectly run via 618 * lwkt_getalltokens() can obtain and release the MP lock and 619 * cause the core MP lock to be released. 620 */ 621 if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) || 622 (ntd->td_toks && lwkt_getalltokens(ntd) == 0) 623 ) { 624 u_int32_t rqmask = gd->gd_runqmask; 625 626 mpheld = MP_LOCK_HELD(); 627 ntd = NULL; 628 while (rqmask) { 629 TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) { 630 if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) { 631 /* spinning due to MP lock being held */ 632 #ifdef INVARIANTS 633 ++mplock_contention_count; 634 #endif 635 /* mplock still not held, 'mpheld' still valid */ 636 continue; 637 } 638 639 /* 640 * mpheld state invalid after getalltokens call returns 641 * failure, but the variable is only needed for 642 * the loop. 643 */ 644 if (ntd->td_toks && !lwkt_getalltokens(ntd)) { 645 /* spinning due to token contention */ 646 #ifdef INVARIANTS 647 ++token_contention_count; 648 #endif 649 mpheld = MP_LOCK_HELD(); 650 continue; 651 } 652 break; 653 } 654 if (ntd) 655 break; 656 rqmask &= ~(1 << nq); 657 nq = bsrl(rqmask); 658 659 /* 660 * We have two choices. We can either refuse to run a 661 * user thread when a kernel thread needs the MP lock 662 * but could not get it, or we can allow it to run but 663 * then expect an IPI (hopefully) later on to force a 664 * reschedule when the MP lock might become available. 665 */ 666 if (nq < TDPRI_KERN_LPSCHED) { 667 if (chain_mplock == 0) 668 break; 669 atomic_set_int(&mp_lock_contention_mask, 670 gd->gd_cpumask); 671 /* continue loop, allow user threads to be scheduled */ 672 } 673 } 674 if (ntd == NULL) { 675 cpu_mplock_contested(); 676 ntd = &gd->gd_idlethread; 677 ntd->td_flags |= TDF_IDLE_NOHLT; 678 goto using_idle_thread; 679 } else { 680 ++gd->gd_cnt.v_swtch; 681 TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); 682 TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); 683 } 684 } else { 685 ++gd->gd_cnt.v_swtch; 686 TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); 687 TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); 688 } 689 #else 690 /* 691 * THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to 692 * worry about tokens or the BGL. However, we still have 693 * to call lwkt_getalltokens() in order to properly detect 694 * stale tokens. This call cannot fail for a UP build! 695 */ 696 lwkt_getalltokens(ntd); 697 ++gd->gd_cnt.v_swtch; 698 TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); 699 TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); 700 #endif 701 } else { 702 /* 703 * We have nothing to run but only let the idle loop halt 704 * the cpu if there are no pending interrupts. 705 */ 706 ntd = &gd->gd_idlethread; 707 if (gd->gd_reqflags & RQF_IDLECHECK_MASK) 708 ntd->td_flags |= TDF_IDLE_NOHLT; 709 #ifdef SMP 710 using_idle_thread: 711 /* 712 * The idle thread should not be holding the MP lock unless we 713 * are trapping in the kernel or in a panic. Since we select the 714 * idle thread unconditionally when no other thread is available, 715 * if the MP lock is desired during a panic or kernel trap, we 716 * have to loop in the scheduler until we get it. 717 */ 718 if (ntd->td_mpcount) { 719 mpheld = MP_LOCK_HELD(); 720 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) { 721 panic("Idle thread %p was holding the BGL!", ntd); 722 } else if (mpheld == 0) { 723 cpu_mplock_contested(); 724 goto again; 725 } 726 } 727 #endif 728 } 729 } 730 KASSERT(ntd->td_pri >= TDPRI_CRIT, 731 ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri)); 732 733 /* 734 * Do the actual switch. If the new target does not need the MP lock 735 * and we are holding it, release the MP lock. If the new target requires 736 * the MP lock we have already acquired it for the target. 737 */ 738 #ifdef SMP 739 if (ntd->td_mpcount == 0 ) { 740 if (MP_LOCK_HELD()) 741 cpu_rel_mplock(); 742 } else { 743 ASSERT_MP_LOCK_HELD(ntd); 744 } 745 #endif 746 if (td != ntd) { 747 ++switch_count; 748 td->td_switch(ntd); 749 } 750 /* NOTE: current cpu may have changed after switch */ 751 crit_exit_quick(td); 752 } 753 754 /* 755 * Request that the target thread preempt the current thread. Preemption 756 * only works under a specific set of conditions: 757 * 758 * - We are not preempting ourselves 759 * - The target thread is owned by the current cpu 760 * - We are not currently being preempted 761 * - The target is not currently being preempted 762 * - We are not holding any spin locks 763 * - The target thread is not holding any tokens 764 * - We are able to satisfy the target's MP lock requirements (if any). 765 * 766 * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically 767 * this is called via lwkt_schedule() through the td_preemptable callback. 768 * critpri is the managed critical priority that we should ignore in order 769 * to determine whether preemption is possible (aka usually just the crit 770 * priority of lwkt_schedule() itself). 771 * 772 * XXX at the moment we run the target thread in a critical section during 773 * the preemption in order to prevent the target from taking interrupts 774 * that *WE* can't. Preemption is strictly limited to interrupt threads 775 * and interrupt-like threads, outside of a critical section, and the 776 * preempted source thread will be resumed the instant the target blocks 777 * whether or not the source is scheduled (i.e. preemption is supposed to 778 * be as transparent as possible). 779 * 780 * The target thread inherits our MP count (added to its own) for the 781 * duration of the preemption in order to preserve the atomicy of the 782 * MP lock during the preemption. Therefore, any preempting targets must be 783 * careful in regards to MP assertions. Note that the MP count may be 784 * out of sync with the physical mp_lock, but we do not have to preserve 785 * the original ownership of the lock if it was out of synch (that is, we 786 * can leave it synchronized on return). 787 */ 788 void 789 lwkt_preempt(thread_t ntd, int critpri) 790 { 791 struct globaldata *gd = mycpu; 792 thread_t td; 793 #ifdef SMP 794 int mpheld; 795 int savecnt; 796 #endif 797 798 /* 799 * The caller has put us in a critical section. We can only preempt 800 * if the caller of the caller was not in a critical section (basically 801 * a local interrupt), as determined by the 'critpri' parameter. We 802 * also can't preempt if the caller is holding any spinlocks (even if 803 * he isn't in a critical section). This also handles the tokens test. 804 * 805 * YYY The target thread must be in a critical section (else it must 806 * inherit our critical section? I dunno yet). 807 * 808 * Set need_lwkt_resched() unconditionally for now YYY. 809 */ 810 KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri)); 811 812 td = gd->gd_curthread; 813 if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) { 814 ++preempt_miss; 815 return; 816 } 817 if ((td->td_pri & ~TDPRI_MASK) > critpri) { 818 ++preempt_miss; 819 need_lwkt_resched(); 820 return; 821 } 822 #ifdef SMP 823 if (ntd->td_gd != gd) { 824 ++preempt_miss; 825 need_lwkt_resched(); 826 return; 827 } 828 #endif 829 /* 830 * Take the easy way out and do not preempt if we are holding 831 * any spinlocks. We could test whether the thread(s) being 832 * preempted interlock against the target thread's tokens and whether 833 * we can get all the target thread's tokens, but this situation 834 * should not occur very often so its easier to simply not preempt. 835 * Also, plain spinlocks are impossible to figure out at this point so 836 * just don't preempt. 837 * 838 * Do not try to preempt if the target thread is holding any tokens. 839 * We could try to acquire the tokens but this case is so rare there 840 * is no need to support it. 841 */ 842 if (gd->gd_spinlock_rd || gd->gd_spinlocks_wr) { 843 ++preempt_miss; 844 need_lwkt_resched(); 845 return; 846 } 847 if (ntd->td_toks) { 848 ++preempt_miss; 849 need_lwkt_resched(); 850 return; 851 } 852 if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { 853 ++preempt_weird; 854 need_lwkt_resched(); 855 return; 856 } 857 if (ntd->td_preempted) { 858 ++preempt_hit; 859 need_lwkt_resched(); 860 return; 861 } 862 #ifdef SMP 863 /* 864 * note: an interrupt might have occured just as we were transitioning 865 * to or from the MP lock. In this case td_mpcount will be pre-disposed 866 * (non-zero) but not actually synchronized with the actual state of the 867 * lock. We can use it to imply an MP lock requirement for the 868 * preemption but we cannot use it to test whether we hold the MP lock 869 * or not. 870 */ 871 savecnt = td->td_mpcount; 872 mpheld = MP_LOCK_HELD(); 873 ntd->td_mpcount += td->td_mpcount; 874 if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) { 875 ntd->td_mpcount -= td->td_mpcount; 876 ++preempt_miss; 877 need_lwkt_resched(); 878 return; 879 } 880 #endif 881 882 /* 883 * Since we are able to preempt the current thread, there is no need to 884 * call need_lwkt_resched(). 885 */ 886 ++preempt_hit; 887 ntd->td_preempted = td; 888 td->td_flags |= TDF_PREEMPT_LOCK; 889 td->td_switch(ntd); 890 891 KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); 892 #ifdef SMP 893 KKASSERT(savecnt == td->td_mpcount); 894 mpheld = MP_LOCK_HELD(); 895 if (mpheld && td->td_mpcount == 0) 896 cpu_rel_mplock(); 897 else if (mpheld == 0 && td->td_mpcount) 898 panic("lwkt_preempt(): MP lock was not held through"); 899 #endif 900 ntd->td_preempted = NULL; 901 td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); 902 } 903 904 /* 905 * Yield our thread while higher priority threads are pending. This is 906 * typically called when we leave a critical section but it can be safely 907 * called while we are in a critical section. 908 * 909 * This function will not generally yield to equal priority threads but it 910 * can occur as a side effect. Note that lwkt_switch() is called from 911 * inside the critical section to prevent its own crit_exit() from reentering 912 * lwkt_yield_quick(). 913 * 914 * gd_reqflags indicates that *something* changed, e.g. an interrupt or softint 915 * came along but was blocked and made pending. 916 * 917 * (self contained on a per cpu basis) 918 */ 919 void 920 lwkt_yield_quick(void) 921 { 922 globaldata_t gd = mycpu; 923 thread_t td = gd->gd_curthread; 924 925 /* 926 * gd_reqflags is cleared in splz if the cpl is 0. If we were to clear 927 * it with a non-zero cpl then we might not wind up calling splz after 928 * a task switch when the critical section is exited even though the 929 * new task could accept the interrupt. 930 * 931 * XXX from crit_exit() only called after last crit section is released. 932 * If called directly will run splz() even if in a critical section. 933 * 934 * td_nest_count prevent deep nesting via splz() or doreti(). Note that 935 * except for this special case, we MUST call splz() here to handle any 936 * pending ints, particularly after we switch, or we might accidently 937 * halt the cpu with interrupts pending. 938 */ 939 if (gd->gd_reqflags && td->td_nest_count < 2) 940 splz(); 941 942 /* 943 * YYY enabling will cause wakeup() to task-switch, which really 944 * confused the old 4.x code. This is a good way to simulate 945 * preemption and MP without actually doing preemption or MP, because a 946 * lot of code assumes that wakeup() does not block. 947 */ 948 if (untimely_switch && td->td_nest_count == 0 && 949 gd->gd_intr_nesting_level == 0 950 ) { 951 crit_enter_quick(td); 952 /* 953 * YYY temporary hacks until we disassociate the userland scheduler 954 * from the LWKT scheduler. 955 */ 956 if (td->td_flags & TDF_RUNQ) { 957 lwkt_switch(); /* will not reenter yield function */ 958 } else { 959 lwkt_schedule_self(td); /* make sure we are scheduled */ 960 lwkt_switch(); /* will not reenter yield function */ 961 lwkt_deschedule_self(td); /* make sure we are descheduled */ 962 } 963 crit_exit_noyield(td); 964 } 965 } 966 967 /* 968 * This implements a normal yield which, unlike _quick, will yield to equal 969 * priority threads as well. Note that gd_reqflags tests will be handled by 970 * the crit_exit() call in lwkt_switch(). 971 * 972 * (self contained on a per cpu basis) 973 */ 974 void 975 lwkt_yield(void) 976 { 977 lwkt_schedule_self(curthread); 978 lwkt_switch(); 979 } 980 981 /* 982 * Return 0 if no runnable threads are pending at the same or higher 983 * priority as the passed thread. 984 * 985 * Return 1 if runnable threads are pending at the same priority. 986 * 987 * Return 2 if runnable threads are pending at a higher priority. 988 */ 989 int 990 lwkt_check_resched(thread_t td) 991 { 992 int pri = td->td_pri & TDPRI_MASK; 993 994 if (td->td_gd->gd_runqmask > (2 << pri) - 1) 995 return(2); 996 if (TAILQ_NEXT(td, td_threadq)) 997 return(1); 998 return(0); 999 } 1000 1001 /* 1002 * Generic schedule. Possibly schedule threads belonging to other cpus and 1003 * deal with threads that might be blocked on a wait queue. 1004 * 1005 * We have a little helper inline function which does additional work after 1006 * the thread has been enqueued, including dealing with preemption and 1007 * setting need_lwkt_resched() (which prevents the kernel from returning 1008 * to userland until it has processed higher priority threads). 1009 * 1010 * It is possible for this routine to be called after a failed _enqueue 1011 * (due to the target thread migrating, sleeping, or otherwise blocked). 1012 * We have to check that the thread is actually on the run queue! 1013 * 1014 * reschedok is an optimized constant propagated from lwkt_schedule() or 1015 * lwkt_schedule_noresched(). By default it is non-zero, causing a 1016 * reschedule to be requested if the target thread has a higher priority. 1017 * The port messaging code will set MSG_NORESCHED and cause reschedok to 1018 * be 0, prevented undesired reschedules. 1019 */ 1020 static __inline 1021 void 1022 _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok) 1023 { 1024 thread_t otd; 1025 1026 if (ntd->td_flags & TDF_RUNQ) { 1027 if (ntd->td_preemptable && reschedok) { 1028 ntd->td_preemptable(ntd, cpri); /* YYY +token */ 1029 } else if (reschedok) { 1030 otd = curthread; 1031 if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK)) 1032 need_lwkt_resched(); 1033 } 1034 } 1035 } 1036 1037 static __inline 1038 void 1039 _lwkt_schedule(thread_t td, int reschedok) 1040 { 1041 globaldata_t mygd = mycpu; 1042 1043 KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); 1044 crit_enter_gd(mygd); 1045 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1046 if (td == mygd->gd_curthread) { 1047 _lwkt_enqueue(td); 1048 } else { 1049 /* 1050 * If we own the thread, there is no race (since we are in a 1051 * critical section). If we do not own the thread there might 1052 * be a race but the target cpu will deal with it. 1053 */ 1054 #ifdef SMP 1055 if (td->td_gd == mygd) { 1056 _lwkt_enqueue(td); 1057 _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); 1058 } else { 1059 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td); 1060 } 1061 #else 1062 _lwkt_enqueue(td); 1063 _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); 1064 #endif 1065 } 1066 crit_exit_gd(mygd); 1067 } 1068 1069 void 1070 lwkt_schedule(thread_t td) 1071 { 1072 _lwkt_schedule(td, 1); 1073 } 1074 1075 void 1076 lwkt_schedule_noresched(thread_t td) 1077 { 1078 _lwkt_schedule(td, 0); 1079 } 1080 1081 #ifdef SMP 1082 1083 /* 1084 * Thread migration using a 'Pull' method. The thread may or may not be 1085 * the current thread. It MUST be descheduled and in a stable state. 1086 * lwkt_giveaway() must be called on the cpu owning the thread. 1087 * 1088 * At any point after lwkt_giveaway() is called, the target cpu may 1089 * 'pull' the thread by calling lwkt_acquire(). 1090 * 1091 * MPSAFE - must be called under very specific conditions. 1092 */ 1093 void 1094 lwkt_giveaway(thread_t td) 1095 { 1096 globaldata_t gd = mycpu; 1097 1098 crit_enter_gd(gd); 1099 KKASSERT(td->td_gd == gd); 1100 TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); 1101 td->td_flags |= TDF_MIGRATING; 1102 crit_exit_gd(gd); 1103 } 1104 1105 void 1106 lwkt_acquire(thread_t td) 1107 { 1108 globaldata_t gd; 1109 globaldata_t mygd; 1110 1111 KKASSERT(td->td_flags & TDF_MIGRATING); 1112 gd = td->td_gd; 1113 mygd = mycpu; 1114 if (gd != mycpu) { 1115 cpu_lfence(); 1116 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1117 crit_enter_gd(mygd); 1118 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1119 #ifdef SMP 1120 lwkt_process_ipiq(); 1121 #endif 1122 cpu_lfence(); 1123 } 1124 td->td_gd = mygd; 1125 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1126 td->td_flags &= ~TDF_MIGRATING; 1127 crit_exit_gd(mygd); 1128 } else { 1129 crit_enter_gd(mygd); 1130 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1131 td->td_flags &= ~TDF_MIGRATING; 1132 crit_exit_gd(mygd); 1133 } 1134 } 1135 1136 #endif 1137 1138 /* 1139 * Generic deschedule. Descheduling threads other then your own should be 1140 * done only in carefully controlled circumstances. Descheduling is 1141 * asynchronous. 1142 * 1143 * This function may block if the cpu has run out of messages. 1144 */ 1145 void 1146 lwkt_deschedule(thread_t td) 1147 { 1148 crit_enter(); 1149 #ifdef SMP 1150 if (td == curthread) { 1151 _lwkt_dequeue(td); 1152 } else { 1153 if (td->td_gd == mycpu) { 1154 _lwkt_dequeue(td); 1155 } else { 1156 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); 1157 } 1158 } 1159 #else 1160 _lwkt_dequeue(td); 1161 #endif 1162 crit_exit(); 1163 } 1164 1165 /* 1166 * Set the target thread's priority. This routine does not automatically 1167 * switch to a higher priority thread, LWKT threads are not designed for 1168 * continuous priority changes. Yield if you want to switch. 1169 * 1170 * We have to retain the critical section count which uses the high bits 1171 * of the td_pri field. The specified priority may also indicate zero or 1172 * more critical sections by adding TDPRI_CRIT*N. 1173 * 1174 * Note that we requeue the thread whether it winds up on a different runq 1175 * or not. uio_yield() depends on this and the routine is not normally 1176 * called with the same priority otherwise. 1177 */ 1178 void 1179 lwkt_setpri(thread_t td, int pri) 1180 { 1181 KKASSERT(pri >= 0); 1182 KKASSERT(td->td_gd == mycpu); 1183 crit_enter(); 1184 if (td->td_flags & TDF_RUNQ) { 1185 _lwkt_dequeue(td); 1186 td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; 1187 _lwkt_enqueue(td); 1188 } else { 1189 td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; 1190 } 1191 crit_exit(); 1192 } 1193 1194 void 1195 lwkt_setpri_self(int pri) 1196 { 1197 thread_t td = curthread; 1198 1199 KKASSERT(pri >= 0 && pri <= TDPRI_MAX); 1200 crit_enter(); 1201 if (td->td_flags & TDF_RUNQ) { 1202 _lwkt_dequeue(td); 1203 td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; 1204 _lwkt_enqueue(td); 1205 } else { 1206 td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; 1207 } 1208 crit_exit(); 1209 } 1210 1211 /* 1212 * Migrate the current thread to the specified cpu. 1213 * 1214 * This is accomplished by descheduling ourselves from the current cpu, 1215 * moving our thread to the tdallq of the target cpu, IPI messaging the 1216 * target cpu, and switching out. TDF_MIGRATING prevents scheduling 1217 * races while the thread is being migrated. 1218 */ 1219 #ifdef SMP 1220 static void lwkt_setcpu_remote(void *arg); 1221 #endif 1222 1223 void 1224 lwkt_setcpu_self(globaldata_t rgd) 1225 { 1226 #ifdef SMP 1227 thread_t td = curthread; 1228 1229 if (td->td_gd != rgd) { 1230 crit_enter_quick(td); 1231 td->td_flags |= TDF_MIGRATING; 1232 lwkt_deschedule_self(td); 1233 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1234 lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); 1235 lwkt_switch(); 1236 /* we are now on the target cpu */ 1237 TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); 1238 crit_exit_quick(td); 1239 } 1240 #endif 1241 } 1242 1243 void 1244 lwkt_migratecpu(int cpuid) 1245 { 1246 #ifdef SMP 1247 globaldata_t rgd; 1248 1249 rgd = globaldata_find(cpuid); 1250 lwkt_setcpu_self(rgd); 1251 #endif 1252 } 1253 1254 /* 1255 * Remote IPI for cpu migration (called while in a critical section so we 1256 * do not have to enter another one). The thread has already been moved to 1257 * our cpu's allq, but we must wait for the thread to be completely switched 1258 * out on the originating cpu before we schedule it on ours or the stack 1259 * state may be corrupt. We clear TDF_MIGRATING after flushing the GD 1260 * change to main memory. 1261 * 1262 * XXX The use of TDF_MIGRATING might not be sufficient to avoid races 1263 * against wakeups. It is best if this interface is used only when there 1264 * are no pending events that might try to schedule the thread. 1265 */ 1266 #ifdef SMP 1267 static void 1268 lwkt_setcpu_remote(void *arg) 1269 { 1270 thread_t td = arg; 1271 globaldata_t gd = mycpu; 1272 1273 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1274 #ifdef SMP 1275 lwkt_process_ipiq(); 1276 #endif 1277 cpu_lfence(); 1278 } 1279 td->td_gd = gd; 1280 cpu_sfence(); 1281 td->td_flags &= ~TDF_MIGRATING; 1282 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1283 _lwkt_enqueue(td); 1284 } 1285 #endif 1286 1287 struct lwp * 1288 lwkt_preempted_proc(void) 1289 { 1290 thread_t td = curthread; 1291 while (td->td_preempted) 1292 td = td->td_preempted; 1293 return(td->td_lwp); 1294 } 1295 1296 /* 1297 * Create a kernel process/thread/whatever. It shares it's address space 1298 * with proc0 - ie: kernel only. 1299 * 1300 * NOTE! By default new threads are created with the MP lock held. A 1301 * thread which does not require the MP lock should release it by calling 1302 * rel_mplock() at the start of the new thread. 1303 */ 1304 int 1305 lwkt_create(void (*func)(void *), void *arg, 1306 struct thread **tdp, thread_t template, int tdflags, int cpu, 1307 const char *fmt, ...) 1308 { 1309 thread_t td; 1310 __va_list ap; 1311 1312 td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, 1313 tdflags); 1314 if (tdp) 1315 *tdp = td; 1316 cpu_set_thread_handler(td, lwkt_exit, func, arg); 1317 1318 /* 1319 * Set up arg0 for 'ps' etc 1320 */ 1321 __va_start(ap, fmt); 1322 kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); 1323 __va_end(ap); 1324 1325 /* 1326 * Schedule the thread to run 1327 */ 1328 if ((td->td_flags & TDF_STOPREQ) == 0) 1329 lwkt_schedule(td); 1330 else 1331 td->td_flags &= ~TDF_STOPREQ; 1332 return 0; 1333 } 1334 1335 /* 1336 * Destroy an LWKT thread. Warning! This function is not called when 1337 * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and 1338 * uses a different reaping mechanism. 1339 */ 1340 void 1341 lwkt_exit(void) 1342 { 1343 thread_t td = curthread; 1344 thread_t std; 1345 globaldata_t gd; 1346 1347 if (td->td_flags & TDF_VERBOSE) 1348 kprintf("kthread %p %s has exited\n", td, td->td_comm); 1349 caps_exit(td); 1350 1351 /* 1352 * Get us into a critical section to interlock gd_freetd and loop 1353 * until we can get it freed. 1354 * 1355 * We have to cache the current td in gd_freetd because objcache_put()ing 1356 * it would rip it out from under us while our thread is still active. 1357 */ 1358 gd = mycpu; 1359 crit_enter_quick(td); 1360 while ((std = gd->gd_freetd) != NULL) { 1361 gd->gd_freetd = NULL; 1362 objcache_put(thread_cache, std); 1363 } 1364 lwkt_deschedule_self(td); 1365 lwkt_remove_tdallq(td); 1366 if (td->td_flags & TDF_ALLOCATED_THREAD) 1367 gd->gd_freetd = td; 1368 cpu_thread_exit(); 1369 } 1370 1371 void 1372 lwkt_remove_tdallq(thread_t td) 1373 { 1374 KKASSERT(td->td_gd == mycpu); 1375 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1376 } 1377 1378 void 1379 crit_panic(void) 1380 { 1381 thread_t td = curthread; 1382 int lpri = td->td_pri; 1383 1384 td->td_pri = 0; 1385 panic("td_pri is/would-go negative! %p %d", td, lpri); 1386 } 1387 1388 #ifdef SMP 1389 1390 /* 1391 * Called from debugger/panic on cpus which have been stopped. We must still 1392 * process the IPIQ while stopped, even if we were stopped while in a critical 1393 * section (XXX). 1394 * 1395 * If we are dumping also try to process any pending interrupts. This may 1396 * or may not work depending on the state of the cpu at the point it was 1397 * stopped. 1398 */ 1399 void 1400 lwkt_smp_stopped(void) 1401 { 1402 globaldata_t gd = mycpu; 1403 1404 crit_enter_gd(gd); 1405 if (dumping) { 1406 lwkt_process_ipiq(); 1407 splz(); 1408 } else { 1409 lwkt_process_ipiq(); 1410 } 1411 crit_exit_gd(gd); 1412 } 1413 1414 /* 1415 * get_mplock() calls this routine if it is unable to obtain the MP lock. 1416 * get_mplock() has already incremented td_mpcount. We must block and 1417 * not return until giant is held. 1418 * 1419 * All we have to do is lwkt_switch() away. The LWKT scheduler will not 1420 * reschedule the thread until it can obtain the giant lock for it. 1421 */ 1422 void 1423 lwkt_mp_lock_contested(void) 1424 { 1425 loggiant(beg); 1426 lwkt_switch(); 1427 loggiant(end); 1428 } 1429 1430 /* 1431 * The rel_mplock() code will call this function after releasing the 1432 * last reference on the MP lock if mp_lock_contention_mask is non-zero. 1433 * 1434 * We then chain an IPI to a single other cpu potentially needing the 1435 * lock. This is a bit heuristical and we can wind up with IPIs flying 1436 * all over the place. 1437 */ 1438 static void lwkt_mp_lock_uncontested_remote(void *arg __unused); 1439 1440 void 1441 lwkt_mp_lock_uncontested(void) 1442 { 1443 globaldata_t gd; 1444 globaldata_t dgd; 1445 cpumask_t mask; 1446 cpumask_t tmpmask; 1447 int cpuid; 1448 1449 if (chain_mplock) { 1450 gd = mycpu; 1451 atomic_clear_int(&mp_lock_contention_mask, gd->gd_cpumask); 1452 mask = mp_lock_contention_mask; 1453 tmpmask = ~((1 << gd->gd_cpuid) - 1); 1454 1455 if (mask) { 1456 if (mask & tmpmask) 1457 cpuid = bsfl(mask & tmpmask); 1458 else 1459 cpuid = bsfl(mask); 1460 atomic_clear_int(&mp_lock_contention_mask, 1 << cpuid); 1461 dgd = globaldata_find(cpuid); 1462 lwkt_send_ipiq(dgd, lwkt_mp_lock_uncontested_remote, NULL); 1463 } 1464 } 1465 } 1466 1467 /* 1468 * The idea is for this IPI to interrupt a potentially lower priority 1469 * thread, such as a user thread, to allow the scheduler to reschedule 1470 * a higher priority kernel thread that needs the MP lock. 1471 * 1472 * For now we set the LWKT reschedule flag which generates an AST in 1473 * doreti, though theoretically it is also possible to possibly preempt 1474 * here if the underlying thread was operating in user mode. Nah. 1475 */ 1476 static void 1477 lwkt_mp_lock_uncontested_remote(void *arg __unused) 1478 { 1479 need_lwkt_resched(); 1480 } 1481 1482 #endif 1483