1 /* 2 * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Each cpu in a system has its own self-contained light weight kernel 37 * thread scheduler, which means that generally speaking we only need 38 * to use a critical section to avoid problems. Foreign thread 39 * scheduling is queued via (async) IPIs. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/kinfo.h> 48 #include <sys/queue.h> 49 #include <sys/sysctl.h> 50 #include <sys/kthread.h> 51 #include <machine/cpu.h> 52 #include <sys/lock.h> 53 #include <sys/caps.h> 54 #include <sys/spinlock.h> 55 #include <sys/ktr.h> 56 57 #include <sys/thread2.h> 58 #include <sys/spinlock2.h> 59 #include <sys/mplock2.h> 60 61 #include <sys/dsched.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_pager.h> 70 #include <vm/vm_extern.h> 71 72 #include <machine/stdarg.h> 73 #include <machine/smp.h> 74 75 #if !defined(KTR_CTXSW) 76 #define KTR_CTXSW KTR_ALL 77 #endif 78 KTR_INFO_MASTER(ctxsw); 79 KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", 80 sizeof(int) + sizeof(struct thread *)); 81 KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", 82 sizeof(int) + sizeof(struct thread *)); 83 KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", 84 sizeof (struct thread *) + sizeof(char *)); 85 KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *)); 86 87 static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); 88 89 #ifdef INVARIANTS 90 static int panic_on_cscount = 0; 91 #endif 92 static __int64_t switch_count = 0; 93 static __int64_t preempt_hit = 0; 94 static __int64_t preempt_miss = 0; 95 static __int64_t preempt_weird = 0; 96 static __int64_t token_contention_count __debugvar = 0; 97 static int lwkt_use_spin_port; 98 static struct objcache *thread_cache; 99 100 #ifdef SMP 101 static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); 102 #endif 103 static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); 104 105 extern void cpu_heavy_restore(void); 106 extern void cpu_lwkt_restore(void); 107 extern void cpu_kthread_restore(void); 108 extern void cpu_idle_restore(void); 109 110 /* 111 * We can make all thread ports use the spin backend instead of the thread 112 * backend. This should only be set to debug the spin backend. 113 */ 114 TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); 115 116 #ifdef INVARIANTS 117 SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, 118 "Panic if attempting to switch lwkt's while mastering cpusync"); 119 #endif 120 SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, 121 "Number of switched threads"); 122 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, 123 "Successful preemption events"); 124 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, 125 "Failed preemption events"); 126 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, 127 "Number of preempted threads."); 128 #ifdef INVARIANTS 129 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, 130 &token_contention_count, 0, "spinning due to token contention"); 131 #endif 132 static int fairq_enable = 1; 133 SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, 134 &fairq_enable, 0, "Turn on fairq priority accumulators"); 135 static int lwkt_spin_loops = 10; 136 SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW, 137 &lwkt_spin_loops, 0, ""); 138 static int lwkt_spin_delay = 1; 139 SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW, 140 &lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto"); 141 static int lwkt_spin_method = 1; 142 SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW, 143 &lwkt_spin_method, 0, "LWKT scheduler behavior when contended"); 144 static int lwkt_spin_fatal = 0; /* disabled */ 145 SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW, 146 &lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic"); 147 static int preempt_enable = 1; 148 SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, 149 &preempt_enable, 0, "Enable preemption"); 150 static int lwkt_cache_threads = 32; 151 SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD, 152 &lwkt_cache_threads, 0, "thread+kstack cache"); 153 154 static __cachealign int lwkt_cseq_rindex; 155 static __cachealign int lwkt_cseq_windex; 156 157 /* 158 * These helper procedures handle the runq, they can only be called from 159 * within a critical section. 160 * 161 * WARNING! Prior to SMP being brought up it is possible to enqueue and 162 * dequeue threads belonging to other cpus, so be sure to use td->td_gd 163 * instead of 'mycpu' when referencing the globaldata structure. Once 164 * SMP live enqueuing and dequeueing only occurs on the current cpu. 165 */ 166 static __inline 167 void 168 _lwkt_dequeue(thread_t td) 169 { 170 if (td->td_flags & TDF_RUNQ) { 171 struct globaldata *gd = td->td_gd; 172 173 td->td_flags &= ~TDF_RUNQ; 174 TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); 175 gd->gd_fairq_total_pri -= td->td_pri; 176 if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) 177 atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING); 178 } 179 } 180 181 /* 182 * Priority enqueue. 183 * 184 * NOTE: There are a limited number of lwkt threads runnable since user 185 * processes only schedule one at a time per cpu. 186 */ 187 static __inline 188 void 189 _lwkt_enqueue(thread_t td) 190 { 191 thread_t xtd; 192 193 if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { 194 struct globaldata *gd = td->td_gd; 195 196 td->td_flags |= TDF_RUNQ; 197 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 198 if (xtd == NULL) { 199 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 200 atomic_set_int(&gd->gd_reqflags, RQF_RUNNING); 201 } else { 202 while (xtd && xtd->td_pri > td->td_pri) 203 xtd = TAILQ_NEXT(xtd, td_threadq); 204 if (xtd) 205 TAILQ_INSERT_BEFORE(xtd, td, td_threadq); 206 else 207 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 208 } 209 gd->gd_fairq_total_pri += td->td_pri; 210 } 211 } 212 213 static __boolean_t 214 _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) 215 { 216 struct thread *td = (struct thread *)obj; 217 218 td->td_kstack = NULL; 219 td->td_kstack_size = 0; 220 td->td_flags = TDF_ALLOCATED_THREAD; 221 return (1); 222 } 223 224 static void 225 _lwkt_thread_dtor(void *obj, void *privdata) 226 { 227 struct thread *td = (struct thread *)obj; 228 229 KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, 230 ("_lwkt_thread_dtor: not allocated from objcache")); 231 KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && 232 td->td_kstack_size > 0, 233 ("_lwkt_thread_dtor: corrupted stack")); 234 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 235 } 236 237 /* 238 * Initialize the lwkt s/system. 239 * 240 * Nominally cache up to 32 thread + kstack structures. 241 */ 242 void 243 lwkt_init(void) 244 { 245 TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads); 246 thread_cache = objcache_create_mbacked( 247 M_THREAD, sizeof(struct thread), 248 NULL, lwkt_cache_threads, 249 _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); 250 } 251 252 /* 253 * Schedule a thread to run. As the current thread we can always safely 254 * schedule ourselves, and a shortcut procedure is provided for that 255 * function. 256 * 257 * (non-blocking, self contained on a per cpu basis) 258 */ 259 void 260 lwkt_schedule_self(thread_t td) 261 { 262 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 263 crit_enter_quick(td); 264 KASSERT(td != &td->td_gd->gd_idlethread, 265 ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); 266 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 267 _lwkt_enqueue(td); 268 crit_exit_quick(td); 269 } 270 271 /* 272 * Deschedule a thread. 273 * 274 * (non-blocking, self contained on a per cpu basis) 275 */ 276 void 277 lwkt_deschedule_self(thread_t td) 278 { 279 crit_enter_quick(td); 280 _lwkt_dequeue(td); 281 crit_exit_quick(td); 282 } 283 284 /* 285 * LWKTs operate on a per-cpu basis 286 * 287 * WARNING! Called from early boot, 'mycpu' may not work yet. 288 */ 289 void 290 lwkt_gdinit(struct globaldata *gd) 291 { 292 TAILQ_INIT(&gd->gd_tdrunq); 293 TAILQ_INIT(&gd->gd_tdallq); 294 } 295 296 /* 297 * Create a new thread. The thread must be associated with a process context 298 * or LWKT start address before it can be scheduled. If the target cpu is 299 * -1 the thread will be created on the current cpu. 300 * 301 * If you intend to create a thread without a process context this function 302 * does everything except load the startup and switcher function. 303 */ 304 thread_t 305 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) 306 { 307 globaldata_t gd = mycpu; 308 void *stack; 309 310 /* 311 * If static thread storage is not supplied allocate a thread. Reuse 312 * a cached free thread if possible. gd_freetd is used to keep an exiting 313 * thread intact through the exit. 314 */ 315 if (td == NULL) { 316 crit_enter_gd(gd); 317 if ((td = gd->gd_freetd) != NULL) { 318 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 319 TDF_RUNQ)) == 0); 320 gd->gd_freetd = NULL; 321 } else { 322 td = objcache_get(thread_cache, M_WAITOK); 323 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 324 TDF_RUNQ)) == 0); 325 } 326 crit_exit_gd(gd); 327 KASSERT((td->td_flags & 328 (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, 329 ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); 330 flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); 331 } 332 333 /* 334 * Try to reuse cached stack. 335 */ 336 if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { 337 if (flags & TDF_ALLOCATED_STACK) { 338 kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); 339 stack = NULL; 340 } 341 } 342 if (stack == NULL) { 343 stack = (void *)kmem_alloc_stack(&kernel_map, stksize); 344 flags |= TDF_ALLOCATED_STACK; 345 } 346 if (cpu < 0) 347 lwkt_init_thread(td, stack, stksize, flags, gd); 348 else 349 lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); 350 return(td); 351 } 352 353 /* 354 * Initialize a preexisting thread structure. This function is used by 355 * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. 356 * 357 * All threads start out in a critical section at a priority of 358 * TDPRI_KERN_DAEMON. Higher level code will modify the priority as 359 * appropriate. This function may send an IPI message when the 360 * requested cpu is not the current cpu and consequently gd_tdallq may 361 * not be initialized synchronously from the point of view of the originating 362 * cpu. 363 * 364 * NOTE! we have to be careful in regards to creating threads for other cpus 365 * if SMP has not yet been activated. 366 */ 367 #ifdef SMP 368 369 static void 370 lwkt_init_thread_remote(void *arg) 371 { 372 thread_t td = arg; 373 374 /* 375 * Protected by critical section held by IPI dispatch 376 */ 377 TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); 378 } 379 380 #endif 381 382 /* 383 * lwkt core thread structural initialization. 384 * 385 * NOTE: All threads are initialized as mpsafe threads. 386 */ 387 void 388 lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, 389 struct globaldata *gd) 390 { 391 globaldata_t mygd = mycpu; 392 393 bzero(td, sizeof(struct thread)); 394 td->td_kstack = stack; 395 td->td_kstack_size = stksize; 396 td->td_flags = flags; 397 td->td_gd = gd; 398 td->td_pri = TDPRI_KERN_DAEMON; 399 td->td_critcount = 1; 400 td->td_toks_stop = &td->td_toks_base; 401 if (lwkt_use_spin_port) 402 lwkt_initport_spin(&td->td_msgport); 403 else 404 lwkt_initport_thread(&td->td_msgport, td); 405 pmap_init_thread(td); 406 #ifdef SMP 407 /* 408 * Normally initializing a thread for a remote cpu requires sending an 409 * IPI. However, the idlethread is setup before the other cpus are 410 * activated so we have to treat it as a special case. XXX manipulation 411 * of gd_tdallq requires the BGL. 412 */ 413 if (gd == mygd || td == &gd->gd_idlethread) { 414 crit_enter_gd(mygd); 415 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 416 crit_exit_gd(mygd); 417 } else { 418 lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); 419 } 420 #else 421 crit_enter_gd(mygd); 422 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 423 crit_exit_gd(mygd); 424 #endif 425 426 dsched_new_thread(td); 427 } 428 429 void 430 lwkt_set_comm(thread_t td, const char *ctl, ...) 431 { 432 __va_list va; 433 434 __va_start(va, ctl); 435 kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); 436 __va_end(va); 437 KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); 438 } 439 440 void 441 lwkt_hold(thread_t td) 442 { 443 atomic_add_int(&td->td_refs, 1); 444 } 445 446 void 447 lwkt_rele(thread_t td) 448 { 449 KKASSERT(td->td_refs > 0); 450 atomic_add_int(&td->td_refs, -1); 451 } 452 453 void 454 lwkt_wait_free(thread_t td) 455 { 456 while (td->td_refs) 457 tsleep(td, 0, "tdreap", hz); 458 } 459 460 void 461 lwkt_free_thread(thread_t td) 462 { 463 KKASSERT(td->td_refs == 0); 464 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); 465 if (td->td_flags & TDF_ALLOCATED_THREAD) { 466 objcache_put(thread_cache, td); 467 } else if (td->td_flags & TDF_ALLOCATED_STACK) { 468 /* client-allocated struct with internally allocated stack */ 469 KASSERT(td->td_kstack && td->td_kstack_size > 0, 470 ("lwkt_free_thread: corrupted stack")); 471 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 472 td->td_kstack = NULL; 473 td->td_kstack_size = 0; 474 } 475 KTR_LOG(ctxsw_deadtd, td); 476 } 477 478 479 /* 480 * Switch to the next runnable lwkt. If no LWKTs are runnable then 481 * switch to the idlethread. Switching must occur within a critical 482 * section to avoid races with the scheduling queue. 483 * 484 * We always have full control over our cpu's run queue. Other cpus 485 * that wish to manipulate our queue must use the cpu_*msg() calls to 486 * talk to our cpu, so a critical section is all that is needed and 487 * the result is very, very fast thread switching. 488 * 489 * The LWKT scheduler uses a fixed priority model and round-robins at 490 * each priority level. User process scheduling is a totally 491 * different beast and LWKT priorities should not be confused with 492 * user process priorities. 493 * 494 * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() 495 * is not called by the current thread in the preemption case, only when 496 * the preempting thread blocks (in order to return to the original thread). 497 * 498 * SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread 499 * migration and tsleep deschedule the current lwkt thread and call 500 * lwkt_switch(). In particular, the target cpu of the migration fully 501 * expects the thread to become non-runnable and can deadlock against 502 * cpusync operations if we run any IPIs prior to switching the thread out. 503 * 504 * WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF 505 * THE CURRENT THREAD HAS BEEN DESCHEDULED! 506 */ 507 void 508 lwkt_switch(void) 509 { 510 globaldata_t gd = mycpu; 511 thread_t td = gd->gd_curthread; 512 thread_t ntd; 513 thread_t xtd; 514 int spinning = lwkt_spin_loops; /* loops before HLTing */ 515 int reqflags; 516 int cseq; 517 int oseq; 518 int fatal_count; 519 520 /* 521 * Switching from within a 'fast' (non thread switched) interrupt or IPI 522 * is illegal. However, we may have to do it anyway if we hit a fatal 523 * kernel trap or we have paniced. 524 * 525 * If this case occurs save and restore the interrupt nesting level. 526 */ 527 if (gd->gd_intr_nesting_level) { 528 int savegdnest; 529 int savegdtrap; 530 531 if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { 532 panic("lwkt_switch: Attempt to switch from a " 533 "a fast interrupt, ipi, or hard code section, " 534 "td %p\n", 535 td); 536 } else { 537 savegdnest = gd->gd_intr_nesting_level; 538 savegdtrap = gd->gd_trap_nesting_level; 539 gd->gd_intr_nesting_level = 0; 540 gd->gd_trap_nesting_level = 0; 541 if ((td->td_flags & TDF_PANICWARN) == 0) { 542 td->td_flags |= TDF_PANICWARN; 543 kprintf("Warning: thread switch from interrupt, IPI, " 544 "or hard code section.\n" 545 "thread %p (%s)\n", td, td->td_comm); 546 print_backtrace(-1); 547 } 548 lwkt_switch(); 549 gd->gd_intr_nesting_level = savegdnest; 550 gd->gd_trap_nesting_level = savegdtrap; 551 return; 552 } 553 } 554 555 /* 556 * Passive release (used to transition from user to kernel mode 557 * when we block or switch rather then when we enter the kernel). 558 * This function is NOT called if we are switching into a preemption 559 * or returning from a preemption. Typically this causes us to lose 560 * our current process designation (if we have one) and become a true 561 * LWKT thread, and may also hand the current process designation to 562 * another process and schedule thread. 563 */ 564 if (td->td_release) 565 td->td_release(td); 566 567 crit_enter_gd(gd); 568 if (TD_TOKS_HELD(td)) 569 lwkt_relalltokens(td); 570 571 /* 572 * We had better not be holding any spin locks, but don't get into an 573 * endless panic loop. 574 */ 575 KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, 576 ("lwkt_switch: still holding %d exclusive spinlocks!", 577 gd->gd_spinlocks_wr)); 578 579 580 #ifdef SMP 581 #ifdef INVARIANTS 582 if (td->td_cscount) { 583 kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", 584 td); 585 if (panic_on_cscount) 586 panic("switching while mastering cpusync"); 587 } 588 #endif 589 #endif 590 591 /* 592 * If we had preempted another thread on this cpu, resume the preempted 593 * thread. This occurs transparently, whether the preempted thread 594 * was scheduled or not (it may have been preempted after descheduling 595 * itself). 596 * 597 * We have to setup the MP lock for the original thread after backing 598 * out the adjustment that was made to curthread when the original 599 * was preempted. 600 */ 601 if ((ntd = td->td_preempted) != NULL) { 602 KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); 603 ntd->td_flags |= TDF_PREEMPT_DONE; 604 605 /* 606 * The interrupt may have woken a thread up, we need to properly 607 * set the reschedule flag if the originally interrupted thread is 608 * at a lower priority. 609 */ 610 if (TAILQ_FIRST(&gd->gd_tdrunq) && 611 TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { 612 need_lwkt_resched(); 613 } 614 /* YYY release mp lock on switchback if original doesn't need it */ 615 goto havethread_preempted; 616 } 617 618 /* 619 * Implement round-robin fairq with priority insertion. The priority 620 * insertion is handled by _lwkt_enqueue() 621 * 622 * If we cannot obtain ownership of the tokens we cannot immediately 623 * schedule the target thread. 624 * 625 * Reminder: Again, we cannot afford to run any IPIs in this path if 626 * the current thread has been descheduled. 627 */ 628 for (;;) { 629 /* 630 * Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request) 631 * and set RQF_WAKEUP (prevent unnecessary IPIs from being 632 * received). 633 */ 634 for (;;) { 635 reqflags = gd->gd_reqflags; 636 if (atomic_cmpset_int(&gd->gd_reqflags, reqflags, 637 (reqflags & ~RQF_AST_LWKT_RESCHED) | 638 RQF_WAKEUP)) { 639 break; 640 } 641 } 642 643 /* 644 * Hotpath - pull the head of the run queue and attempt to schedule 645 * it. Fairq exhaustion moves the task to the end of the list. If 646 * no threads are runnable we switch to the idle thread. 647 */ 648 for (;;) { 649 ntd = TAILQ_FIRST(&gd->gd_tdrunq); 650 651 if (ntd == NULL) { 652 /* 653 * Runq is empty, switch to idle and clear RQF_WAKEUP 654 * to allow it to halt. 655 */ 656 ntd = &gd->gd_idlethread; 657 #ifdef SMP 658 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 659 ASSERT_NO_TOKENS_HELD(ntd); 660 #endif 661 cpu_time.cp_msg[0] = 0; 662 cpu_time.cp_stallpc = 0; 663 atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); 664 goto haveidle; 665 } 666 667 if (ntd->td_fairq_accum >= 0) 668 break; 669 670 /*splz_check(); cannot do this here, see above */ 671 lwkt_fairq_accumulate(gd, ntd); 672 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 673 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); 674 } 675 676 /* 677 * Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent 678 * unwanted decontention IPIs. 679 * 680 * NOTE: For UP there is no mplock and lwkt_getalltokens() 681 * always succeeds. 682 */ 683 if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 684 goto havethread; 685 686 /* 687 * Coldpath (SMP only since tokens always succeed on UP) 688 * 689 * We had some contention on the thread we wanted to schedule. 690 * What we do now is try to find a thread that we can schedule 691 * in its stead until decontention reschedules on our cpu. 692 * 693 * The coldpath scan does NOT rearrange threads in the run list 694 * and it also ignores the accumulator. 695 * 696 * We do not immediately schedule a user priority thread, instead 697 * we record it in xtd and continue looking for kernel threads. 698 * A cpu can only have one user priority thread (normally) so just 699 * record the first one. 700 * 701 * NOTE: This scan will also include threads whos fairq's were 702 * accumulated in the first loop. 703 */ 704 ++token_contention_count; 705 xtd = NULL; 706 while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) { 707 /* 708 * Try to switch to this thread. If the thread is running at 709 * user priority we clear WAKEUP to allow decontention IPIs 710 * (since this thread is simply running until the one we wanted 711 * decontends), and we make sure that LWKT_RESCHED is not set. 712 * 713 * Otherwise for kernel threads we leave WAKEUP set to avoid 714 * unnecessary decontention IPIs. 715 */ 716 if (ntd->td_pri < TDPRI_KERN_LPSCHED) { 717 if (xtd == NULL) 718 xtd = ntd; 719 continue; 720 } 721 722 /* 723 * Do not let the fairq get too negative. Even though we are 724 * ignoring it atm once the scheduler decontends a very negative 725 * thread will get moved to the end of the queue. 726 */ 727 if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) { 728 if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) 729 ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); 730 goto havethread; 731 } 732 733 /* 734 * Well fubar, this thread is contended as well, loop 735 */ 736 /* */ 737 } 738 739 /* 740 * We exhausted the run list but we may have recorded a user 741 * thread to try. We have three choices based on 742 * lwkt.decontention_method. 743 * 744 * (0) Atomically clear RQF_WAKEUP in order to receive decontention 745 * IPIs (to interrupt the user process) and test 746 * RQF_AST_LWKT_RESCHED at the same time. 747 * 748 * This results in significant decontention IPI traffic but may 749 * be more responsive. 750 * 751 * (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI. 752 * An automatic LWKT reschedule will occur on the next hardclock 753 * (typically 100hz). 754 * 755 * This results in no decontention IPI traffic but may be less 756 * responsive. This is the default. 757 * 758 * (2) Refuse to schedule the user process at this time. 759 * 760 * This is highly experimental and should not be used under 761 * normal circumstances. This can cause a user process to 762 * get starved out in situations where kernel threads are 763 * fighting each other for tokens. 764 */ 765 if (xtd) { 766 ntd = xtd; 767 768 switch(lwkt_spin_method) { 769 case 0: 770 for (;;) { 771 reqflags = gd->gd_reqflags; 772 if (atomic_cmpset_int(&gd->gd_reqflags, 773 reqflags, 774 reqflags & ~RQF_WAKEUP)) { 775 break; 776 } 777 } 778 break; 779 case 1: 780 reqflags = gd->gd_reqflags; 781 break; 782 default: 783 goto skip; 784 break; 785 } 786 if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 && 787 (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 788 ) { 789 if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) 790 ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); 791 goto havethread; 792 } 793 794 skip: 795 /* 796 * Make sure RQF_WAKEUP is set if we failed to schedule the 797 * user thread to prevent the idle thread from halting. 798 */ 799 atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP); 800 } 801 802 /* 803 * We exhausted the run list, meaning that all runnable threads 804 * are contended. 805 */ 806 cpu_pause(); 807 ntd = &gd->gd_idlethread; 808 #ifdef SMP 809 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 810 ASSERT_NO_TOKENS_HELD(ntd); 811 /* contention case, do not clear contention mask */ 812 #endif 813 814 /* 815 * Ok, we might want to spin a few times as some tokens are held for 816 * very short periods of time and IPI overhead is 1uS or worse 817 * (meaning it is usually better to spin). Regardless we have to 818 * call splz_check() to be sure to service any interrupts blocked 819 * by our critical section, otherwise we could livelock e.g. IPIs. 820 * 821 * The IPI mechanic is really a last resort. In nearly all other 822 * cases RQF_WAKEUP is left set to prevent decontention IPIs. 823 * 824 * When we decide not to spin we clear RQF_WAKEUP and switch to 825 * the idle thread. Clearing RQF_WEAKEUP allows the idle thread 826 * to halt and decontended tokens will issue an IPI to us. The 827 * idle thread will check for pending reschedules already set 828 * (RQF_AST_LWKT_RESCHED) before actually halting so we don't have 829 * to here. 830 * 831 * Also, if TDF_RUNQ is not set the current thread is trying to 832 * deschedule, possibly in an atomic fashion. We cannot afford to 833 * stay here. 834 */ 835 if (spinning <= 0 || (td->td_flags & TDF_RUNQ) == 0) { 836 atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); 837 goto haveidle; 838 } 839 --spinning; 840 841 /* 842 * When spinning a delay is required both to avoid livelocks from 843 * token order reversals (a thread may be trying to acquire multiple 844 * tokens), and also to reduce cpu cache management traffic. 845 * 846 * In order to scale to a large number of CPUs we use a time slot 847 * resequencer to force contending cpus into non-contending 848 * time-slots. The scheduler may still contend with the lock holder 849 * but will not (generally) contend with all the other cpus trying 850 * trying to get the same token. 851 * 852 * The resequencer uses a FIFO counter mechanic. The owner of the 853 * rindex at the head of the FIFO is allowed to pull itself off 854 * the FIFO and fetchadd is used to enter into the FIFO. This bit 855 * of code is VERY cache friendly and forces all spinning schedulers 856 * into their own time slots. 857 * 858 * This code has been tested to 48-cpus and caps the cache 859 * contention load at ~1uS intervals regardless of the number of 860 * cpus. Scaling beyond 64 cpus might require additional smarts 861 * (such as separate FIFOs for specific token cases). 862 * 863 * WARNING! We can't call splz_check() or anything else here as 864 * it could cause a deadlock. 865 */ 866 #if defined(INVARIANTS) && defined(__amd64__) 867 if ((read_rflags() & PSL_I) == 0) { 868 cpu_enable_intr(); 869 panic("lwkt_switch() called with interrupts disabled"); 870 } 871 #endif 872 cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1); 873 fatal_count = lwkt_spin_fatal; 874 while ((oseq = lwkt_cseq_rindex) != cseq) { 875 cpu_ccfence(); 876 #if !defined(_KERNEL_VIRTUAL) 877 if (cpu_mi_feature & CPU_MI_MONITOR) { 878 cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq); 879 } else 880 #endif 881 { 882 DELAY(1); 883 cpu_lfence(); 884 } 885 if (fatal_count && --fatal_count == 0) 886 panic("lwkt_switch: fatal spin wait"); 887 } 888 cseq = lwkt_spin_delay; /* don't trust the system operator */ 889 cpu_ccfence(); 890 if (cseq < 1) 891 cseq = 1; 892 if (cseq > 1000) 893 cseq = 1000; 894 DELAY(cseq); 895 atomic_add_int(&lwkt_cseq_rindex, 1); 896 splz_check(); /* ok, we already checked that td is still scheduled */ 897 /* highest level for(;;) loop */ 898 } 899 900 havethread: 901 /* 902 * We must always decrement td_fairq_accum on non-idle threads just 903 * in case a thread never gets a tick due to being in a continuous 904 * critical section. The page-zeroing code does this, for example. 905 * 906 * If the thread we came up with is a higher or equal priority verses 907 * the thread at the head of the queue we move our thread to the 908 * front. This way we can always check the front of the queue. 909 * 910 * Clear gd_idle_repeat when doing a normal switch to a non-idle 911 * thread. 912 */ 913 ++gd->gd_cnt.v_swtch; 914 --ntd->td_fairq_accum; 915 ntd->td_wmesg = NULL; 916 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 917 if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { 918 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 919 TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); 920 } 921 gd->gd_idle_repeat = 0; 922 923 havethread_preempted: 924 /* 925 * If the new target does not need the MP lock and we are holding it, 926 * release the MP lock. If the new target requires the MP lock we have 927 * already acquired it for the target. 928 */ 929 ; 930 haveidle: 931 KASSERT(ntd->td_critcount, 932 ("priority problem in lwkt_switch %d %d", 933 td->td_critcount, ntd->td_critcount)); 934 935 if (td != ntd) { 936 ++switch_count; 937 KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); 938 td->td_switch(ntd); 939 } 940 /* NOTE: current cpu may have changed after switch */ 941 crit_exit_quick(td); 942 } 943 944 /* 945 * Request that the target thread preempt the current thread. Preemption 946 * only works under a specific set of conditions: 947 * 948 * - We are not preempting ourselves 949 * - The target thread is owned by the current cpu 950 * - We are not currently being preempted 951 * - The target is not currently being preempted 952 * - We are not holding any spin locks 953 * - The target thread is not holding any tokens 954 * - We are able to satisfy the target's MP lock requirements (if any). 955 * 956 * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically 957 * this is called via lwkt_schedule() through the td_preemptable callback. 958 * critcount is the managed critical priority that we should ignore in order 959 * to determine whether preemption is possible (aka usually just the crit 960 * priority of lwkt_schedule() itself). 961 * 962 * XXX at the moment we run the target thread in a critical section during 963 * the preemption in order to prevent the target from taking interrupts 964 * that *WE* can't. Preemption is strictly limited to interrupt threads 965 * and interrupt-like threads, outside of a critical section, and the 966 * preempted source thread will be resumed the instant the target blocks 967 * whether or not the source is scheduled (i.e. preemption is supposed to 968 * be as transparent as possible). 969 */ 970 void 971 lwkt_preempt(thread_t ntd, int critcount) 972 { 973 struct globaldata *gd = mycpu; 974 thread_t td; 975 int save_gd_intr_nesting_level; 976 977 /* 978 * The caller has put us in a critical section. We can only preempt 979 * if the caller of the caller was not in a critical section (basically 980 * a local interrupt), as determined by the 'critcount' parameter. We 981 * also can't preempt if the caller is holding any spinlocks (even if 982 * he isn't in a critical section). This also handles the tokens test. 983 * 984 * YYY The target thread must be in a critical section (else it must 985 * inherit our critical section? I dunno yet). 986 * 987 * Set need_lwkt_resched() unconditionally for now YYY. 988 */ 989 KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); 990 991 if (preempt_enable == 0) { 992 ++preempt_miss; 993 return; 994 } 995 996 td = gd->gd_curthread; 997 if (ntd->td_pri <= td->td_pri) { 998 ++preempt_miss; 999 return; 1000 } 1001 if (td->td_critcount > critcount) { 1002 ++preempt_miss; 1003 need_lwkt_resched(); 1004 return; 1005 } 1006 #ifdef SMP 1007 if (ntd->td_gd != gd) { 1008 ++preempt_miss; 1009 need_lwkt_resched(); 1010 return; 1011 } 1012 #endif 1013 /* 1014 * We don't have to check spinlocks here as they will also bump 1015 * td_critcount. 1016 * 1017 * Do not try to preempt if the target thread is holding any tokens. 1018 * We could try to acquire the tokens but this case is so rare there 1019 * is no need to support it. 1020 */ 1021 KKASSERT(gd->gd_spinlocks_wr == 0); 1022 1023 if (TD_TOKS_HELD(ntd)) { 1024 ++preempt_miss; 1025 need_lwkt_resched(); 1026 return; 1027 } 1028 if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { 1029 ++preempt_weird; 1030 need_lwkt_resched(); 1031 return; 1032 } 1033 if (ntd->td_preempted) { 1034 ++preempt_hit; 1035 need_lwkt_resched(); 1036 return; 1037 } 1038 1039 /* 1040 * Since we are able to preempt the current thread, there is no need to 1041 * call need_lwkt_resched(). 1042 * 1043 * We must temporarily clear gd_intr_nesting_level around the switch 1044 * since switchouts from the target thread are allowed (they will just 1045 * return to our thread), and since the target thread has its own stack. 1046 */ 1047 ++preempt_hit; 1048 ntd->td_preempted = td; 1049 td->td_flags |= TDF_PREEMPT_LOCK; 1050 KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); 1051 save_gd_intr_nesting_level = gd->gd_intr_nesting_level; 1052 gd->gd_intr_nesting_level = 0; 1053 td->td_switch(ntd); 1054 gd->gd_intr_nesting_level = save_gd_intr_nesting_level; 1055 1056 KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); 1057 ntd->td_preempted = NULL; 1058 td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); 1059 } 1060 1061 /* 1062 * Conditionally call splz() if gd_reqflags indicates work is pending. 1063 * This will work inside a critical section but not inside a hard code 1064 * section. 1065 * 1066 * (self contained on a per cpu basis) 1067 */ 1068 void 1069 splz_check(void) 1070 { 1071 globaldata_t gd = mycpu; 1072 thread_t td = gd->gd_curthread; 1073 1074 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && 1075 gd->gd_intr_nesting_level == 0 && 1076 td->td_nest_count < 2) 1077 { 1078 splz(); 1079 } 1080 } 1081 1082 /* 1083 * This version is integrated into crit_exit, reqflags has already 1084 * been tested but td_critcount has not. 1085 * 1086 * We only want to execute the splz() on the 1->0 transition of 1087 * critcount and not in a hard code section or if too deeply nested. 1088 */ 1089 void 1090 lwkt_maybe_splz(thread_t td) 1091 { 1092 globaldata_t gd = td->td_gd; 1093 1094 if (td->td_critcount == 0 && 1095 gd->gd_intr_nesting_level == 0 && 1096 td->td_nest_count < 2) 1097 { 1098 splz(); 1099 } 1100 } 1101 1102 /* 1103 * This function is used to negotiate a passive release of the current 1104 * process/lwp designation with the user scheduler, allowing the user 1105 * scheduler to schedule another user thread. The related kernel thread 1106 * (curthread) continues running in the released state. 1107 */ 1108 void 1109 lwkt_passive_release(struct thread *td) 1110 { 1111 struct lwp *lp = td->td_lwp; 1112 1113 td->td_release = NULL; 1114 lwkt_setpri_self(TDPRI_KERN_USER); 1115 lp->lwp_proc->p_usched->release_curproc(lp); 1116 } 1117 1118 1119 /* 1120 * This implements a normal yield. This routine is virtually a nop if 1121 * there is nothing to yield to but it will always run any pending interrupts 1122 * if called from a critical section. 1123 * 1124 * This yield is designed for kernel threads without a user context. 1125 * 1126 * (self contained on a per cpu basis) 1127 */ 1128 void 1129 lwkt_yield(void) 1130 { 1131 globaldata_t gd = mycpu; 1132 thread_t td = gd->gd_curthread; 1133 thread_t xtd; 1134 1135 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1136 splz(); 1137 if (td->td_fairq_accum < 0) { 1138 lwkt_schedule_self(curthread); 1139 lwkt_switch(); 1140 } else { 1141 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 1142 if (xtd && xtd->td_pri > td->td_pri) { 1143 lwkt_schedule_self(curthread); 1144 lwkt_switch(); 1145 } 1146 } 1147 } 1148 1149 /* 1150 * This yield is designed for kernel threads with a user context. 1151 * 1152 * The kernel acting on behalf of the user is potentially cpu-bound, 1153 * this function will efficiently allow other threads to run and also 1154 * switch to other processes by releasing. 1155 * 1156 * The lwkt_user_yield() function is designed to have very low overhead 1157 * if no yield is determined to be needed. 1158 */ 1159 void 1160 lwkt_user_yield(void) 1161 { 1162 globaldata_t gd = mycpu; 1163 thread_t td = gd->gd_curthread; 1164 1165 /* 1166 * Always run any pending interrupts in case we are in a critical 1167 * section. 1168 */ 1169 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1170 splz(); 1171 1172 /* 1173 * Switch (which forces a release) if another kernel thread needs 1174 * the cpu, if userland wants us to resched, or if our kernel 1175 * quantum has run out. 1176 */ 1177 if (lwkt_resched_wanted() || 1178 user_resched_wanted() || 1179 td->td_fairq_accum < 0) 1180 { 1181 lwkt_switch(); 1182 } 1183 1184 #if 0 1185 /* 1186 * Reacquire the current process if we are released. 1187 * 1188 * XXX not implemented atm. The kernel may be holding locks and such, 1189 * so we want the thread to continue to receive cpu. 1190 */ 1191 if (td->td_release == NULL && lp) { 1192 lp->lwp_proc->p_usched->acquire_curproc(lp); 1193 td->td_release = lwkt_passive_release; 1194 lwkt_setpri_self(TDPRI_USER_NORM); 1195 } 1196 #endif 1197 } 1198 1199 /* 1200 * Generic schedule. Possibly schedule threads belonging to other cpus and 1201 * deal with threads that might be blocked on a wait queue. 1202 * 1203 * We have a little helper inline function which does additional work after 1204 * the thread has been enqueued, including dealing with preemption and 1205 * setting need_lwkt_resched() (which prevents the kernel from returning 1206 * to userland until it has processed higher priority threads). 1207 * 1208 * It is possible for this routine to be called after a failed _enqueue 1209 * (due to the target thread migrating, sleeping, or otherwise blocked). 1210 * We have to check that the thread is actually on the run queue! 1211 * 1212 * reschedok is an optimized constant propagated from lwkt_schedule() or 1213 * lwkt_schedule_noresched(). By default it is non-zero, causing a 1214 * reschedule to be requested if the target thread has a higher priority. 1215 * The port messaging code will set MSG_NORESCHED and cause reschedok to 1216 * be 0, prevented undesired reschedules. 1217 */ 1218 static __inline 1219 void 1220 _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) 1221 { 1222 thread_t otd; 1223 1224 if (ntd->td_flags & TDF_RUNQ) { 1225 if (ntd->td_preemptable && reschedok) { 1226 ntd->td_preemptable(ntd, ccount); /* YYY +token */ 1227 } else if (reschedok) { 1228 otd = curthread; 1229 if (ntd->td_pri > otd->td_pri) 1230 need_lwkt_resched(); 1231 } 1232 1233 /* 1234 * Give the thread a little fair share scheduler bump if it 1235 * has been asleep for a while. This is primarily to avoid 1236 * a degenerate case for interrupt threads where accumulator 1237 * crosses into negative territory unnecessarily. 1238 */ 1239 if (ntd->td_fairq_lticks != ticks) { 1240 ntd->td_fairq_lticks = ticks; 1241 ntd->td_fairq_accum += gd->gd_fairq_total_pri; 1242 if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) 1243 ntd->td_fairq_accum = TDFAIRQ_MAX(gd); 1244 } 1245 } 1246 } 1247 1248 static __inline 1249 void 1250 _lwkt_schedule(thread_t td, int reschedok) 1251 { 1252 globaldata_t mygd = mycpu; 1253 1254 KASSERT(td != &td->td_gd->gd_idlethread, 1255 ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); 1256 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 1257 crit_enter_gd(mygd); 1258 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1259 if (td == mygd->gd_curthread) { 1260 _lwkt_enqueue(td); 1261 } else { 1262 /* 1263 * If we own the thread, there is no race (since we are in a 1264 * critical section). If we do not own the thread there might 1265 * be a race but the target cpu will deal with it. 1266 */ 1267 #ifdef SMP 1268 if (td->td_gd == mygd) { 1269 _lwkt_enqueue(td); 1270 _lwkt_schedule_post(mygd, td, 1, reschedok); 1271 } else { 1272 lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); 1273 } 1274 #else 1275 _lwkt_enqueue(td); 1276 _lwkt_schedule_post(mygd, td, 1, reschedok); 1277 #endif 1278 } 1279 crit_exit_gd(mygd); 1280 } 1281 1282 void 1283 lwkt_schedule(thread_t td) 1284 { 1285 _lwkt_schedule(td, 1); 1286 } 1287 1288 void 1289 lwkt_schedule_noresched(thread_t td) 1290 { 1291 _lwkt_schedule(td, 0); 1292 } 1293 1294 #ifdef SMP 1295 1296 /* 1297 * When scheduled remotely if frame != NULL the IPIQ is being 1298 * run via doreti or an interrupt then preemption can be allowed. 1299 * 1300 * To allow preemption we have to drop the critical section so only 1301 * one is present in _lwkt_schedule_post. 1302 */ 1303 static void 1304 lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame) 1305 { 1306 thread_t td = curthread; 1307 thread_t ntd = arg; 1308 1309 if (frame && ntd->td_preemptable) { 1310 crit_exit_noyield(td); 1311 _lwkt_schedule(ntd, 1); 1312 crit_enter_quick(td); 1313 } else { 1314 _lwkt_schedule(ntd, 1); 1315 } 1316 } 1317 1318 /* 1319 * Thread migration using a 'Pull' method. The thread may or may not be 1320 * the current thread. It MUST be descheduled and in a stable state. 1321 * lwkt_giveaway() must be called on the cpu owning the thread. 1322 * 1323 * At any point after lwkt_giveaway() is called, the target cpu may 1324 * 'pull' the thread by calling lwkt_acquire(). 1325 * 1326 * We have to make sure the thread is not sitting on a per-cpu tsleep 1327 * queue or it will blow up when it moves to another cpu. 1328 * 1329 * MPSAFE - must be called under very specific conditions. 1330 */ 1331 void 1332 lwkt_giveaway(thread_t td) 1333 { 1334 globaldata_t gd = mycpu; 1335 1336 crit_enter_gd(gd); 1337 if (td->td_flags & TDF_TSLEEPQ) 1338 tsleep_remove(td); 1339 KKASSERT(td->td_gd == gd); 1340 TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); 1341 td->td_flags |= TDF_MIGRATING; 1342 crit_exit_gd(gd); 1343 } 1344 1345 void 1346 lwkt_acquire(thread_t td) 1347 { 1348 globaldata_t gd; 1349 globaldata_t mygd; 1350 1351 KKASSERT(td->td_flags & TDF_MIGRATING); 1352 gd = td->td_gd; 1353 mygd = mycpu; 1354 if (gd != mycpu) { 1355 cpu_lfence(); 1356 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1357 crit_enter_gd(mygd); 1358 DEBUG_PUSH_INFO("lwkt_acquire"); 1359 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1360 #ifdef SMP 1361 lwkt_process_ipiq(); 1362 #endif 1363 cpu_lfence(); 1364 } 1365 DEBUG_POP_INFO(); 1366 cpu_mfence(); 1367 td->td_gd = mygd; 1368 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1369 td->td_flags &= ~TDF_MIGRATING; 1370 crit_exit_gd(mygd); 1371 } else { 1372 crit_enter_gd(mygd); 1373 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1374 td->td_flags &= ~TDF_MIGRATING; 1375 crit_exit_gd(mygd); 1376 } 1377 } 1378 1379 #endif 1380 1381 /* 1382 * Generic deschedule. Descheduling threads other then your own should be 1383 * done only in carefully controlled circumstances. Descheduling is 1384 * asynchronous. 1385 * 1386 * This function may block if the cpu has run out of messages. 1387 */ 1388 void 1389 lwkt_deschedule(thread_t td) 1390 { 1391 crit_enter(); 1392 #ifdef SMP 1393 if (td == curthread) { 1394 _lwkt_dequeue(td); 1395 } else { 1396 if (td->td_gd == mycpu) { 1397 _lwkt_dequeue(td); 1398 } else { 1399 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); 1400 } 1401 } 1402 #else 1403 _lwkt_dequeue(td); 1404 #endif 1405 crit_exit(); 1406 } 1407 1408 /* 1409 * Set the target thread's priority. This routine does not automatically 1410 * switch to a higher priority thread, LWKT threads are not designed for 1411 * continuous priority changes. Yield if you want to switch. 1412 */ 1413 void 1414 lwkt_setpri(thread_t td, int pri) 1415 { 1416 KKASSERT(td->td_gd == mycpu); 1417 if (td->td_pri != pri) { 1418 KKASSERT(pri >= 0); 1419 crit_enter(); 1420 if (td->td_flags & TDF_RUNQ) { 1421 _lwkt_dequeue(td); 1422 td->td_pri = pri; 1423 _lwkt_enqueue(td); 1424 } else { 1425 td->td_pri = pri; 1426 } 1427 crit_exit(); 1428 } 1429 } 1430 1431 /* 1432 * Set the initial priority for a thread prior to it being scheduled for 1433 * the first time. The thread MUST NOT be scheduled before or during 1434 * this call. The thread may be assigned to a cpu other then the current 1435 * cpu. 1436 * 1437 * Typically used after a thread has been created with TDF_STOPPREQ, 1438 * and before the thread is initially scheduled. 1439 */ 1440 void 1441 lwkt_setpri_initial(thread_t td, int pri) 1442 { 1443 KKASSERT(pri >= 0); 1444 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1445 td->td_pri = pri; 1446 } 1447 1448 void 1449 lwkt_setpri_self(int pri) 1450 { 1451 thread_t td = curthread; 1452 1453 KKASSERT(pri >= 0 && pri <= TDPRI_MAX); 1454 crit_enter(); 1455 if (td->td_flags & TDF_RUNQ) { 1456 _lwkt_dequeue(td); 1457 td->td_pri = pri; 1458 _lwkt_enqueue(td); 1459 } else { 1460 td->td_pri = pri; 1461 } 1462 crit_exit(); 1463 } 1464 1465 /* 1466 * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. 1467 * 1468 * Example: two competing threads, same priority N. decrement by (2*N) 1469 * increment by N*8, each thread will get 4 ticks. 1470 */ 1471 void 1472 lwkt_fairq_schedulerclock(thread_t td) 1473 { 1474 globaldata_t gd; 1475 1476 if (fairq_enable) { 1477 while (td) { 1478 gd = td->td_gd; 1479 if (td != &gd->gd_idlethread) { 1480 td->td_fairq_accum -= gd->gd_fairq_total_pri; 1481 if (td->td_fairq_accum < -TDFAIRQ_MAX(gd)) 1482 td->td_fairq_accum = -TDFAIRQ_MAX(gd); 1483 if (td->td_fairq_accum < 0) 1484 need_lwkt_resched(); 1485 td->td_fairq_lticks = ticks; 1486 } 1487 td = td->td_preempted; 1488 } 1489 } 1490 } 1491 1492 static void 1493 lwkt_fairq_accumulate(globaldata_t gd, thread_t td) 1494 { 1495 td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; 1496 if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) 1497 td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); 1498 } 1499 1500 /* 1501 * Migrate the current thread to the specified cpu. 1502 * 1503 * This is accomplished by descheduling ourselves from the current cpu, 1504 * moving our thread to the tdallq of the target cpu, IPI messaging the 1505 * target cpu, and switching out. TDF_MIGRATING prevents scheduling 1506 * races while the thread is being migrated. 1507 * 1508 * We must be sure to remove ourselves from the current cpu's tsleepq 1509 * before potentially moving to another queue. The thread can be on 1510 * a tsleepq due to a left-over tsleep_interlock(). 1511 * 1512 * We also have to make sure that the switch code doesn't allow an IPI 1513 * processing operation to leak in between our send and our switch, or 1514 * any other potential livelock such that might occur when we release the 1515 * current process designation, so do that first. 1516 */ 1517 #ifdef SMP 1518 static void lwkt_setcpu_remote(void *arg); 1519 #endif 1520 1521 void 1522 lwkt_setcpu_self(globaldata_t rgd) 1523 { 1524 #ifdef SMP 1525 thread_t td = curthread; 1526 1527 if (td->td_gd != rgd) { 1528 crit_enter_quick(td); 1529 if (td->td_release) 1530 td->td_release(td); 1531 if (td->td_flags & TDF_TSLEEPQ) 1532 tsleep_remove(td); 1533 td->td_flags |= TDF_MIGRATING; 1534 lwkt_deschedule_self(td); 1535 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1536 lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); 1537 lwkt_switch(); 1538 /* we are now on the target cpu */ 1539 TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); 1540 crit_exit_quick(td); 1541 } 1542 #endif 1543 } 1544 1545 void 1546 lwkt_migratecpu(int cpuid) 1547 { 1548 #ifdef SMP 1549 globaldata_t rgd; 1550 1551 rgd = globaldata_find(cpuid); 1552 lwkt_setcpu_self(rgd); 1553 #endif 1554 } 1555 1556 /* 1557 * Remote IPI for cpu migration (called while in a critical section so we 1558 * do not have to enter another one). The thread has already been moved to 1559 * our cpu's allq, but we must wait for the thread to be completely switched 1560 * out on the originating cpu before we schedule it on ours or the stack 1561 * state may be corrupt. We clear TDF_MIGRATING after flushing the GD 1562 * change to main memory. 1563 * 1564 * XXX The use of TDF_MIGRATING might not be sufficient to avoid races 1565 * against wakeups. It is best if this interface is used only when there 1566 * are no pending events that might try to schedule the thread. 1567 */ 1568 #ifdef SMP 1569 static void 1570 lwkt_setcpu_remote(void *arg) 1571 { 1572 thread_t td = arg; 1573 globaldata_t gd = mycpu; 1574 int retry = 10000000; 1575 1576 DEBUG_PUSH_INFO("lwkt_setcpu_remote"); 1577 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1578 #ifdef SMP 1579 lwkt_process_ipiq(); 1580 #endif 1581 cpu_lfence(); 1582 cpu_pause(); 1583 if (--retry == 0) { 1584 kprintf("lwkt_setcpu_remote: td->td_flags %08x\n", 1585 td->td_flags); 1586 retry = 10000000; 1587 } 1588 } 1589 DEBUG_POP_INFO(); 1590 td->td_gd = gd; 1591 cpu_mfence(); 1592 td->td_flags &= ~TDF_MIGRATING; 1593 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1594 _lwkt_enqueue(td); 1595 } 1596 #endif 1597 1598 struct lwp * 1599 lwkt_preempted_proc(void) 1600 { 1601 thread_t td = curthread; 1602 while (td->td_preempted) 1603 td = td->td_preempted; 1604 return(td->td_lwp); 1605 } 1606 1607 /* 1608 * Create a kernel process/thread/whatever. It shares it's address space 1609 * with proc0 - ie: kernel only. 1610 * 1611 * NOTE! By default new threads are created with the MP lock held. A 1612 * thread which does not require the MP lock should release it by calling 1613 * rel_mplock() at the start of the new thread. 1614 */ 1615 int 1616 lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, 1617 thread_t template, int tdflags, int cpu, const char *fmt, ...) 1618 { 1619 thread_t td; 1620 __va_list ap; 1621 1622 td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, 1623 tdflags); 1624 if (tdp) 1625 *tdp = td; 1626 cpu_set_thread_handler(td, lwkt_exit, func, arg); 1627 1628 /* 1629 * Set up arg0 for 'ps' etc 1630 */ 1631 __va_start(ap, fmt); 1632 kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); 1633 __va_end(ap); 1634 1635 /* 1636 * Schedule the thread to run 1637 */ 1638 if ((td->td_flags & TDF_STOPREQ) == 0) 1639 lwkt_schedule(td); 1640 else 1641 td->td_flags &= ~TDF_STOPREQ; 1642 return 0; 1643 } 1644 1645 /* 1646 * Destroy an LWKT thread. Warning! This function is not called when 1647 * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and 1648 * uses a different reaping mechanism. 1649 */ 1650 void 1651 lwkt_exit(void) 1652 { 1653 thread_t td = curthread; 1654 thread_t std; 1655 globaldata_t gd; 1656 1657 /* 1658 * Do any cleanup that might block here 1659 */ 1660 if (td->td_flags & TDF_VERBOSE) 1661 kprintf("kthread %p %s has exited\n", td, td->td_comm); 1662 caps_exit(td); 1663 biosched_done(td); 1664 dsched_exit_thread(td); 1665 1666 /* 1667 * Get us into a critical section to interlock gd_freetd and loop 1668 * until we can get it freed. 1669 * 1670 * We have to cache the current td in gd_freetd because objcache_put()ing 1671 * it would rip it out from under us while our thread is still active. 1672 */ 1673 gd = mycpu; 1674 crit_enter_quick(td); 1675 while ((std = gd->gd_freetd) != NULL) { 1676 KKASSERT((std->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) == 0); 1677 gd->gd_freetd = NULL; 1678 objcache_put(thread_cache, std); 1679 } 1680 1681 /* 1682 * Remove thread resources from kernel lists and deschedule us for 1683 * the last time. We cannot block after this point or we may end 1684 * up with a stale td on the tsleepq. 1685 */ 1686 if (td->td_flags & TDF_TSLEEPQ) 1687 tsleep_remove(td); 1688 lwkt_deschedule_self(td); 1689 lwkt_remove_tdallq(td); 1690 KKASSERT(td->td_refs == 0); 1691 1692 /* 1693 * Final cleanup 1694 */ 1695 KKASSERT(gd->gd_freetd == NULL); 1696 if (td->td_flags & TDF_ALLOCATED_THREAD) 1697 gd->gd_freetd = td; 1698 cpu_thread_exit(); 1699 } 1700 1701 void 1702 lwkt_remove_tdallq(thread_t td) 1703 { 1704 KKASSERT(td->td_gd == mycpu); 1705 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1706 } 1707 1708 /* 1709 * Code reduction and branch prediction improvements. Call/return 1710 * overhead on modern cpus often degenerates into 0 cycles due to 1711 * the cpu's branch prediction hardware and return pc cache. We 1712 * can take advantage of this by not inlining medium-complexity 1713 * functions and we can also reduce the branch prediction impact 1714 * by collapsing perfectly predictable branches into a single 1715 * procedure instead of duplicating it. 1716 * 1717 * Is any of this noticeable? Probably not, so I'll take the 1718 * smaller code size. 1719 */ 1720 void 1721 crit_exit_wrapper(__DEBUG_CRIT_ARG__) 1722 { 1723 _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); 1724 } 1725 1726 void 1727 crit_panic(void) 1728 { 1729 thread_t td = curthread; 1730 int lcrit = td->td_critcount; 1731 1732 td->td_critcount = 0; 1733 panic("td_critcount is/would-go negative! %p %d", td, lcrit); 1734 /* NOT REACHED */ 1735 } 1736 1737 #ifdef SMP 1738 1739 /* 1740 * Called from debugger/panic on cpus which have been stopped. We must still 1741 * process the IPIQ while stopped, even if we were stopped while in a critical 1742 * section (XXX). 1743 * 1744 * If we are dumping also try to process any pending interrupts. This may 1745 * or may not work depending on the state of the cpu at the point it was 1746 * stopped. 1747 */ 1748 void 1749 lwkt_smp_stopped(void) 1750 { 1751 globaldata_t gd = mycpu; 1752 1753 crit_enter_gd(gd); 1754 if (dumping) { 1755 lwkt_process_ipiq(); 1756 splz(); 1757 } else { 1758 lwkt_process_ipiq(); 1759 } 1760 crit_exit_gd(gd); 1761 } 1762 1763 #endif 1764