1 /* 2 * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Each cpu in a system has its own self-contained light weight kernel 37 * thread scheduler, which means that generally speaking we only need 38 * to use a critical section to avoid problems. Foreign thread 39 * scheduling is queued via (async) IPIs. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/kinfo.h> 48 #include <sys/queue.h> 49 #include <sys/sysctl.h> 50 #include <sys/kthread.h> 51 #include <machine/cpu.h> 52 #include <sys/lock.h> 53 #include <sys/caps.h> 54 #include <sys/spinlock.h> 55 #include <sys/ktr.h> 56 57 #include <sys/thread2.h> 58 #include <sys/spinlock2.h> 59 #include <sys/mplock2.h> 60 61 #include <sys/dsched.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_pager.h> 70 #include <vm/vm_extern.h> 71 72 #include <machine/stdarg.h> 73 #include <machine/smp.h> 74 75 #if !defined(KTR_CTXSW) 76 #define KTR_CTXSW KTR_ALL 77 #endif 78 KTR_INFO_MASTER(ctxsw); 79 KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", 80 sizeof(int) + sizeof(struct thread *)); 81 KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", 82 sizeof(int) + sizeof(struct thread *)); 83 KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", 84 sizeof (struct thread *) + sizeof(char *)); 85 KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *)); 86 87 static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); 88 89 #ifdef INVARIANTS 90 static int panic_on_cscount = 0; 91 #endif 92 static __int64_t switch_count = 0; 93 static __int64_t preempt_hit = 0; 94 static __int64_t preempt_miss = 0; 95 static __int64_t preempt_weird = 0; 96 static __int64_t token_contention_count __debugvar = 0; 97 static int lwkt_use_spin_port; 98 static struct objcache *thread_cache; 99 100 #ifdef SMP 101 static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); 102 #endif 103 static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); 104 105 extern void cpu_heavy_restore(void); 106 extern void cpu_lwkt_restore(void); 107 extern void cpu_kthread_restore(void); 108 extern void cpu_idle_restore(void); 109 110 /* 111 * We can make all thread ports use the spin backend instead of the thread 112 * backend. This should only be set to debug the spin backend. 113 */ 114 TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); 115 116 #ifdef INVARIANTS 117 SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, 118 "Panic if attempting to switch lwkt's while mastering cpusync"); 119 #endif 120 SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, 121 "Number of switched threads"); 122 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, 123 "Successful preemption events"); 124 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, 125 "Failed preemption events"); 126 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, 127 "Number of preempted threads."); 128 #ifdef INVARIANTS 129 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, 130 &token_contention_count, 0, "spinning due to token contention"); 131 #endif 132 static int fairq_enable = 1; 133 SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, &fairq_enable, 0, 134 "Turn on fairq priority accumulators"); 135 static int user_pri_sched = 0; 136 SYSCTL_INT(_lwkt, OID_AUTO, user_pri_sched, CTLFLAG_RW, &user_pri_sched, 0, 137 ""); 138 static int preempt_enable = 1; 139 SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, &preempt_enable, 0, 140 "Enable preemption"); 141 142 143 /* 144 * These helper procedures handle the runq, they can only be called from 145 * within a critical section. 146 * 147 * WARNING! Prior to SMP being brought up it is possible to enqueue and 148 * dequeue threads belonging to other cpus, so be sure to use td->td_gd 149 * instead of 'mycpu' when referencing the globaldata structure. Once 150 * SMP live enqueuing and dequeueing only occurs on the current cpu. 151 */ 152 static __inline 153 void 154 _lwkt_dequeue(thread_t td) 155 { 156 if (td->td_flags & TDF_RUNQ) { 157 struct globaldata *gd = td->td_gd; 158 159 td->td_flags &= ~TDF_RUNQ; 160 TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); 161 gd->gd_fairq_total_pri -= td->td_pri; 162 if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) 163 atomic_clear_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); 164 } 165 } 166 167 /* 168 * Priority enqueue. 169 * 170 * NOTE: There are a limited number of lwkt threads runnable since user 171 * processes only schedule one at a time per cpu. 172 */ 173 static __inline 174 void 175 _lwkt_enqueue(thread_t td) 176 { 177 thread_t xtd; 178 179 if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { 180 struct globaldata *gd = td->td_gd; 181 182 td->td_flags |= TDF_RUNQ; 183 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 184 if (xtd == NULL) { 185 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 186 atomic_set_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); 187 } else { 188 while (xtd && xtd->td_pri > td->td_pri) 189 xtd = TAILQ_NEXT(xtd, td_threadq); 190 if (xtd) 191 TAILQ_INSERT_BEFORE(xtd, td, td_threadq); 192 else 193 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 194 } 195 gd->gd_fairq_total_pri += td->td_pri; 196 } 197 } 198 199 static __boolean_t 200 _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) 201 { 202 struct thread *td = (struct thread *)obj; 203 204 td->td_kstack = NULL; 205 td->td_kstack_size = 0; 206 td->td_flags = TDF_ALLOCATED_THREAD; 207 return (1); 208 } 209 210 static void 211 _lwkt_thread_dtor(void *obj, void *privdata) 212 { 213 struct thread *td = (struct thread *)obj; 214 215 KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, 216 ("_lwkt_thread_dtor: not allocated from objcache")); 217 KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && 218 td->td_kstack_size > 0, 219 ("_lwkt_thread_dtor: corrupted stack")); 220 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 221 } 222 223 /* 224 * Initialize the lwkt s/system. 225 */ 226 void 227 lwkt_init(void) 228 { 229 /* An objcache has 2 magazines per CPU so divide cache size by 2. */ 230 thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread), 231 NULL, CACHE_NTHREADS/2, 232 _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); 233 } 234 235 /* 236 * Schedule a thread to run. As the current thread we can always safely 237 * schedule ourselves, and a shortcut procedure is provided for that 238 * function. 239 * 240 * (non-blocking, self contained on a per cpu basis) 241 */ 242 void 243 lwkt_schedule_self(thread_t td) 244 { 245 crit_enter_quick(td); 246 KASSERT(td != &td->td_gd->gd_idlethread, 247 ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); 248 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 249 _lwkt_enqueue(td); 250 crit_exit_quick(td); 251 } 252 253 /* 254 * Deschedule a thread. 255 * 256 * (non-blocking, self contained on a per cpu basis) 257 */ 258 void 259 lwkt_deschedule_self(thread_t td) 260 { 261 crit_enter_quick(td); 262 _lwkt_dequeue(td); 263 crit_exit_quick(td); 264 } 265 266 /* 267 * LWKTs operate on a per-cpu basis 268 * 269 * WARNING! Called from early boot, 'mycpu' may not work yet. 270 */ 271 void 272 lwkt_gdinit(struct globaldata *gd) 273 { 274 TAILQ_INIT(&gd->gd_tdrunq); 275 TAILQ_INIT(&gd->gd_tdallq); 276 } 277 278 /* 279 * Create a new thread. The thread must be associated with a process context 280 * or LWKT start address before it can be scheduled. If the target cpu is 281 * -1 the thread will be created on the current cpu. 282 * 283 * If you intend to create a thread without a process context this function 284 * does everything except load the startup and switcher function. 285 */ 286 thread_t 287 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) 288 { 289 globaldata_t gd = mycpu; 290 void *stack; 291 292 /* 293 * If static thread storage is not supplied allocate a thread. Reuse 294 * a cached free thread if possible. gd_freetd is used to keep an exiting 295 * thread intact through the exit. 296 */ 297 if (td == NULL) { 298 crit_enter_gd(gd); 299 if ((td = gd->gd_freetd) != NULL) { 300 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 301 TDF_RUNQ)) == 0); 302 gd->gd_freetd = NULL; 303 } else { 304 td = objcache_get(thread_cache, M_WAITOK); 305 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 306 TDF_RUNQ)) == 0); 307 } 308 crit_exit_gd(gd); 309 KASSERT((td->td_flags & 310 (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, 311 ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); 312 flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); 313 } 314 315 /* 316 * Try to reuse cached stack. 317 */ 318 if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { 319 if (flags & TDF_ALLOCATED_STACK) { 320 kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); 321 stack = NULL; 322 } 323 } 324 if (stack == NULL) { 325 stack = (void *)kmem_alloc_stack(&kernel_map, stksize); 326 flags |= TDF_ALLOCATED_STACK; 327 } 328 if (cpu < 0) 329 lwkt_init_thread(td, stack, stksize, flags, gd); 330 else 331 lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); 332 return(td); 333 } 334 335 /* 336 * Initialize a preexisting thread structure. This function is used by 337 * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. 338 * 339 * All threads start out in a critical section at a priority of 340 * TDPRI_KERN_DAEMON. Higher level code will modify the priority as 341 * appropriate. This function may send an IPI message when the 342 * requested cpu is not the current cpu and consequently gd_tdallq may 343 * not be initialized synchronously from the point of view of the originating 344 * cpu. 345 * 346 * NOTE! we have to be careful in regards to creating threads for other cpus 347 * if SMP has not yet been activated. 348 */ 349 #ifdef SMP 350 351 static void 352 lwkt_init_thread_remote(void *arg) 353 { 354 thread_t td = arg; 355 356 /* 357 * Protected by critical section held by IPI dispatch 358 */ 359 TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); 360 } 361 362 #endif 363 364 /* 365 * lwkt core thread structural initialization. 366 * 367 * NOTE: All threads are initialized as mpsafe threads. 368 */ 369 void 370 lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, 371 struct globaldata *gd) 372 { 373 globaldata_t mygd = mycpu; 374 375 bzero(td, sizeof(struct thread)); 376 td->td_kstack = stack; 377 td->td_kstack_size = stksize; 378 td->td_flags = flags; 379 td->td_gd = gd; 380 td->td_pri = TDPRI_KERN_DAEMON; 381 td->td_critcount = 1; 382 td->td_toks_stop = &td->td_toks_base; 383 if (lwkt_use_spin_port) 384 lwkt_initport_spin(&td->td_msgport); 385 else 386 lwkt_initport_thread(&td->td_msgport, td); 387 pmap_init_thread(td); 388 #ifdef SMP 389 /* 390 * Normally initializing a thread for a remote cpu requires sending an 391 * IPI. However, the idlethread is setup before the other cpus are 392 * activated so we have to treat it as a special case. XXX manipulation 393 * of gd_tdallq requires the BGL. 394 */ 395 if (gd == mygd || td == &gd->gd_idlethread) { 396 crit_enter_gd(mygd); 397 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 398 crit_exit_gd(mygd); 399 } else { 400 lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); 401 } 402 #else 403 crit_enter_gd(mygd); 404 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 405 crit_exit_gd(mygd); 406 #endif 407 408 dsched_new_thread(td); 409 } 410 411 void 412 lwkt_set_comm(thread_t td, const char *ctl, ...) 413 { 414 __va_list va; 415 416 __va_start(va, ctl); 417 kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); 418 __va_end(va); 419 KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); 420 } 421 422 void 423 lwkt_hold(thread_t td) 424 { 425 ++td->td_refs; 426 } 427 428 void 429 lwkt_rele(thread_t td) 430 { 431 KKASSERT(td->td_refs > 0); 432 --td->td_refs; 433 } 434 435 void 436 lwkt_wait_free(thread_t td) 437 { 438 while (td->td_refs) 439 tsleep(td, 0, "tdreap", hz); 440 } 441 442 void 443 lwkt_free_thread(thread_t td) 444 { 445 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); 446 if (td->td_flags & TDF_ALLOCATED_THREAD) { 447 objcache_put(thread_cache, td); 448 } else if (td->td_flags & TDF_ALLOCATED_STACK) { 449 /* client-allocated struct with internally allocated stack */ 450 KASSERT(td->td_kstack && td->td_kstack_size > 0, 451 ("lwkt_free_thread: corrupted stack")); 452 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 453 td->td_kstack = NULL; 454 td->td_kstack_size = 0; 455 } 456 KTR_LOG(ctxsw_deadtd, td); 457 } 458 459 460 /* 461 * Switch to the next runnable lwkt. If no LWKTs are runnable then 462 * switch to the idlethread. Switching must occur within a critical 463 * section to avoid races with the scheduling queue. 464 * 465 * We always have full control over our cpu's run queue. Other cpus 466 * that wish to manipulate our queue must use the cpu_*msg() calls to 467 * talk to our cpu, so a critical section is all that is needed and 468 * the result is very, very fast thread switching. 469 * 470 * The LWKT scheduler uses a fixed priority model and round-robins at 471 * each priority level. User process scheduling is a totally 472 * different beast and LWKT priorities should not be confused with 473 * user process priorities. 474 * 475 * Note that the td_switch() function cannot do anything that requires 476 * the MP lock since the MP lock will have already been setup for 477 * the target thread (not the current thread). It's nice to have a scheduler 478 * that does not need the MP lock to work because it allows us to do some 479 * really cool high-performance MP lock optimizations. 480 * 481 * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() 482 * is not called by the current thread in the preemption case, only when 483 * the preempting thread blocks (in order to return to the original thread). 484 */ 485 void 486 lwkt_switch(void) 487 { 488 globaldata_t gd = mycpu; 489 thread_t td = gd->gd_curthread; 490 thread_t ntd; 491 thread_t xtd; 492 thread_t nlast; 493 int nquserok; 494 int didaccumulate; 495 496 /* 497 * Switching from within a 'fast' (non thread switched) interrupt or IPI 498 * is illegal. However, we may have to do it anyway if we hit a fatal 499 * kernel trap or we have paniced. 500 * 501 * If this case occurs save and restore the interrupt nesting level. 502 */ 503 if (gd->gd_intr_nesting_level) { 504 int savegdnest; 505 int savegdtrap; 506 507 if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { 508 panic("lwkt_switch: Attempt to switch from a " 509 "a fast interrupt, ipi, or hard code section, " 510 "td %p\n", 511 td); 512 } else { 513 savegdnest = gd->gd_intr_nesting_level; 514 savegdtrap = gd->gd_trap_nesting_level; 515 gd->gd_intr_nesting_level = 0; 516 gd->gd_trap_nesting_level = 0; 517 if ((td->td_flags & TDF_PANICWARN) == 0) { 518 td->td_flags |= TDF_PANICWARN; 519 kprintf("Warning: thread switch from interrupt, IPI, " 520 "or hard code section.\n" 521 "thread %p (%s)\n", td, td->td_comm); 522 print_backtrace(-1); 523 } 524 lwkt_switch(); 525 gd->gd_intr_nesting_level = savegdnest; 526 gd->gd_trap_nesting_level = savegdtrap; 527 return; 528 } 529 } 530 531 /* 532 * Passive release (used to transition from user to kernel mode 533 * when we block or switch rather then when we enter the kernel). 534 * This function is NOT called if we are switching into a preemption 535 * or returning from a preemption. Typically this causes us to lose 536 * our current process designation (if we have one) and become a true 537 * LWKT thread, and may also hand the current process designation to 538 * another process and schedule thread. 539 */ 540 if (td->td_release) 541 td->td_release(td); 542 543 crit_enter_gd(gd); 544 if (TD_TOKS_HELD(td)) 545 lwkt_relalltokens(td); 546 547 /* 548 * We had better not be holding any spin locks, but don't get into an 549 * endless panic loop. 550 */ 551 KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, 552 ("lwkt_switch: still holding %d exclusive spinlocks!", 553 gd->gd_spinlocks_wr)); 554 555 556 #ifdef SMP 557 #ifdef INVARIANTS 558 if (td->td_cscount) { 559 kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", 560 td); 561 if (panic_on_cscount) 562 panic("switching while mastering cpusync"); 563 } 564 #endif 565 #endif 566 567 /* 568 * If we had preempted another thread on this cpu, resume the preempted 569 * thread. This occurs transparently, whether the preempted thread 570 * was scheduled or not (it may have been preempted after descheduling 571 * itself). 572 * 573 * We have to setup the MP lock for the original thread after backing 574 * out the adjustment that was made to curthread when the original 575 * was preempted. 576 */ 577 if ((ntd = td->td_preempted) != NULL) { 578 KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); 579 ntd->td_flags |= TDF_PREEMPT_DONE; 580 581 /* 582 * The interrupt may have woken a thread up, we need to properly 583 * set the reschedule flag if the originally interrupted thread is 584 * at a lower priority. 585 */ 586 if (TAILQ_FIRST(&gd->gd_tdrunq) && 587 TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { 588 need_lwkt_resched(); 589 } 590 /* YYY release mp lock on switchback if original doesn't need it */ 591 goto havethread_preempted; 592 } 593 594 /* 595 * Implement round-robin fairq with priority insertion. The priority 596 * insertion is handled by _lwkt_enqueue() 597 * 598 * We have to adjust the MP lock for the target thread. If we 599 * need the MP lock and cannot obtain it we try to locate a 600 * thread that does not need the MP lock. If we cannot, we spin 601 * instead of HLT. 602 * 603 * A similar issue exists for the tokens held by the target thread. 604 * If we cannot obtain ownership of the tokens we cannot immediately 605 * schedule the thread. 606 */ 607 for (;;) { 608 clear_lwkt_resched(); 609 didaccumulate = 0; 610 ntd = TAILQ_FIRST(&gd->gd_tdrunq); 611 612 /* 613 * Hotpath if we can get all necessary resources. 614 * 615 * If nothing is runnable switch to the idle thread 616 */ 617 if (ntd == NULL) { 618 ntd = &gd->gd_idlethread; 619 if (gd->gd_reqflags & RQF_IDLECHECK_MASK) 620 ntd->td_flags |= TDF_IDLE_NOHLT; 621 #ifdef SMP 622 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 623 ASSERT_NO_TOKENS_HELD(ntd); 624 clr_cpu_contention_mask(gd); 625 #endif 626 cpu_time.cp_msg[0] = 0; 627 cpu_time.cp_stallpc = 0; 628 goto haveidle; 629 } 630 631 /* 632 * Hotpath schedule 633 * 634 * NOTE: For UP there is no mplock and lwkt_getalltokens() 635 * always succeeds. 636 */ 637 if (ntd->td_fairq_accum >= 0 && 638 (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 639 ) { 640 #ifdef SMP 641 clr_cpu_contention_mask(gd); 642 #endif 643 goto havethread; 644 } 645 646 #ifdef SMP 647 if (ntd->td_fairq_accum >= 0) 648 set_cpu_contention_mask(gd); 649 #endif 650 651 /* 652 * Coldpath - unable to schedule ntd, continue looking for threads 653 * to schedule. This is only allowed of the (presumably) kernel 654 * thread exhausted its fair share. A kernel thread stuck on 655 * resources does not currently allow a user thread to get in 656 * front of it. 657 */ 658 #ifdef SMP 659 nquserok = ((ntd->td_pri < TDPRI_KERN_LPSCHED) || 660 (ntd->td_fairq_accum < 0)); 661 #else 662 nquserok = 1; 663 #endif 664 nlast = NULL; 665 666 for (;;) { 667 /* 668 * If the fair-share scheduler ran out ntd gets moved to the 669 * end and its accumulator will be bumped, if it didn't we 670 * maintain the same queue position. 671 * 672 * nlast keeps track of the last element prior to any moves. 673 */ 674 if (ntd->td_fairq_accum < 0) { 675 lwkt_fairq_accumulate(gd, ntd); 676 didaccumulate = 1; 677 678 /* 679 * Move to end 680 */ 681 xtd = TAILQ_NEXT(ntd, td_threadq); 682 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 683 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); 684 685 /* 686 * Set terminal element (nlast) 687 */ 688 if (nlast == NULL) { 689 nlast = ntd; 690 if (xtd == NULL) 691 xtd = ntd; 692 } 693 ntd = xtd; 694 } else { 695 ntd = TAILQ_NEXT(ntd, td_threadq); 696 } 697 698 /* 699 * If we exhausted the run list switch to the idle thread. 700 * Since one or more threads had resource acquisition issues 701 * we do not allow the idle thread to halt. 702 * 703 * NOTE: nlast can be NULL. 704 */ 705 if (ntd == nlast) { 706 cpu_pause(); 707 ntd = &gd->gd_idlethread; 708 ntd->td_flags |= TDF_IDLE_NOHLT; 709 #ifdef SMP 710 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 711 ASSERT_NO_TOKENS_HELD(ntd); 712 /* contention case, do not clear contention mask */ 713 #endif 714 715 /* 716 * If fairq accumulations occured we do not schedule the 717 * idle thread. This will cause us to try again from 718 * the (almost) top. 719 */ 720 if (didaccumulate) 721 break; /* try again from the top, almost */ 722 goto haveidle; 723 } 724 725 /* 726 * Try to switch to this thread. 727 * 728 * NOTE: For UP there is no mplock and lwkt_getalltokens() 729 * always succeeds. 730 */ 731 if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok || 732 user_pri_sched) && ntd->td_fairq_accum >= 0 && 733 (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 734 ) { 735 #ifdef SMP 736 clr_cpu_contention_mask(gd); 737 #endif 738 goto havethread; 739 } 740 741 /* 742 * Thread was runnable but we were unable to get the required 743 * resources (tokens and/or mplock). 744 */ 745 #ifdef SMP 746 if (ntd->td_fairq_accum >= 0) 747 set_cpu_contention_mask(gd); 748 if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0) 749 nquserok = 0; 750 #endif 751 } 752 753 /* 754 * All threads exhausted but we can loop due to a negative 755 * accumulator. 756 * 757 * While we are looping in the scheduler be sure to service 758 * any interrupts which were made pending due to our critical 759 * section, otherwise we could livelock (e.g.) IPIs. 760 */ 761 splz_check(); 762 } 763 764 /* 765 * We must always decrement td_fairq_accum on non-idle threads just 766 * in case a thread never gets a tick due to being in a continuous 767 * critical section. The page-zeroing code does that. 768 * 769 * If the thread we came up with is a higher or equal priority verses 770 * the thread at the head of the queue we move our thread to the 771 * front. This way we can always check the front of the queue. 772 */ 773 havethread: 774 ++gd->gd_cnt.v_swtch; 775 --ntd->td_fairq_accum; 776 ntd->td_wmesg = NULL; 777 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 778 if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { 779 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 780 TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); 781 } 782 havethread_preempted: 783 ; 784 /* 785 * If the new target does not need the MP lock and we are holding it, 786 * release the MP lock. If the new target requires the MP lock we have 787 * already acquired it for the target. 788 */ 789 haveidle: 790 KASSERT(ntd->td_critcount, 791 ("priority problem in lwkt_switch %d %d", 792 td->td_critcount, ntd->td_critcount)); 793 794 if (td != ntd) { 795 ++switch_count; 796 KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); 797 td->td_switch(ntd); 798 } 799 /* NOTE: current cpu may have changed after switch */ 800 crit_exit_quick(td); 801 } 802 803 /* 804 * Request that the target thread preempt the current thread. Preemption 805 * only works under a specific set of conditions: 806 * 807 * - We are not preempting ourselves 808 * - The target thread is owned by the current cpu 809 * - We are not currently being preempted 810 * - The target is not currently being preempted 811 * - We are not holding any spin locks 812 * - The target thread is not holding any tokens 813 * - We are able to satisfy the target's MP lock requirements (if any). 814 * 815 * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically 816 * this is called via lwkt_schedule() through the td_preemptable callback. 817 * critcount is the managed critical priority that we should ignore in order 818 * to determine whether preemption is possible (aka usually just the crit 819 * priority of lwkt_schedule() itself). 820 * 821 * XXX at the moment we run the target thread in a critical section during 822 * the preemption in order to prevent the target from taking interrupts 823 * that *WE* can't. Preemption is strictly limited to interrupt threads 824 * and interrupt-like threads, outside of a critical section, and the 825 * preempted source thread will be resumed the instant the target blocks 826 * whether or not the source is scheduled (i.e. preemption is supposed to 827 * be as transparent as possible). 828 */ 829 void 830 lwkt_preempt(thread_t ntd, int critcount) 831 { 832 struct globaldata *gd = mycpu; 833 thread_t td; 834 int save_gd_intr_nesting_level; 835 836 /* 837 * The caller has put us in a critical section. We can only preempt 838 * if the caller of the caller was not in a critical section (basically 839 * a local interrupt), as determined by the 'critcount' parameter. We 840 * also can't preempt if the caller is holding any spinlocks (even if 841 * he isn't in a critical section). This also handles the tokens test. 842 * 843 * YYY The target thread must be in a critical section (else it must 844 * inherit our critical section? I dunno yet). 845 * 846 * Set need_lwkt_resched() unconditionally for now YYY. 847 */ 848 KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); 849 850 if (preempt_enable == 0) { 851 ++preempt_miss; 852 return; 853 } 854 855 td = gd->gd_curthread; 856 if (ntd->td_pri <= td->td_pri) { 857 ++preempt_miss; 858 return; 859 } 860 if (td->td_critcount > critcount) { 861 ++preempt_miss; 862 need_lwkt_resched(); 863 return; 864 } 865 #ifdef SMP 866 if (ntd->td_gd != gd) { 867 ++preempt_miss; 868 need_lwkt_resched(); 869 return; 870 } 871 #endif 872 /* 873 * We don't have to check spinlocks here as they will also bump 874 * td_critcount. 875 * 876 * Do not try to preempt if the target thread is holding any tokens. 877 * We could try to acquire the tokens but this case is so rare there 878 * is no need to support it. 879 */ 880 KKASSERT(gd->gd_spinlocks_wr == 0); 881 882 if (TD_TOKS_HELD(ntd)) { 883 ++preempt_miss; 884 need_lwkt_resched(); 885 return; 886 } 887 if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { 888 ++preempt_weird; 889 need_lwkt_resched(); 890 return; 891 } 892 if (ntd->td_preempted) { 893 ++preempt_hit; 894 need_lwkt_resched(); 895 return; 896 } 897 898 /* 899 * Since we are able to preempt the current thread, there is no need to 900 * call need_lwkt_resched(). 901 * 902 * We must temporarily clear gd_intr_nesting_level around the switch 903 * since switchouts from the target thread are allowed (they will just 904 * return to our thread), and since the target thread has its own stack. 905 */ 906 ++preempt_hit; 907 ntd->td_preempted = td; 908 td->td_flags |= TDF_PREEMPT_LOCK; 909 KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); 910 save_gd_intr_nesting_level = gd->gd_intr_nesting_level; 911 gd->gd_intr_nesting_level = 0; 912 td->td_switch(ntd); 913 gd->gd_intr_nesting_level = save_gd_intr_nesting_level; 914 915 KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); 916 ntd->td_preempted = NULL; 917 td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); 918 } 919 920 /* 921 * Conditionally call splz() if gd_reqflags indicates work is pending. 922 * This will work inside a critical section but not inside a hard code 923 * section. 924 * 925 * (self contained on a per cpu basis) 926 */ 927 void 928 splz_check(void) 929 { 930 globaldata_t gd = mycpu; 931 thread_t td = gd->gd_curthread; 932 933 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && 934 gd->gd_intr_nesting_level == 0 && 935 td->td_nest_count < 2) 936 { 937 splz(); 938 } 939 } 940 941 /* 942 * This version is integrated into crit_exit, reqflags has already 943 * been tested but td_critcount has not. 944 * 945 * We only want to execute the splz() on the 1->0 transition of 946 * critcount and not in a hard code section or if too deeply nested. 947 */ 948 void 949 lwkt_maybe_splz(thread_t td) 950 { 951 globaldata_t gd = td->td_gd; 952 953 if (td->td_critcount == 0 && 954 gd->gd_intr_nesting_level == 0 && 955 td->td_nest_count < 2) 956 { 957 splz(); 958 } 959 } 960 961 /* 962 * This function is used to negotiate a passive release of the current 963 * process/lwp designation with the user scheduler, allowing the user 964 * scheduler to schedule another user thread. The related kernel thread 965 * (curthread) continues running in the released state. 966 */ 967 void 968 lwkt_passive_release(struct thread *td) 969 { 970 struct lwp *lp = td->td_lwp; 971 972 td->td_release = NULL; 973 lwkt_setpri_self(TDPRI_KERN_USER); 974 lp->lwp_proc->p_usched->release_curproc(lp); 975 } 976 977 978 /* 979 * This implements a normal yield. This routine is virtually a nop if 980 * there is nothing to yield to but it will always run any pending interrupts 981 * if called from a critical section. 982 * 983 * This yield is designed for kernel threads without a user context. 984 * 985 * (self contained on a per cpu basis) 986 */ 987 void 988 lwkt_yield(void) 989 { 990 globaldata_t gd = mycpu; 991 thread_t td = gd->gd_curthread; 992 thread_t xtd; 993 994 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 995 splz(); 996 if (td->td_fairq_accum < 0) { 997 lwkt_schedule_self(curthread); 998 lwkt_switch(); 999 } else { 1000 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 1001 if (xtd && xtd->td_pri > td->td_pri) { 1002 lwkt_schedule_self(curthread); 1003 lwkt_switch(); 1004 } 1005 } 1006 } 1007 1008 /* 1009 * This yield is designed for kernel threads with a user context. 1010 * 1011 * The kernel acting on behalf of the user is potentially cpu-bound, 1012 * this function will efficiently allow other threads to run and also 1013 * switch to other processes by releasing. 1014 * 1015 * The lwkt_user_yield() function is designed to have very low overhead 1016 * if no yield is determined to be needed. 1017 */ 1018 void 1019 lwkt_user_yield(void) 1020 { 1021 globaldata_t gd = mycpu; 1022 thread_t td = gd->gd_curthread; 1023 1024 /* 1025 * Always run any pending interrupts in case we are in a critical 1026 * section. 1027 */ 1028 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1029 splz(); 1030 1031 /* 1032 * Switch (which forces a release) if another kernel thread needs 1033 * the cpu, if userland wants us to resched, or if our kernel 1034 * quantum has run out. 1035 */ 1036 if (lwkt_resched_wanted() || 1037 user_resched_wanted() || 1038 td->td_fairq_accum < 0) 1039 { 1040 lwkt_switch(); 1041 } 1042 1043 #if 0 1044 /* 1045 * Reacquire the current process if we are released. 1046 * 1047 * XXX not implemented atm. The kernel may be holding locks and such, 1048 * so we want the thread to continue to receive cpu. 1049 */ 1050 if (td->td_release == NULL && lp) { 1051 lp->lwp_proc->p_usched->acquire_curproc(lp); 1052 td->td_release = lwkt_passive_release; 1053 lwkt_setpri_self(TDPRI_USER_NORM); 1054 } 1055 #endif 1056 } 1057 1058 /* 1059 * Generic schedule. Possibly schedule threads belonging to other cpus and 1060 * deal with threads that might be blocked on a wait queue. 1061 * 1062 * We have a little helper inline function which does additional work after 1063 * the thread has been enqueued, including dealing with preemption and 1064 * setting need_lwkt_resched() (which prevents the kernel from returning 1065 * to userland until it has processed higher priority threads). 1066 * 1067 * It is possible for this routine to be called after a failed _enqueue 1068 * (due to the target thread migrating, sleeping, or otherwise blocked). 1069 * We have to check that the thread is actually on the run queue! 1070 * 1071 * reschedok is an optimized constant propagated from lwkt_schedule() or 1072 * lwkt_schedule_noresched(). By default it is non-zero, causing a 1073 * reschedule to be requested if the target thread has a higher priority. 1074 * The port messaging code will set MSG_NORESCHED and cause reschedok to 1075 * be 0, prevented undesired reschedules. 1076 */ 1077 static __inline 1078 void 1079 _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) 1080 { 1081 thread_t otd; 1082 1083 if (ntd->td_flags & TDF_RUNQ) { 1084 if (ntd->td_preemptable && reschedok) { 1085 ntd->td_preemptable(ntd, ccount); /* YYY +token */ 1086 } else if (reschedok) { 1087 otd = curthread; 1088 if (ntd->td_pri > otd->td_pri) 1089 need_lwkt_resched(); 1090 } 1091 1092 /* 1093 * Give the thread a little fair share scheduler bump if it 1094 * has been asleep for a while. This is primarily to avoid 1095 * a degenerate case for interrupt threads where accumulator 1096 * crosses into negative territory unnecessarily. 1097 */ 1098 if (ntd->td_fairq_lticks != ticks) { 1099 ntd->td_fairq_lticks = ticks; 1100 ntd->td_fairq_accum += gd->gd_fairq_total_pri; 1101 if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) 1102 ntd->td_fairq_accum = TDFAIRQ_MAX(gd); 1103 } 1104 } 1105 } 1106 1107 static __inline 1108 void 1109 _lwkt_schedule(thread_t td, int reschedok) 1110 { 1111 globaldata_t mygd = mycpu; 1112 1113 KASSERT(td != &td->td_gd->gd_idlethread, 1114 ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); 1115 crit_enter_gd(mygd); 1116 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1117 if (td == mygd->gd_curthread) { 1118 _lwkt_enqueue(td); 1119 } else { 1120 /* 1121 * If we own the thread, there is no race (since we are in a 1122 * critical section). If we do not own the thread there might 1123 * be a race but the target cpu will deal with it. 1124 */ 1125 #ifdef SMP 1126 if (td->td_gd == mygd) { 1127 _lwkt_enqueue(td); 1128 _lwkt_schedule_post(mygd, td, 1, reschedok); 1129 } else { 1130 lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); 1131 } 1132 #else 1133 _lwkt_enqueue(td); 1134 _lwkt_schedule_post(mygd, td, 1, reschedok); 1135 #endif 1136 } 1137 crit_exit_gd(mygd); 1138 } 1139 1140 void 1141 lwkt_schedule(thread_t td) 1142 { 1143 _lwkt_schedule(td, 1); 1144 } 1145 1146 void 1147 lwkt_schedule_noresched(thread_t td) 1148 { 1149 _lwkt_schedule(td, 0); 1150 } 1151 1152 #ifdef SMP 1153 1154 /* 1155 * When scheduled remotely if frame != NULL the IPIQ is being 1156 * run via doreti or an interrupt then preemption can be allowed. 1157 * 1158 * To allow preemption we have to drop the critical section so only 1159 * one is present in _lwkt_schedule_post. 1160 */ 1161 static void 1162 lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame) 1163 { 1164 thread_t td = curthread; 1165 thread_t ntd = arg; 1166 1167 if (frame && ntd->td_preemptable) { 1168 crit_exit_noyield(td); 1169 _lwkt_schedule(ntd, 1); 1170 crit_enter_quick(td); 1171 } else { 1172 _lwkt_schedule(ntd, 1); 1173 } 1174 } 1175 1176 /* 1177 * Thread migration using a 'Pull' method. The thread may or may not be 1178 * the current thread. It MUST be descheduled and in a stable state. 1179 * lwkt_giveaway() must be called on the cpu owning the thread. 1180 * 1181 * At any point after lwkt_giveaway() is called, the target cpu may 1182 * 'pull' the thread by calling lwkt_acquire(). 1183 * 1184 * We have to make sure the thread is not sitting on a per-cpu tsleep 1185 * queue or it will blow up when it moves to another cpu. 1186 * 1187 * MPSAFE - must be called under very specific conditions. 1188 */ 1189 void 1190 lwkt_giveaway(thread_t td) 1191 { 1192 globaldata_t gd = mycpu; 1193 1194 crit_enter_gd(gd); 1195 if (td->td_flags & TDF_TSLEEPQ) 1196 tsleep_remove(td); 1197 KKASSERT(td->td_gd == gd); 1198 TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); 1199 td->td_flags |= TDF_MIGRATING; 1200 crit_exit_gd(gd); 1201 } 1202 1203 void 1204 lwkt_acquire(thread_t td) 1205 { 1206 globaldata_t gd; 1207 globaldata_t mygd; 1208 1209 KKASSERT(td->td_flags & TDF_MIGRATING); 1210 gd = td->td_gd; 1211 mygd = mycpu; 1212 if (gd != mycpu) { 1213 cpu_lfence(); 1214 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1215 crit_enter_gd(mygd); 1216 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1217 #ifdef SMP 1218 lwkt_process_ipiq(); 1219 #endif 1220 cpu_lfence(); 1221 } 1222 cpu_mfence(); 1223 td->td_gd = mygd; 1224 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1225 td->td_flags &= ~TDF_MIGRATING; 1226 crit_exit_gd(mygd); 1227 } else { 1228 crit_enter_gd(mygd); 1229 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1230 td->td_flags &= ~TDF_MIGRATING; 1231 crit_exit_gd(mygd); 1232 } 1233 } 1234 1235 #endif 1236 1237 /* 1238 * Generic deschedule. Descheduling threads other then your own should be 1239 * done only in carefully controlled circumstances. Descheduling is 1240 * asynchronous. 1241 * 1242 * This function may block if the cpu has run out of messages. 1243 */ 1244 void 1245 lwkt_deschedule(thread_t td) 1246 { 1247 crit_enter(); 1248 #ifdef SMP 1249 if (td == curthread) { 1250 _lwkt_dequeue(td); 1251 } else { 1252 if (td->td_gd == mycpu) { 1253 _lwkt_dequeue(td); 1254 } else { 1255 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); 1256 } 1257 } 1258 #else 1259 _lwkt_dequeue(td); 1260 #endif 1261 crit_exit(); 1262 } 1263 1264 /* 1265 * Set the target thread's priority. This routine does not automatically 1266 * switch to a higher priority thread, LWKT threads are not designed for 1267 * continuous priority changes. Yield if you want to switch. 1268 */ 1269 void 1270 lwkt_setpri(thread_t td, int pri) 1271 { 1272 KKASSERT(td->td_gd == mycpu); 1273 if (td->td_pri != pri) { 1274 KKASSERT(pri >= 0); 1275 crit_enter(); 1276 if (td->td_flags & TDF_RUNQ) { 1277 _lwkt_dequeue(td); 1278 td->td_pri = pri; 1279 _lwkt_enqueue(td); 1280 } else { 1281 td->td_pri = pri; 1282 } 1283 crit_exit(); 1284 } 1285 } 1286 1287 /* 1288 * Set the initial priority for a thread prior to it being scheduled for 1289 * the first time. The thread MUST NOT be scheduled before or during 1290 * this call. The thread may be assigned to a cpu other then the current 1291 * cpu. 1292 * 1293 * Typically used after a thread has been created with TDF_STOPPREQ, 1294 * and before the thread is initially scheduled. 1295 */ 1296 void 1297 lwkt_setpri_initial(thread_t td, int pri) 1298 { 1299 KKASSERT(pri >= 0); 1300 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1301 td->td_pri = pri; 1302 } 1303 1304 void 1305 lwkt_setpri_self(int pri) 1306 { 1307 thread_t td = curthread; 1308 1309 KKASSERT(pri >= 0 && pri <= TDPRI_MAX); 1310 crit_enter(); 1311 if (td->td_flags & TDF_RUNQ) { 1312 _lwkt_dequeue(td); 1313 td->td_pri = pri; 1314 _lwkt_enqueue(td); 1315 } else { 1316 td->td_pri = pri; 1317 } 1318 crit_exit(); 1319 } 1320 1321 /* 1322 * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. 1323 * 1324 * Example: two competing threads, same priority N. decrement by (2*N) 1325 * increment by N*8, each thread will get 4 ticks. 1326 */ 1327 void 1328 lwkt_fairq_schedulerclock(thread_t td) 1329 { 1330 if (fairq_enable) { 1331 while (td) { 1332 if (td != &td->td_gd->gd_idlethread) { 1333 td->td_fairq_accum -= td->td_gd->gd_fairq_total_pri; 1334 if (td->td_fairq_accum < -TDFAIRQ_MAX(td->td_gd)) 1335 td->td_fairq_accum = -TDFAIRQ_MAX(td->td_gd); 1336 if (td->td_fairq_accum < 0) 1337 need_lwkt_resched(); 1338 td->td_fairq_lticks = ticks; 1339 } 1340 td = td->td_preempted; 1341 } 1342 } 1343 } 1344 1345 static void 1346 lwkt_fairq_accumulate(globaldata_t gd, thread_t td) 1347 { 1348 td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; 1349 if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) 1350 td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); 1351 } 1352 1353 /* 1354 * Migrate the current thread to the specified cpu. 1355 * 1356 * This is accomplished by descheduling ourselves from the current cpu, 1357 * moving our thread to the tdallq of the target cpu, IPI messaging the 1358 * target cpu, and switching out. TDF_MIGRATING prevents scheduling 1359 * races while the thread is being migrated. 1360 * 1361 * We must be sure to remove ourselves from the current cpu's tsleepq 1362 * before potentially moving to another queue. The thread can be on 1363 * a tsleepq due to a left-over tsleep_interlock(). 1364 */ 1365 #ifdef SMP 1366 static void lwkt_setcpu_remote(void *arg); 1367 #endif 1368 1369 void 1370 lwkt_setcpu_self(globaldata_t rgd) 1371 { 1372 #ifdef SMP 1373 thread_t td = curthread; 1374 1375 if (td->td_gd != rgd) { 1376 crit_enter_quick(td); 1377 if (td->td_flags & TDF_TSLEEPQ) 1378 tsleep_remove(td); 1379 td->td_flags |= TDF_MIGRATING; 1380 lwkt_deschedule_self(td); 1381 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1382 lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); 1383 lwkt_switch(); 1384 /* we are now on the target cpu */ 1385 TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); 1386 crit_exit_quick(td); 1387 } 1388 #endif 1389 } 1390 1391 void 1392 lwkt_migratecpu(int cpuid) 1393 { 1394 #ifdef SMP 1395 globaldata_t rgd; 1396 1397 rgd = globaldata_find(cpuid); 1398 lwkt_setcpu_self(rgd); 1399 #endif 1400 } 1401 1402 /* 1403 * Remote IPI for cpu migration (called while in a critical section so we 1404 * do not have to enter another one). The thread has already been moved to 1405 * our cpu's allq, but we must wait for the thread to be completely switched 1406 * out on the originating cpu before we schedule it on ours or the stack 1407 * state may be corrupt. We clear TDF_MIGRATING after flushing the GD 1408 * change to main memory. 1409 * 1410 * XXX The use of TDF_MIGRATING might not be sufficient to avoid races 1411 * against wakeups. It is best if this interface is used only when there 1412 * are no pending events that might try to schedule the thread. 1413 */ 1414 #ifdef SMP 1415 static void 1416 lwkt_setcpu_remote(void *arg) 1417 { 1418 thread_t td = arg; 1419 globaldata_t gd = mycpu; 1420 1421 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1422 #ifdef SMP 1423 lwkt_process_ipiq(); 1424 #endif 1425 cpu_lfence(); 1426 cpu_pause(); 1427 } 1428 td->td_gd = gd; 1429 cpu_mfence(); 1430 td->td_flags &= ~TDF_MIGRATING; 1431 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1432 _lwkt_enqueue(td); 1433 } 1434 #endif 1435 1436 struct lwp * 1437 lwkt_preempted_proc(void) 1438 { 1439 thread_t td = curthread; 1440 while (td->td_preempted) 1441 td = td->td_preempted; 1442 return(td->td_lwp); 1443 } 1444 1445 /* 1446 * Create a kernel process/thread/whatever. It shares it's address space 1447 * with proc0 - ie: kernel only. 1448 * 1449 * NOTE! By default new threads are created with the MP lock held. A 1450 * thread which does not require the MP lock should release it by calling 1451 * rel_mplock() at the start of the new thread. 1452 */ 1453 int 1454 lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, 1455 thread_t template, int tdflags, int cpu, const char *fmt, ...) 1456 { 1457 thread_t td; 1458 __va_list ap; 1459 1460 td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, 1461 tdflags); 1462 if (tdp) 1463 *tdp = td; 1464 cpu_set_thread_handler(td, lwkt_exit, func, arg); 1465 1466 /* 1467 * Set up arg0 for 'ps' etc 1468 */ 1469 __va_start(ap, fmt); 1470 kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); 1471 __va_end(ap); 1472 1473 /* 1474 * Schedule the thread to run 1475 */ 1476 if ((td->td_flags & TDF_STOPREQ) == 0) 1477 lwkt_schedule(td); 1478 else 1479 td->td_flags &= ~TDF_STOPREQ; 1480 return 0; 1481 } 1482 1483 /* 1484 * Destroy an LWKT thread. Warning! This function is not called when 1485 * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and 1486 * uses a different reaping mechanism. 1487 */ 1488 void 1489 lwkt_exit(void) 1490 { 1491 thread_t td = curthread; 1492 thread_t std; 1493 globaldata_t gd; 1494 1495 /* 1496 * Do any cleanup that might block here 1497 */ 1498 if (td->td_flags & TDF_VERBOSE) 1499 kprintf("kthread %p %s has exited\n", td, td->td_comm); 1500 caps_exit(td); 1501 biosched_done(td); 1502 dsched_exit_thread(td); 1503 1504 /* 1505 * Get us into a critical section to interlock gd_freetd and loop 1506 * until we can get it freed. 1507 * 1508 * We have to cache the current td in gd_freetd because objcache_put()ing 1509 * it would rip it out from under us while our thread is still active. 1510 */ 1511 gd = mycpu; 1512 crit_enter_quick(td); 1513 while ((std = gd->gd_freetd) != NULL) { 1514 KKASSERT((std->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) == 0); 1515 gd->gd_freetd = NULL; 1516 objcache_put(thread_cache, std); 1517 } 1518 1519 /* 1520 * Remove thread resources from kernel lists and deschedule us for 1521 * the last time. We cannot block after this point or we may end 1522 * up with a stale td on the tsleepq. 1523 */ 1524 if (td->td_flags & TDF_TSLEEPQ) 1525 tsleep_remove(td); 1526 lwkt_deschedule_self(td); 1527 lwkt_remove_tdallq(td); 1528 1529 /* 1530 * Final cleanup 1531 */ 1532 KKASSERT(gd->gd_freetd == NULL); 1533 if (td->td_flags & TDF_ALLOCATED_THREAD) 1534 gd->gd_freetd = td; 1535 cpu_thread_exit(); 1536 } 1537 1538 void 1539 lwkt_remove_tdallq(thread_t td) 1540 { 1541 KKASSERT(td->td_gd == mycpu); 1542 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1543 } 1544 1545 /* 1546 * Code reduction and branch prediction improvements. Call/return 1547 * overhead on modern cpus often degenerates into 0 cycles due to 1548 * the cpu's branch prediction hardware and return pc cache. We 1549 * can take advantage of this by not inlining medium-complexity 1550 * functions and we can also reduce the branch prediction impact 1551 * by collapsing perfectly predictable branches into a single 1552 * procedure instead of duplicating it. 1553 * 1554 * Is any of this noticeable? Probably not, so I'll take the 1555 * smaller code size. 1556 */ 1557 void 1558 crit_exit_wrapper(__DEBUG_CRIT_ARG__) 1559 { 1560 _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); 1561 } 1562 1563 void 1564 crit_panic(void) 1565 { 1566 thread_t td = curthread; 1567 int lcrit = td->td_critcount; 1568 1569 td->td_critcount = 0; 1570 panic("td_critcount is/would-go negative! %p %d", td, lcrit); 1571 /* NOT REACHED */ 1572 } 1573 1574 #ifdef SMP 1575 1576 /* 1577 * Called from debugger/panic on cpus which have been stopped. We must still 1578 * process the IPIQ while stopped, even if we were stopped while in a critical 1579 * section (XXX). 1580 * 1581 * If we are dumping also try to process any pending interrupts. This may 1582 * or may not work depending on the state of the cpu at the point it was 1583 * stopped. 1584 */ 1585 void 1586 lwkt_smp_stopped(void) 1587 { 1588 globaldata_t gd = mycpu; 1589 1590 crit_enter_gd(gd); 1591 if (dumping) { 1592 lwkt_process_ipiq(); 1593 splz(); 1594 } else { 1595 lwkt_process_ipiq(); 1596 } 1597 crit_exit_gd(gd); 1598 } 1599 1600 #endif 1601