1 /* 2 * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Each cpu in a system has its own self-contained light weight kernel 37 * thread scheduler, which means that generally speaking we only need 38 * to use a critical section to avoid problems. Foreign thread 39 * scheduling is queued via (async) IPIs. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/kinfo.h> 48 #include <sys/queue.h> 49 #include <sys/sysctl.h> 50 #include <sys/kthread.h> 51 #include <machine/cpu.h> 52 #include <sys/lock.h> 53 #include <sys/caps.h> 54 #include <sys/spinlock.h> 55 #include <sys/ktr.h> 56 57 #include <sys/thread2.h> 58 #include <sys/spinlock2.h> 59 #include <sys/mplock2.h> 60 61 #include <sys/dsched.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_pager.h> 70 #include <vm/vm_extern.h> 71 72 #include <machine/stdarg.h> 73 #include <machine/smp.h> 74 75 #if !defined(KTR_CTXSW) 76 #define KTR_CTXSW KTR_ALL 77 #endif 78 KTR_INFO_MASTER(ctxsw); 79 KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", 80 sizeof(int) + sizeof(struct thread *)); 81 KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", 82 sizeof(int) + sizeof(struct thread *)); 83 KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", 84 sizeof (struct thread *) + sizeof(char *)); 85 KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *)); 86 87 static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); 88 89 #ifdef INVARIANTS 90 static int panic_on_cscount = 0; 91 #endif 92 static __int64_t switch_count = 0; 93 static __int64_t preempt_hit = 0; 94 static __int64_t preempt_miss = 0; 95 static __int64_t preempt_weird = 0; 96 static __int64_t token_contention_count __debugvar = 0; 97 static int lwkt_use_spin_port; 98 static struct objcache *thread_cache; 99 100 #ifdef SMP 101 static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); 102 #endif 103 static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); 104 105 extern void cpu_heavy_restore(void); 106 extern void cpu_lwkt_restore(void); 107 extern void cpu_kthread_restore(void); 108 extern void cpu_idle_restore(void); 109 110 #ifdef __x86_64__ 111 112 static int 113 jg_tos_ok(struct thread *td) 114 { 115 void *tos; 116 int tos_ok; 117 118 if (td == NULL) { 119 return 1; 120 } 121 KKASSERT(td->td_sp != NULL); 122 tos = ((void **)td->td_sp)[0]; 123 tos_ok = 0; 124 if ((tos == cpu_heavy_restore) || (tos == cpu_lwkt_restore) || 125 (tos == cpu_kthread_restore) || (tos == cpu_idle_restore)) { 126 tos_ok = 1; 127 } 128 return tos_ok; 129 } 130 131 #endif 132 133 /* 134 * We can make all thread ports use the spin backend instead of the thread 135 * backend. This should only be set to debug the spin backend. 136 */ 137 TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); 138 139 #ifdef INVARIANTS 140 SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, ""); 141 #endif 142 SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, ""); 143 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, 144 "Successful preemption events"); 145 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, 146 "Failed preemption events"); 147 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, ""); 148 #ifdef INVARIANTS 149 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, 150 &token_contention_count, 0, "spinning due to token contention"); 151 #endif 152 static int fairq_enable = 1; 153 SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, &fairq_enable, 0, ""); 154 static int user_pri_sched = 0; 155 SYSCTL_INT(_lwkt, OID_AUTO, user_pri_sched, CTLFLAG_RW, &user_pri_sched, 0, ""); 156 157 /* 158 * These helper procedures handle the runq, they can only be called from 159 * within a critical section. 160 * 161 * WARNING! Prior to SMP being brought up it is possible to enqueue and 162 * dequeue threads belonging to other cpus, so be sure to use td->td_gd 163 * instead of 'mycpu' when referencing the globaldata structure. Once 164 * SMP live enqueuing and dequeueing only occurs on the current cpu. 165 */ 166 static __inline 167 void 168 _lwkt_dequeue(thread_t td) 169 { 170 if (td->td_flags & TDF_RUNQ) { 171 struct globaldata *gd = td->td_gd; 172 173 td->td_flags &= ~TDF_RUNQ; 174 TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); 175 gd->gd_fairq_total_pri -= td->td_pri; 176 if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) 177 atomic_clear_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); 178 } 179 } 180 181 /* 182 * Priority enqueue. 183 * 184 * NOTE: There are a limited number of lwkt threads runnable since user 185 * processes only schedule one at a time per cpu. 186 */ 187 static __inline 188 void 189 _lwkt_enqueue(thread_t td) 190 { 191 thread_t xtd; 192 193 if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { 194 struct globaldata *gd = td->td_gd; 195 196 td->td_flags |= TDF_RUNQ; 197 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 198 if (xtd == NULL) { 199 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 200 atomic_set_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); 201 } else { 202 while (xtd && xtd->td_pri > td->td_pri) 203 xtd = TAILQ_NEXT(xtd, td_threadq); 204 if (xtd) 205 TAILQ_INSERT_BEFORE(xtd, td, td_threadq); 206 else 207 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 208 } 209 gd->gd_fairq_total_pri += td->td_pri; 210 } 211 } 212 213 static __boolean_t 214 _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) 215 { 216 struct thread *td = (struct thread *)obj; 217 218 td->td_kstack = NULL; 219 td->td_kstack_size = 0; 220 td->td_flags = TDF_ALLOCATED_THREAD; 221 return (1); 222 } 223 224 static void 225 _lwkt_thread_dtor(void *obj, void *privdata) 226 { 227 struct thread *td = (struct thread *)obj; 228 229 KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, 230 ("_lwkt_thread_dtor: not allocated from objcache")); 231 KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && 232 td->td_kstack_size > 0, 233 ("_lwkt_thread_dtor: corrupted stack")); 234 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 235 } 236 237 /* 238 * Initialize the lwkt s/system. 239 */ 240 void 241 lwkt_init(void) 242 { 243 /* An objcache has 2 magazines per CPU so divide cache size by 2. */ 244 thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread), 245 NULL, CACHE_NTHREADS/2, 246 _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); 247 } 248 249 /* 250 * Schedule a thread to run. As the current thread we can always safely 251 * schedule ourselves, and a shortcut procedure is provided for that 252 * function. 253 * 254 * (non-blocking, self contained on a per cpu basis) 255 */ 256 void 257 lwkt_schedule_self(thread_t td) 258 { 259 crit_enter_quick(td); 260 KASSERT(td != &td->td_gd->gd_idlethread, 261 ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); 262 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 263 _lwkt_enqueue(td); 264 crit_exit_quick(td); 265 } 266 267 /* 268 * Deschedule a thread. 269 * 270 * (non-blocking, self contained on a per cpu basis) 271 */ 272 void 273 lwkt_deschedule_self(thread_t td) 274 { 275 crit_enter_quick(td); 276 _lwkt_dequeue(td); 277 crit_exit_quick(td); 278 } 279 280 /* 281 * LWKTs operate on a per-cpu basis 282 * 283 * WARNING! Called from early boot, 'mycpu' may not work yet. 284 */ 285 void 286 lwkt_gdinit(struct globaldata *gd) 287 { 288 TAILQ_INIT(&gd->gd_tdrunq); 289 TAILQ_INIT(&gd->gd_tdallq); 290 } 291 292 /* 293 * Create a new thread. The thread must be associated with a process context 294 * or LWKT start address before it can be scheduled. If the target cpu is 295 * -1 the thread will be created on the current cpu. 296 * 297 * If you intend to create a thread without a process context this function 298 * does everything except load the startup and switcher function. 299 */ 300 thread_t 301 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) 302 { 303 globaldata_t gd = mycpu; 304 void *stack; 305 306 /* 307 * If static thread storage is not supplied allocate a thread. Reuse 308 * a cached free thread if possible. gd_freetd is used to keep an exiting 309 * thread intact through the exit. 310 */ 311 if (td == NULL) { 312 if ((td = gd->gd_freetd) != NULL) 313 gd->gd_freetd = NULL; 314 else 315 td = objcache_get(thread_cache, M_WAITOK); 316 KASSERT((td->td_flags & 317 (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, 318 ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); 319 flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); 320 } 321 322 /* 323 * Try to reuse cached stack. 324 */ 325 if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { 326 if (flags & TDF_ALLOCATED_STACK) { 327 kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); 328 stack = NULL; 329 } 330 } 331 if (stack == NULL) { 332 stack = (void *)kmem_alloc(&kernel_map, stksize); 333 flags |= TDF_ALLOCATED_STACK; 334 } 335 if (cpu < 0) 336 lwkt_init_thread(td, stack, stksize, flags, gd); 337 else 338 lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); 339 return(td); 340 } 341 342 /* 343 * Initialize a preexisting thread structure. This function is used by 344 * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. 345 * 346 * All threads start out in a critical section at a priority of 347 * TDPRI_KERN_DAEMON. Higher level code will modify the priority as 348 * appropriate. This function may send an IPI message when the 349 * requested cpu is not the current cpu and consequently gd_tdallq may 350 * not be initialized synchronously from the point of view of the originating 351 * cpu. 352 * 353 * NOTE! we have to be careful in regards to creating threads for other cpus 354 * if SMP has not yet been activated. 355 */ 356 #ifdef SMP 357 358 static void 359 lwkt_init_thread_remote(void *arg) 360 { 361 thread_t td = arg; 362 363 /* 364 * Protected by critical section held by IPI dispatch 365 */ 366 TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); 367 } 368 369 #endif 370 371 /* 372 * lwkt core thread structural initialization. 373 * 374 * NOTE: All threads are initialized as mpsafe threads. 375 */ 376 void 377 lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, 378 struct globaldata *gd) 379 { 380 globaldata_t mygd = mycpu; 381 382 bzero(td, sizeof(struct thread)); 383 td->td_kstack = stack; 384 td->td_kstack_size = stksize; 385 td->td_flags = flags; 386 td->td_gd = gd; 387 td->td_pri = TDPRI_KERN_DAEMON; 388 td->td_critcount = 1; 389 td->td_toks_stop = &td->td_toks_base; 390 if (lwkt_use_spin_port) 391 lwkt_initport_spin(&td->td_msgport); 392 else 393 lwkt_initport_thread(&td->td_msgport, td); 394 pmap_init_thread(td); 395 #ifdef SMP 396 /* 397 * Normally initializing a thread for a remote cpu requires sending an 398 * IPI. However, the idlethread is setup before the other cpus are 399 * activated so we have to treat it as a special case. XXX manipulation 400 * of gd_tdallq requires the BGL. 401 */ 402 if (gd == mygd || td == &gd->gd_idlethread) { 403 crit_enter_gd(mygd); 404 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 405 crit_exit_gd(mygd); 406 } else { 407 lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); 408 } 409 #else 410 crit_enter_gd(mygd); 411 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 412 crit_exit_gd(mygd); 413 #endif 414 415 dsched_new_thread(td); 416 } 417 418 void 419 lwkt_set_comm(thread_t td, const char *ctl, ...) 420 { 421 __va_list va; 422 423 __va_start(va, ctl); 424 kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); 425 __va_end(va); 426 KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); 427 } 428 429 void 430 lwkt_hold(thread_t td) 431 { 432 ++td->td_refs; 433 } 434 435 void 436 lwkt_rele(thread_t td) 437 { 438 KKASSERT(td->td_refs > 0); 439 --td->td_refs; 440 } 441 442 void 443 lwkt_wait_free(thread_t td) 444 { 445 while (td->td_refs) 446 tsleep(td, 0, "tdreap", hz); 447 } 448 449 void 450 lwkt_free_thread(thread_t td) 451 { 452 KASSERT((td->td_flags & TDF_RUNNING) == 0, 453 ("lwkt_free_thread: did not exit! %p", td)); 454 455 if (td->td_flags & TDF_ALLOCATED_THREAD) { 456 objcache_put(thread_cache, td); 457 } else if (td->td_flags & TDF_ALLOCATED_STACK) { 458 /* client-allocated struct with internally allocated stack */ 459 KASSERT(td->td_kstack && td->td_kstack_size > 0, 460 ("lwkt_free_thread: corrupted stack")); 461 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 462 td->td_kstack = NULL; 463 td->td_kstack_size = 0; 464 } 465 KTR_LOG(ctxsw_deadtd, td); 466 } 467 468 469 /* 470 * Switch to the next runnable lwkt. If no LWKTs are runnable then 471 * switch to the idlethread. Switching must occur within a critical 472 * section to avoid races with the scheduling queue. 473 * 474 * We always have full control over our cpu's run queue. Other cpus 475 * that wish to manipulate our queue must use the cpu_*msg() calls to 476 * talk to our cpu, so a critical section is all that is needed and 477 * the result is very, very fast thread switching. 478 * 479 * The LWKT scheduler uses a fixed priority model and round-robins at 480 * each priority level. User process scheduling is a totally 481 * different beast and LWKT priorities should not be confused with 482 * user process priorities. 483 * 484 * The MP lock may be out of sync with the thread's td_mpcount + td_xpcount. 485 * lwkt_switch() cleans it up. 486 * 487 * Note that the td_switch() function cannot do anything that requires 488 * the MP lock since the MP lock will have already been setup for 489 * the target thread (not the current thread). It's nice to have a scheduler 490 * that does not need the MP lock to work because it allows us to do some 491 * really cool high-performance MP lock optimizations. 492 * 493 * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() 494 * is not called by the current thread in the preemption case, only when 495 * the preempting thread blocks (in order to return to the original thread). 496 */ 497 void 498 lwkt_switch(void) 499 { 500 globaldata_t gd = mycpu; 501 thread_t td = gd->gd_curthread; 502 thread_t ntd; 503 thread_t xtd; 504 thread_t nlast; 505 int nquserok; 506 #ifdef SMP 507 int mpheld; 508 #endif 509 int didaccumulate; 510 const char *lmsg; /* diagnostic - 'systat -pv 1' */ 511 const void *laddr; 512 513 /* 514 * Switching from within a 'fast' (non thread switched) interrupt or IPI 515 * is illegal. However, we may have to do it anyway if we hit a fatal 516 * kernel trap or we have paniced. 517 * 518 * If this case occurs save and restore the interrupt nesting level. 519 */ 520 if (gd->gd_intr_nesting_level) { 521 int savegdnest; 522 int savegdtrap; 523 524 if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { 525 panic("lwkt_switch: Attempt to switch from a " 526 "a fast interrupt, ipi, or hard code section, " 527 "td %p\n", 528 td); 529 } else { 530 savegdnest = gd->gd_intr_nesting_level; 531 savegdtrap = gd->gd_trap_nesting_level; 532 gd->gd_intr_nesting_level = 0; 533 gd->gd_trap_nesting_level = 0; 534 if ((td->td_flags & TDF_PANICWARN) == 0) { 535 td->td_flags |= TDF_PANICWARN; 536 kprintf("Warning: thread switch from interrupt, IPI, " 537 "or hard code section.\n" 538 "thread %p (%s)\n", td, td->td_comm); 539 print_backtrace(-1); 540 } 541 lwkt_switch(); 542 gd->gd_intr_nesting_level = savegdnest; 543 gd->gd_trap_nesting_level = savegdtrap; 544 return; 545 } 546 } 547 548 /* 549 * Passive release (used to transition from user to kernel mode 550 * when we block or switch rather then when we enter the kernel). 551 * This function is NOT called if we are switching into a preemption 552 * or returning from a preemption. Typically this causes us to lose 553 * our current process designation (if we have one) and become a true 554 * LWKT thread, and may also hand the current process designation to 555 * another process and schedule thread. 556 */ 557 if (td->td_release) 558 td->td_release(td); 559 560 crit_enter_gd(gd); 561 if (TD_TOKS_HELD(td)) 562 lwkt_relalltokens(td); 563 564 /* 565 * We had better not be holding any spin locks, but don't get into an 566 * endless panic loop. 567 */ 568 KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, 569 ("lwkt_switch: still holding %d exclusive spinlocks!", 570 gd->gd_spinlocks_wr)); 571 572 573 #ifdef SMP 574 /* 575 * td_mpcount + td_xpcount cannot be used to determine if we currently 576 * hold the MP lock because get_mplock() will increment it prior to 577 * attempting to get the lock, and switch out if it can't. Our 578 * ownership of the actual lock will remain stable while we are 579 * in a critical section, and once we actually acquire the underlying 580 * lock as long as the count is greater than 0. 581 */ 582 mpheld = MP_LOCK_HELD(gd); 583 #ifdef INVARIANTS 584 if (td->td_cscount) { 585 kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", 586 td); 587 if (panic_on_cscount) 588 panic("switching while mastering cpusync"); 589 } 590 #endif 591 #endif 592 593 /* 594 * If we had preempted another thread on this cpu, resume the preempted 595 * thread. This occurs transparently, whether the preempted thread 596 * was scheduled or not (it may have been preempted after descheduling 597 * itself). 598 * 599 * We have to setup the MP lock for the original thread after backing 600 * out the adjustment that was made to curthread when the original 601 * was preempted. 602 */ 603 if ((ntd = td->td_preempted) != NULL) { 604 KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); 605 #ifdef SMP 606 if (ntd->td_mpcount + ntd->td_xpcount && mpheld == 0) { 607 panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d", 608 td, ntd, td->td_mpcount, ntd->td_mpcount + ntd->td_xpcount); 609 } 610 td->td_xpcount = 0; 611 #endif 612 ntd->td_flags |= TDF_PREEMPT_DONE; 613 614 /* 615 * The interrupt may have woken a thread up, we need to properly 616 * set the reschedule flag if the originally interrupted thread is 617 * at a lower priority. 618 */ 619 if (TAILQ_FIRST(&gd->gd_tdrunq) && 620 TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { 621 need_lwkt_resched(); 622 } 623 /* YYY release mp lock on switchback if original doesn't need it */ 624 goto havethread_preempted; 625 } 626 627 /* 628 * Implement round-robin fairq with priority insertion. The priority 629 * insertion is handled by _lwkt_enqueue() 630 * 631 * We have to adjust the MP lock for the target thread. If we 632 * need the MP lock and cannot obtain it we try to locate a 633 * thread that does not need the MP lock. If we cannot, we spin 634 * instead of HLT. 635 * 636 * A similar issue exists for the tokens held by the target thread. 637 * If we cannot obtain ownership of the tokens we cannot immediately 638 * schedule the thread. 639 */ 640 for (;;) { 641 clear_lwkt_resched(); 642 didaccumulate = 0; 643 ntd = TAILQ_FIRST(&gd->gd_tdrunq); 644 645 /* 646 * Hotpath if we can get all necessary resources. 647 * 648 * If nothing is runnable switch to the idle thread 649 */ 650 if (ntd == NULL) { 651 ntd = &gd->gd_idlethread; 652 if (gd->gd_reqflags & RQF_IDLECHECK_MASK) 653 ntd->td_flags |= TDF_IDLE_NOHLT; 654 #ifdef SMP 655 KKASSERT(ntd->td_xpcount == 0); 656 if (ntd->td_mpcount) { 657 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 658 panic("Idle thread %p was holding the BGL!", ntd); 659 if (mpheld == 0) { 660 set_cpu_contention_mask(gd); 661 handle_cpu_contention_mask(); 662 cpu_try_mplock(); 663 mpheld = MP_LOCK_HELD(gd); 664 cpu_pause(); 665 continue; 666 } 667 } 668 clr_cpu_contention_mask(gd); 669 #endif 670 cpu_time.cp_msg[0] = 0; 671 cpu_time.cp_stallpc = 0; 672 goto haveidle; 673 } 674 675 /* 676 * Hotpath schedule 677 * 678 * NOTE: For UP there is no mplock and lwkt_getalltokens() 679 * always succeeds. 680 */ 681 if (ntd->td_fairq_accum >= 0 && 682 #ifdef SMP 683 (ntd->td_mpcount + ntd->td_xpcount == 0 || 684 mpheld || cpu_try_mplock()) && 685 #endif 686 (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr)) 687 ) { 688 #ifdef SMP 689 clr_cpu_contention_mask(gd); 690 #endif 691 goto havethread; 692 } 693 694 lmsg = NULL; 695 laddr = NULL; 696 697 #ifdef SMP 698 if (ntd->td_fairq_accum >= 0) 699 set_cpu_contention_mask(gd); 700 /* Reload mpheld (it become stale after mplock/token ops) */ 701 mpheld = MP_LOCK_HELD(gd); 702 if (ntd->td_mpcount + ntd->td_xpcount && mpheld == 0) { 703 lmsg = "mplock"; 704 laddr = ntd->td_mplock_stallpc; 705 } 706 #endif 707 708 /* 709 * Coldpath - unable to schedule ntd, continue looking for threads 710 * to schedule. This is only allowed of the (presumably) kernel 711 * thread exhausted its fair share. A kernel thread stuck on 712 * resources does not currently allow a user thread to get in 713 * front of it. 714 */ 715 #ifdef SMP 716 nquserok = ((ntd->td_pri < TDPRI_KERN_LPSCHED) || 717 (ntd->td_fairq_accum < 0)); 718 #else 719 nquserok = 1; 720 #endif 721 nlast = NULL; 722 723 for (;;) { 724 /* 725 * If the fair-share scheduler ran out ntd gets moved to the 726 * end and its accumulator will be bumped, if it didn't we 727 * maintain the same queue position. 728 * 729 * nlast keeps track of the last element prior to any moves. 730 */ 731 if (ntd->td_fairq_accum < 0) { 732 lwkt_fairq_accumulate(gd, ntd); 733 didaccumulate = 1; 734 735 /* 736 * Move to end 737 */ 738 xtd = TAILQ_NEXT(ntd, td_threadq); 739 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 740 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); 741 742 /* 743 * Set terminal element (nlast) 744 */ 745 if (nlast == NULL) { 746 nlast = ntd; 747 if (xtd == NULL) 748 xtd = ntd; 749 } 750 ntd = xtd; 751 } else { 752 ntd = TAILQ_NEXT(ntd, td_threadq); 753 } 754 755 /* 756 * If we exhausted the run list switch to the idle thread. 757 * Since one or more threads had resource acquisition issues 758 * we do not allow the idle thread to halt. 759 * 760 * NOTE: nlast can be NULL. 761 */ 762 if (ntd == nlast) { 763 cpu_pause(); 764 ntd = &gd->gd_idlethread; 765 ntd->td_flags |= TDF_IDLE_NOHLT; 766 #ifdef SMP 767 KKASSERT(ntd->td_xpcount == 0); 768 if (ntd->td_mpcount) { 769 mpheld = MP_LOCK_HELD(gd); 770 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 771 panic("Idle thread %p was holding the BGL!", ntd); 772 if (mpheld == 0) { 773 set_cpu_contention_mask(gd); 774 handle_cpu_contention_mask(); 775 cpu_try_mplock(); 776 mpheld = MP_LOCK_HELD(gd); 777 cpu_pause(); 778 break; /* try again from the top, almost */ 779 } 780 } 781 #endif 782 783 /* 784 * If fairq accumulations occured we do not schedule the 785 * idle thread. This will cause us to try again from 786 * the (almost) top. 787 */ 788 if (didaccumulate) 789 break; /* try again from the top, almost */ 790 if (lmsg) 791 strlcpy(cpu_time.cp_msg, lmsg, sizeof(cpu_time.cp_msg)); 792 cpu_time.cp_stallpc = (uintptr_t)laddr; 793 goto haveidle; 794 } 795 796 /* 797 * Try to switch to this thread. 798 * 799 * NOTE: For UP there is no mplock and lwkt_getalltokens() 800 * always succeeds. 801 */ 802 if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok || 803 user_pri_sched) && ntd->td_fairq_accum >= 0 && 804 #ifdef SMP 805 (ntd->td_mpcount + ntd->td_xpcount == 0 || 806 mpheld || cpu_try_mplock()) && 807 #endif 808 (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr)) 809 ) { 810 #ifdef SMP 811 clr_cpu_contention_mask(gd); 812 #endif 813 goto havethread; 814 } 815 #ifdef SMP 816 if (ntd->td_fairq_accum >= 0) 817 set_cpu_contention_mask(gd); 818 /* 819 * Reload mpheld (it become stale after mplock/token ops). 820 */ 821 mpheld = MP_LOCK_HELD(gd); 822 if (ntd->td_mpcount + ntd->td_xpcount && mpheld == 0) { 823 lmsg = "mplock"; 824 laddr = ntd->td_mplock_stallpc; 825 } 826 if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0) 827 nquserok = 0; 828 #endif 829 } 830 831 /* 832 * All threads exhausted but we can loop due to a negative 833 * accumulator. 834 * 835 * While we are looping in the scheduler be sure to service 836 * any interrupts which were made pending due to our critical 837 * section, otherwise we could livelock (e.g.) IPIs. 838 * 839 * NOTE: splz can enter and exit the mplock so mpheld is 840 * stale after this call. 841 */ 842 splz_check(); 843 844 #ifdef SMP 845 /* 846 * Our mplock can be cached and cause other cpus to livelock 847 * if we loop due to e.g. not being able to acquire tokens. 848 */ 849 if (MP_LOCK_HELD(gd)) 850 cpu_rel_mplock(gd->gd_cpuid); 851 mpheld = 0; 852 #endif 853 } 854 855 /* 856 * Do the actual switch. WARNING: mpheld is stale here. 857 * 858 * We must always decrement td_fairq_accum on non-idle threads just 859 * in case a thread never gets a tick due to being in a continuous 860 * critical section. The page-zeroing code does that. 861 * 862 * If the thread we came up with is a higher or equal priority verses 863 * the thread at the head of the queue we move our thread to the 864 * front. This way we can always check the front of the queue. 865 */ 866 havethread: 867 ++gd->gd_cnt.v_swtch; 868 --ntd->td_fairq_accum; 869 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 870 if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { 871 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 872 TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); 873 } 874 havethread_preempted: 875 ; 876 /* 877 * If the new target does not need the MP lock and we are holding it, 878 * release the MP lock. If the new target requires the MP lock we have 879 * already acquired it for the target. 880 * 881 * WARNING: mpheld is stale here. 882 */ 883 haveidle: 884 KASSERT(ntd->td_critcount, 885 ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri)); 886 #ifdef SMP 887 if (ntd->td_mpcount + ntd->td_xpcount == 0 ) { 888 if (MP_LOCK_HELD(gd)) 889 cpu_rel_mplock(gd->gd_cpuid); 890 } else { 891 ASSERT_MP_LOCK_HELD(ntd); 892 } 893 #endif 894 if (td != ntd) { 895 ++switch_count; 896 #ifdef __x86_64__ 897 { 898 int tos_ok __debugvar = jg_tos_ok(ntd); 899 KKASSERT(tos_ok); 900 } 901 #endif 902 KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); 903 td->td_switch(ntd); 904 } 905 /* NOTE: current cpu may have changed after switch */ 906 crit_exit_quick(td); 907 } 908 909 /* 910 * Request that the target thread preempt the current thread. Preemption 911 * only works under a specific set of conditions: 912 * 913 * - We are not preempting ourselves 914 * - The target thread is owned by the current cpu 915 * - We are not currently being preempted 916 * - The target is not currently being preempted 917 * - We are not holding any spin locks 918 * - The target thread is not holding any tokens 919 * - We are able to satisfy the target's MP lock requirements (if any). 920 * 921 * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically 922 * this is called via lwkt_schedule() through the td_preemptable callback. 923 * critcount is the managed critical priority that we should ignore in order 924 * to determine whether preemption is possible (aka usually just the crit 925 * priority of lwkt_schedule() itself). 926 * 927 * XXX at the moment we run the target thread in a critical section during 928 * the preemption in order to prevent the target from taking interrupts 929 * that *WE* can't. Preemption is strictly limited to interrupt threads 930 * and interrupt-like threads, outside of a critical section, and the 931 * preempted source thread will be resumed the instant the target blocks 932 * whether or not the source is scheduled (i.e. preemption is supposed to 933 * be as transparent as possible). 934 * 935 * The target thread inherits our MP count (added to its own) for the 936 * duration of the preemption in order to preserve the atomicy of the 937 * MP lock during the preemption. Therefore, any preempting targets must be 938 * careful in regards to MP assertions. Note that the MP count may be 939 * out of sync with the physical mp_lock, but we do not have to preserve 940 * the original ownership of the lock if it was out of synch (that is, we 941 * can leave it synchronized on return). 942 */ 943 void 944 lwkt_preempt(thread_t ntd, int critcount) 945 { 946 struct globaldata *gd = mycpu; 947 thread_t td; 948 #ifdef SMP 949 int mpheld; 950 int savecnt; 951 #endif 952 953 /* 954 * The caller has put us in a critical section. We can only preempt 955 * if the caller of the caller was not in a critical section (basically 956 * a local interrupt), as determined by the 'critcount' parameter. We 957 * also can't preempt if the caller is holding any spinlocks (even if 958 * he isn't in a critical section). This also handles the tokens test. 959 * 960 * YYY The target thread must be in a critical section (else it must 961 * inherit our critical section? I dunno yet). 962 * 963 * Set need_lwkt_resched() unconditionally for now YYY. 964 */ 965 KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); 966 967 td = gd->gd_curthread; 968 if (ntd->td_pri <= td->td_pri) { 969 ++preempt_miss; 970 return; 971 } 972 if (td->td_critcount > critcount) { 973 ++preempt_miss; 974 need_lwkt_resched(); 975 return; 976 } 977 #ifdef SMP 978 if (ntd->td_gd != gd) { 979 ++preempt_miss; 980 need_lwkt_resched(); 981 return; 982 } 983 #endif 984 /* 985 * We don't have to check spinlocks here as they will also bump 986 * td_critcount. 987 * 988 * Do not try to preempt if the target thread is holding any tokens. 989 * We could try to acquire the tokens but this case is so rare there 990 * is no need to support it. 991 */ 992 KKASSERT(gd->gd_spinlocks_wr == 0); 993 994 if (TD_TOKS_HELD(ntd)) { 995 ++preempt_miss; 996 need_lwkt_resched(); 997 return; 998 } 999 if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { 1000 ++preempt_weird; 1001 need_lwkt_resched(); 1002 return; 1003 } 1004 if (ntd->td_preempted) { 1005 ++preempt_hit; 1006 need_lwkt_resched(); 1007 return; 1008 } 1009 #ifdef SMP 1010 /* 1011 * NOTE: An interrupt might have occured just as we were transitioning 1012 * to or from the MP lock. In this case td_mpcount will be pre-disposed 1013 * (non-zero) but not actually synchronized with the mp_lock itself. 1014 * We can use it to imply an MP lock requirement for the preemption but 1015 * we cannot use it to test whether we hold the MP lock or not. 1016 */ 1017 savecnt = td->td_mpcount; 1018 mpheld = MP_LOCK_HELD(gd); 1019 ntd->td_xpcount = td->td_mpcount + td->td_xpcount; 1020 if (mpheld == 0 && ntd->td_mpcount + ntd->td_xpcount && !cpu_try_mplock()) { 1021 ntd->td_xpcount = 0; 1022 ++preempt_miss; 1023 need_lwkt_resched(); 1024 return; 1025 } 1026 #endif 1027 1028 /* 1029 * Since we are able to preempt the current thread, there is no need to 1030 * call need_lwkt_resched(). 1031 */ 1032 ++preempt_hit; 1033 ntd->td_preempted = td; 1034 td->td_flags |= TDF_PREEMPT_LOCK; 1035 KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); 1036 td->td_switch(ntd); 1037 1038 KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); 1039 #ifdef SMP 1040 KKASSERT(savecnt == td->td_mpcount); 1041 mpheld = MP_LOCK_HELD(gd); 1042 if (mpheld && td->td_mpcount == 0) 1043 cpu_rel_mplock(gd->gd_cpuid); 1044 else if (mpheld == 0 && td->td_mpcount + td->td_xpcount) 1045 panic("lwkt_preempt(): MP lock was not held through"); 1046 #endif 1047 ntd->td_preempted = NULL; 1048 td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); 1049 } 1050 1051 /* 1052 * Conditionally call splz() if gd_reqflags indicates work is pending. 1053 * This will work inside a critical section but not inside a hard code 1054 * section. 1055 * 1056 * (self contained on a per cpu basis) 1057 */ 1058 void 1059 splz_check(void) 1060 { 1061 globaldata_t gd = mycpu; 1062 thread_t td = gd->gd_curthread; 1063 1064 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && 1065 gd->gd_intr_nesting_level == 0 && 1066 td->td_nest_count < 2) 1067 { 1068 splz(); 1069 } 1070 } 1071 1072 /* 1073 * This version is integrated into crit_exit, reqflags has already 1074 * been tested but td_critcount has not. 1075 * 1076 * We only want to execute the splz() on the 1->0 transition of 1077 * critcount and not in a hard code section or if too deeply nested. 1078 */ 1079 void 1080 lwkt_maybe_splz(thread_t td) 1081 { 1082 globaldata_t gd = td->td_gd; 1083 1084 if (td->td_critcount == 0 && 1085 gd->gd_intr_nesting_level == 0 && 1086 td->td_nest_count < 2) 1087 { 1088 splz(); 1089 } 1090 } 1091 1092 /* 1093 * This function is used to negotiate a passive release of the current 1094 * process/lwp designation with the user scheduler, allowing the user 1095 * scheduler to schedule another user thread. The related kernel thread 1096 * (curthread) continues running in the released state. 1097 */ 1098 void 1099 lwkt_passive_release(struct thread *td) 1100 { 1101 struct lwp *lp = td->td_lwp; 1102 1103 td->td_release = NULL; 1104 lwkt_setpri_self(TDPRI_KERN_USER); 1105 lp->lwp_proc->p_usched->release_curproc(lp); 1106 } 1107 1108 1109 /* 1110 * This implements a normal yield. This routine is virtually a nop if 1111 * there is nothing to yield to but it will always run any pending interrupts 1112 * if called from a critical section. 1113 * 1114 * This yield is designed for kernel threads without a user context. 1115 * 1116 * (self contained on a per cpu basis) 1117 */ 1118 void 1119 lwkt_yield(void) 1120 { 1121 globaldata_t gd = mycpu; 1122 thread_t td = gd->gd_curthread; 1123 thread_t xtd; 1124 1125 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1126 splz(); 1127 if (td->td_fairq_accum < 0) { 1128 lwkt_schedule_self(curthread); 1129 lwkt_switch(); 1130 } else { 1131 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 1132 if (xtd && xtd->td_pri > td->td_pri) { 1133 lwkt_schedule_self(curthread); 1134 lwkt_switch(); 1135 } 1136 } 1137 } 1138 1139 /* 1140 * This yield is designed for kernel threads with a user context. 1141 * 1142 * The kernel acting on behalf of the user is potentially cpu-bound, 1143 * this function will efficiently allow other threads to run and also 1144 * switch to other processes by releasing. 1145 * 1146 * The lwkt_user_yield() function is designed to have very low overhead 1147 * if no yield is determined to be needed. 1148 */ 1149 void 1150 lwkt_user_yield(void) 1151 { 1152 globaldata_t gd = mycpu; 1153 thread_t td = gd->gd_curthread; 1154 1155 /* 1156 * Always run any pending interrupts in case we are in a critical 1157 * section. 1158 */ 1159 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1160 splz(); 1161 1162 #ifdef SMP 1163 /* 1164 * XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the 1165 * kernel can prevent other cpus from servicing interrupt threads 1166 * which still require the MP lock (which is a lot of them). This 1167 * has a chaining effect since if the interrupt is blocked, so is 1168 * the event, so normal scheduling will not pick up on the problem. 1169 */ 1170 if (cpu_contention_mask && td->td_mpcount + td->td_xpcount) { 1171 yield_mplock(td); 1172 } 1173 #endif 1174 1175 /* 1176 * Switch (which forces a release) if another kernel thread needs 1177 * the cpu, if userland wants us to resched, or if our kernel 1178 * quantum has run out. 1179 */ 1180 if (lwkt_resched_wanted() || 1181 user_resched_wanted() || 1182 td->td_fairq_accum < 0) 1183 { 1184 lwkt_switch(); 1185 } 1186 1187 #if 0 1188 /* 1189 * Reacquire the current process if we are released. 1190 * 1191 * XXX not implemented atm. The kernel may be holding locks and such, 1192 * so we want the thread to continue to receive cpu. 1193 */ 1194 if (td->td_release == NULL && lp) { 1195 lp->lwp_proc->p_usched->acquire_curproc(lp); 1196 td->td_release = lwkt_passive_release; 1197 lwkt_setpri_self(TDPRI_USER_NORM); 1198 } 1199 #endif 1200 } 1201 1202 /* 1203 * Generic schedule. Possibly schedule threads belonging to other cpus and 1204 * deal with threads that might be blocked on a wait queue. 1205 * 1206 * We have a little helper inline function which does additional work after 1207 * the thread has been enqueued, including dealing with preemption and 1208 * setting need_lwkt_resched() (which prevents the kernel from returning 1209 * to userland until it has processed higher priority threads). 1210 * 1211 * It is possible for this routine to be called after a failed _enqueue 1212 * (due to the target thread migrating, sleeping, or otherwise blocked). 1213 * We have to check that the thread is actually on the run queue! 1214 * 1215 * reschedok is an optimized constant propagated from lwkt_schedule() or 1216 * lwkt_schedule_noresched(). By default it is non-zero, causing a 1217 * reschedule to be requested if the target thread has a higher priority. 1218 * The port messaging code will set MSG_NORESCHED and cause reschedok to 1219 * be 0, prevented undesired reschedules. 1220 */ 1221 static __inline 1222 void 1223 _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) 1224 { 1225 thread_t otd; 1226 1227 if (ntd->td_flags & TDF_RUNQ) { 1228 if (ntd->td_preemptable && reschedok) { 1229 ntd->td_preemptable(ntd, ccount); /* YYY +token */ 1230 } else if (reschedok) { 1231 otd = curthread; 1232 if (ntd->td_pri > otd->td_pri) 1233 need_lwkt_resched(); 1234 } 1235 1236 /* 1237 * Give the thread a little fair share scheduler bump if it 1238 * has been asleep for a while. This is primarily to avoid 1239 * a degenerate case for interrupt threads where accumulator 1240 * crosses into negative territory unnecessarily. 1241 */ 1242 if (ntd->td_fairq_lticks != ticks) { 1243 ntd->td_fairq_lticks = ticks; 1244 ntd->td_fairq_accum += gd->gd_fairq_total_pri; 1245 if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) 1246 ntd->td_fairq_accum = TDFAIRQ_MAX(gd); 1247 } 1248 } 1249 } 1250 1251 static __inline 1252 void 1253 _lwkt_schedule(thread_t td, int reschedok) 1254 { 1255 globaldata_t mygd = mycpu; 1256 1257 KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); 1258 crit_enter_gd(mygd); 1259 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1260 if (td == mygd->gd_curthread) { 1261 _lwkt_enqueue(td); 1262 } else { 1263 /* 1264 * If we own the thread, there is no race (since we are in a 1265 * critical section). If we do not own the thread there might 1266 * be a race but the target cpu will deal with it. 1267 */ 1268 #ifdef SMP 1269 if (td->td_gd == mygd) { 1270 _lwkt_enqueue(td); 1271 _lwkt_schedule_post(mygd, td, 1, reschedok); 1272 } else { 1273 lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); 1274 } 1275 #else 1276 _lwkt_enqueue(td); 1277 _lwkt_schedule_post(mygd, td, 1, reschedok); 1278 #endif 1279 } 1280 crit_exit_gd(mygd); 1281 } 1282 1283 void 1284 lwkt_schedule(thread_t td) 1285 { 1286 _lwkt_schedule(td, 1); 1287 } 1288 1289 void 1290 lwkt_schedule_noresched(thread_t td) 1291 { 1292 _lwkt_schedule(td, 0); 1293 } 1294 1295 #ifdef SMP 1296 1297 /* 1298 * When scheduled remotely if frame != NULL the IPIQ is being 1299 * run via doreti or an interrupt then preemption can be allowed. 1300 * 1301 * To allow preemption we have to drop the critical section so only 1302 * one is present in _lwkt_schedule_post. 1303 */ 1304 static void 1305 lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame) 1306 { 1307 thread_t td = curthread; 1308 thread_t ntd = arg; 1309 1310 if (frame && ntd->td_preemptable) { 1311 crit_exit_noyield(td); 1312 _lwkt_schedule(ntd, 1); 1313 crit_enter_quick(td); 1314 } else { 1315 _lwkt_schedule(ntd, 1); 1316 } 1317 } 1318 1319 /* 1320 * Thread migration using a 'Pull' method. The thread may or may not be 1321 * the current thread. It MUST be descheduled and in a stable state. 1322 * lwkt_giveaway() must be called on the cpu owning the thread. 1323 * 1324 * At any point after lwkt_giveaway() is called, the target cpu may 1325 * 'pull' the thread by calling lwkt_acquire(). 1326 * 1327 * We have to make sure the thread is not sitting on a per-cpu tsleep 1328 * queue or it will blow up when it moves to another cpu. 1329 * 1330 * MPSAFE - must be called under very specific conditions. 1331 */ 1332 void 1333 lwkt_giveaway(thread_t td) 1334 { 1335 globaldata_t gd = mycpu; 1336 1337 crit_enter_gd(gd); 1338 if (td->td_flags & TDF_TSLEEPQ) 1339 tsleep_remove(td); 1340 KKASSERT(td->td_gd == gd); 1341 TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); 1342 td->td_flags |= TDF_MIGRATING; 1343 crit_exit_gd(gd); 1344 } 1345 1346 void 1347 lwkt_acquire(thread_t td) 1348 { 1349 globaldata_t gd; 1350 globaldata_t mygd; 1351 1352 KKASSERT(td->td_flags & TDF_MIGRATING); 1353 gd = td->td_gd; 1354 mygd = mycpu; 1355 if (gd != mycpu) { 1356 cpu_lfence(); 1357 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1358 crit_enter_gd(mygd); 1359 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1360 #ifdef SMP 1361 lwkt_process_ipiq(); 1362 #endif 1363 cpu_lfence(); 1364 } 1365 td->td_gd = mygd; 1366 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1367 td->td_flags &= ~TDF_MIGRATING; 1368 crit_exit_gd(mygd); 1369 } else { 1370 crit_enter_gd(mygd); 1371 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1372 td->td_flags &= ~TDF_MIGRATING; 1373 crit_exit_gd(mygd); 1374 } 1375 } 1376 1377 #endif 1378 1379 /* 1380 * Generic deschedule. Descheduling threads other then your own should be 1381 * done only in carefully controlled circumstances. Descheduling is 1382 * asynchronous. 1383 * 1384 * This function may block if the cpu has run out of messages. 1385 */ 1386 void 1387 lwkt_deschedule(thread_t td) 1388 { 1389 crit_enter(); 1390 #ifdef SMP 1391 if (td == curthread) { 1392 _lwkt_dequeue(td); 1393 } else { 1394 if (td->td_gd == mycpu) { 1395 _lwkt_dequeue(td); 1396 } else { 1397 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); 1398 } 1399 } 1400 #else 1401 _lwkt_dequeue(td); 1402 #endif 1403 crit_exit(); 1404 } 1405 1406 /* 1407 * Set the target thread's priority. This routine does not automatically 1408 * switch to a higher priority thread, LWKT threads are not designed for 1409 * continuous priority changes. Yield if you want to switch. 1410 */ 1411 void 1412 lwkt_setpri(thread_t td, int pri) 1413 { 1414 KKASSERT(td->td_gd == mycpu); 1415 if (td->td_pri != pri) { 1416 KKASSERT(pri >= 0); 1417 crit_enter(); 1418 if (td->td_flags & TDF_RUNQ) { 1419 _lwkt_dequeue(td); 1420 td->td_pri = pri; 1421 _lwkt_enqueue(td); 1422 } else { 1423 td->td_pri = pri; 1424 } 1425 crit_exit(); 1426 } 1427 } 1428 1429 /* 1430 * Set the initial priority for a thread prior to it being scheduled for 1431 * the first time. The thread MUST NOT be scheduled before or during 1432 * this call. The thread may be assigned to a cpu other then the current 1433 * cpu. 1434 * 1435 * Typically used after a thread has been created with TDF_STOPPREQ, 1436 * and before the thread is initially scheduled. 1437 */ 1438 void 1439 lwkt_setpri_initial(thread_t td, int pri) 1440 { 1441 KKASSERT(pri >= 0); 1442 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1443 td->td_pri = pri; 1444 } 1445 1446 void 1447 lwkt_setpri_self(int pri) 1448 { 1449 thread_t td = curthread; 1450 1451 KKASSERT(pri >= 0 && pri <= TDPRI_MAX); 1452 crit_enter(); 1453 if (td->td_flags & TDF_RUNQ) { 1454 _lwkt_dequeue(td); 1455 td->td_pri = pri; 1456 _lwkt_enqueue(td); 1457 } else { 1458 td->td_pri = pri; 1459 } 1460 crit_exit(); 1461 } 1462 1463 /* 1464 * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. 1465 * 1466 * Example: two competing threads, same priority N. decrement by (2*N) 1467 * increment by N*8, each thread will get 4 ticks. 1468 */ 1469 void 1470 lwkt_fairq_schedulerclock(thread_t td) 1471 { 1472 if (fairq_enable) { 1473 while (td) { 1474 if (td != &td->td_gd->gd_idlethread) { 1475 td->td_fairq_accum -= td->td_gd->gd_fairq_total_pri; 1476 if (td->td_fairq_accum < -TDFAIRQ_MAX(td->td_gd)) 1477 td->td_fairq_accum = -TDFAIRQ_MAX(td->td_gd); 1478 if (td->td_fairq_accum < 0) 1479 need_lwkt_resched(); 1480 td->td_fairq_lticks = ticks; 1481 } 1482 td = td->td_preempted; 1483 } 1484 } 1485 } 1486 1487 static void 1488 lwkt_fairq_accumulate(globaldata_t gd, thread_t td) 1489 { 1490 td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; 1491 if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) 1492 td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); 1493 } 1494 1495 /* 1496 * Migrate the current thread to the specified cpu. 1497 * 1498 * This is accomplished by descheduling ourselves from the current cpu, 1499 * moving our thread to the tdallq of the target cpu, IPI messaging the 1500 * target cpu, and switching out. TDF_MIGRATING prevents scheduling 1501 * races while the thread is being migrated. 1502 * 1503 * We must be sure to remove ourselves from the current cpu's tsleepq 1504 * before potentially moving to another queue. The thread can be on 1505 * a tsleepq due to a left-over tsleep_interlock(). 1506 */ 1507 #ifdef SMP 1508 static void lwkt_setcpu_remote(void *arg); 1509 #endif 1510 1511 void 1512 lwkt_setcpu_self(globaldata_t rgd) 1513 { 1514 #ifdef SMP 1515 thread_t td = curthread; 1516 1517 if (td->td_gd != rgd) { 1518 crit_enter_quick(td); 1519 if (td->td_flags & TDF_TSLEEPQ) 1520 tsleep_remove(td); 1521 td->td_flags |= TDF_MIGRATING; 1522 lwkt_deschedule_self(td); 1523 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1524 lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); 1525 lwkt_switch(); 1526 /* we are now on the target cpu */ 1527 TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); 1528 crit_exit_quick(td); 1529 } 1530 #endif 1531 } 1532 1533 void 1534 lwkt_migratecpu(int cpuid) 1535 { 1536 #ifdef SMP 1537 globaldata_t rgd; 1538 1539 rgd = globaldata_find(cpuid); 1540 lwkt_setcpu_self(rgd); 1541 #endif 1542 } 1543 1544 /* 1545 * Remote IPI for cpu migration (called while in a critical section so we 1546 * do not have to enter another one). The thread has already been moved to 1547 * our cpu's allq, but we must wait for the thread to be completely switched 1548 * out on the originating cpu before we schedule it on ours or the stack 1549 * state may be corrupt. We clear TDF_MIGRATING after flushing the GD 1550 * change to main memory. 1551 * 1552 * XXX The use of TDF_MIGRATING might not be sufficient to avoid races 1553 * against wakeups. It is best if this interface is used only when there 1554 * are no pending events that might try to schedule the thread. 1555 */ 1556 #ifdef SMP 1557 static void 1558 lwkt_setcpu_remote(void *arg) 1559 { 1560 thread_t td = arg; 1561 globaldata_t gd = mycpu; 1562 1563 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1564 #ifdef SMP 1565 lwkt_process_ipiq(); 1566 #endif 1567 cpu_lfence(); 1568 } 1569 td->td_gd = gd; 1570 cpu_sfence(); 1571 td->td_flags &= ~TDF_MIGRATING; 1572 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1573 _lwkt_enqueue(td); 1574 } 1575 #endif 1576 1577 struct lwp * 1578 lwkt_preempted_proc(void) 1579 { 1580 thread_t td = curthread; 1581 while (td->td_preempted) 1582 td = td->td_preempted; 1583 return(td->td_lwp); 1584 } 1585 1586 /* 1587 * Create a kernel process/thread/whatever. It shares it's address space 1588 * with proc0 - ie: kernel only. 1589 * 1590 * NOTE! By default new threads are created with the MP lock held. A 1591 * thread which does not require the MP lock should release it by calling 1592 * rel_mplock() at the start of the new thread. 1593 */ 1594 int 1595 lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, 1596 thread_t template, int tdflags, int cpu, const char *fmt, ...) 1597 { 1598 thread_t td; 1599 __va_list ap; 1600 1601 td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, 1602 tdflags); 1603 if (tdp) 1604 *tdp = td; 1605 cpu_set_thread_handler(td, lwkt_exit, func, arg); 1606 1607 /* 1608 * Set up arg0 for 'ps' etc 1609 */ 1610 __va_start(ap, fmt); 1611 kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); 1612 __va_end(ap); 1613 1614 /* 1615 * Schedule the thread to run 1616 */ 1617 if ((td->td_flags & TDF_STOPREQ) == 0) 1618 lwkt_schedule(td); 1619 else 1620 td->td_flags &= ~TDF_STOPREQ; 1621 return 0; 1622 } 1623 1624 /* 1625 * Destroy an LWKT thread. Warning! This function is not called when 1626 * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and 1627 * uses a different reaping mechanism. 1628 */ 1629 void 1630 lwkt_exit(void) 1631 { 1632 thread_t td = curthread; 1633 thread_t std; 1634 globaldata_t gd; 1635 1636 if (td->td_flags & TDF_VERBOSE) 1637 kprintf("kthread %p %s has exited\n", td, td->td_comm); 1638 caps_exit(td); 1639 1640 /* 1641 * Get us into a critical section to interlock gd_freetd and loop 1642 * until we can get it freed. 1643 * 1644 * We have to cache the current td in gd_freetd because objcache_put()ing 1645 * it would rip it out from under us while our thread is still active. 1646 */ 1647 gd = mycpu; 1648 crit_enter_quick(td); 1649 while ((std = gd->gd_freetd) != NULL) { 1650 gd->gd_freetd = NULL; 1651 objcache_put(thread_cache, std); 1652 } 1653 1654 /* 1655 * Remove thread resources from kernel lists and deschedule us for 1656 * the last time. 1657 */ 1658 if (td->td_flags & TDF_TSLEEPQ) 1659 tsleep_remove(td); 1660 biosched_done(td); 1661 dsched_exit_thread(td); 1662 lwkt_deschedule_self(td); 1663 lwkt_remove_tdallq(td); 1664 if (td->td_flags & TDF_ALLOCATED_THREAD) 1665 gd->gd_freetd = td; 1666 cpu_thread_exit(); 1667 } 1668 1669 void 1670 lwkt_remove_tdallq(thread_t td) 1671 { 1672 KKASSERT(td->td_gd == mycpu); 1673 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1674 } 1675 1676 /* 1677 * Code reduction and branch prediction improvements. Call/return 1678 * overhead on modern cpus often degenerates into 0 cycles due to 1679 * the cpu's branch prediction hardware and return pc cache. We 1680 * can take advantage of this by not inlining medium-complexity 1681 * functions and we can also reduce the branch prediction impact 1682 * by collapsing perfectly predictable branches into a single 1683 * procedure instead of duplicating it. 1684 * 1685 * Is any of this noticeable? Probably not, so I'll take the 1686 * smaller code size. 1687 */ 1688 void 1689 crit_exit_wrapper(__DEBUG_CRIT_ARG__) 1690 { 1691 _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); 1692 } 1693 1694 void 1695 crit_panic(void) 1696 { 1697 thread_t td = curthread; 1698 int lcrit = td->td_critcount; 1699 1700 td->td_critcount = 0; 1701 panic("td_critcount is/would-go negative! %p %d", td, lcrit); 1702 /* NOT REACHED */ 1703 } 1704 1705 #ifdef SMP 1706 1707 /* 1708 * Called from debugger/panic on cpus which have been stopped. We must still 1709 * process the IPIQ while stopped, even if we were stopped while in a critical 1710 * section (XXX). 1711 * 1712 * If we are dumping also try to process any pending interrupts. This may 1713 * or may not work depending on the state of the cpu at the point it was 1714 * stopped. 1715 */ 1716 void 1717 lwkt_smp_stopped(void) 1718 { 1719 globaldata_t gd = mycpu; 1720 1721 crit_enter_gd(gd); 1722 if (dumping) { 1723 lwkt_process_ipiq(); 1724 splz(); 1725 } else { 1726 lwkt_process_ipiq(); 1727 } 1728 crit_exit_gd(gd); 1729 } 1730 1731 #endif 1732