1 /* $NetBSD: kern_runq.c,v 1.71 2025/01/17 04:11:33 mrg Exp $ */ 2 3 /*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57 58 #include <sys/cdefs.h> 59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.71 2025/01/17 04:11:33 mrg Exp $"); 60 61 #include "opt_dtrace.h" 62 63 #include <sys/param.h> 64 #include <sys/kernel.h> 65 #include <sys/bitops.h> 66 #include <sys/cpu.h> 67 #include <sys/idle.h> 68 #include <sys/intr.h> 69 #include <sys/kmem.h> 70 #include <sys/lwp.h> 71 #include <sys/mutex.h> 72 #include <sys/proc.h> 73 #include <sys/pset.h> 74 #include <sys/sched.h> 75 #include <sys/syscallargs.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/types.h> 79 #include <sys/evcnt.h> 80 #include <sys/atomic.h> 81 82 /* 83 * Bits per map. 84 */ 85 #define BITMAP_BITS (32) 86 #define BITMAP_SHIFT (5) 87 #define BITMAP_MSB (0x80000000U) 88 #define BITMAP_MASK (BITMAP_BITS - 1) 89 90 const int schedppq = 1; 91 92 static void *sched_getrq(struct schedstate_percpu *, const pri_t); 93 #ifdef MULTIPROCESSOR 94 static lwp_t * sched_catchlwp(struct cpu_info *); 95 #endif 96 97 /* 98 * Preemption control. 99 */ 100 #ifdef __HAVE_PREEMPTION 101 # ifdef DEBUG 102 int sched_kpreempt_pri = 0; 103 # else 104 int sched_kpreempt_pri = PRI_USER_RT; 105 # endif 106 #else 107 int sched_kpreempt_pri = 1000; 108 #endif 109 110 /* 111 * Migration and balancing. 112 */ 113 static u_int cacheht_time; /* Cache hotness time */ 114 static u_int min_catch; /* Minimal LWP count for catching */ 115 static u_int skim_interval; /* Rate limit for stealing LWPs */ 116 117 #ifdef KDTRACE_HOOKS 118 struct lwp *curthread; 119 #endif 120 121 void 122 runq_init(void) 123 { 124 125 /* Pulling from remote packages, LWP must not have run for 10ms. */ 126 cacheht_time = 10; 127 128 /* Minimal count of LWPs for catching */ 129 min_catch = 1; 130 131 /* Steal from other CPUs at most every 10ms. */ 132 skim_interval = 10; 133 } 134 135 void 136 sched_cpuattach(struct cpu_info *ci) 137 { 138 struct schedstate_percpu *spc; 139 size_t size; 140 void *p; 141 u_int i; 142 143 spc = &ci->ci_schedstate; 144 spc->spc_nextpkg = ci; 145 146 if (spc->spc_lwplock == NULL) { 147 spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 148 } 149 if (ci == lwp0.l_cpu) { 150 /* Initialize the scheduler structure of the primary LWP */ 151 lwp0.l_mutex = spc->spc_lwplock; 152 } 153 if (spc->spc_mutex != NULL) { 154 /* Already initialized. */ 155 return; 156 } 157 158 /* Allocate the run queue */ 159 size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) + 160 coherency_unit; 161 p = kmem_alloc(size, KM_SLEEP); 162 spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit); 163 164 /* Initialize run queues */ 165 spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 166 for (i = 0; i < PRI_COUNT; i++) 167 TAILQ_INIT(&spc->spc_queue[i]); 168 } 169 170 /* 171 * Control of the runqueue. 172 */ 173 static inline void * 174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio) 175 { 176 177 KASSERT(prio < PRI_COUNT); 178 return &spc->spc_queue[prio]; 179 } 180 181 /* 182 * Put an LWP onto a run queue. The LWP must be locked by spc_mutex for 183 * l_cpu. 184 */ 185 void 186 sched_enqueue(struct lwp *l) 187 { 188 struct schedstate_percpu *spc; 189 TAILQ_HEAD(, lwp) *q_head; 190 const pri_t eprio = lwp_eprio(l); 191 struct cpu_info *ci; 192 193 ci = l->l_cpu; 194 spc = &ci->ci_schedstate; 195 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 196 197 /* Enqueue the thread */ 198 q_head = sched_getrq(spc, eprio); 199 if (TAILQ_EMPTY(q_head)) { 200 u_int i; 201 uint32_t q; 202 203 /* Mark bit */ 204 i = eprio >> BITMAP_SHIFT; 205 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 206 KASSERT((spc->spc_bitmap[i] & q) == 0); 207 spc->spc_bitmap[i] |= q; 208 } 209 210 /* 211 * Determine run queue position according to POSIX. XXX Explicitly 212 * lowering a thread's priority with pthread_setschedparam() is not 213 * handled. 214 */ 215 if ((l->l_pflag & LP_PREEMPTING) != 0) { 216 switch (l->l_class) { 217 case SCHED_OTHER: 218 TAILQ_INSERT_TAIL(q_head, l, l_runq); 219 break; 220 case SCHED_FIFO: 221 TAILQ_INSERT_HEAD(q_head, l, l_runq); 222 break; 223 case SCHED_RR: 224 if (getticks() - l->l_rticks >= sched_rrticks) { 225 TAILQ_INSERT_TAIL(q_head, l, l_runq); 226 } else { 227 TAILQ_INSERT_HEAD(q_head, l, l_runq); 228 } 229 break; 230 default: 231 panic("sched_enqueue: LWP %p has class %d\n", 232 l, l->l_class); 233 } 234 } else { 235 TAILQ_INSERT_TAIL(q_head, l, l_runq); 236 } 237 spc->spc_flags &= ~SPCF_IDLE; 238 spc->spc_count++; 239 if ((l->l_pflag & LP_BOUND) == 0) { 240 atomic_store_relaxed(&spc->spc_mcount, 241 atomic_load_relaxed(&spc->spc_mcount) + 1); 242 } 243 244 /* 245 * Update the value of highest priority in the runqueue, 246 * if priority of this thread is higher. 247 */ 248 if (eprio > spc->spc_maxpriority) 249 spc->spc_maxpriority = eprio; 250 251 sched_newts(l); 252 } 253 254 /* 255 * Remove and LWP from the run queue it's on. The LWP must be in state 256 * LSRUN. 257 */ 258 void 259 sched_dequeue(struct lwp *l) 260 { 261 TAILQ_HEAD(, lwp) *q_head; 262 struct schedstate_percpu *spc; 263 const pri_t eprio = lwp_eprio(l); 264 265 spc = &l->l_cpu->ci_schedstate; 266 267 KASSERT(lwp_locked(l, spc->spc_mutex)); 268 KASSERT(eprio <= spc->spc_maxpriority); 269 KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); 270 KASSERT(spc->spc_count > 0); 271 272 if (spc->spc_migrating == l) 273 spc->spc_migrating = NULL; 274 275 spc->spc_count--; 276 if ((l->l_pflag & LP_BOUND) == 0) { 277 atomic_store_relaxed(&spc->spc_mcount, 278 atomic_load_relaxed(&spc->spc_mcount) - 1); 279 } 280 281 q_head = sched_getrq(spc, eprio); 282 TAILQ_REMOVE(q_head, l, l_runq); 283 if (TAILQ_EMPTY(q_head)) { 284 u_int i; 285 uint32_t q; 286 287 /* Unmark bit */ 288 i = eprio >> BITMAP_SHIFT; 289 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 290 KASSERT((spc->spc_bitmap[i] & q) != 0); 291 spc->spc_bitmap[i] &= ~q; 292 293 /* 294 * Update the value of highest priority in the runqueue, in a 295 * case it was a last thread in the queue of highest priority. 296 */ 297 if (eprio != spc->spc_maxpriority) 298 return; 299 300 do { 301 if (spc->spc_bitmap[i] != 0) { 302 q = ffs(spc->spc_bitmap[i]); 303 spc->spc_maxpriority = 304 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 305 return; 306 } 307 } while (i--); 308 309 /* If not found - set the lowest value */ 310 spc->spc_maxpriority = 0; 311 } 312 } 313 314 /* 315 * Cause a preemption on the given CPU, if the priority "pri" is higher 316 * priority than the running LWP. If "unlock" is specified, and ideally it 317 * will be for concurrency reasons, spc_mutex will be dropped before return. 318 */ 319 void 320 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) 321 { 322 struct schedstate_percpu *spc; 323 u_int o, n, f; 324 lwp_t *l; 325 326 spc = &ci->ci_schedstate; 327 328 KASSERT(mutex_owned(spc->spc_mutex)); 329 330 /* 331 * If the priority level we're evaluating wouldn't cause a new LWP 332 * to be run on the CPU, then we have nothing to do. 333 */ 334 if (pri <= spc->spc_curpriority || !mp_online) { 335 if (__predict_true(unlock)) { 336 spc_unlock(ci); 337 } 338 return; 339 } 340 341 /* 342 * Figure out what kind of preemption we should do. 343 */ 344 l = ci->ci_onproc; 345 if ((l->l_flag & LW_IDLE) != 0) { 346 f = RESCHED_IDLE | RESCHED_UPREEMPT; 347 } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) { 348 /* We can't currently preempt softints - should be able to. */ 349 #ifdef __HAVE_PREEMPTION 350 f = RESCHED_KPREEMPT; 351 #else 352 /* Leave door open for test: set kpreempt_pri with sysctl. */ 353 f = RESCHED_UPREEMPT; 354 #endif 355 /* 356 * l_dopreempt must be set with the CPU locked to sync with 357 * mi_switch(). It must also be set with an atomic to sync 358 * with kpreempt(). 359 */ 360 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE); 361 } else { 362 f = RESCHED_UPREEMPT; 363 } 364 if (ci != curcpu()) { 365 f |= RESCHED_REMOTE; 366 } 367 368 /* 369 * Things can start as soon as ci_want_resched is touched: x86 has 370 * an instruction that monitors the memory cell it's in. Drop the 371 * schedstate lock in advance, otherwise the remote CPU can awaken 372 * and immediately block on the lock. 373 */ 374 if (__predict_true(unlock)) { 375 spc_unlock(ci); 376 } 377 378 /* 379 * The caller almost always has a second scheduler lock held: either 380 * the running LWP lock (spc_lwplock), or a sleep queue lock. That 381 * keeps preemption disabled, which among other things ensures all 382 * LWPs involved won't be freed while we're here (see lwp_dtor()). 383 */ 384 KASSERT(kpreempt_disabled()); 385 386 for (o = 0;; o = n) { 387 n = atomic_cas_uint(&ci->ci_want_resched, o, o | f); 388 if (__predict_true(o == n)) { 389 /* 390 * We're the first to set a resched on the CPU. Try 391 * to avoid causing a needless trip through trap() 392 * to handle an AST fault, if it's known the LWP 393 * will either block or go through userret() soon. 394 */ 395 if (l != curlwp || cpu_intr_p()) { 396 cpu_need_resched(ci, l, f); 397 } 398 break; 399 } 400 if (__predict_true( 401 (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >= 402 (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) { 403 /* Already in progress, nothing to do. */ 404 break; 405 } 406 } 407 } 408 409 /* 410 * Cause a preemption on the given CPU, if the priority of LWP "l" in state 411 * LSRUN, is higher priority than the running LWP. If "unlock" is 412 * specified, and ideally it will be for concurrency reasons, spc_mutex will 413 * be dropped before return. 414 */ 415 void 416 sched_resched_lwp(struct lwp *l, bool unlock) 417 { 418 struct cpu_info *ci = l->l_cpu; 419 420 KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); 421 KASSERT(l->l_stat == LSRUN); 422 423 sched_resched_cpu(ci, lwp_eprio(l), unlock); 424 } 425 426 /* 427 * Migration and balancing. 428 */ 429 430 #ifdef MULTIPROCESSOR 431 432 /* 433 * Estimate if LWP is cache-hot. 434 */ 435 static inline bool 436 lwp_cache_hot(const struct lwp *l) 437 { 438 439 /* Leave new LWPs in peace, determination has already been made. */ 440 if (l->l_stat == LSIDL) 441 return true; 442 443 if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0)) 444 return false; 445 446 return (getticks() - l->l_rticks < mstohz(cacheht_time)); 447 } 448 449 /* 450 * Check if LWP can migrate to the chosen CPU. 451 */ 452 static inline bool 453 sched_migratable(const struct lwp *l, struct cpu_info *ci) 454 { 455 const struct schedstate_percpu *spc = &ci->ci_schedstate; 456 KASSERT(lwp_locked(__UNCONST(l), NULL)); 457 458 /* Is CPU offline? */ 459 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 460 return false; 461 462 /* Is affinity set? */ 463 if (__predict_false(l->l_affinity)) 464 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 465 466 /* Is there a processor-set? */ 467 return (spc->spc_psid == l->l_psid); 468 } 469 470 /* 471 * A small helper to do round robin through CPU packages. 472 */ 473 static struct cpu_info * 474 sched_nextpkg(void) 475 { 476 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 477 478 spc->spc_nextpkg = 479 spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST]; 480 481 return spc->spc_nextpkg; 482 } 483 484 /* 485 * Find a CPU to run LWP "l". Look for the CPU with the lowest priority 486 * thread. In case of equal priority, prefer first class CPUs, and amongst 487 * the remainder choose the CPU with the fewest runqueue entries. 488 * 489 * Begin the search in the CPU package which "pivot" is a member of. 490 */ 491 static struct cpu_info * __noinline 492 sched_bestcpu(struct lwp *l, struct cpu_info *pivot) 493 { 494 struct cpu_info *bestci, *curci, *outer; 495 struct schedstate_percpu *bestspc, *curspc; 496 pri_t bestpri, curpri; 497 498 /* 499 * If this fails (it shouldn't), run on the given CPU. This also 500 * gives us a weak preference for "pivot" to begin with. 501 */ 502 bestci = pivot; 503 bestspc = &bestci->ci_schedstate; 504 if (sched_migratable(l, bestci)) { 505 bestpri = MAX(bestspc->spc_curpriority, 506 bestspc->spc_maxpriority); 507 } else { 508 /* Invalidate the priority. */ 509 bestpri = PRI_COUNT; 510 } 511 512 /* In the outer loop scroll through all CPU packages. */ 513 pivot = pivot->ci_package1st; 514 outer = pivot; 515 do { 516 /* In the inner loop scroll through all CPUs in package. */ 517 curci = outer; 518 do { 519 if (!sched_migratable(l, curci)) { 520 continue; 521 } 522 523 curspc = &curci->ci_schedstate; 524 525 /* If this CPU is idle and 1st class, we're done. */ 526 if (cpu_is_idle_1stclass(curci)) { 527 return curci; 528 } 529 530 curpri = MAX(curspc->spc_curpriority, 531 curspc->spc_maxpriority); 532 533 if (curpri > bestpri) { 534 continue; 535 } 536 if (curpri == bestpri) { 537 /* Prefer first class CPUs over others. */ 538 if (cpu_is_better(bestci, curci)) { 539 continue; 540 } 541 /* 542 * Pick the least busy CPU. Make sure this is not 543 * <=, otherwise it defeats the above preference. 544 */ 545 if (bestspc->spc_count < curspc->spc_count) { 546 continue; 547 } 548 } 549 550 bestpri = curpri; 551 bestci = curci; 552 bestspc = curspc; 553 554 } while (curci = curci->ci_sibling[CPUREL_PACKAGE], 555 curci != outer); 556 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 557 outer != pivot); 558 559 return bestci; 560 } 561 562 /* 563 * Estimate the migration of LWP to the other CPU. 564 * Take and return the CPU, if migration is needed. 565 */ 566 struct cpu_info * 567 sched_takecpu(struct lwp *l) 568 { 569 struct schedstate_percpu *spc; 570 struct cpu_info *ci, *curci, *tci; 571 pri_t eprio; 572 int flags; 573 574 KASSERT(lwp_locked(l, NULL)); 575 576 /* If thread is strictly bound, do not estimate other CPUs */ 577 ci = l->l_cpu; 578 if (l->l_pflag & LP_BOUND) 579 return ci; 580 581 spc = &ci->ci_schedstate; 582 eprio = lwp_eprio(l); 583 584 /* 585 * Handle new LWPs. For vfork() with a timeshared child, make it 586 * run on the same CPU as the parent if no other LWPs in queue. 587 * Otherwise scatter far and wide - try for an even distribution 588 * across all CPU packages and CPUs. 589 */ 590 if (l->l_stat == LSIDL) { 591 if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { 592 if (sched_migratable(l, curlwp->l_cpu) && eprio > 593 curlwp->l_cpu->ci_schedstate.spc_maxpriority) { 594 return curlwp->l_cpu; 595 } 596 } else { 597 return sched_bestcpu(l, sched_nextpkg()); 598 } 599 flags = SPCF_IDLE; 600 } else { 601 flags = SPCF_IDLE | SPCF_1STCLASS; 602 } 603 604 /* 605 * Try to send the LWP back to the first CPU in the same core if 606 * idle. This keeps LWPs clustered in the run queues of 1st class 607 * CPUs. This implies stickiness. If we didn't find a home for 608 * a vfork() child above, try to use any SMT sibling to help out. 609 */ 610 tci = ci; 611 do { 612 if (cpu_is_type(tci, flags) && sched_migratable(l, tci)) { 613 return tci; 614 } 615 tci = tci->ci_sibling[CPUREL_CORE]; 616 } while (tci != ci); 617 618 /* 619 * Otherwise the LWP is "sticky", i.e. generally preferring to stay 620 * on the same CPU. 621 */ 622 if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || 623 (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) { 624 return ci; 625 } 626 627 /* 628 * If the current CPU core is idle, run there and avoid the 629 * expensive scan of CPUs below. 630 */ 631 curci = curcpu(); 632 tci = curci; 633 do { 634 if (cpu_is_type(tci, flags) && sched_migratable(l, tci)) { 635 return tci; 636 } 637 tci = tci->ci_sibling[CPUREL_CORE]; 638 } while (tci != curci); 639 640 /* 641 * Didn't find a new home above - happens infrequently. Start the 642 * search in last CPU package that the LWP ran in, but expand to 643 * include the whole system if needed. 644 */ 645 return sched_bestcpu(l, l->l_cpu); 646 } 647 648 /* 649 * Tries to catch an LWP from the runqueue of other CPU. 650 */ 651 static struct lwp * 652 sched_catchlwp(struct cpu_info *ci) 653 { 654 struct cpu_info *curci = curcpu(); 655 struct schedstate_percpu *spc, *curspc; 656 TAILQ_HEAD(, lwp) *q_head; 657 struct lwp *l; 658 bool gentle; 659 660 curspc = &curci->ci_schedstate; 661 spc = &ci->ci_schedstate; 662 663 /* 664 * Be more aggressive if this CPU is first class, and the other 665 * is not. 666 */ 667 gentle = cpu_is_better(curci, ci); 668 669 if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) || 670 curspc->spc_psid != spc->spc_psid) { 671 spc_unlock(ci); 672 return NULL; 673 } 674 675 /* Take the highest priority thread */ 676 q_head = sched_getrq(spc, spc->spc_maxpriority); 677 l = TAILQ_FIRST(q_head); 678 679 for (;;) { 680 /* Check the first and next result from the queue */ 681 if (l == NULL) { 682 break; 683 } 684 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 685 ci->ci_data.cpu_name, 686 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 687 688 /* Look for threads, whose are allowed to migrate */ 689 if ((l->l_pflag & LP_BOUND) || 690 (gentle && lwp_cache_hot(l)) || 691 !sched_migratable(l, curci)) { 692 l = TAILQ_NEXT(l, l_runq); 693 /* XXX Gap: could walk down priority list. */ 694 continue; 695 } 696 697 /* Grab the thread, and move to the local run queue */ 698 sched_dequeue(l); 699 l->l_cpu = curci; 700 lwp_unlock_to(l, curspc->spc_mutex); 701 sched_enqueue(l); 702 return l; 703 } 704 spc_unlock(ci); 705 706 return l; 707 } 708 709 /* 710 * Called from sched_idle() to handle migration. Return the CPU that we 711 * pushed the LWP to (may be NULL). 712 */ 713 static struct cpu_info * 714 sched_idle_migrate(void) 715 { 716 struct cpu_info *ci = curcpu(), *tci = NULL; 717 struct schedstate_percpu *spc, *tspc; 718 bool dlock = false; 719 720 spc = &ci->ci_schedstate; 721 spc_lock(ci); 722 for (;;) { 723 struct lwp *l; 724 725 l = spc->spc_migrating; 726 if (l == NULL) 727 break; 728 729 /* 730 * If second attempt, and target CPU has changed, 731 * drop the old lock. 732 */ 733 if (dlock == true && tci != l->l_target_cpu) { 734 KASSERT(tci != NULL); 735 spc_unlock(tci); 736 dlock = false; 737 } 738 739 /* 740 * Nothing to do if destination has changed to the 741 * local CPU, or migration was done by other CPU. 742 */ 743 tci = l->l_target_cpu; 744 if (tci == NULL || tci == ci) { 745 spc->spc_migrating = NULL; 746 l->l_target_cpu = NULL; 747 break; 748 } 749 tspc = &tci->ci_schedstate; 750 751 /* 752 * Double-lock the runqueues. 753 * We do that only once. 754 */ 755 if (dlock == false) { 756 dlock = true; 757 if (ci < tci) { 758 spc_lock(tci); 759 } else if (!mutex_tryenter(tspc->spc_mutex)) { 760 spc_unlock(ci); 761 spc_lock(tci); 762 spc_lock(ci); 763 /* Check the situation again.. */ 764 continue; 765 } 766 } 767 768 /* Migrate the thread */ 769 KASSERT(l->l_stat == LSRUN); 770 spc->spc_migrating = NULL; 771 l->l_target_cpu = NULL; 772 sched_dequeue(l); 773 l->l_cpu = tci; 774 lwp_setlock(l, tspc->spc_mutex); 775 sched_enqueue(l); 776 sched_resched_lwp(l, true); 777 /* tci now unlocked */ 778 spc_unlock(ci); 779 return tci; 780 } 781 if (dlock == true) { 782 KASSERT(tci != NULL); 783 spc_unlock(tci); 784 } 785 spc_unlock(ci); 786 return NULL; 787 } 788 789 /* 790 * Try to steal an LWP from "tci". 791 */ 792 static bool 793 sched_steal(struct cpu_info *ci, struct cpu_info *tci) 794 { 795 struct schedstate_percpu *spc, *tspc; 796 lwp_t *l; 797 798 spc = &ci->ci_schedstate; 799 tspc = &tci->ci_schedstate; 800 if (atomic_load_relaxed(&tspc->spc_mcount) != 0 && 801 spc->spc_psid == tspc->spc_psid) { 802 spc_dlock(ci, tci); 803 l = sched_catchlwp(tci); 804 spc_unlock(ci); 805 if (l != NULL) { 806 return true; 807 } 808 } 809 return false; 810 } 811 812 /* 813 * Called from each CPU's idle loop. 814 */ 815 void 816 sched_idle(void) 817 { 818 struct cpu_info *ci, *inner, *outer, *first, *tci, *mci; 819 struct schedstate_percpu *spc, *tspc; 820 struct lwp *l; 821 822 ci = curcpu(); 823 spc = &ci->ci_schedstate; 824 tci = NULL; 825 mci = NULL; 826 827 /* 828 * Handle LWP migrations off this CPU to another. If there a is 829 * migration to do then remember the CPU the LWP was sent to, and 830 * don't steal the LWP back from that CPU below. 831 */ 832 if (spc->spc_migrating != NULL) { 833 mci = sched_idle_migrate(); 834 } 835 836 /* If this CPU is offline, or we have an LWP to run, we're done. */ 837 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) { 838 return; 839 } 840 841 /* Deal with SMT. */ 842 if (ci->ci_nsibling[CPUREL_CORE] > 1) { 843 /* Try to help our siblings out. */ 844 tci = ci->ci_sibling[CPUREL_CORE]; 845 while (tci != ci) { 846 if (tci != mci && sched_steal(ci, tci)) { 847 return; 848 } 849 tci = tci->ci_sibling[CPUREL_CORE]; 850 } 851 /* 852 * If not the first SMT in the core, and in the default 853 * processor set, the search ends here. 854 */ 855 if ((spc->spc_flags & SPCF_1STCLASS) == 0 && 856 spc->spc_psid == PS_NONE) { 857 return; 858 } 859 } 860 861 /* 862 * Find something to run, unless this CPU exceeded the rate limit. 863 * Start looking on the current package to maximise L2/L3 cache 864 * locality. Then expand to looking at the rest of the system. 865 * 866 * XXX Should probably look at 2nd class CPUs first, but they will 867 * shed jobs via preempt() anyway. 868 */ 869 if (spc->spc_nextskim > getticks()) { 870 return; 871 } 872 spc->spc_nextskim = getticks() + mstohz(skim_interval); 873 874 /* In the outer loop scroll through all CPU packages, starting here. */ 875 first = ci->ci_package1st; 876 outer = first; 877 do { 878 /* In the inner loop scroll through all CPUs in package. */ 879 inner = outer; 880 do { 881 /* Don't hit the locks unless needed. */ 882 tspc = &inner->ci_schedstate; 883 if (ci == inner || ci == mci || 884 spc->spc_psid != tspc->spc_psid || 885 atomic_load_relaxed(&tspc->spc_mcount) < min_catch) { 886 continue; 887 } 888 spc_dlock(ci, inner); 889 l = sched_catchlwp(inner); 890 spc_unlock(ci); 891 if (l != NULL) { 892 /* Got it! */ 893 return; 894 } 895 } while (inner = inner->ci_sibling[CPUREL_PACKAGE], 896 inner != outer); 897 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 898 outer != first); 899 } 900 901 /* 902 * Called from mi_switch() when an LWP has been preempted / has yielded. 903 * The LWP is presently in the CPU's run queue. Here we look for a better 904 * CPU to teleport the LWP to; there may not be one. 905 */ 906 void 907 sched_preempted(struct lwp *l) 908 { 909 struct schedstate_percpu *tspc; 910 struct cpu_info *ci, *tci; 911 912 ci = l->l_cpu; 913 tspc = &ci->ci_schedstate; 914 915 KASSERT(tspc->spc_count >= 1); 916 917 /* 918 * Try to select another CPU if: 919 * 920 * - there is no migration pending already 921 * - and this LWP is running on a 2nd class CPU 922 * - or this LWP is a child of vfork() that has just done execve() 923 */ 924 if (l->l_target_cpu != NULL || 925 (cpu_is_1stclass(ci) && 926 (l->l_pflag & LP_TELEPORT) == 0)) { 927 return; 928 } 929 930 /* 931 * Fast path: if the first SMT in the core is idle, send it back 932 * there, because the cache is shared (cheap) and we want all LWPs 933 * to be clustered on 1st class CPUs (either running there or on 934 * their runqueues). 935 */ 936 tci = ci->ci_sibling[CPUREL_CORE]; 937 while (tci != ci) { 938 tspc = &tci->ci_schedstate; 939 if (cpu_is_idle_1stclass(tci) && sched_migratable(l, tci)) { 940 l->l_target_cpu = tci; 941 l->l_pflag &= ~LP_TELEPORT; 942 return; 943 } 944 tci = tci->ci_sibling[CPUREL_CORE]; 945 } 946 947 if ((l->l_pflag & LP_TELEPORT) != 0) { 948 /* 949 * A child of vfork(): now that the parent is released, 950 * scatter far and wide, to match the LSIDL distribution 951 * done in sched_takecpu(). 952 */ 953 l->l_pflag &= ~LP_TELEPORT; 954 tci = sched_bestcpu(l, sched_nextpkg()); 955 if (tci != ci) { 956 l->l_target_cpu = tci; 957 } 958 } else { 959 /* 960 * Try to find a better CPU to take it, but don't move to 961 * another 2nd class CPU, and don't move to a non-idle CPU, 962 * because that would prevent SMT being used to maximise 963 * throughput. 964 * 965 * Search in the current CPU package in order to try and 966 * keep L2/L3 cache locality, but expand to include the 967 * whole system if needed. 968 */ 969 tci = sched_bestcpu(l, l->l_cpu); 970 if (tci != ci && cpu_is_idle_1stclass(tci)) { 971 l->l_target_cpu = tci; 972 } 973 } 974 } 975 976 /* 977 * Called during execve() by a child of vfork(). Does two things: 978 * 979 * - If the parent has been awoken and put back on curcpu then give the 980 * CPU back to the parent. 981 * 982 * - If curlwp is not on a 1st class CPU then find somewhere else to run, 983 * since it dodged the distribution in sched_takecpu() when first set 984 * runnable. 985 */ 986 void 987 sched_vforkexec(struct lwp *l, bool samecpu) 988 { 989 990 KASSERT(l == curlwp); 991 if ((samecpu && ncpu > 1) || !cpu_is_1stclass(l->l_cpu)) { 992 l->l_pflag |= LP_TELEPORT; 993 preempt(); 994 } 995 } 996 997 #else 998 999 /* 1000 * stubs for !MULTIPROCESSOR 1001 */ 1002 1003 struct cpu_info * 1004 sched_takecpu(struct lwp *l) 1005 { 1006 1007 return l->l_cpu; 1008 } 1009 1010 void 1011 sched_idle(void) 1012 { 1013 1014 } 1015 1016 void 1017 sched_preempted(struct lwp *l) 1018 { 1019 1020 } 1021 1022 void 1023 sched_vforkexec(struct lwp *l, bool samecpu) 1024 { 1025 1026 KASSERT(l == curlwp); 1027 } 1028 1029 #endif /* MULTIPROCESSOR */ 1030 1031 /* 1032 * Scheduling statistics and balancing. 1033 */ 1034 void 1035 sched_lwp_stats(struct lwp *l) 1036 { 1037 int batch; 1038 1039 KASSERT(lwp_locked(l, NULL)); 1040 1041 /* Update sleep time */ 1042 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 1043 l->l_stat == LSSUSPENDED) 1044 l->l_slptime++; 1045 1046 /* 1047 * Set that thread is more CPU-bound, if sum of run time exceeds the 1048 * sum of sleep time. Check if thread is CPU-bound a first time. 1049 */ 1050 batch = (l->l_rticksum > l->l_slpticksum); 1051 if (batch != 0) { 1052 if ((l->l_flag & LW_BATCH) == 0) 1053 batch = 0; 1054 l->l_flag |= LW_BATCH; 1055 } else 1056 l->l_flag &= ~LW_BATCH; 1057 1058 /* Reset the time sums */ 1059 l->l_slpticksum = 0; 1060 l->l_rticksum = 0; 1061 1062 /* Scheduler-specific hook */ 1063 sched_pstats_hook(l, batch); 1064 #ifdef KDTRACE_HOOKS 1065 curthread = l; 1066 #endif 1067 } 1068 1069 /* 1070 * Scheduler mill. 1071 */ 1072 struct lwp * 1073 sched_nextlwp(void) 1074 { 1075 struct cpu_info *ci = curcpu(); 1076 struct schedstate_percpu *spc; 1077 TAILQ_HEAD(, lwp) *q_head; 1078 struct lwp *l; 1079 1080 /* Update the last run time on switch */ 1081 l = curlwp; 1082 l->l_rticksum += (getticks() - l->l_rticks); 1083 1084 /* Return to idle LWP if there is a migrating thread */ 1085 spc = &ci->ci_schedstate; 1086 if (__predict_false(spc->spc_migrating != NULL)) 1087 return NULL; 1088 1089 /* Return to idle LWP if there is no runnable job */ 1090 if (__predict_false(spc->spc_count == 0)) 1091 return NULL; 1092 1093 /* Take the highest priority thread */ 1094 KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 1095 q_head = sched_getrq(spc, spc->spc_maxpriority); 1096 l = TAILQ_FIRST(q_head); 1097 KASSERT(l != NULL); 1098 1099 sched_oncpu(l); 1100 l->l_rticks = getticks(); 1101 1102 return l; 1103 } 1104 1105 /* 1106 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 1107 */ 1108 1109 bool 1110 sched_curcpu_runnable_p(void) 1111 { 1112 const struct cpu_info *ci; 1113 const struct schedstate_percpu *spc; 1114 bool rv; 1115 1116 kpreempt_disable(); 1117 ci = curcpu(); 1118 spc = &ci->ci_schedstate; 1119 rv = (spc->spc_count != 0); 1120 #ifndef __HAVE_FAST_SOFTINTS 1121 rv |= (ci->ci_data.cpu_softints != 0); 1122 #endif 1123 kpreempt_enable(); 1124 1125 return rv; 1126 } 1127 1128 /* 1129 * Sysctl nodes and initialization. 1130 */ 1131 1132 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 1133 { 1134 const struct sysctlnode *node = NULL; 1135 1136 sysctl_createv(clog, 0, NULL, &node, 1137 CTLFLAG_PERMANENT, 1138 CTLTYPE_NODE, "sched", 1139 SYSCTL_DESCR("Scheduler options"), 1140 NULL, 0, NULL, 0, 1141 CTL_KERN, CTL_CREATE, CTL_EOL); 1142 1143 if (node == NULL) 1144 return; 1145 1146 sysctl_createv(clog, 0, &node, NULL, 1147 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1148 CTLTYPE_INT, "cacheht_time", 1149 SYSCTL_DESCR("Cache hotness time (in ms)"), 1150 NULL, 0, &cacheht_time, 0, 1151 CTL_CREATE, CTL_EOL); 1152 sysctl_createv(clog, 0, &node, NULL, 1153 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1154 CTLTYPE_INT, "skim_interval", 1155 SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"), 1156 NULL, 0, &skim_interval, 0, 1157 CTL_CREATE, CTL_EOL); 1158 sysctl_createv(clog, 0, &node, NULL, 1159 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1160 CTLTYPE_INT, "min_catch", 1161 SYSCTL_DESCR("Minimal count of threads for catching"), 1162 NULL, 0, &min_catch, 0, 1163 CTL_CREATE, CTL_EOL); 1164 sysctl_createv(clog, 0, &node, NULL, 1165 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1166 CTLTYPE_INT, "timesoftints", 1167 SYSCTL_DESCR("Track CPU time for soft interrupts"), 1168 NULL, 0, &softint_timing, 0, 1169 CTL_CREATE, CTL_EOL); 1170 sysctl_createv(clog, 0, &node, NULL, 1171 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1172 CTLTYPE_INT, "kpreempt_pri", 1173 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 1174 NULL, 0, &sched_kpreempt_pri, 0, 1175 CTL_CREATE, CTL_EOL); 1176 } 1177 1178 /* 1179 * Debugging. 1180 */ 1181 1182 #ifdef DDB 1183 1184 void 1185 sched_print_runqueue(void (*pr)(const char *, ...)) 1186 { 1187 struct cpu_info *ci, *tci; 1188 struct schedstate_percpu *spc; 1189 struct lwp *l; 1190 struct proc *p; 1191 CPU_INFO_ITERATOR cii; 1192 1193 for (CPU_INFO_FOREACH(cii, ci)) { 1194 int i; 1195 1196 spc = &ci->ci_schedstate; 1197 1198 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 1199 (*pr)(" pid.lid = %d.%d, r_count = %u, " 1200 "maxpri = %d, mlwp = %p\n", 1201 #ifdef MULTIPROCESSOR 1202 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 1203 #else 1204 curlwp->l_proc->p_pid, curlwp->l_lid, 1205 #endif 1206 spc->spc_count, spc->spc_maxpriority, 1207 spc->spc_migrating); 1208 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 1209 do { 1210 uint32_t q; 1211 q = spc->spc_bitmap[i]; 1212 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 1213 } while (i--); 1214 } 1215 1216 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 1217 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 1218 1219 PROCLIST_FOREACH(p, &allproc) { 1220 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 1221 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1222 ci = l->l_cpu; 1223 tci = l->l_target_cpu; 1224 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 1225 (int)l->l_lid, l->l_priority, lwp_eprio(l), 1226 l->l_flag, l->l_stat == LSRUN ? "RQ" : 1227 (l->l_stat == LSSLEEP ? "SQ" : "-"), 1228 l, ci->ci_index, (tci ? tci->ci_index : -1), 1229 (u_int)(getticks() - l->l_rticks)); 1230 } 1231 } 1232 } 1233 1234 #endif 1235