1 /* $NetBSD: kern_runq.c,v 1.64 2020/03/26 19:25:07 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57 58 #include <sys/cdefs.h> 59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.64 2020/03/26 19:25:07 ad Exp $"); 60 61 #include "opt_dtrace.h" 62 63 #include <sys/param.h> 64 #include <sys/kernel.h> 65 #include <sys/bitops.h> 66 #include <sys/cpu.h> 67 #include <sys/idle.h> 68 #include <sys/intr.h> 69 #include <sys/kmem.h> 70 #include <sys/lwp.h> 71 #include <sys/mutex.h> 72 #include <sys/proc.h> 73 #include <sys/pset.h> 74 #include <sys/sched.h> 75 #include <sys/syscallargs.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/types.h> 79 #include <sys/evcnt.h> 80 #include <sys/atomic.h> 81 82 /* 83 * Bits per map. 84 */ 85 #define BITMAP_BITS (32) 86 #define BITMAP_SHIFT (5) 87 #define BITMAP_MSB (0x80000000U) 88 #define BITMAP_MASK (BITMAP_BITS - 1) 89 90 const int schedppq = 1; 91 92 static void *sched_getrq(struct schedstate_percpu *, const pri_t); 93 #ifdef MULTIPROCESSOR 94 static lwp_t * sched_catchlwp(struct cpu_info *); 95 #endif 96 97 /* 98 * Preemption control. 99 */ 100 #ifdef __HAVE_PREEMPTION 101 # ifdef DEBUG 102 int sched_kpreempt_pri = 0; 103 # else 104 int sched_kpreempt_pri = PRI_USER_RT; 105 # endif 106 #else 107 int sched_kpreempt_pri = 1000; 108 #endif 109 110 /* 111 * Migration and balancing. 112 */ 113 static u_int cacheht_time; /* Cache hotness time */ 114 static u_int min_catch; /* Minimal LWP count for catching */ 115 static u_int skim_interval; /* Rate limit for stealing LWPs */ 116 117 #ifdef KDTRACE_HOOKS 118 struct lwp *curthread; 119 #endif 120 121 void 122 runq_init(void) 123 { 124 125 /* Pulling from remote packages, LWP must not have run for 10ms. */ 126 cacheht_time = 10; 127 128 /* Minimal count of LWPs for catching */ 129 min_catch = 1; 130 131 /* Steal from other CPUs at most every 10ms. */ 132 skim_interval = 10; 133 } 134 135 void 136 sched_cpuattach(struct cpu_info *ci) 137 { 138 struct schedstate_percpu *spc; 139 size_t size; 140 void *p; 141 u_int i; 142 143 spc = &ci->ci_schedstate; 144 spc->spc_nextpkg = ci; 145 146 if (spc->spc_lwplock == NULL) { 147 spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 148 } 149 if (ci == lwp0.l_cpu) { 150 /* Initialize the scheduler structure of the primary LWP */ 151 lwp0.l_mutex = spc->spc_lwplock; 152 } 153 if (spc->spc_mutex != NULL) { 154 /* Already initialized. */ 155 return; 156 } 157 158 /* Allocate the run queue */ 159 size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) + 160 coherency_unit; 161 p = kmem_alloc(size, KM_SLEEP); 162 spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit); 163 164 /* Initialize run queues */ 165 spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 166 for (i = 0; i < PRI_COUNT; i++) 167 TAILQ_INIT(&spc->spc_queue[i]); 168 } 169 170 /* 171 * Control of the runqueue. 172 */ 173 static inline void * 174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio) 175 { 176 177 KASSERT(prio < PRI_COUNT); 178 return &spc->spc_queue[prio]; 179 } 180 181 /* 182 * Put an LWP onto a run queue. The LWP must be locked by spc_mutex for 183 * l_cpu. 184 */ 185 void 186 sched_enqueue(struct lwp *l) 187 { 188 struct schedstate_percpu *spc; 189 TAILQ_HEAD(, lwp) *q_head; 190 const pri_t eprio = lwp_eprio(l); 191 struct cpu_info *ci; 192 193 ci = l->l_cpu; 194 spc = &ci->ci_schedstate; 195 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 196 197 /* Enqueue the thread */ 198 q_head = sched_getrq(spc, eprio); 199 if (TAILQ_EMPTY(q_head)) { 200 u_int i; 201 uint32_t q; 202 203 /* Mark bit */ 204 i = eprio >> BITMAP_SHIFT; 205 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 206 KASSERT((spc->spc_bitmap[i] & q) == 0); 207 spc->spc_bitmap[i] |= q; 208 } 209 /* Preempted SCHED_RR and SCHED_FIFO LWPs go to the queue head. */ 210 if (l->l_class != SCHED_OTHER && (l->l_pflag & LP_PREEMPTING) != 0) { 211 TAILQ_INSERT_HEAD(q_head, l, l_runq); 212 } else { 213 TAILQ_INSERT_TAIL(q_head, l, l_runq); 214 } 215 spc->spc_flags &= ~SPCF_IDLE; 216 spc->spc_count++; 217 if ((l->l_pflag & LP_BOUND) == 0) 218 spc->spc_mcount++; 219 220 /* 221 * Update the value of highest priority in the runqueue, 222 * if priority of this thread is higher. 223 */ 224 if (eprio > spc->spc_maxpriority) 225 spc->spc_maxpriority = eprio; 226 227 sched_newts(l); 228 } 229 230 /* 231 * Remove and LWP from the run queue it's on. The LWP must be in state 232 * LSRUN. 233 */ 234 void 235 sched_dequeue(struct lwp *l) 236 { 237 TAILQ_HEAD(, lwp) *q_head; 238 struct schedstate_percpu *spc; 239 const pri_t eprio = lwp_eprio(l); 240 241 spc = &l->l_cpu->ci_schedstate; 242 243 KASSERT(lwp_locked(l, spc->spc_mutex)); 244 KASSERT(eprio <= spc->spc_maxpriority); 245 KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); 246 KASSERT(spc->spc_count > 0); 247 248 if (spc->spc_migrating == l) 249 spc->spc_migrating = NULL; 250 251 spc->spc_count--; 252 if ((l->l_pflag & LP_BOUND) == 0) 253 spc->spc_mcount--; 254 255 q_head = sched_getrq(spc, eprio); 256 TAILQ_REMOVE(q_head, l, l_runq); 257 if (TAILQ_EMPTY(q_head)) { 258 u_int i; 259 uint32_t q; 260 261 /* Unmark bit */ 262 i = eprio >> BITMAP_SHIFT; 263 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 264 KASSERT((spc->spc_bitmap[i] & q) != 0); 265 spc->spc_bitmap[i] &= ~q; 266 267 /* 268 * Update the value of highest priority in the runqueue, in a 269 * case it was a last thread in the queue of highest priority. 270 */ 271 if (eprio != spc->spc_maxpriority) 272 return; 273 274 do { 275 if (spc->spc_bitmap[i] != 0) { 276 q = ffs(spc->spc_bitmap[i]); 277 spc->spc_maxpriority = 278 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 279 return; 280 } 281 } while (i--); 282 283 /* If not found - set the lowest value */ 284 spc->spc_maxpriority = 0; 285 } 286 } 287 288 /* 289 * Cause a preemption on the given CPU, if the priority "pri" is higher 290 * priority than the running LWP. If "unlock" is specified, and ideally it 291 * will be for concurrency reasons, spc_mutex will be dropped before return. 292 */ 293 void 294 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) 295 { 296 struct schedstate_percpu *spc; 297 u_int o, n, f; 298 lwp_t *l; 299 300 spc = &ci->ci_schedstate; 301 302 KASSERT(mutex_owned(spc->spc_mutex)); 303 304 /* 305 * If the priority level we're evaluating wouldn't cause a new LWP 306 * to be run on the CPU, then we have nothing to do. 307 */ 308 if (pri <= spc->spc_curpriority || !mp_online) { 309 if (__predict_true(unlock)) { 310 spc_unlock(ci); 311 } 312 return; 313 } 314 315 /* 316 * Figure out what kind of preemption we should do. 317 */ 318 l = ci->ci_onproc; 319 if ((l->l_flag & LW_IDLE) != 0) { 320 f = RESCHED_IDLE | RESCHED_UPREEMPT; 321 } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) { 322 /* We can't currently preempt softints - should be able to. */ 323 #ifdef __HAVE_PREEMPTION 324 f = RESCHED_KPREEMPT; 325 #else 326 /* Leave door open for test: set kpreempt_pri with sysctl. */ 327 f = RESCHED_UPREEMPT; 328 #endif 329 /* 330 * l_dopreempt must be set with the CPU locked to sync with 331 * mi_switch(). It must also be set with an atomic to sync 332 * with kpreempt(). 333 */ 334 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE); 335 } else { 336 f = RESCHED_UPREEMPT; 337 } 338 if (ci != curcpu()) { 339 f |= RESCHED_REMOTE; 340 } 341 342 /* 343 * Things start as soon as we touch ci_want_resched: x86 for example 344 * has an instruction that monitors the memory cell it's in. We 345 * want to drop the schedstate lock in advance, otherwise the remote 346 * CPU can awaken and immediately block on the lock. 347 */ 348 if (__predict_true(unlock)) { 349 spc_unlock(ci); 350 } 351 352 /* 353 * The caller will always have a second scheduler lock held: either 354 * the running LWP lock (spc_lwplock), or a sleep queue lock. That 355 * keeps preemption disabled, which among other things ensures all 356 * LWPs involved won't be freed while we're here (see lwp_dtor()). 357 */ 358 KASSERT(kpreempt_disabled()); 359 360 for (o = 0;; o = n) { 361 n = atomic_cas_uint(&ci->ci_want_resched, o, o | f); 362 if (__predict_true(o == n)) { 363 /* 364 * We're the first. If we're in process context on 365 * the same CPU, we can avoid the visit to trap(). 366 */ 367 if (l != curlwp || cpu_intr_p()) { 368 cpu_need_resched(ci, l, f); 369 } 370 break; 371 } 372 if (__predict_true( 373 (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >= 374 (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) { 375 /* Already in progress, nothing to do. */ 376 break; 377 } 378 } 379 } 380 381 /* 382 * Cause a preemption on the given CPU, if the priority of LWP "l" in state 383 * LSRUN, is higher priority than the running LWP. If "unlock" is 384 * specified, and ideally it will be for concurrency reasons, spc_mutex will 385 * be dropped before return. 386 */ 387 void 388 sched_resched_lwp(struct lwp *l, bool unlock) 389 { 390 struct cpu_info *ci = l->l_cpu; 391 392 KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); 393 KASSERT(l->l_stat == LSRUN); 394 395 sched_resched_cpu(ci, lwp_eprio(l), unlock); 396 } 397 398 /* 399 * Migration and balancing. 400 */ 401 402 #ifdef MULTIPROCESSOR 403 404 /* 405 * Estimate if LWP is cache-hot. 406 */ 407 static inline bool 408 lwp_cache_hot(const struct lwp *l) 409 { 410 411 /* Leave new LWPs in peace, determination has already been made. */ 412 if (l->l_stat == LSIDL) 413 return true; 414 415 if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0)) 416 return false; 417 418 return (hardclock_ticks - l->l_rticks < mstohz(cacheht_time)); 419 } 420 421 /* 422 * Check if LWP can migrate to the chosen CPU. 423 */ 424 static inline bool 425 sched_migratable(const struct lwp *l, struct cpu_info *ci) 426 { 427 const struct schedstate_percpu *spc = &ci->ci_schedstate; 428 KASSERT(lwp_locked(__UNCONST(l), NULL)); 429 430 /* Is CPU offline? */ 431 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 432 return false; 433 434 /* Is affinity set? */ 435 if (__predict_false(l->l_affinity)) 436 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 437 438 /* Is there a processor-set? */ 439 return (spc->spc_psid == l->l_psid); 440 } 441 442 /* 443 * A small helper to do round robin through CPU packages. 444 */ 445 static struct cpu_info * 446 sched_nextpkg(void) 447 { 448 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 449 450 spc->spc_nextpkg = 451 spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST]; 452 453 return spc->spc_nextpkg; 454 } 455 456 /* 457 * Find a CPU to run LWP "l". Look for the CPU with the lowest priority 458 * thread. In case of equal priority, prefer first class CPUs, and amongst 459 * the remainder choose the CPU with the fewest runqueue entries. 460 * 461 * Begin the search in the CPU package which "pivot" is a member of. 462 */ 463 static struct cpu_info * __noinline 464 sched_bestcpu(struct lwp *l, struct cpu_info *pivot) 465 { 466 struct cpu_info *bestci, *curci, *outer; 467 struct schedstate_percpu *bestspc, *curspc; 468 pri_t bestpri, curpri; 469 470 /* 471 * If this fails (it shouldn't), run on the given CPU. This also 472 * gives us a weak preference for "pivot" to begin with. 473 */ 474 bestci = pivot; 475 bestspc = &bestci->ci_schedstate; 476 bestpri = MAX(bestspc->spc_curpriority, bestspc->spc_maxpriority); 477 478 /* In the outer loop scroll through all CPU packages. */ 479 pivot = pivot->ci_package1st; 480 outer = pivot; 481 do { 482 /* In the inner loop scroll through all CPUs in package. */ 483 curci = outer; 484 do { 485 if (!sched_migratable(l, curci)) { 486 continue; 487 } 488 489 curspc = &curci->ci_schedstate; 490 491 /* If this CPU is idle and 1st class, we're done. */ 492 if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) == 493 (SPCF_IDLE | SPCF_1STCLASS)) { 494 return curci; 495 } 496 497 curpri = MAX(curspc->spc_curpriority, 498 curspc->spc_maxpriority); 499 500 if (curpri > bestpri) { 501 continue; 502 } 503 if (curpri == bestpri) { 504 /* Prefer first class CPUs over others. */ 505 if ((curspc->spc_flags & SPCF_1STCLASS) == 0 && 506 (bestspc->spc_flags & SPCF_1STCLASS) != 0) { 507 continue; 508 } 509 /* 510 * Pick the least busy CPU. Make sure this is not 511 * <=, otherwise it defeats the above preference. 512 */ 513 if (bestspc->spc_count < curspc->spc_count) { 514 continue; 515 } 516 } 517 518 bestpri = curpri; 519 bestci = curci; 520 bestspc = curspc; 521 522 } while (curci = curci->ci_sibling[CPUREL_PACKAGE], 523 curci != outer); 524 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 525 outer != pivot); 526 527 return bestci; 528 } 529 530 /* 531 * Estimate the migration of LWP to the other CPU. 532 * Take and return the CPU, if migration is needed. 533 */ 534 struct cpu_info * 535 sched_takecpu(struct lwp *l) 536 { 537 struct schedstate_percpu *spc, *tspc; 538 struct cpu_info *ci, *curci, *tci; 539 pri_t eprio; 540 int flags; 541 542 KASSERT(lwp_locked(l, NULL)); 543 544 /* If thread is strictly bound, do not estimate other CPUs */ 545 ci = l->l_cpu; 546 if (l->l_pflag & LP_BOUND) 547 return ci; 548 549 spc = &ci->ci_schedstate; 550 eprio = lwp_eprio(l); 551 552 /* 553 * Handle new LWPs. For vfork() with a timeshared child, make it 554 * run on the same CPU as the parent if no other LWPs in queue. 555 * Otherwise scatter far and wide - try for an even distribution 556 * across all CPU packages and CPUs. 557 */ 558 if (l->l_stat == LSIDL) { 559 if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { 560 if (sched_migratable(l, curlwp->l_cpu) && eprio > 561 curlwp->l_cpu->ci_schedstate.spc_maxpriority) { 562 return curlwp->l_cpu; 563 } 564 } else { 565 return sched_bestcpu(l, sched_nextpkg()); 566 } 567 flags = SPCF_IDLE; 568 } else { 569 flags = SPCF_IDLE | SPCF_1STCLASS; 570 } 571 572 /* 573 * Try to send the LWP back to the first CPU in the same core if 574 * idle. This keeps LWPs clustered in the run queues of 1st class 575 * CPUs. This implies stickiness. If we didn't find a home for 576 * a vfork() child above, try to use any SMT sibling to help out. 577 */ 578 tci = ci; 579 do { 580 tspc = &tci->ci_schedstate; 581 if ((tspc->spc_flags & flags) == flags && 582 sched_migratable(l, tci)) { 583 return tci; 584 } 585 tci = tci->ci_sibling[CPUREL_CORE]; 586 } while (tci != ci); 587 588 /* 589 * Otherwise the LWP is "sticky", i.e. generally preferring to stay 590 * on the same CPU. 591 */ 592 if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || 593 (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) { 594 return ci; 595 } 596 597 /* 598 * If the current CPU core is idle, run there and avoid the 599 * expensive scan of CPUs below. 600 */ 601 curci = curcpu(); 602 tci = curci; 603 do { 604 tspc = &tci->ci_schedstate; 605 if ((tspc->spc_flags & flags) == flags && 606 sched_migratable(l, tci)) { 607 return tci; 608 } 609 tci = tci->ci_sibling[CPUREL_CORE]; 610 } while (tci != curci); 611 612 /* 613 * Didn't find a new home above - happens infrequently. Start the 614 * search in last CPU package that the LWP ran in, but expand to 615 * include the whole system if needed. 616 */ 617 return sched_bestcpu(l, l->l_cpu); 618 } 619 620 /* 621 * Tries to catch an LWP from the runqueue of other CPU. 622 */ 623 static struct lwp * 624 sched_catchlwp(struct cpu_info *ci) 625 { 626 struct cpu_info *curci = curcpu(); 627 struct schedstate_percpu *spc, *curspc; 628 TAILQ_HEAD(, lwp) *q_head; 629 struct lwp *l; 630 bool gentle; 631 632 curspc = &curci->ci_schedstate; 633 spc = &ci->ci_schedstate; 634 635 /* 636 * Be more aggressive if this CPU is first class, and the other 637 * is not. 638 */ 639 gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 || 640 (spc->spc_flags & SPCF_1STCLASS) != 0); 641 642 if (spc->spc_mcount < (gentle ? min_catch : 1) || 643 curspc->spc_psid != spc->spc_psid) { 644 spc_unlock(ci); 645 return NULL; 646 } 647 648 /* Take the highest priority thread */ 649 q_head = sched_getrq(spc, spc->spc_maxpriority); 650 l = TAILQ_FIRST(q_head); 651 652 for (;;) { 653 /* Check the first and next result from the queue */ 654 if (l == NULL) { 655 break; 656 } 657 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 658 ci->ci_data.cpu_name, 659 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 660 661 /* Look for threads, whose are allowed to migrate */ 662 if ((l->l_pflag & LP_BOUND) || 663 (gentle && lwp_cache_hot(l)) || 664 !sched_migratable(l, curci)) { 665 l = TAILQ_NEXT(l, l_runq); 666 /* XXX Gap: could walk down priority list. */ 667 continue; 668 } 669 670 /* Grab the thread, and move to the local run queue */ 671 sched_dequeue(l); 672 l->l_cpu = curci; 673 lwp_unlock_to(l, curspc->spc_mutex); 674 sched_enqueue(l); 675 return l; 676 } 677 spc_unlock(ci); 678 679 return l; 680 } 681 682 /* 683 * Called from sched_idle() to handle migration. 684 */ 685 static void 686 sched_idle_migrate(void) 687 { 688 struct cpu_info *ci = curcpu(), *tci = NULL; 689 struct schedstate_percpu *spc, *tspc; 690 bool dlock = false; 691 692 spc = &ci->ci_schedstate; 693 spc_lock(ci); 694 for (;;) { 695 struct lwp *l; 696 697 l = spc->spc_migrating; 698 if (l == NULL) 699 break; 700 701 /* 702 * If second attempt, and target CPU has changed, 703 * drop the old lock. 704 */ 705 if (dlock == true && tci != l->l_target_cpu) { 706 KASSERT(tci != NULL); 707 spc_unlock(tci); 708 dlock = false; 709 } 710 711 /* 712 * Nothing to do if destination has changed to the 713 * local CPU, or migration was done by other CPU. 714 */ 715 tci = l->l_target_cpu; 716 if (tci == NULL || tci == ci) { 717 spc->spc_migrating = NULL; 718 l->l_target_cpu = NULL; 719 break; 720 } 721 tspc = &tci->ci_schedstate; 722 723 /* 724 * Double-lock the runqueues. 725 * We do that only once. 726 */ 727 if (dlock == false) { 728 dlock = true; 729 if (ci < tci) { 730 spc_lock(tci); 731 } else if (!mutex_tryenter(tspc->spc_mutex)) { 732 spc_unlock(ci); 733 spc_lock(tci); 734 spc_lock(ci); 735 /* Check the situation again.. */ 736 continue; 737 } 738 } 739 740 /* Migrate the thread */ 741 KASSERT(l->l_stat == LSRUN); 742 spc->spc_migrating = NULL; 743 l->l_target_cpu = NULL; 744 sched_dequeue(l); 745 l->l_cpu = tci; 746 lwp_setlock(l, tspc->spc_mutex); 747 sched_enqueue(l); 748 sched_resched_lwp(l, true); 749 /* tci now unlocked */ 750 spc_unlock(ci); 751 return; 752 } 753 if (dlock == true) { 754 KASSERT(tci != NULL); 755 spc_unlock(tci); 756 } 757 spc_unlock(ci); 758 } 759 760 /* 761 * Try to steal an LWP from "tci". 762 */ 763 static bool 764 sched_steal(struct cpu_info *ci, struct cpu_info *tci) 765 { 766 struct schedstate_percpu *spc, *tspc; 767 lwp_t *l; 768 769 spc = &ci->ci_schedstate; 770 tspc = &tci->ci_schedstate; 771 if (tspc->spc_mcount != 0 && spc->spc_psid == tspc->spc_psid) { 772 spc_dlock(ci, tci); 773 l = sched_catchlwp(tci); 774 spc_unlock(ci); 775 if (l != NULL) { 776 return true; 777 } 778 } 779 return false; 780 } 781 782 /* 783 * Called from each CPU's idle loop. 784 */ 785 void 786 sched_idle(void) 787 { 788 struct cpu_info *ci = curcpu(), *inner, *outer, *first, *tci = NULL; 789 struct schedstate_percpu *spc, *tspc; 790 struct lwp *l; 791 792 spc = &ci->ci_schedstate; 793 794 /* 795 * Handle LWP migrations off this CPU to another. If there a is 796 * migration to do then go idle afterwards (we'll wake again soon), 797 * as we don't want to instantly steal back the LWP we just moved 798 * out. 799 */ 800 if (spc->spc_migrating != NULL) { 801 sched_idle_migrate(); 802 return; 803 } 804 805 /* If this CPU is offline, or we have an LWP to run, we're done. */ 806 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) { 807 return; 808 } 809 810 /* Deal with SMT. */ 811 if (ci->ci_nsibling[CPUREL_CORE] > 1) { 812 /* Try to help our siblings out. */ 813 tci = ci->ci_sibling[CPUREL_CORE]; 814 while (tci != ci) { 815 if (sched_steal(ci, tci)) { 816 return; 817 } 818 tci = tci->ci_sibling[CPUREL_CORE]; 819 } 820 /* 821 * If not the first SMT in the core, and in the default 822 * processor set, the search ends here. 823 */ 824 if ((spc->spc_flags & SPCF_1STCLASS) == 0 && 825 spc->spc_psid == PS_NONE) { 826 return; 827 } 828 } 829 830 /* 831 * Find something to run, unless this CPU exceeded the rate limit. 832 * Start looking on the current package to maximise L2/L3 cache 833 * locality. Then expand to looking at the rest of the system. 834 * 835 * XXX Should probably look at 2nd class CPUs first, but they will 836 * shed jobs via preempt() anyway. 837 */ 838 if (spc->spc_nextskim > hardclock_ticks) { 839 return; 840 } 841 spc->spc_nextskim = hardclock_ticks + mstohz(skim_interval); 842 843 /* In the outer loop scroll through all CPU packages, starting here. */ 844 first = ci->ci_package1st; 845 outer = first; 846 do { 847 /* In the inner loop scroll through all CPUs in package. */ 848 inner = outer; 849 do { 850 /* Don't hit the locks unless needed. */ 851 tspc = &inner->ci_schedstate; 852 if (ci == inner || spc->spc_psid != tspc->spc_psid || 853 tspc->spc_mcount < min_catch) { 854 continue; 855 } 856 spc_dlock(ci, inner); 857 l = sched_catchlwp(inner); 858 spc_unlock(ci); 859 if (l != NULL) { 860 /* Got it! */ 861 return; 862 } 863 } while (inner = inner->ci_sibling[CPUREL_PACKAGE], 864 inner != outer); 865 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 866 outer != first); 867 } 868 869 /* 870 * Called from mi_switch() when an LWP has been preempted / has yielded. 871 * The LWP is presently in the CPU's run queue. Here we look for a better 872 * CPU to teleport the LWP to; there may not be one. 873 */ 874 void 875 sched_preempted(struct lwp *l) 876 { 877 struct schedstate_percpu *tspc; 878 struct cpu_info *ci, *tci; 879 880 ci = l->l_cpu; 881 tspc = &ci->ci_schedstate; 882 883 KASSERT(tspc->spc_count >= 1); 884 885 /* 886 * Try to select another CPU if: 887 * 888 * - there is no migration pending already 889 * - and this LWP is running on a 2nd class CPU 890 * - or this LWP is a child of vfork() that has just done execve() 891 */ 892 if (l->l_target_cpu != NULL || 893 ((tspc->spc_flags & SPCF_1STCLASS) != 0 && 894 (l->l_pflag & LP_TELEPORT) == 0)) { 895 return; 896 } 897 898 /* 899 * Fast path: if the first SMT in the core is idle, send it back 900 * there, because the cache is shared (cheap) and we want all LWPs 901 * to be clustered on 1st class CPUs (either running there or on 902 * their runqueues). 903 */ 904 tci = ci->ci_sibling[CPUREL_CORE]; 905 while (tci != ci) { 906 const int flags = SPCF_IDLE | SPCF_1STCLASS; 907 tspc = &tci->ci_schedstate; 908 if ((tspc->spc_flags & flags) == flags && 909 sched_migratable(l, tci)) { 910 l->l_target_cpu = tci; 911 l->l_pflag &= ~LP_TELEPORT; 912 return; 913 } 914 tci = tci->ci_sibling[CPUREL_CORE]; 915 } 916 917 if ((l->l_pflag & LP_TELEPORT) != 0) { 918 /* 919 * A child of vfork(): now that the parent is released, 920 * scatter far and wide, to match the LSIDL distribution 921 * done in sched_takecpu(). 922 */ 923 l->l_pflag &= ~LP_TELEPORT; 924 tci = sched_bestcpu(l, sched_nextpkg()); 925 if (tci != ci) { 926 l->l_target_cpu = tci; 927 } 928 } else { 929 /* 930 * Try to find a better CPU to take it, but don't move to 931 * another 2nd class CPU; there's not much point. 932 * 933 * Search in the current CPU package in order to try and 934 * keep L2/L3 cache locality, but expand to include the 935 * whole system if needed. 936 */ 937 tci = sched_bestcpu(l, l->l_cpu); 938 if (tci != ci && 939 (tci->ci_schedstate.spc_flags & SPCF_1STCLASS) != 0) { 940 l->l_target_cpu = tci; 941 } 942 } 943 } 944 945 /* 946 * Called during execve() by a child of vfork(). Does two things: 947 * 948 * - If the parent has been awoken and put back on curcpu then give the 949 * CPU back to the parent. 950 * 951 * - If curlwp is not on a 1st class CPU then find somewhere else to run, 952 * since it dodged the distribution in sched_takecpu() when first set 953 * runnable. 954 */ 955 void 956 sched_vforkexec(struct lwp *l, bool samecpu) 957 { 958 959 KASSERT(l == curlwp); 960 if ((samecpu && ncpu > 1) || 961 (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) { 962 l->l_pflag |= LP_TELEPORT; 963 preempt(); 964 } 965 } 966 967 #else 968 969 /* 970 * stubs for !MULTIPROCESSOR 971 */ 972 973 struct cpu_info * 974 sched_takecpu(struct lwp *l) 975 { 976 977 return l->l_cpu; 978 } 979 980 void 981 sched_idle(void) 982 { 983 984 } 985 986 void 987 sched_preempted(struct lwp *l) 988 { 989 990 } 991 992 void 993 sched_vforkexec(struct lwp *l, bool samecpu) 994 { 995 996 KASSERT(l == curlwp); 997 } 998 999 #endif /* MULTIPROCESSOR */ 1000 1001 /* 1002 * Scheduling statistics and balancing. 1003 */ 1004 void 1005 sched_lwp_stats(struct lwp *l) 1006 { 1007 int batch; 1008 1009 KASSERT(lwp_locked(l, NULL)); 1010 1011 /* Update sleep time */ 1012 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 1013 l->l_stat == LSSUSPENDED) 1014 l->l_slptime++; 1015 1016 /* 1017 * Set that thread is more CPU-bound, if sum of run time exceeds the 1018 * sum of sleep time. Check if thread is CPU-bound a first time. 1019 */ 1020 batch = (l->l_rticksum > l->l_slpticksum); 1021 if (batch != 0) { 1022 if ((l->l_flag & LW_BATCH) == 0) 1023 batch = 0; 1024 l->l_flag |= LW_BATCH; 1025 } else 1026 l->l_flag &= ~LW_BATCH; 1027 1028 /* Reset the time sums */ 1029 l->l_slpticksum = 0; 1030 l->l_rticksum = 0; 1031 1032 /* Scheduler-specific hook */ 1033 sched_pstats_hook(l, batch); 1034 #ifdef KDTRACE_HOOKS 1035 curthread = l; 1036 #endif 1037 } 1038 1039 /* 1040 * Scheduler mill. 1041 */ 1042 struct lwp * 1043 sched_nextlwp(void) 1044 { 1045 struct cpu_info *ci = curcpu(); 1046 struct schedstate_percpu *spc; 1047 TAILQ_HEAD(, lwp) *q_head; 1048 struct lwp *l; 1049 1050 /* Update the last run time on switch */ 1051 l = curlwp; 1052 l->l_rticksum += (hardclock_ticks - l->l_rticks); 1053 1054 /* Return to idle LWP if there is a migrating thread */ 1055 spc = &ci->ci_schedstate; 1056 if (__predict_false(spc->spc_migrating != NULL)) 1057 return NULL; 1058 1059 /* Return to idle LWP if there is no runnable job */ 1060 if (__predict_false(spc->spc_count == 0)) 1061 return NULL; 1062 1063 /* Take the highest priority thread */ 1064 KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 1065 q_head = sched_getrq(spc, spc->spc_maxpriority); 1066 l = TAILQ_FIRST(q_head); 1067 KASSERT(l != NULL); 1068 1069 sched_oncpu(l); 1070 l->l_rticks = hardclock_ticks; 1071 1072 return l; 1073 } 1074 1075 /* 1076 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 1077 */ 1078 1079 bool 1080 sched_curcpu_runnable_p(void) 1081 { 1082 const struct cpu_info *ci; 1083 const struct schedstate_percpu *spc; 1084 bool rv; 1085 1086 kpreempt_disable(); 1087 ci = curcpu(); 1088 spc = &ci->ci_schedstate; 1089 rv = (spc->spc_count != 0); 1090 #ifndef __HAVE_FAST_SOFTINTS 1091 rv |= (ci->ci_data.cpu_softints != 0); 1092 #endif 1093 kpreempt_enable(); 1094 1095 return rv; 1096 } 1097 1098 /* 1099 * Sysctl nodes and initialization. 1100 */ 1101 1102 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 1103 { 1104 const struct sysctlnode *node = NULL; 1105 1106 sysctl_createv(clog, 0, NULL, &node, 1107 CTLFLAG_PERMANENT, 1108 CTLTYPE_NODE, "sched", 1109 SYSCTL_DESCR("Scheduler options"), 1110 NULL, 0, NULL, 0, 1111 CTL_KERN, CTL_CREATE, CTL_EOL); 1112 1113 if (node == NULL) 1114 return; 1115 1116 sysctl_createv(clog, 0, &node, NULL, 1117 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1118 CTLTYPE_INT, "cacheht_time", 1119 SYSCTL_DESCR("Cache hotness time (in ms)"), 1120 NULL, 0, &cacheht_time, 0, 1121 CTL_CREATE, CTL_EOL); 1122 sysctl_createv(clog, 0, &node, NULL, 1123 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1124 CTLTYPE_INT, "skim_interval", 1125 SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"), 1126 NULL, 0, &skim_interval, 0, 1127 CTL_CREATE, CTL_EOL); 1128 sysctl_createv(clog, 0, &node, NULL, 1129 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1130 CTLTYPE_INT, "min_catch", 1131 SYSCTL_DESCR("Minimal count of threads for catching"), 1132 NULL, 0, &min_catch, 0, 1133 CTL_CREATE, CTL_EOL); 1134 sysctl_createv(clog, 0, &node, NULL, 1135 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1136 CTLTYPE_INT, "timesoftints", 1137 SYSCTL_DESCR("Track CPU time for soft interrupts"), 1138 NULL, 0, &softint_timing, 0, 1139 CTL_CREATE, CTL_EOL); 1140 sysctl_createv(clog, 0, &node, NULL, 1141 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1142 CTLTYPE_INT, "kpreempt_pri", 1143 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 1144 NULL, 0, &sched_kpreempt_pri, 0, 1145 CTL_CREATE, CTL_EOL); 1146 } 1147 1148 /* 1149 * Debugging. 1150 */ 1151 1152 #ifdef DDB 1153 1154 void 1155 sched_print_runqueue(void (*pr)(const char *, ...)) 1156 { 1157 struct cpu_info *ci, *tci; 1158 struct schedstate_percpu *spc; 1159 struct lwp *l; 1160 struct proc *p; 1161 CPU_INFO_ITERATOR cii; 1162 1163 for (CPU_INFO_FOREACH(cii, ci)) { 1164 int i; 1165 1166 spc = &ci->ci_schedstate; 1167 1168 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 1169 (*pr)(" pid.lid = %d.%d, r_count = %u, " 1170 "maxpri = %d, mlwp = %p\n", 1171 #ifdef MULTIPROCESSOR 1172 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 1173 #else 1174 curlwp->l_proc->p_pid, curlwp->l_lid, 1175 #endif 1176 spc->spc_count, spc->spc_maxpriority, 1177 spc->spc_migrating); 1178 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 1179 do { 1180 uint32_t q; 1181 q = spc->spc_bitmap[i]; 1182 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 1183 } while (i--); 1184 } 1185 1186 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 1187 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 1188 1189 PROCLIST_FOREACH(p, &allproc) { 1190 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 1191 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1192 ci = l->l_cpu; 1193 tci = l->l_target_cpu; 1194 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 1195 (int)l->l_lid, l->l_priority, lwp_eprio(l), 1196 l->l_flag, l->l_stat == LSRUN ? "RQ" : 1197 (l->l_stat == LSSLEEP ? "SQ" : "-"), 1198 l, ci->ci_index, (tci ? tci->ci_index : -1), 1199 (u_int)(hardclock_ticks - l->l_rticks)); 1200 } 1201 } 1202 } 1203 1204 #endif 1205