1 /* $NetBSD: kern_runq.c,v 1.47 2017/06/01 02:45:13 chs Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.47 2017/06/01 02:45:13 chs Exp $"); 31 32 #include "opt_dtrace.h" 33 34 #include <sys/param.h> 35 #include <sys/kernel.h> 36 #include <sys/bitops.h> 37 #include <sys/cpu.h> 38 #include <sys/idle.h> 39 #include <sys/intr.h> 40 #include <sys/kmem.h> 41 #include <sys/lwp.h> 42 #include <sys/mutex.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/syscallargs.h> 46 #include <sys/sysctl.h> 47 #include <sys/systm.h> 48 #include <sys/types.h> 49 #include <sys/evcnt.h> 50 51 /* 52 * Priority related definitions. 53 */ 54 #define PRI_TS_COUNT (NPRI_USER) 55 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 56 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 57 58 #define PRI_HIGHEST_TS (MAXPRI_USER) 59 60 /* 61 * Bits per map. 62 */ 63 #define BITMAP_BITS (32) 64 #define BITMAP_SHIFT (5) 65 #define BITMAP_MSB (0x80000000U) 66 #define BITMAP_MASK (BITMAP_BITS - 1) 67 68 /* 69 * Structures, runqueue. 70 */ 71 72 const int schedppq = 1; 73 74 typedef struct { 75 TAILQ_HEAD(, lwp) q_head; 76 } queue_t; 77 78 typedef struct { 79 /* Bitmap */ 80 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; 81 /* Counters */ 82 u_int r_count; /* Count of the threads */ 83 u_int r_avgcount; /* Average count of threads (* 256) */ 84 u_int r_mcount; /* Count of migratable threads */ 85 /* Runqueues */ 86 queue_t r_rt_queue[PRI_RT_COUNT]; 87 queue_t r_ts_queue[PRI_TS_COUNT]; 88 /* Event counters */ 89 struct evcnt r_ev_pull; 90 struct evcnt r_ev_push; 91 struct evcnt r_ev_stay; 92 struct evcnt r_ev_localize; 93 } runqueue_t; 94 95 static void * sched_getrq(runqueue_t *, const pri_t); 96 #ifdef MULTIPROCESSOR 97 static lwp_t * sched_catchlwp(struct cpu_info *); 98 static void sched_balance(void *); 99 #endif 100 101 /* 102 * Preemption control. 103 */ 104 int sched_upreempt_pri = 0; 105 #ifdef __HAVE_PREEMPTION 106 # ifdef DEBUG 107 int sched_kpreempt_pri = 0; 108 # else 109 int sched_kpreempt_pri = PRI_USER_RT; 110 # endif 111 #else 112 int sched_kpreempt_pri = 1000; 113 #endif 114 115 /* 116 * Migration and balancing. 117 */ 118 static u_int cacheht_time; /* Cache hotness time */ 119 static u_int min_catch; /* Minimal LWP count for catching */ 120 static u_int balance_period; /* Balance period */ 121 static u_int average_weight; /* Weight old thread count average */ 122 static struct cpu_info *worker_ci; /* Victim CPU */ 123 #ifdef MULTIPROCESSOR 124 static struct callout balance_ch; /* Callout of balancer */ 125 #endif 126 127 #ifdef KDTRACE_HOOKS 128 struct lwp *curthread; 129 #endif 130 131 void 132 runq_init(void) 133 { 134 135 /* Balancing */ 136 worker_ci = curcpu(); 137 cacheht_time = mstohz(3); /* ~3 ms */ 138 balance_period = mstohz(300); /* ~300 ms */ 139 140 /* Minimal count of LWPs for catching */ 141 min_catch = 1; 142 /* Weight of historical average */ 143 average_weight = 50; /* 0.5 */ 144 145 /* Initialize balancing callout and run it */ 146 #ifdef MULTIPROCESSOR 147 callout_init(&balance_ch, CALLOUT_MPSAFE); 148 callout_setfunc(&balance_ch, sched_balance, NULL); 149 callout_schedule(&balance_ch, balance_period); 150 #endif 151 } 152 153 void 154 sched_cpuattach(struct cpu_info *ci) 155 { 156 runqueue_t *ci_rq; 157 void *rq_ptr; 158 u_int i, size; 159 160 if (ci->ci_schedstate.spc_lwplock == NULL) { 161 ci->ci_schedstate.spc_lwplock = 162 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 163 } 164 if (ci == lwp0.l_cpu) { 165 /* Initialize the scheduler structure of the primary LWP */ 166 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock; 167 } 168 if (ci->ci_schedstate.spc_mutex != NULL) { 169 /* Already initialized. */ 170 return; 171 } 172 173 /* Allocate the run queue */ 174 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit; 175 rq_ptr = kmem_zalloc(size, KM_SLEEP); 176 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit)); 177 178 /* Initialize run queues */ 179 ci->ci_schedstate.spc_mutex = 180 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 181 for (i = 0; i < PRI_RT_COUNT; i++) 182 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); 183 for (i = 0; i < PRI_TS_COUNT; i++) 184 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); 185 186 ci->ci_schedstate.spc_sched_info = ci_rq; 187 188 evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL, 189 cpu_name(ci), "runqueue pull"); 190 evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL, 191 cpu_name(ci), "runqueue push"); 192 evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL, 193 cpu_name(ci), "runqueue stay"); 194 evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL, 195 cpu_name(ci), "runqueue localize"); 196 } 197 198 /* 199 * Control of the runqueue. 200 */ 201 202 static inline void * 203 sched_getrq(runqueue_t *ci_rq, const pri_t prio) 204 { 205 206 KASSERT(prio < PRI_COUNT); 207 return (prio <= PRI_HIGHEST_TS) ? 208 &ci_rq->r_ts_queue[prio].q_head : 209 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; 210 } 211 212 void 213 sched_enqueue(struct lwp *l, bool swtch) 214 { 215 runqueue_t *ci_rq; 216 struct schedstate_percpu *spc; 217 TAILQ_HEAD(, lwp) *q_head; 218 const pri_t eprio = lwp_eprio(l); 219 struct cpu_info *ci; 220 int type; 221 222 ci = l->l_cpu; 223 spc = &ci->ci_schedstate; 224 ci_rq = spc->spc_sched_info; 225 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 226 227 /* Update the last run time on switch */ 228 if (__predict_true(swtch == true)) 229 l->l_rticksum += (hardclock_ticks - l->l_rticks); 230 else if (l->l_rticks == 0) 231 l->l_rticks = hardclock_ticks; 232 233 /* Enqueue the thread */ 234 q_head = sched_getrq(ci_rq, eprio); 235 if (TAILQ_EMPTY(q_head)) { 236 u_int i; 237 uint32_t q; 238 239 /* Mark bit */ 240 i = eprio >> BITMAP_SHIFT; 241 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 242 KASSERT((ci_rq->r_bitmap[i] & q) == 0); 243 ci_rq->r_bitmap[i] |= q; 244 } 245 TAILQ_INSERT_TAIL(q_head, l, l_runq); 246 ci_rq->r_count++; 247 if ((l->l_pflag & LP_BOUND) == 0) 248 ci_rq->r_mcount++; 249 250 /* 251 * Update the value of highest priority in the runqueue, 252 * if priority of this thread is higher. 253 */ 254 if (eprio > spc->spc_maxpriority) 255 spc->spc_maxpriority = eprio; 256 257 sched_newts(l); 258 259 /* 260 * Wake the chosen CPU or cause a preemption if the newly 261 * enqueued thread has higher priority. Don't cause a 262 * preemption if the thread is yielding (swtch). 263 */ 264 if (!swtch && eprio > spc->spc_curpriority) { 265 if (eprio >= sched_kpreempt_pri) 266 type = RESCHED_KPREEMPT; 267 else if (eprio >= sched_upreempt_pri) 268 type = RESCHED_IMMED; 269 else 270 type = RESCHED_LAZY; 271 cpu_need_resched(ci, type); 272 } 273 } 274 275 void 276 sched_dequeue(struct lwp *l) 277 { 278 runqueue_t *ci_rq; 279 TAILQ_HEAD(, lwp) *q_head; 280 struct schedstate_percpu *spc; 281 const pri_t eprio = lwp_eprio(l); 282 283 spc = & l->l_cpu->ci_schedstate; 284 ci_rq = spc->spc_sched_info; 285 KASSERT(lwp_locked(l, spc->spc_mutex)); 286 287 KASSERT(eprio <= spc->spc_maxpriority); 288 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); 289 KASSERT(ci_rq->r_count > 0); 290 291 if (spc->spc_migrating == l) 292 spc->spc_migrating = NULL; 293 294 ci_rq->r_count--; 295 if ((l->l_pflag & LP_BOUND) == 0) 296 ci_rq->r_mcount--; 297 298 q_head = sched_getrq(ci_rq, eprio); 299 TAILQ_REMOVE(q_head, l, l_runq); 300 if (TAILQ_EMPTY(q_head)) { 301 u_int i; 302 uint32_t q; 303 304 /* Unmark bit */ 305 i = eprio >> BITMAP_SHIFT; 306 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 307 KASSERT((ci_rq->r_bitmap[i] & q) != 0); 308 ci_rq->r_bitmap[i] &= ~q; 309 310 /* 311 * Update the value of highest priority in the runqueue, in a 312 * case it was a last thread in the queue of highest priority. 313 */ 314 if (eprio != spc->spc_maxpriority) 315 return; 316 317 do { 318 if (ci_rq->r_bitmap[i] != 0) { 319 q = ffs(ci_rq->r_bitmap[i]); 320 spc->spc_maxpriority = 321 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 322 return; 323 } 324 } while (i--); 325 326 /* If not found - set the lowest value */ 327 spc->spc_maxpriority = 0; 328 } 329 } 330 331 /* 332 * Migration and balancing. 333 */ 334 335 #ifdef MULTIPROCESSOR 336 337 /* Estimate if LWP is cache-hot */ 338 static inline bool 339 lwp_cache_hot(const struct lwp *l) 340 { 341 342 if (__predict_false(l->l_slptime || l->l_rticks == 0)) 343 return false; 344 345 return (hardclock_ticks - l->l_rticks <= cacheht_time); 346 } 347 348 /* Check if LWP can migrate to the chosen CPU */ 349 static inline bool 350 sched_migratable(const struct lwp *l, struct cpu_info *ci) 351 { 352 const struct schedstate_percpu *spc = &ci->ci_schedstate; 353 KASSERT(lwp_locked(__UNCONST(l), NULL)); 354 355 /* Is CPU offline? */ 356 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 357 return false; 358 359 /* Is affinity set? */ 360 if (__predict_false(l->l_affinity)) 361 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 362 363 /* Is there a processor-set? */ 364 return (spc->spc_psid == l->l_psid); 365 } 366 367 /* 368 * Estimate the migration of LWP to the other CPU. 369 * Take and return the CPU, if migration is needed. 370 */ 371 struct cpu_info * 372 sched_takecpu(struct lwp *l) 373 { 374 struct cpu_info *ci, *tci, *pivot, *next; 375 struct schedstate_percpu *spc; 376 runqueue_t *ci_rq, *ici_rq; 377 pri_t eprio, lpri, pri; 378 379 KASSERT(lwp_locked(l, NULL)); 380 381 /* If thread is strictly bound, do not estimate other CPUs */ 382 ci = l->l_cpu; 383 if (l->l_pflag & LP_BOUND) 384 return ci; 385 386 spc = &ci->ci_schedstate; 387 ci_rq = spc->spc_sched_info; 388 389 /* Make sure that thread is in appropriate processor-set */ 390 if (__predict_true(spc->spc_psid == l->l_psid)) { 391 /* If CPU of this thread is idling - run there */ 392 if (ci_rq->r_count == 0) { 393 ci_rq->r_ev_stay.ev_count++; 394 return ci; 395 } 396 /* Stay if thread is cache-hot */ 397 eprio = lwp_eprio(l); 398 if (__predict_true(l->l_stat != LSIDL) && 399 lwp_cache_hot(l) && eprio >= spc->spc_curpriority) { 400 ci_rq->r_ev_stay.ev_count++; 401 return ci; 402 } 403 } else { 404 eprio = lwp_eprio(l); 405 } 406 407 /* Run on current CPU if priority of thread is higher */ 408 ci = curcpu(); 409 spc = &ci->ci_schedstate; 410 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) { 411 ci_rq = spc->spc_sched_info; 412 ci_rq->r_ev_localize.ev_count++; 413 return ci; 414 } 415 416 /* 417 * Look for the CPU with the lowest priority thread. In case of 418 * equal priority, choose the CPU with the fewest of threads. 419 */ 420 pivot = l->l_cpu; 421 ci = pivot; 422 tci = pivot; 423 lpri = PRI_COUNT; 424 do { 425 if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) { 426 /* Reached the end, start from the beginning. */ 427 next = cpu_lookup(0); 428 } 429 spc = &ci->ci_schedstate; 430 ici_rq = spc->spc_sched_info; 431 pri = MAX(spc->spc_curpriority, spc->spc_maxpriority); 432 if (pri > lpri) 433 continue; 434 435 if (pri == lpri && ci_rq->r_count < ici_rq->r_count) 436 continue; 437 438 if (!sched_migratable(l, ci)) 439 continue; 440 441 lpri = pri; 442 tci = ci; 443 ci_rq = ici_rq; 444 } while (ci = next, ci != pivot); 445 446 ci_rq = tci->ci_schedstate.spc_sched_info; 447 ci_rq->r_ev_push.ev_count++; 448 449 return tci; 450 } 451 452 /* 453 * Tries to catch an LWP from the runqueue of other CPU. 454 */ 455 static struct lwp * 456 sched_catchlwp(struct cpu_info *ci) 457 { 458 struct cpu_info *curci = curcpu(); 459 struct schedstate_percpu *spc, *curspc; 460 TAILQ_HEAD(, lwp) *q_head; 461 runqueue_t *ci_rq; 462 struct lwp *l; 463 464 curspc = &curci->ci_schedstate; 465 spc = &ci->ci_schedstate; 466 KASSERT(curspc->spc_psid == spc->spc_psid); 467 468 ci_rq = spc->spc_sched_info; 469 if (ci_rq->r_mcount < min_catch) { 470 spc_unlock(ci); 471 return NULL; 472 } 473 474 /* Take the highest priority thread */ 475 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 476 l = TAILQ_FIRST(q_head); 477 478 for (;;) { 479 /* Check the first and next result from the queue */ 480 if (l == NULL) { 481 break; 482 } 483 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 484 ci->ci_data.cpu_name, 485 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 486 487 /* Look for threads, whose are allowed to migrate */ 488 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) || 489 !sched_migratable(l, curci)) { 490 l = TAILQ_NEXT(l, l_runq); 491 continue; 492 } 493 494 /* Grab the thread, and move to the local run queue */ 495 sched_dequeue(l); 496 497 /* 498 * If LWP is still context switching, we may need to 499 * spin-wait before changing its CPU. 500 */ 501 if (__predict_false(l->l_ctxswtch != 0)) { 502 u_int count; 503 count = SPINLOCK_BACKOFF_MIN; 504 while (l->l_ctxswtch) 505 SPINLOCK_BACKOFF(count); 506 } 507 l->l_cpu = curci; 508 ci_rq->r_ev_pull.ev_count++; 509 lwp_unlock_to(l, curspc->spc_mutex); 510 sched_enqueue(l, false); 511 return l; 512 } 513 spc_unlock(ci); 514 515 return l; 516 } 517 518 /* 519 * Periodical calculations for balancing. 520 */ 521 static void 522 sched_balance(void *nocallout) 523 { 524 struct cpu_info *ci, *hci; 525 runqueue_t *ci_rq; 526 CPU_INFO_ITERATOR cii; 527 u_int highest; 528 u_int weight; 529 530 /* sanitize sysctl value */ 531 weight = MIN(average_weight, 100); 532 533 hci = curcpu(); 534 highest = 0; 535 536 /* Make lockless countings */ 537 for (CPU_INFO_FOREACH(cii, ci)) { 538 ci_rq = ci->ci_schedstate.spc_sched_info; 539 540 /* 541 * Average count of the threads 542 * 543 * The average is computed as a fixpoint number with 544 * 8 fractional bits. 545 */ 546 ci_rq->r_avgcount = ( 547 weight * ci_rq->r_avgcount + (100 - weight) * 256 * ci_rq->r_mcount 548 ) / 100; 549 550 /* Look for CPU with the highest average */ 551 if (ci_rq->r_avgcount > highest) { 552 hci = ci; 553 highest = ci_rq->r_avgcount; 554 } 555 } 556 557 /* Update the worker */ 558 worker_ci = hci; 559 560 if (nocallout == NULL) 561 callout_schedule(&balance_ch, balance_period); 562 } 563 564 /* 565 * Called from each CPU's idle loop. 566 */ 567 void 568 sched_idle(void) 569 { 570 struct cpu_info *ci = curcpu(), *tci = NULL; 571 struct schedstate_percpu *spc, *tspc; 572 runqueue_t *ci_rq; 573 bool dlock = false; 574 575 /* Check if there is a migrating LWP */ 576 spc = &ci->ci_schedstate; 577 if (spc->spc_migrating == NULL) 578 goto no_migration; 579 580 spc_lock(ci); 581 for (;;) { 582 struct lwp *l; 583 584 l = spc->spc_migrating; 585 if (l == NULL) 586 break; 587 588 /* 589 * If second attempt, and target CPU has changed, 590 * drop the old lock. 591 */ 592 if (dlock == true && tci != l->l_target_cpu) { 593 KASSERT(tci != NULL); 594 spc_unlock(tci); 595 dlock = false; 596 } 597 598 /* 599 * Nothing to do if destination has changed to the 600 * local CPU, or migration was done by other CPU. 601 */ 602 tci = l->l_target_cpu; 603 if (tci == NULL || tci == ci) { 604 spc->spc_migrating = NULL; 605 l->l_target_cpu = NULL; 606 break; 607 } 608 tspc = &tci->ci_schedstate; 609 610 /* 611 * Double-lock the runqueues. 612 * We do that only once. 613 */ 614 if (dlock == false) { 615 dlock = true; 616 if (ci < tci) { 617 spc_lock(tci); 618 } else if (!mutex_tryenter(tspc->spc_mutex)) { 619 spc_unlock(ci); 620 spc_lock(tci); 621 spc_lock(ci); 622 /* Check the situation again.. */ 623 continue; 624 } 625 } 626 627 /* Migrate the thread */ 628 KASSERT(l->l_stat == LSRUN); 629 spc->spc_migrating = NULL; 630 l->l_target_cpu = NULL; 631 sched_dequeue(l); 632 l->l_cpu = tci; 633 lwp_setlock(l, tspc->spc_mutex); 634 sched_enqueue(l, false); 635 break; 636 } 637 if (dlock == true) { 638 KASSERT(tci != NULL); 639 spc_unlock(tci); 640 } 641 spc_unlock(ci); 642 643 no_migration: 644 ci_rq = spc->spc_sched_info; 645 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) { 646 return; 647 } 648 649 /* Reset the counter, and call the balancer */ 650 ci_rq->r_avgcount = 0; 651 sched_balance(ci); 652 tci = worker_ci; 653 tspc = &tci->ci_schedstate; 654 if (ci == tci || spc->spc_psid != tspc->spc_psid) 655 return; 656 spc_dlock(ci, tci); 657 (void)sched_catchlwp(tci); 658 spc_unlock(ci); 659 } 660 661 #else 662 663 /* 664 * stubs for !MULTIPROCESSOR 665 */ 666 667 struct cpu_info * 668 sched_takecpu(struct lwp *l) 669 { 670 671 return l->l_cpu; 672 } 673 674 void 675 sched_idle(void) 676 { 677 678 } 679 #endif /* MULTIPROCESSOR */ 680 681 /* 682 * Scheduling statistics and balancing. 683 */ 684 void 685 sched_lwp_stats(struct lwp *l) 686 { 687 int batch; 688 689 KASSERT(lwp_locked(l, NULL)); 690 691 /* Update sleep time */ 692 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 693 l->l_stat == LSSUSPENDED) 694 l->l_slptime++; 695 696 /* 697 * Set that thread is more CPU-bound, if sum of run time exceeds the 698 * sum of sleep time. Check if thread is CPU-bound a first time. 699 */ 700 batch = (l->l_rticksum > l->l_slpticksum); 701 if (batch != 0) { 702 if ((l->l_flag & LW_BATCH) == 0) 703 batch = 0; 704 l->l_flag |= LW_BATCH; 705 } else 706 l->l_flag &= ~LW_BATCH; 707 708 /* 709 * If thread is CPU-bound and never sleeps, it would occupy the CPU. 710 * In such case reset the value of last sleep, and check it later, if 711 * it is still zero - perform the migration, unmark the batch flag. 712 */ 713 if (batch && (l->l_slptime + l->l_slpticksum) == 0) { 714 if (l->l_slpticks == 0) { 715 if (l->l_target_cpu == NULL && 716 (l->l_stat == LSRUN || l->l_stat == LSONPROC)) { 717 struct cpu_info *ci = sched_takecpu(l); 718 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL; 719 } 720 l->l_flag &= ~LW_BATCH; 721 } else { 722 l->l_slpticks = 0; 723 } 724 } 725 726 /* Reset the time sums */ 727 l->l_slpticksum = 0; 728 l->l_rticksum = 0; 729 730 /* Scheduler-specific hook */ 731 sched_pstats_hook(l, batch); 732 #ifdef KDTRACE_HOOKS 733 curthread = l; 734 #endif 735 } 736 737 /* 738 * Scheduler mill. 739 */ 740 struct lwp * 741 sched_nextlwp(void) 742 { 743 struct cpu_info *ci = curcpu(); 744 struct schedstate_percpu *spc; 745 TAILQ_HEAD(, lwp) *q_head; 746 runqueue_t *ci_rq; 747 struct lwp *l; 748 749 /* Return to idle LWP if there is a migrating thread */ 750 spc = &ci->ci_schedstate; 751 if (__predict_false(spc->spc_migrating != NULL)) 752 return NULL; 753 ci_rq = spc->spc_sched_info; 754 755 #ifdef MULTIPROCESSOR 756 /* If runqueue is empty, try to catch some thread from other CPU */ 757 if (__predict_false(ci_rq->r_count == 0)) { 758 struct schedstate_percpu *cspc; 759 struct cpu_info *cci; 760 761 /* Offline CPUs should not perform this, however */ 762 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 763 return NULL; 764 765 /* Reset the counter, and call the balancer */ 766 ci_rq->r_avgcount = 0; 767 sched_balance(ci); 768 cci = worker_ci; 769 cspc = &cci->ci_schedstate; 770 if (ci == cci || spc->spc_psid != cspc->spc_psid || 771 !mutex_tryenter(cci->ci_schedstate.spc_mutex)) 772 return NULL; 773 return sched_catchlwp(cci); 774 } 775 #else 776 if (__predict_false(ci_rq->r_count == 0)) 777 return NULL; 778 #endif 779 780 /* Take the highest priority thread */ 781 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 782 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 783 l = TAILQ_FIRST(q_head); 784 KASSERT(l != NULL); 785 786 sched_oncpu(l); 787 l->l_rticks = hardclock_ticks; 788 789 return l; 790 } 791 792 /* 793 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 794 */ 795 796 bool 797 sched_curcpu_runnable_p(void) 798 { 799 const struct cpu_info *ci; 800 const struct schedstate_percpu *spc; 801 const runqueue_t *ci_rq; 802 bool rv; 803 804 kpreempt_disable(); 805 ci = curcpu(); 806 spc = &ci->ci_schedstate; 807 ci_rq = spc->spc_sched_info; 808 809 #ifndef __HAVE_FAST_SOFTINTS 810 if (ci->ci_data.cpu_softints) { 811 kpreempt_enable(); 812 return true; 813 } 814 #endif 815 816 rv = (ci_rq->r_count != 0) ? true : false; 817 kpreempt_enable(); 818 819 return rv; 820 } 821 822 /* 823 * Sysctl nodes and initialization. 824 */ 825 826 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 827 { 828 const struct sysctlnode *node = NULL; 829 830 sysctl_createv(clog, 0, NULL, &node, 831 CTLFLAG_PERMANENT, 832 CTLTYPE_NODE, "sched", 833 SYSCTL_DESCR("Scheduler options"), 834 NULL, 0, NULL, 0, 835 CTL_KERN, CTL_CREATE, CTL_EOL); 836 837 if (node == NULL) 838 return; 839 840 sysctl_createv(clog, 0, &node, NULL, 841 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 842 CTLTYPE_INT, "cacheht_time", 843 SYSCTL_DESCR("Cache hotness time (in ticks)"), 844 NULL, 0, &cacheht_time, 0, 845 CTL_CREATE, CTL_EOL); 846 sysctl_createv(clog, 0, &node, NULL, 847 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 848 CTLTYPE_INT, "balance_period", 849 SYSCTL_DESCR("Balance period (in ticks)"), 850 NULL, 0, &balance_period, 0, 851 CTL_CREATE, CTL_EOL); 852 sysctl_createv(clog, 0, &node, NULL, 853 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 854 CTLTYPE_INT, "average_weight", 855 SYSCTL_DESCR("Thread count averaging weight (in percent)"), 856 NULL, 0, &average_weight, 0, 857 CTL_CREATE, CTL_EOL); 858 sysctl_createv(clog, 0, &node, NULL, 859 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 860 CTLTYPE_INT, "min_catch", 861 SYSCTL_DESCR("Minimal count of threads for catching"), 862 NULL, 0, &min_catch, 0, 863 CTL_CREATE, CTL_EOL); 864 sysctl_createv(clog, 0, &node, NULL, 865 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 866 CTLTYPE_INT, "timesoftints", 867 SYSCTL_DESCR("Track CPU time for soft interrupts"), 868 NULL, 0, &softint_timing, 0, 869 CTL_CREATE, CTL_EOL); 870 sysctl_createv(clog, 0, &node, NULL, 871 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 872 CTLTYPE_INT, "kpreempt_pri", 873 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 874 NULL, 0, &sched_kpreempt_pri, 0, 875 CTL_CREATE, CTL_EOL); 876 sysctl_createv(clog, 0, &node, NULL, 877 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 878 CTLTYPE_INT, "upreempt_pri", 879 SYSCTL_DESCR("Minimum priority to trigger user preemption"), 880 NULL, 0, &sched_upreempt_pri, 0, 881 CTL_CREATE, CTL_EOL); 882 } 883 884 /* 885 * Debugging. 886 */ 887 888 #ifdef DDB 889 890 void 891 sched_print_runqueue(void (*pr)(const char *, ...)) 892 { 893 runqueue_t *ci_rq; 894 struct cpu_info *ci, *tci; 895 struct schedstate_percpu *spc; 896 struct lwp *l; 897 struct proc *p; 898 CPU_INFO_ITERATOR cii; 899 900 for (CPU_INFO_FOREACH(cii, ci)) { 901 int i; 902 903 spc = &ci->ci_schedstate; 904 ci_rq = spc->spc_sched_info; 905 906 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 907 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, " 908 "maxpri = %d, mlwp = %p\n", 909 #ifdef MULTIPROCESSOR 910 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 911 #else 912 curlwp->l_proc->p_pid, curlwp->l_lid, 913 #endif 914 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority, 915 spc->spc_migrating); 916 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 917 do { 918 uint32_t q; 919 q = ci_rq->r_bitmap[i]; 920 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 921 } while (i--); 922 } 923 924 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 925 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 926 927 PROCLIST_FOREACH(p, &allproc) { 928 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 929 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 930 ci = l->l_cpu; 931 tci = l->l_target_cpu; 932 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 933 (int)l->l_lid, l->l_priority, lwp_eprio(l), 934 l->l_flag, l->l_stat == LSRUN ? "RQ" : 935 (l->l_stat == LSSLEEP ? "SQ" : "-"), 936 l, ci->ci_index, (tci ? tci->ci_index : -1), 937 (u_int)(hardclock_ticks - l->l_rticks)); 938 } 939 } 940 } 941 942 #endif 943