1 /* $NetBSD: kern_runq.c,v 1.46 2016/12/22 14:11:58 mlelstv Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.46 2016/12/22 14:11:58 mlelstv Exp $"); 31 32 #include "opt_dtrace.h" 33 34 #include <sys/param.h> 35 #include <sys/kernel.h> 36 #include <sys/bitops.h> 37 #include <sys/cpu.h> 38 #include <sys/idle.h> 39 #include <sys/intr.h> 40 #include <sys/kmem.h> 41 #include <sys/lwp.h> 42 #include <sys/mutex.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/syscallargs.h> 46 #include <sys/sysctl.h> 47 #include <sys/systm.h> 48 #include <sys/types.h> 49 #include <sys/evcnt.h> 50 51 /* 52 * Priority related definitions. 53 */ 54 #define PRI_TS_COUNT (NPRI_USER) 55 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 56 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 57 58 #define PRI_HIGHEST_TS (MAXPRI_USER) 59 60 /* 61 * Bits per map. 62 */ 63 #define BITMAP_BITS (32) 64 #define BITMAP_SHIFT (5) 65 #define BITMAP_MSB (0x80000000U) 66 #define BITMAP_MASK (BITMAP_BITS - 1) 67 68 /* 69 * Structures, runqueue. 70 */ 71 72 const int schedppq = 1; 73 74 typedef struct { 75 TAILQ_HEAD(, lwp) q_head; 76 } queue_t; 77 78 typedef struct { 79 /* Bitmap */ 80 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; 81 /* Counters */ 82 u_int r_count; /* Count of the threads */ 83 u_int r_avgcount; /* Average count of threads (* 256) */ 84 u_int r_mcount; /* Count of migratable threads */ 85 /* Runqueues */ 86 queue_t r_rt_queue[PRI_RT_COUNT]; 87 queue_t r_ts_queue[PRI_TS_COUNT]; 88 /* Event counters */ 89 struct evcnt r_ev_pull; 90 struct evcnt r_ev_push; 91 struct evcnt r_ev_stay; 92 struct evcnt r_ev_localize; 93 } runqueue_t; 94 95 static void * sched_getrq(runqueue_t *, const pri_t); 96 #ifdef MULTIPROCESSOR 97 static lwp_t * sched_catchlwp(struct cpu_info *); 98 static void sched_balance(void *); 99 #endif 100 101 /* 102 * Preemption control. 103 */ 104 int sched_upreempt_pri = 0; 105 #ifdef __HAVE_PREEMPTION 106 # ifdef DEBUG 107 int sched_kpreempt_pri = 0; 108 # else 109 int sched_kpreempt_pri = PRI_USER_RT; 110 # endif 111 #else 112 int sched_kpreempt_pri = 1000; 113 #endif 114 115 /* 116 * Migration and balancing. 117 */ 118 static u_int cacheht_time; /* Cache hotness time */ 119 static u_int min_catch; /* Minimal LWP count for catching */ 120 static u_int balance_period; /* Balance period */ 121 static u_int average_weight; /* Weight old thread count average */ 122 static struct cpu_info *worker_ci; /* Victim CPU */ 123 #ifdef MULTIPROCESSOR 124 static struct callout balance_ch; /* Callout of balancer */ 125 #endif 126 127 #ifdef KDTRACE_HOOKS 128 struct lwp *curthread; 129 #endif 130 131 void 132 runq_init(void) 133 { 134 135 /* Balancing */ 136 worker_ci = curcpu(); 137 cacheht_time = mstohz(3); /* ~3 ms */ 138 balance_period = mstohz(300); /* ~300 ms */ 139 140 /* Minimal count of LWPs for catching */ 141 min_catch = 1; 142 /* Weight of historical average */ 143 average_weight = 50; /* 0.5 */ 144 145 /* Initialize balancing callout and run it */ 146 #ifdef MULTIPROCESSOR 147 callout_init(&balance_ch, CALLOUT_MPSAFE); 148 callout_setfunc(&balance_ch, sched_balance, NULL); 149 callout_schedule(&balance_ch, balance_period); 150 #endif 151 } 152 153 void 154 sched_cpuattach(struct cpu_info *ci) 155 { 156 runqueue_t *ci_rq; 157 void *rq_ptr; 158 u_int i, size; 159 160 if (ci->ci_schedstate.spc_lwplock == NULL) { 161 ci->ci_schedstate.spc_lwplock = 162 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 163 } 164 if (ci == lwp0.l_cpu) { 165 /* Initialize the scheduler structure of the primary LWP */ 166 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock; 167 } 168 if (ci->ci_schedstate.spc_mutex != NULL) { 169 /* Already initialized. */ 170 return; 171 } 172 173 /* Allocate the run queue */ 174 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit; 175 rq_ptr = kmem_zalloc(size, KM_SLEEP); 176 if (rq_ptr == NULL) { 177 panic("sched_cpuattach: could not allocate the runqueue"); 178 } 179 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit)); 180 181 /* Initialize run queues */ 182 ci->ci_schedstate.spc_mutex = 183 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 184 for (i = 0; i < PRI_RT_COUNT; i++) 185 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); 186 for (i = 0; i < PRI_TS_COUNT; i++) 187 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); 188 189 ci->ci_schedstate.spc_sched_info = ci_rq; 190 191 evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL, 192 cpu_name(ci), "runqueue pull"); 193 evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL, 194 cpu_name(ci), "runqueue push"); 195 evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL, 196 cpu_name(ci), "runqueue stay"); 197 evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL, 198 cpu_name(ci), "runqueue localize"); 199 } 200 201 /* 202 * Control of the runqueue. 203 */ 204 205 static inline void * 206 sched_getrq(runqueue_t *ci_rq, const pri_t prio) 207 { 208 209 KASSERT(prio < PRI_COUNT); 210 return (prio <= PRI_HIGHEST_TS) ? 211 &ci_rq->r_ts_queue[prio].q_head : 212 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; 213 } 214 215 void 216 sched_enqueue(struct lwp *l, bool swtch) 217 { 218 runqueue_t *ci_rq; 219 struct schedstate_percpu *spc; 220 TAILQ_HEAD(, lwp) *q_head; 221 const pri_t eprio = lwp_eprio(l); 222 struct cpu_info *ci; 223 int type; 224 225 ci = l->l_cpu; 226 spc = &ci->ci_schedstate; 227 ci_rq = spc->spc_sched_info; 228 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 229 230 /* Update the last run time on switch */ 231 if (__predict_true(swtch == true)) 232 l->l_rticksum += (hardclock_ticks - l->l_rticks); 233 else if (l->l_rticks == 0) 234 l->l_rticks = hardclock_ticks; 235 236 /* Enqueue the thread */ 237 q_head = sched_getrq(ci_rq, eprio); 238 if (TAILQ_EMPTY(q_head)) { 239 u_int i; 240 uint32_t q; 241 242 /* Mark bit */ 243 i = eprio >> BITMAP_SHIFT; 244 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 245 KASSERT((ci_rq->r_bitmap[i] & q) == 0); 246 ci_rq->r_bitmap[i] |= q; 247 } 248 TAILQ_INSERT_TAIL(q_head, l, l_runq); 249 ci_rq->r_count++; 250 if ((l->l_pflag & LP_BOUND) == 0) 251 ci_rq->r_mcount++; 252 253 /* 254 * Update the value of highest priority in the runqueue, 255 * if priority of this thread is higher. 256 */ 257 if (eprio > spc->spc_maxpriority) 258 spc->spc_maxpriority = eprio; 259 260 sched_newts(l); 261 262 /* 263 * Wake the chosen CPU or cause a preemption if the newly 264 * enqueued thread has higher priority. Don't cause a 265 * preemption if the thread is yielding (swtch). 266 */ 267 if (!swtch && eprio > spc->spc_curpriority) { 268 if (eprio >= sched_kpreempt_pri) 269 type = RESCHED_KPREEMPT; 270 else if (eprio >= sched_upreempt_pri) 271 type = RESCHED_IMMED; 272 else 273 type = RESCHED_LAZY; 274 cpu_need_resched(ci, type); 275 } 276 } 277 278 void 279 sched_dequeue(struct lwp *l) 280 { 281 runqueue_t *ci_rq; 282 TAILQ_HEAD(, lwp) *q_head; 283 struct schedstate_percpu *spc; 284 const pri_t eprio = lwp_eprio(l); 285 286 spc = & l->l_cpu->ci_schedstate; 287 ci_rq = spc->spc_sched_info; 288 KASSERT(lwp_locked(l, spc->spc_mutex)); 289 290 KASSERT(eprio <= spc->spc_maxpriority); 291 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); 292 KASSERT(ci_rq->r_count > 0); 293 294 if (spc->spc_migrating == l) 295 spc->spc_migrating = NULL; 296 297 ci_rq->r_count--; 298 if ((l->l_pflag & LP_BOUND) == 0) 299 ci_rq->r_mcount--; 300 301 q_head = sched_getrq(ci_rq, eprio); 302 TAILQ_REMOVE(q_head, l, l_runq); 303 if (TAILQ_EMPTY(q_head)) { 304 u_int i; 305 uint32_t q; 306 307 /* Unmark bit */ 308 i = eprio >> BITMAP_SHIFT; 309 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 310 KASSERT((ci_rq->r_bitmap[i] & q) != 0); 311 ci_rq->r_bitmap[i] &= ~q; 312 313 /* 314 * Update the value of highest priority in the runqueue, in a 315 * case it was a last thread in the queue of highest priority. 316 */ 317 if (eprio != spc->spc_maxpriority) 318 return; 319 320 do { 321 if (ci_rq->r_bitmap[i] != 0) { 322 q = ffs(ci_rq->r_bitmap[i]); 323 spc->spc_maxpriority = 324 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 325 return; 326 } 327 } while (i--); 328 329 /* If not found - set the lowest value */ 330 spc->spc_maxpriority = 0; 331 } 332 } 333 334 /* 335 * Migration and balancing. 336 */ 337 338 #ifdef MULTIPROCESSOR 339 340 /* Estimate if LWP is cache-hot */ 341 static inline bool 342 lwp_cache_hot(const struct lwp *l) 343 { 344 345 if (__predict_false(l->l_slptime || l->l_rticks == 0)) 346 return false; 347 348 return (hardclock_ticks - l->l_rticks <= cacheht_time); 349 } 350 351 /* Check if LWP can migrate to the chosen CPU */ 352 static inline bool 353 sched_migratable(const struct lwp *l, struct cpu_info *ci) 354 { 355 const struct schedstate_percpu *spc = &ci->ci_schedstate; 356 KASSERT(lwp_locked(__UNCONST(l), NULL)); 357 358 /* Is CPU offline? */ 359 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 360 return false; 361 362 /* Is affinity set? */ 363 if (__predict_false(l->l_affinity)) 364 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 365 366 /* Is there a processor-set? */ 367 return (spc->spc_psid == l->l_psid); 368 } 369 370 /* 371 * Estimate the migration of LWP to the other CPU. 372 * Take and return the CPU, if migration is needed. 373 */ 374 struct cpu_info * 375 sched_takecpu(struct lwp *l) 376 { 377 struct cpu_info *ci, *tci, *pivot, *next; 378 struct schedstate_percpu *spc; 379 runqueue_t *ci_rq, *ici_rq; 380 pri_t eprio, lpri, pri; 381 382 KASSERT(lwp_locked(l, NULL)); 383 384 /* If thread is strictly bound, do not estimate other CPUs */ 385 ci = l->l_cpu; 386 if (l->l_pflag & LP_BOUND) 387 return ci; 388 389 spc = &ci->ci_schedstate; 390 ci_rq = spc->spc_sched_info; 391 392 /* Make sure that thread is in appropriate processor-set */ 393 if (__predict_true(spc->spc_psid == l->l_psid)) { 394 /* If CPU of this thread is idling - run there */ 395 if (ci_rq->r_count == 0) { 396 ci_rq->r_ev_stay.ev_count++; 397 return ci; 398 } 399 /* Stay if thread is cache-hot */ 400 eprio = lwp_eprio(l); 401 if (__predict_true(l->l_stat != LSIDL) && 402 lwp_cache_hot(l) && eprio >= spc->spc_curpriority) { 403 ci_rq->r_ev_stay.ev_count++; 404 return ci; 405 } 406 } else { 407 eprio = lwp_eprio(l); 408 } 409 410 /* Run on current CPU if priority of thread is higher */ 411 ci = curcpu(); 412 spc = &ci->ci_schedstate; 413 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) { 414 ci_rq = spc->spc_sched_info; 415 ci_rq->r_ev_localize.ev_count++; 416 return ci; 417 } 418 419 /* 420 * Look for the CPU with the lowest priority thread. In case of 421 * equal priority, choose the CPU with the fewest of threads. 422 */ 423 pivot = l->l_cpu; 424 ci = pivot; 425 tci = pivot; 426 lpri = PRI_COUNT; 427 do { 428 if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) { 429 /* Reached the end, start from the beginning. */ 430 next = cpu_lookup(0); 431 } 432 spc = &ci->ci_schedstate; 433 ici_rq = spc->spc_sched_info; 434 pri = MAX(spc->spc_curpriority, spc->spc_maxpriority); 435 if (pri > lpri) 436 continue; 437 438 if (pri == lpri && ci_rq->r_count < ici_rq->r_count) 439 continue; 440 441 if (!sched_migratable(l, ci)) 442 continue; 443 444 lpri = pri; 445 tci = ci; 446 ci_rq = ici_rq; 447 } while (ci = next, ci != pivot); 448 449 ci_rq = tci->ci_schedstate.spc_sched_info; 450 ci_rq->r_ev_push.ev_count++; 451 452 return tci; 453 } 454 455 /* 456 * Tries to catch an LWP from the runqueue of other CPU. 457 */ 458 static struct lwp * 459 sched_catchlwp(struct cpu_info *ci) 460 { 461 struct cpu_info *curci = curcpu(); 462 struct schedstate_percpu *spc, *curspc; 463 TAILQ_HEAD(, lwp) *q_head; 464 runqueue_t *ci_rq; 465 struct lwp *l; 466 467 curspc = &curci->ci_schedstate; 468 spc = &ci->ci_schedstate; 469 KASSERT(curspc->spc_psid == spc->spc_psid); 470 471 ci_rq = spc->spc_sched_info; 472 if (ci_rq->r_mcount < min_catch) { 473 spc_unlock(ci); 474 return NULL; 475 } 476 477 /* Take the highest priority thread */ 478 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 479 l = TAILQ_FIRST(q_head); 480 481 for (;;) { 482 /* Check the first and next result from the queue */ 483 if (l == NULL) { 484 break; 485 } 486 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 487 ci->ci_data.cpu_name, 488 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 489 490 /* Look for threads, whose are allowed to migrate */ 491 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) || 492 !sched_migratable(l, curci)) { 493 l = TAILQ_NEXT(l, l_runq); 494 continue; 495 } 496 497 /* Grab the thread, and move to the local run queue */ 498 sched_dequeue(l); 499 500 /* 501 * If LWP is still context switching, we may need to 502 * spin-wait before changing its CPU. 503 */ 504 if (__predict_false(l->l_ctxswtch != 0)) { 505 u_int count; 506 count = SPINLOCK_BACKOFF_MIN; 507 while (l->l_ctxswtch) 508 SPINLOCK_BACKOFF(count); 509 } 510 l->l_cpu = curci; 511 ci_rq->r_ev_pull.ev_count++; 512 lwp_unlock_to(l, curspc->spc_mutex); 513 sched_enqueue(l, false); 514 return l; 515 } 516 spc_unlock(ci); 517 518 return l; 519 } 520 521 /* 522 * Periodical calculations for balancing. 523 */ 524 static void 525 sched_balance(void *nocallout) 526 { 527 struct cpu_info *ci, *hci; 528 runqueue_t *ci_rq; 529 CPU_INFO_ITERATOR cii; 530 u_int highest; 531 u_int weight; 532 533 /* sanitize sysctl value */ 534 weight = MIN(average_weight, 100); 535 536 hci = curcpu(); 537 highest = 0; 538 539 /* Make lockless countings */ 540 for (CPU_INFO_FOREACH(cii, ci)) { 541 ci_rq = ci->ci_schedstate.spc_sched_info; 542 543 /* 544 * Average count of the threads 545 * 546 * The average is computed as a fixpoint number with 547 * 8 fractional bits. 548 */ 549 ci_rq->r_avgcount = ( 550 weight * ci_rq->r_avgcount + (100 - weight) * 256 * ci_rq->r_mcount 551 ) / 100; 552 553 /* Look for CPU with the highest average */ 554 if (ci_rq->r_avgcount > highest) { 555 hci = ci; 556 highest = ci_rq->r_avgcount; 557 } 558 } 559 560 /* Update the worker */ 561 worker_ci = hci; 562 563 if (nocallout == NULL) 564 callout_schedule(&balance_ch, balance_period); 565 } 566 567 /* 568 * Called from each CPU's idle loop. 569 */ 570 void 571 sched_idle(void) 572 { 573 struct cpu_info *ci = curcpu(), *tci = NULL; 574 struct schedstate_percpu *spc, *tspc; 575 runqueue_t *ci_rq; 576 bool dlock = false; 577 578 /* Check if there is a migrating LWP */ 579 spc = &ci->ci_schedstate; 580 if (spc->spc_migrating == NULL) 581 goto no_migration; 582 583 spc_lock(ci); 584 for (;;) { 585 struct lwp *l; 586 587 l = spc->spc_migrating; 588 if (l == NULL) 589 break; 590 591 /* 592 * If second attempt, and target CPU has changed, 593 * drop the old lock. 594 */ 595 if (dlock == true && tci != l->l_target_cpu) { 596 KASSERT(tci != NULL); 597 spc_unlock(tci); 598 dlock = false; 599 } 600 601 /* 602 * Nothing to do if destination has changed to the 603 * local CPU, or migration was done by other CPU. 604 */ 605 tci = l->l_target_cpu; 606 if (tci == NULL || tci == ci) { 607 spc->spc_migrating = NULL; 608 l->l_target_cpu = NULL; 609 break; 610 } 611 tspc = &tci->ci_schedstate; 612 613 /* 614 * Double-lock the runqueues. 615 * We do that only once. 616 */ 617 if (dlock == false) { 618 dlock = true; 619 if (ci < tci) { 620 spc_lock(tci); 621 } else if (!mutex_tryenter(tspc->spc_mutex)) { 622 spc_unlock(ci); 623 spc_lock(tci); 624 spc_lock(ci); 625 /* Check the situation again.. */ 626 continue; 627 } 628 } 629 630 /* Migrate the thread */ 631 KASSERT(l->l_stat == LSRUN); 632 spc->spc_migrating = NULL; 633 l->l_target_cpu = NULL; 634 sched_dequeue(l); 635 l->l_cpu = tci; 636 lwp_setlock(l, tspc->spc_mutex); 637 sched_enqueue(l, false); 638 break; 639 } 640 if (dlock == true) { 641 KASSERT(tci != NULL); 642 spc_unlock(tci); 643 } 644 spc_unlock(ci); 645 646 no_migration: 647 ci_rq = spc->spc_sched_info; 648 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) { 649 return; 650 } 651 652 /* Reset the counter, and call the balancer */ 653 ci_rq->r_avgcount = 0; 654 sched_balance(ci); 655 tci = worker_ci; 656 tspc = &tci->ci_schedstate; 657 if (ci == tci || spc->spc_psid != tspc->spc_psid) 658 return; 659 spc_dlock(ci, tci); 660 (void)sched_catchlwp(tci); 661 spc_unlock(ci); 662 } 663 664 #else 665 666 /* 667 * stubs for !MULTIPROCESSOR 668 */ 669 670 struct cpu_info * 671 sched_takecpu(struct lwp *l) 672 { 673 674 return l->l_cpu; 675 } 676 677 void 678 sched_idle(void) 679 { 680 681 } 682 #endif /* MULTIPROCESSOR */ 683 684 /* 685 * Scheduling statistics and balancing. 686 */ 687 void 688 sched_lwp_stats(struct lwp *l) 689 { 690 int batch; 691 692 KASSERT(lwp_locked(l, NULL)); 693 694 /* Update sleep time */ 695 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 696 l->l_stat == LSSUSPENDED) 697 l->l_slptime++; 698 699 /* 700 * Set that thread is more CPU-bound, if sum of run time exceeds the 701 * sum of sleep time. Check if thread is CPU-bound a first time. 702 */ 703 batch = (l->l_rticksum > l->l_slpticksum); 704 if (batch != 0) { 705 if ((l->l_flag & LW_BATCH) == 0) 706 batch = 0; 707 l->l_flag |= LW_BATCH; 708 } else 709 l->l_flag &= ~LW_BATCH; 710 711 /* 712 * If thread is CPU-bound and never sleeps, it would occupy the CPU. 713 * In such case reset the value of last sleep, and check it later, if 714 * it is still zero - perform the migration, unmark the batch flag. 715 */ 716 if (batch && (l->l_slptime + l->l_slpticksum) == 0) { 717 if (l->l_slpticks == 0) { 718 if (l->l_target_cpu == NULL && 719 (l->l_stat == LSRUN || l->l_stat == LSONPROC)) { 720 struct cpu_info *ci = sched_takecpu(l); 721 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL; 722 } 723 l->l_flag &= ~LW_BATCH; 724 } else { 725 l->l_slpticks = 0; 726 } 727 } 728 729 /* Reset the time sums */ 730 l->l_slpticksum = 0; 731 l->l_rticksum = 0; 732 733 /* Scheduler-specific hook */ 734 sched_pstats_hook(l, batch); 735 #ifdef KDTRACE_HOOKS 736 curthread = l; 737 #endif 738 } 739 740 /* 741 * Scheduler mill. 742 */ 743 struct lwp * 744 sched_nextlwp(void) 745 { 746 struct cpu_info *ci = curcpu(); 747 struct schedstate_percpu *spc; 748 TAILQ_HEAD(, lwp) *q_head; 749 runqueue_t *ci_rq; 750 struct lwp *l; 751 752 /* Return to idle LWP if there is a migrating thread */ 753 spc = &ci->ci_schedstate; 754 if (__predict_false(spc->spc_migrating != NULL)) 755 return NULL; 756 ci_rq = spc->spc_sched_info; 757 758 #ifdef MULTIPROCESSOR 759 /* If runqueue is empty, try to catch some thread from other CPU */ 760 if (__predict_false(ci_rq->r_count == 0)) { 761 struct schedstate_percpu *cspc; 762 struct cpu_info *cci; 763 764 /* Offline CPUs should not perform this, however */ 765 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 766 return NULL; 767 768 /* Reset the counter, and call the balancer */ 769 ci_rq->r_avgcount = 0; 770 sched_balance(ci); 771 cci = worker_ci; 772 cspc = &cci->ci_schedstate; 773 if (ci == cci || spc->spc_psid != cspc->spc_psid || 774 !mutex_tryenter(cci->ci_schedstate.spc_mutex)) 775 return NULL; 776 return sched_catchlwp(cci); 777 } 778 #else 779 if (__predict_false(ci_rq->r_count == 0)) 780 return NULL; 781 #endif 782 783 /* Take the highest priority thread */ 784 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 785 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 786 l = TAILQ_FIRST(q_head); 787 KASSERT(l != NULL); 788 789 sched_oncpu(l); 790 l->l_rticks = hardclock_ticks; 791 792 return l; 793 } 794 795 /* 796 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 797 */ 798 799 bool 800 sched_curcpu_runnable_p(void) 801 { 802 const struct cpu_info *ci; 803 const struct schedstate_percpu *spc; 804 const runqueue_t *ci_rq; 805 bool rv; 806 807 kpreempt_disable(); 808 ci = curcpu(); 809 spc = &ci->ci_schedstate; 810 ci_rq = spc->spc_sched_info; 811 812 #ifndef __HAVE_FAST_SOFTINTS 813 if (ci->ci_data.cpu_softints) { 814 kpreempt_enable(); 815 return true; 816 } 817 #endif 818 819 rv = (ci_rq->r_count != 0) ? true : false; 820 kpreempt_enable(); 821 822 return rv; 823 } 824 825 /* 826 * Sysctl nodes and initialization. 827 */ 828 829 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 830 { 831 const struct sysctlnode *node = NULL; 832 833 sysctl_createv(clog, 0, NULL, &node, 834 CTLFLAG_PERMANENT, 835 CTLTYPE_NODE, "sched", 836 SYSCTL_DESCR("Scheduler options"), 837 NULL, 0, NULL, 0, 838 CTL_KERN, CTL_CREATE, CTL_EOL); 839 840 if (node == NULL) 841 return; 842 843 sysctl_createv(clog, 0, &node, NULL, 844 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 845 CTLTYPE_INT, "cacheht_time", 846 SYSCTL_DESCR("Cache hotness time (in ticks)"), 847 NULL, 0, &cacheht_time, 0, 848 CTL_CREATE, CTL_EOL); 849 sysctl_createv(clog, 0, &node, NULL, 850 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 851 CTLTYPE_INT, "balance_period", 852 SYSCTL_DESCR("Balance period (in ticks)"), 853 NULL, 0, &balance_period, 0, 854 CTL_CREATE, CTL_EOL); 855 sysctl_createv(clog, 0, &node, NULL, 856 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 857 CTLTYPE_INT, "average_weight", 858 SYSCTL_DESCR("Thread count averaging weight (in percent)"), 859 NULL, 0, &average_weight, 0, 860 CTL_CREATE, CTL_EOL); 861 sysctl_createv(clog, 0, &node, NULL, 862 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 863 CTLTYPE_INT, "min_catch", 864 SYSCTL_DESCR("Minimal count of threads for catching"), 865 NULL, 0, &min_catch, 0, 866 CTL_CREATE, CTL_EOL); 867 sysctl_createv(clog, 0, &node, NULL, 868 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 869 CTLTYPE_INT, "timesoftints", 870 SYSCTL_DESCR("Track CPU time for soft interrupts"), 871 NULL, 0, &softint_timing, 0, 872 CTL_CREATE, CTL_EOL); 873 sysctl_createv(clog, 0, &node, NULL, 874 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 875 CTLTYPE_INT, "kpreempt_pri", 876 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 877 NULL, 0, &sched_kpreempt_pri, 0, 878 CTL_CREATE, CTL_EOL); 879 sysctl_createv(clog, 0, &node, NULL, 880 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 881 CTLTYPE_INT, "upreempt_pri", 882 SYSCTL_DESCR("Minimum priority to trigger user preemption"), 883 NULL, 0, &sched_upreempt_pri, 0, 884 CTL_CREATE, CTL_EOL); 885 } 886 887 /* 888 * Debugging. 889 */ 890 891 #ifdef DDB 892 893 void 894 sched_print_runqueue(void (*pr)(const char *, ...)) 895 { 896 runqueue_t *ci_rq; 897 struct cpu_info *ci, *tci; 898 struct schedstate_percpu *spc; 899 struct lwp *l; 900 struct proc *p; 901 CPU_INFO_ITERATOR cii; 902 903 for (CPU_INFO_FOREACH(cii, ci)) { 904 int i; 905 906 spc = &ci->ci_schedstate; 907 ci_rq = spc->spc_sched_info; 908 909 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 910 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, " 911 "maxpri = %d, mlwp = %p\n", 912 #ifdef MULTIPROCESSOR 913 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 914 #else 915 curlwp->l_proc->p_pid, curlwp->l_lid, 916 #endif 917 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority, 918 spc->spc_migrating); 919 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 920 do { 921 uint32_t q; 922 q = ci_rq->r_bitmap[i]; 923 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 924 } while (i--); 925 } 926 927 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 928 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 929 930 PROCLIST_FOREACH(p, &allproc) { 931 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 932 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 933 ci = l->l_cpu; 934 tci = l->l_target_cpu; 935 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 936 (int)l->l_lid, l->l_priority, lwp_eprio(l), 937 l->l_flag, l->l_stat == LSRUN ? "RQ" : 938 (l->l_stat == LSSLEEP ? "SQ" : "-"), 939 l, ci->ci_index, (tci ? tci->ci_index : -1), 940 (u_int)(hardclock_ticks - l->l_rticks)); 941 } 942 } 943 } 944 945 #endif 946