1 /* $NetBSD: kern_runq.c,v 1.44 2015/10/07 00:32:34 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.44 2015/10/07 00:32:34 christos Exp $"); 31 32 #include "opt_dtrace.h" 33 34 #include <sys/param.h> 35 #include <sys/kernel.h> 36 #include <sys/bitops.h> 37 #include <sys/cpu.h> 38 #include <sys/idle.h> 39 #include <sys/intr.h> 40 #include <sys/kmem.h> 41 #include <sys/lwp.h> 42 #include <sys/mutex.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/syscallargs.h> 46 #include <sys/sysctl.h> 47 #include <sys/systm.h> 48 #include <sys/types.h> 49 #include <sys/evcnt.h> 50 51 /* 52 * Priority related definitions. 53 */ 54 #define PRI_TS_COUNT (NPRI_USER) 55 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 56 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 57 58 #define PRI_HIGHEST_TS (MAXPRI_USER) 59 60 /* 61 * Bits per map. 62 */ 63 #define BITMAP_BITS (32) 64 #define BITMAP_SHIFT (5) 65 #define BITMAP_MSB (0x80000000U) 66 #define BITMAP_MASK (BITMAP_BITS - 1) 67 68 /* 69 * Structures, runqueue. 70 */ 71 72 const int schedppq = 1; 73 74 typedef struct { 75 TAILQ_HEAD(, lwp) q_head; 76 } queue_t; 77 78 typedef struct { 79 /* Bitmap */ 80 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; 81 /* Counters */ 82 u_int r_count; /* Count of the threads */ 83 u_int r_avgcount; /* Average count of threads */ 84 u_int r_mcount; /* Count of migratable threads */ 85 /* Runqueues */ 86 queue_t r_rt_queue[PRI_RT_COUNT]; 87 queue_t r_ts_queue[PRI_TS_COUNT]; 88 /* Event counters */ 89 struct evcnt r_ev_pull; 90 struct evcnt r_ev_push; 91 struct evcnt r_ev_stay; 92 struct evcnt r_ev_localize; 93 } runqueue_t; 94 95 static void * sched_getrq(runqueue_t *, const pri_t); 96 #ifdef MULTIPROCESSOR 97 static lwp_t * sched_catchlwp(struct cpu_info *); 98 static void sched_balance(void *); 99 #endif 100 101 /* 102 * Preemption control. 103 */ 104 int sched_upreempt_pri = 0; 105 #ifdef __HAVE_PREEMPTION 106 # ifdef DEBUG 107 int sched_kpreempt_pri = 0; 108 # else 109 int sched_kpreempt_pri = PRI_USER_RT; 110 # endif 111 #else 112 int sched_kpreempt_pri = 1000; 113 #endif 114 115 /* 116 * Migration and balancing. 117 */ 118 static u_int cacheht_time; /* Cache hotness time */ 119 static u_int min_catch; /* Minimal LWP count for catching */ 120 static u_int balance_period; /* Balance period */ 121 static struct cpu_info *worker_ci; /* Victim CPU */ 122 #ifdef MULTIPROCESSOR 123 static struct callout balance_ch; /* Callout of balancer */ 124 #endif 125 126 #ifdef KDTRACE_HOOKS 127 struct lwp *curthread; 128 #endif 129 130 void 131 runq_init(void) 132 { 133 134 /* Balancing */ 135 worker_ci = curcpu(); 136 cacheht_time = mstohz(3); /* ~3 ms */ 137 balance_period = mstohz(300); /* ~300 ms */ 138 139 /* Minimal count of LWPs for catching */ 140 min_catch = 1; 141 142 /* Initialize balancing callout and run it */ 143 #ifdef MULTIPROCESSOR 144 callout_init(&balance_ch, CALLOUT_MPSAFE); 145 callout_setfunc(&balance_ch, sched_balance, NULL); 146 callout_schedule(&balance_ch, balance_period); 147 #endif 148 } 149 150 void 151 sched_cpuattach(struct cpu_info *ci) 152 { 153 runqueue_t *ci_rq; 154 void *rq_ptr; 155 u_int i, size; 156 157 if (ci->ci_schedstate.spc_lwplock == NULL) { 158 ci->ci_schedstate.spc_lwplock = 159 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 160 } 161 if (ci == lwp0.l_cpu) { 162 /* Initialize the scheduler structure of the primary LWP */ 163 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock; 164 } 165 if (ci->ci_schedstate.spc_mutex != NULL) { 166 /* Already initialized. */ 167 return; 168 } 169 170 /* Allocate the run queue */ 171 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit; 172 rq_ptr = kmem_zalloc(size, KM_SLEEP); 173 if (rq_ptr == NULL) { 174 panic("sched_cpuattach: could not allocate the runqueue"); 175 } 176 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit)); 177 178 /* Initialize run queues */ 179 ci->ci_schedstate.spc_mutex = 180 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 181 for (i = 0; i < PRI_RT_COUNT; i++) 182 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); 183 for (i = 0; i < PRI_TS_COUNT; i++) 184 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); 185 186 ci->ci_schedstate.spc_sched_info = ci_rq; 187 188 evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL, 189 cpu_name(ci), "runqueue pull"); 190 evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL, 191 cpu_name(ci), "runqueue push"); 192 evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL, 193 cpu_name(ci), "runqueue stay"); 194 evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL, 195 cpu_name(ci), "runqueue localize"); 196 } 197 198 /* 199 * Control of the runqueue. 200 */ 201 202 static inline void * 203 sched_getrq(runqueue_t *ci_rq, const pri_t prio) 204 { 205 206 KASSERT(prio < PRI_COUNT); 207 return (prio <= PRI_HIGHEST_TS) ? 208 &ci_rq->r_ts_queue[prio].q_head : 209 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; 210 } 211 212 void 213 sched_enqueue(struct lwp *l, bool swtch) 214 { 215 runqueue_t *ci_rq; 216 struct schedstate_percpu *spc; 217 TAILQ_HEAD(, lwp) *q_head; 218 const pri_t eprio = lwp_eprio(l); 219 struct cpu_info *ci; 220 int type; 221 222 ci = l->l_cpu; 223 spc = &ci->ci_schedstate; 224 ci_rq = spc->spc_sched_info; 225 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 226 227 /* Update the last run time on switch */ 228 if (__predict_true(swtch == true)) 229 l->l_rticksum += (hardclock_ticks - l->l_rticks); 230 else if (l->l_rticks == 0) 231 l->l_rticks = hardclock_ticks; 232 233 /* Enqueue the thread */ 234 q_head = sched_getrq(ci_rq, eprio); 235 if (TAILQ_EMPTY(q_head)) { 236 u_int i; 237 uint32_t q; 238 239 /* Mark bit */ 240 i = eprio >> BITMAP_SHIFT; 241 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 242 KASSERT((ci_rq->r_bitmap[i] & q) == 0); 243 ci_rq->r_bitmap[i] |= q; 244 } 245 TAILQ_INSERT_TAIL(q_head, l, l_runq); 246 ci_rq->r_count++; 247 if ((l->l_pflag & LP_BOUND) == 0) 248 ci_rq->r_mcount++; 249 250 /* 251 * Update the value of highest priority in the runqueue, 252 * if priority of this thread is higher. 253 */ 254 if (eprio > spc->spc_maxpriority) 255 spc->spc_maxpriority = eprio; 256 257 sched_newts(l); 258 259 /* 260 * Wake the chosen CPU or cause a preemption if the newly 261 * enqueued thread has higher priority. Don't cause a 262 * preemption if the thread is yielding (swtch). 263 */ 264 if (!swtch && eprio > spc->spc_curpriority) { 265 if (eprio >= sched_kpreempt_pri) 266 type = RESCHED_KPREEMPT; 267 else if (eprio >= sched_upreempt_pri) 268 type = RESCHED_IMMED; 269 else 270 type = RESCHED_LAZY; 271 cpu_need_resched(ci, type); 272 } 273 } 274 275 void 276 sched_dequeue(struct lwp *l) 277 { 278 runqueue_t *ci_rq; 279 TAILQ_HEAD(, lwp) *q_head; 280 struct schedstate_percpu *spc; 281 const pri_t eprio = lwp_eprio(l); 282 283 spc = & l->l_cpu->ci_schedstate; 284 ci_rq = spc->spc_sched_info; 285 KASSERT(lwp_locked(l, spc->spc_mutex)); 286 287 KASSERT(eprio <= spc->spc_maxpriority); 288 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); 289 KASSERT(ci_rq->r_count > 0); 290 291 if (spc->spc_migrating == l) 292 spc->spc_migrating = NULL; 293 294 ci_rq->r_count--; 295 if ((l->l_pflag & LP_BOUND) == 0) 296 ci_rq->r_mcount--; 297 298 q_head = sched_getrq(ci_rq, eprio); 299 TAILQ_REMOVE(q_head, l, l_runq); 300 if (TAILQ_EMPTY(q_head)) { 301 u_int i; 302 uint32_t q; 303 304 /* Unmark bit */ 305 i = eprio >> BITMAP_SHIFT; 306 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 307 KASSERT((ci_rq->r_bitmap[i] & q) != 0); 308 ci_rq->r_bitmap[i] &= ~q; 309 310 /* 311 * Update the value of highest priority in the runqueue, in a 312 * case it was a last thread in the queue of highest priority. 313 */ 314 if (eprio != spc->spc_maxpriority) 315 return; 316 317 do { 318 if (ci_rq->r_bitmap[i] != 0) { 319 q = ffs(ci_rq->r_bitmap[i]); 320 spc->spc_maxpriority = 321 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 322 return; 323 } 324 } while (i--); 325 326 /* If not found - set the lowest value */ 327 spc->spc_maxpriority = 0; 328 } 329 } 330 331 /* 332 * Migration and balancing. 333 */ 334 335 #ifdef MULTIPROCESSOR 336 337 /* Estimate if LWP is cache-hot */ 338 static inline bool 339 lwp_cache_hot(const struct lwp *l) 340 { 341 342 if (__predict_false(l->l_slptime || l->l_rticks == 0)) 343 return false; 344 345 return (hardclock_ticks - l->l_rticks <= cacheht_time); 346 } 347 348 /* Check if LWP can migrate to the chosen CPU */ 349 static inline bool 350 sched_migratable(const struct lwp *l, struct cpu_info *ci) 351 { 352 const struct schedstate_percpu *spc = &ci->ci_schedstate; 353 KASSERT(lwp_locked(__UNCONST(l), NULL)); 354 355 /* Is CPU offline? */ 356 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 357 return false; 358 359 /* Is affinity set? */ 360 if (__predict_false(l->l_affinity)) 361 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 362 363 /* Is there a processor-set? */ 364 return (spc->spc_psid == l->l_psid); 365 } 366 367 /* 368 * Estimate the migration of LWP to the other CPU. 369 * Take and return the CPU, if migration is needed. 370 */ 371 struct cpu_info * 372 sched_takecpu(struct lwp *l) 373 { 374 struct cpu_info *ci, *tci, *pivot, *next; 375 struct schedstate_percpu *spc; 376 runqueue_t *ci_rq, *ici_rq; 377 pri_t eprio, lpri, pri; 378 379 KASSERT(lwp_locked(l, NULL)); 380 381 /* If thread is strictly bound, do not estimate other CPUs */ 382 ci = l->l_cpu; 383 if (l->l_pflag & LP_BOUND) 384 return ci; 385 386 spc = &ci->ci_schedstate; 387 ci_rq = spc->spc_sched_info; 388 389 /* Make sure that thread is in appropriate processor-set */ 390 if (__predict_true(spc->spc_psid == l->l_psid)) { 391 /* If CPU of this thread is idling - run there */ 392 if (ci_rq->r_count == 0) { 393 ci_rq->r_ev_stay.ev_count++; 394 return ci; 395 } 396 /* Stay if thread is cache-hot */ 397 eprio = lwp_eprio(l); 398 if (__predict_true(l->l_stat != LSIDL) && 399 lwp_cache_hot(l) && eprio >= spc->spc_curpriority) { 400 ci_rq->r_ev_stay.ev_count++; 401 return ci; 402 } 403 } else { 404 eprio = lwp_eprio(l); 405 } 406 407 /* Run on current CPU if priority of thread is higher */ 408 ci = curcpu(); 409 spc = &ci->ci_schedstate; 410 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) { 411 ci_rq = spc->spc_sched_info; 412 ci_rq->r_ev_localize.ev_count++; 413 return ci; 414 } 415 416 /* 417 * Look for the CPU with the lowest priority thread. In case of 418 * equal priority, choose the CPU with the fewest of threads. 419 */ 420 pivot = l->l_cpu; 421 ci = pivot; 422 tci = pivot; 423 lpri = PRI_COUNT; 424 do { 425 if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) { 426 /* Reached the end, start from the beginning. */ 427 next = cpu_lookup(0); 428 } 429 spc = &ci->ci_schedstate; 430 ici_rq = spc->spc_sched_info; 431 pri = MAX(spc->spc_curpriority, spc->spc_maxpriority); 432 if (pri > lpri) 433 continue; 434 435 if (pri == lpri && ci_rq->r_count < ici_rq->r_count) 436 continue; 437 438 if (!sched_migratable(l, ci)) 439 continue; 440 441 lpri = pri; 442 tci = ci; 443 ci_rq = ici_rq; 444 } while (ci = next, ci != pivot); 445 446 ci_rq = tci->ci_schedstate.spc_sched_info; 447 ci_rq->r_ev_push.ev_count++; 448 449 return tci; 450 } 451 452 /* 453 * Tries to catch an LWP from the runqueue of other CPU. 454 */ 455 static struct lwp * 456 sched_catchlwp(struct cpu_info *ci) 457 { 458 struct cpu_info *curci = curcpu(); 459 struct schedstate_percpu *spc, *curspc; 460 TAILQ_HEAD(, lwp) *q_head; 461 runqueue_t *ci_rq; 462 struct lwp *l; 463 464 curspc = &curci->ci_schedstate; 465 spc = &ci->ci_schedstate; 466 KASSERT(curspc->spc_psid == spc->spc_psid); 467 468 ci_rq = spc->spc_sched_info; 469 if (ci_rq->r_mcount < min_catch) { 470 spc_unlock(ci); 471 return NULL; 472 } 473 474 /* Take the highest priority thread */ 475 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 476 l = TAILQ_FIRST(q_head); 477 478 for (;;) { 479 /* Check the first and next result from the queue */ 480 if (l == NULL) { 481 break; 482 } 483 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 484 ci->ci_data.cpu_name, 485 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 486 487 /* Look for threads, whose are allowed to migrate */ 488 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) || 489 !sched_migratable(l, curci)) { 490 l = TAILQ_NEXT(l, l_runq); 491 continue; 492 } 493 494 /* Grab the thread, and move to the local run queue */ 495 sched_dequeue(l); 496 497 /* 498 * If LWP is still context switching, we may need to 499 * spin-wait before changing its CPU. 500 */ 501 if (__predict_false(l->l_ctxswtch != 0)) { 502 u_int count; 503 count = SPINLOCK_BACKOFF_MIN; 504 while (l->l_ctxswtch) 505 SPINLOCK_BACKOFF(count); 506 } 507 l->l_cpu = curci; 508 ci_rq->r_ev_pull.ev_count++; 509 lwp_unlock_to(l, curspc->spc_mutex); 510 sched_enqueue(l, false); 511 return l; 512 } 513 spc_unlock(ci); 514 515 return l; 516 } 517 518 /* 519 * Periodical calculations for balancing. 520 */ 521 static void 522 sched_balance(void *nocallout) 523 { 524 struct cpu_info *ci, *hci; 525 runqueue_t *ci_rq; 526 CPU_INFO_ITERATOR cii; 527 u_int highest; 528 529 hci = curcpu(); 530 highest = 0; 531 532 /* Make lockless countings */ 533 for (CPU_INFO_FOREACH(cii, ci)) { 534 ci_rq = ci->ci_schedstate.spc_sched_info; 535 536 /* Average count of the threads */ 537 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1; 538 539 /* Look for CPU with the highest average */ 540 if (ci_rq->r_avgcount > highest) { 541 hci = ci; 542 highest = ci_rq->r_avgcount; 543 } 544 } 545 546 /* Update the worker */ 547 worker_ci = hci; 548 549 if (nocallout == NULL) 550 callout_schedule(&balance_ch, balance_period); 551 } 552 553 /* 554 * Called from each CPU's idle loop. 555 */ 556 void 557 sched_idle(void) 558 { 559 struct cpu_info *ci = curcpu(), *tci = NULL; 560 struct schedstate_percpu *spc, *tspc; 561 runqueue_t *ci_rq; 562 bool dlock = false; 563 564 /* Check if there is a migrating LWP */ 565 spc = &ci->ci_schedstate; 566 if (spc->spc_migrating == NULL) 567 goto no_migration; 568 569 spc_lock(ci); 570 for (;;) { 571 struct lwp *l; 572 573 l = spc->spc_migrating; 574 if (l == NULL) 575 break; 576 577 /* 578 * If second attempt, and target CPU has changed, 579 * drop the old lock. 580 */ 581 if (dlock == true && tci != l->l_target_cpu) { 582 KASSERT(tci != NULL); 583 spc_unlock(tci); 584 dlock = false; 585 } 586 587 /* 588 * Nothing to do if destination has changed to the 589 * local CPU, or migration was done by other CPU. 590 */ 591 tci = l->l_target_cpu; 592 if (tci == NULL || tci == ci) { 593 spc->spc_migrating = NULL; 594 l->l_target_cpu = NULL; 595 break; 596 } 597 tspc = &tci->ci_schedstate; 598 599 /* 600 * Double-lock the runqueues. 601 * We do that only once. 602 */ 603 if (dlock == false) { 604 dlock = true; 605 if (ci < tci) { 606 spc_lock(tci); 607 } else if (!mutex_tryenter(tspc->spc_mutex)) { 608 spc_unlock(ci); 609 spc_lock(tci); 610 spc_lock(ci); 611 /* Check the situation again.. */ 612 continue; 613 } 614 } 615 616 /* Migrate the thread */ 617 KASSERT(l->l_stat == LSRUN); 618 spc->spc_migrating = NULL; 619 l->l_target_cpu = NULL; 620 sched_dequeue(l); 621 l->l_cpu = tci; 622 lwp_setlock(l, tspc->spc_mutex); 623 sched_enqueue(l, false); 624 break; 625 } 626 if (dlock == true) { 627 KASSERT(tci != NULL); 628 spc_unlock(tci); 629 } 630 spc_unlock(ci); 631 632 no_migration: 633 ci_rq = spc->spc_sched_info; 634 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) { 635 return; 636 } 637 638 /* Reset the counter, and call the balancer */ 639 ci_rq->r_avgcount = 0; 640 sched_balance(ci); 641 tci = worker_ci; 642 tspc = &tci->ci_schedstate; 643 if (ci == tci || spc->spc_psid != tspc->spc_psid) 644 return; 645 spc_dlock(ci, tci); 646 (void)sched_catchlwp(tci); 647 spc_unlock(ci); 648 } 649 650 #else 651 652 /* 653 * stubs for !MULTIPROCESSOR 654 */ 655 656 struct cpu_info * 657 sched_takecpu(struct lwp *l) 658 { 659 660 return l->l_cpu; 661 } 662 663 void 664 sched_idle(void) 665 { 666 667 } 668 #endif /* MULTIPROCESSOR */ 669 670 /* 671 * Scheduling statistics and balancing. 672 */ 673 void 674 sched_lwp_stats(struct lwp *l) 675 { 676 int batch; 677 678 KASSERT(lwp_locked(l, NULL)); 679 680 /* Update sleep time */ 681 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 682 l->l_stat == LSSUSPENDED) 683 l->l_slptime++; 684 685 /* 686 * Set that thread is more CPU-bound, if sum of run time exceeds the 687 * sum of sleep time. Check if thread is CPU-bound a first time. 688 */ 689 batch = (l->l_rticksum > l->l_slpticksum); 690 if (batch != 0) { 691 if ((l->l_flag & LW_BATCH) == 0) 692 batch = 0; 693 l->l_flag |= LW_BATCH; 694 } else 695 l->l_flag &= ~LW_BATCH; 696 697 /* 698 * If thread is CPU-bound and never sleeps, it would occupy the CPU. 699 * In such case reset the value of last sleep, and check it later, if 700 * it is still zero - perform the migration, unmark the batch flag. 701 */ 702 if (batch && (l->l_slptime + l->l_slpticksum) == 0) { 703 if (l->l_slpticks == 0) { 704 if (l->l_target_cpu == NULL && 705 (l->l_stat == LSRUN || l->l_stat == LSONPROC)) { 706 struct cpu_info *ci = sched_takecpu(l); 707 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL; 708 } 709 l->l_flag &= ~LW_BATCH; 710 } else { 711 l->l_slpticks = 0; 712 } 713 } 714 715 /* Reset the time sums */ 716 l->l_slpticksum = 0; 717 l->l_rticksum = 0; 718 719 /* Scheduler-specific hook */ 720 sched_pstats_hook(l, batch); 721 #ifdef KDTRACE_HOOKS 722 curthread = l; 723 #endif 724 } 725 726 /* 727 * Scheduler mill. 728 */ 729 struct lwp * 730 sched_nextlwp(void) 731 { 732 struct cpu_info *ci = curcpu(); 733 struct schedstate_percpu *spc; 734 TAILQ_HEAD(, lwp) *q_head; 735 runqueue_t *ci_rq; 736 struct lwp *l; 737 738 /* Return to idle LWP if there is a migrating thread */ 739 spc = &ci->ci_schedstate; 740 if (__predict_false(spc->spc_migrating != NULL)) 741 return NULL; 742 ci_rq = spc->spc_sched_info; 743 744 #ifdef MULTIPROCESSOR 745 /* If runqueue is empty, try to catch some thread from other CPU */ 746 if (__predict_false(ci_rq->r_count == 0)) { 747 struct schedstate_percpu *cspc; 748 struct cpu_info *cci; 749 750 /* Offline CPUs should not perform this, however */ 751 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 752 return NULL; 753 754 /* Reset the counter, and call the balancer */ 755 ci_rq->r_avgcount = 0; 756 sched_balance(ci); 757 cci = worker_ci; 758 cspc = &cci->ci_schedstate; 759 if (ci == cci || spc->spc_psid != cspc->spc_psid || 760 !mutex_tryenter(cci->ci_schedstate.spc_mutex)) 761 return NULL; 762 return sched_catchlwp(cci); 763 } 764 #else 765 if (__predict_false(ci_rq->r_count == 0)) 766 return NULL; 767 #endif 768 769 /* Take the highest priority thread */ 770 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 771 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 772 l = TAILQ_FIRST(q_head); 773 KASSERT(l != NULL); 774 775 sched_oncpu(l); 776 l->l_rticks = hardclock_ticks; 777 778 return l; 779 } 780 781 /* 782 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 783 */ 784 785 bool 786 sched_curcpu_runnable_p(void) 787 { 788 const struct cpu_info *ci; 789 const struct schedstate_percpu *spc; 790 const runqueue_t *ci_rq; 791 bool rv; 792 793 kpreempt_disable(); 794 ci = curcpu(); 795 spc = &ci->ci_schedstate; 796 ci_rq = spc->spc_sched_info; 797 798 #ifndef __HAVE_FAST_SOFTINTS 799 if (ci->ci_data.cpu_softints) { 800 kpreempt_enable(); 801 return true; 802 } 803 #endif 804 805 rv = (ci_rq->r_count != 0) ? true : false; 806 kpreempt_enable(); 807 808 return rv; 809 } 810 811 /* 812 * Sysctl nodes and initialization. 813 */ 814 815 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 816 { 817 const struct sysctlnode *node = NULL; 818 819 sysctl_createv(clog, 0, NULL, &node, 820 CTLFLAG_PERMANENT, 821 CTLTYPE_NODE, "sched", 822 SYSCTL_DESCR("Scheduler options"), 823 NULL, 0, NULL, 0, 824 CTL_KERN, CTL_CREATE, CTL_EOL); 825 826 if (node == NULL) 827 return; 828 829 sysctl_createv(clog, 0, &node, NULL, 830 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 831 CTLTYPE_INT, "cacheht_time", 832 SYSCTL_DESCR("Cache hotness time (in ticks)"), 833 NULL, 0, &cacheht_time, 0, 834 CTL_CREATE, CTL_EOL); 835 sysctl_createv(clog, 0, &node, NULL, 836 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 837 CTLTYPE_INT, "balance_period", 838 SYSCTL_DESCR("Balance period (in ticks)"), 839 NULL, 0, &balance_period, 0, 840 CTL_CREATE, CTL_EOL); 841 sysctl_createv(clog, 0, &node, NULL, 842 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 843 CTLTYPE_INT, "min_catch", 844 SYSCTL_DESCR("Minimal count of threads for catching"), 845 NULL, 0, &min_catch, 0, 846 CTL_CREATE, CTL_EOL); 847 sysctl_createv(clog, 0, &node, NULL, 848 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 849 CTLTYPE_INT, "timesoftints", 850 SYSCTL_DESCR("Track CPU time for soft interrupts"), 851 NULL, 0, &softint_timing, 0, 852 CTL_CREATE, CTL_EOL); 853 sysctl_createv(clog, 0, &node, NULL, 854 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 855 CTLTYPE_INT, "kpreempt_pri", 856 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 857 NULL, 0, &sched_kpreempt_pri, 0, 858 CTL_CREATE, CTL_EOL); 859 sysctl_createv(clog, 0, &node, NULL, 860 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 861 CTLTYPE_INT, "upreempt_pri", 862 SYSCTL_DESCR("Minimum priority to trigger user preemption"), 863 NULL, 0, &sched_upreempt_pri, 0, 864 CTL_CREATE, CTL_EOL); 865 } 866 867 /* 868 * Debugging. 869 */ 870 871 #ifdef DDB 872 873 void 874 sched_print_runqueue(void (*pr)(const char *, ...)) 875 { 876 runqueue_t *ci_rq; 877 struct cpu_info *ci, *tci; 878 struct schedstate_percpu *spc; 879 struct lwp *l; 880 struct proc *p; 881 CPU_INFO_ITERATOR cii; 882 883 for (CPU_INFO_FOREACH(cii, ci)) { 884 int i; 885 886 spc = &ci->ci_schedstate; 887 ci_rq = spc->spc_sched_info; 888 889 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 890 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, " 891 "maxpri = %d, mlwp = %p\n", 892 #ifdef MULTIPROCESSOR 893 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 894 #else 895 curlwp->l_proc->p_pid, curlwp->l_lid, 896 #endif 897 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority, 898 spc->spc_migrating); 899 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 900 do { 901 uint32_t q; 902 q = ci_rq->r_bitmap[i]; 903 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 904 } while (i--); 905 } 906 907 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 908 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 909 910 PROCLIST_FOREACH(p, &allproc) { 911 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 912 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 913 ci = l->l_cpu; 914 tci = l->l_target_cpu; 915 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 916 (int)l->l_lid, l->l_priority, lwp_eprio(l), 917 l->l_flag, l->l_stat == LSRUN ? "RQ" : 918 (l->l_stat == LSSLEEP ? "SQ" : "-"), 919 l, ci->ci_index, (tci ? tci->ci_index : -1), 920 (u_int)(hardclock_ticks - l->l_rticks)); 921 } 922 } 923 } 924 925 #endif 926