1 /* $NetBSD: kern_runq.c,v 1.36 2013/02/09 00:31:21 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.36 2013/02/09 00:31:21 christos Exp $"); 31 32 #include <sys/param.h> 33 #include <sys/kernel.h> 34 #include <sys/bitops.h> 35 #include <sys/cpu.h> 36 #include <sys/idle.h> 37 #include <sys/intr.h> 38 #include <sys/kmem.h> 39 #include <sys/lwp.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/sched.h> 43 #include <sys/syscallargs.h> 44 #include <sys/sysctl.h> 45 #include <sys/systm.h> 46 #include <sys/types.h> 47 #include <sys/evcnt.h> 48 49 /* 50 * Priority related defintions. 51 */ 52 #define PRI_TS_COUNT (NPRI_USER) 53 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 54 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 55 56 #define PRI_HIGHEST_TS (MAXPRI_USER) 57 58 /* 59 * Bits per map. 60 */ 61 #define BITMAP_BITS (32) 62 #define BITMAP_SHIFT (5) 63 #define BITMAP_MSB (0x80000000U) 64 #define BITMAP_MASK (BITMAP_BITS - 1) 65 66 /* 67 * Structures, runqueue. 68 */ 69 70 const int schedppq = 1; 71 72 typedef struct { 73 TAILQ_HEAD(, lwp) q_head; 74 } queue_t; 75 76 typedef struct { 77 /* Bitmap */ 78 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; 79 /* Counters */ 80 u_int r_count; /* Count of the threads */ 81 u_int r_avgcount; /* Average count of threads */ 82 u_int r_mcount; /* Count of migratable threads */ 83 /* Runqueues */ 84 queue_t r_rt_queue[PRI_RT_COUNT]; 85 queue_t r_ts_queue[PRI_TS_COUNT]; 86 /* Event counters */ 87 struct evcnt r_ev_pull; 88 struct evcnt r_ev_push; 89 struct evcnt r_ev_stay; 90 struct evcnt r_ev_localize; 91 } runqueue_t; 92 93 static void * sched_getrq(runqueue_t *, const pri_t); 94 #ifdef MULTIPROCESSOR 95 static lwp_t * sched_catchlwp(struct cpu_info *); 96 static void sched_balance(void *); 97 #endif 98 99 /* 100 * Preemption control. 101 */ 102 int sched_upreempt_pri = 0; 103 #ifdef __HAVE_PREEMPTION 104 # ifdef DEBUG 105 int sched_kpreempt_pri = 0; 106 # else 107 int sched_kpreempt_pri = PRI_USER_RT; 108 # endif 109 #else 110 int sched_kpreempt_pri = 1000; 111 #endif 112 113 /* 114 * Migration and balancing. 115 */ 116 static u_int cacheht_time; /* Cache hotness time */ 117 static u_int min_catch; /* Minimal LWP count for catching */ 118 static u_int balance_period; /* Balance period */ 119 static struct cpu_info *worker_ci; /* Victim CPU */ 120 #ifdef MULTIPROCESSOR 121 static struct callout balance_ch; /* Callout of balancer */ 122 #endif 123 124 void 125 runq_init(void) 126 { 127 128 /* Balancing */ 129 worker_ci = curcpu(); 130 cacheht_time = mstohz(3); /* ~3 ms */ 131 balance_period = mstohz(300); /* ~300 ms */ 132 133 /* Minimal count of LWPs for catching */ 134 min_catch = 1; 135 136 /* Initialize balancing callout and run it */ 137 #ifdef MULTIPROCESSOR 138 callout_init(&balance_ch, CALLOUT_MPSAFE); 139 callout_setfunc(&balance_ch, sched_balance, NULL); 140 callout_schedule(&balance_ch, balance_period); 141 #endif 142 } 143 144 void 145 sched_cpuattach(struct cpu_info *ci) 146 { 147 runqueue_t *ci_rq; 148 void *rq_ptr; 149 u_int i, size; 150 151 if (ci->ci_schedstate.spc_lwplock == NULL) { 152 ci->ci_schedstate.spc_lwplock = 153 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 154 } 155 if (ci == lwp0.l_cpu) { 156 /* Initialize the scheduler structure of the primary LWP */ 157 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock; 158 } 159 if (ci->ci_schedstate.spc_mutex != NULL) { 160 /* Already initialized. */ 161 return; 162 } 163 164 /* Allocate the run queue */ 165 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit; 166 rq_ptr = kmem_zalloc(size, KM_SLEEP); 167 if (rq_ptr == NULL) { 168 panic("sched_cpuattach: could not allocate the runqueue"); 169 } 170 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit)); 171 172 /* Initialize run queues */ 173 ci->ci_schedstate.spc_mutex = 174 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 175 for (i = 0; i < PRI_RT_COUNT; i++) 176 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); 177 for (i = 0; i < PRI_TS_COUNT; i++) 178 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); 179 180 ci->ci_schedstate.spc_sched_info = ci_rq; 181 182 evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL, 183 cpu_name(ci), "runqueue pull"); 184 evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL, 185 cpu_name(ci), "runqueue push"); 186 evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL, 187 cpu_name(ci), "runqueue stay"); 188 evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL, 189 cpu_name(ci), "runqueue localize"); 190 } 191 192 /* 193 * Control of the runqueue. 194 */ 195 196 static inline void * 197 sched_getrq(runqueue_t *ci_rq, const pri_t prio) 198 { 199 200 KASSERT(prio < PRI_COUNT); 201 return (prio <= PRI_HIGHEST_TS) ? 202 &ci_rq->r_ts_queue[prio].q_head : 203 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; 204 } 205 206 void 207 sched_enqueue(struct lwp *l, bool swtch) 208 { 209 runqueue_t *ci_rq; 210 struct schedstate_percpu *spc; 211 TAILQ_HEAD(, lwp) *q_head; 212 const pri_t eprio = lwp_eprio(l); 213 struct cpu_info *ci; 214 int type; 215 216 ci = l->l_cpu; 217 spc = &ci->ci_schedstate; 218 ci_rq = spc->spc_sched_info; 219 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 220 221 /* Update the last run time on switch */ 222 if (__predict_true(swtch == true)) 223 l->l_rticksum += (hardclock_ticks - l->l_rticks); 224 else if (l->l_rticks == 0) 225 l->l_rticks = hardclock_ticks; 226 227 /* Enqueue the thread */ 228 q_head = sched_getrq(ci_rq, eprio); 229 if (TAILQ_EMPTY(q_head)) { 230 u_int i; 231 uint32_t q; 232 233 /* Mark bit */ 234 i = eprio >> BITMAP_SHIFT; 235 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 236 KASSERT((ci_rq->r_bitmap[i] & q) == 0); 237 ci_rq->r_bitmap[i] |= q; 238 } 239 TAILQ_INSERT_TAIL(q_head, l, l_runq); 240 ci_rq->r_count++; 241 if ((l->l_pflag & LP_BOUND) == 0) 242 ci_rq->r_mcount++; 243 244 /* 245 * Update the value of highest priority in the runqueue, 246 * if priority of this thread is higher. 247 */ 248 if (eprio > spc->spc_maxpriority) 249 spc->spc_maxpriority = eprio; 250 251 sched_newts(l); 252 253 /* 254 * Wake the chosen CPU or cause a preemption if the newly 255 * enqueued thread has higher priority. Don't cause a 256 * preemption if the thread is yielding (swtch). 257 */ 258 if (!swtch && eprio > spc->spc_curpriority) { 259 if (eprio >= sched_kpreempt_pri) 260 type = RESCHED_KPREEMPT; 261 else if (eprio >= sched_upreempt_pri) 262 type = RESCHED_IMMED; 263 else 264 type = RESCHED_LAZY; 265 cpu_need_resched(ci, type); 266 } 267 } 268 269 void 270 sched_dequeue(struct lwp *l) 271 { 272 runqueue_t *ci_rq; 273 TAILQ_HEAD(, lwp) *q_head; 274 struct schedstate_percpu *spc; 275 const pri_t eprio = lwp_eprio(l); 276 277 spc = & l->l_cpu->ci_schedstate; 278 ci_rq = spc->spc_sched_info; 279 KASSERT(lwp_locked(l, spc->spc_mutex)); 280 281 KASSERT(eprio <= spc->spc_maxpriority); 282 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); 283 KASSERT(ci_rq->r_count > 0); 284 285 if (spc->spc_migrating == l) 286 spc->spc_migrating = NULL; 287 288 ci_rq->r_count--; 289 if ((l->l_pflag & LP_BOUND) == 0) 290 ci_rq->r_mcount--; 291 292 q_head = sched_getrq(ci_rq, eprio); 293 TAILQ_REMOVE(q_head, l, l_runq); 294 if (TAILQ_EMPTY(q_head)) { 295 u_int i; 296 uint32_t q; 297 298 /* Unmark bit */ 299 i = eprio >> BITMAP_SHIFT; 300 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 301 KASSERT((ci_rq->r_bitmap[i] & q) != 0); 302 ci_rq->r_bitmap[i] &= ~q; 303 304 /* 305 * Update the value of highest priority in the runqueue, in a 306 * case it was a last thread in the queue of highest priority. 307 */ 308 if (eprio != spc->spc_maxpriority) 309 return; 310 311 do { 312 if (ci_rq->r_bitmap[i] != 0) { 313 q = ffs(ci_rq->r_bitmap[i]); 314 spc->spc_maxpriority = 315 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 316 return; 317 } 318 } while (i--); 319 320 /* If not found - set the lowest value */ 321 spc->spc_maxpriority = 0; 322 } 323 } 324 325 /* 326 * Migration and balancing. 327 */ 328 329 #ifdef MULTIPROCESSOR 330 331 /* Estimate if LWP is cache-hot */ 332 static inline bool 333 lwp_cache_hot(const struct lwp *l) 334 { 335 336 if (__predict_false(l->l_slptime || l->l_rticks == 0)) 337 return false; 338 339 return (hardclock_ticks - l->l_rticks <= cacheht_time); 340 } 341 342 /* Check if LWP can migrate to the chosen CPU */ 343 static inline bool 344 sched_migratable(const struct lwp *l, struct cpu_info *ci) 345 { 346 const struct schedstate_percpu *spc = &ci->ci_schedstate; 347 KASSERT(lwp_locked(__UNCONST(l), NULL)); 348 349 /* Is CPU offline? */ 350 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 351 return false; 352 353 /* Is affinity set? */ 354 if (__predict_false(l->l_affinity)) 355 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 356 357 /* Is there a processor-set? */ 358 return (spc->spc_psid == l->l_psid); 359 } 360 361 /* 362 * Estimate the migration of LWP to the other CPU. 363 * Take and return the CPU, if migration is needed. 364 */ 365 struct cpu_info * 366 sched_takecpu(struct lwp *l) 367 { 368 struct cpu_info *ci, *tci, *first, *next; 369 struct schedstate_percpu *spc; 370 runqueue_t *ci_rq, *ici_rq; 371 pri_t eprio, lpri, pri; 372 373 KASSERT(lwp_locked(l, NULL)); 374 375 /* If thread is strictly bound, do not estimate other CPUs */ 376 ci = l->l_cpu; 377 if (l->l_pflag & LP_BOUND) 378 return ci; 379 380 spc = &ci->ci_schedstate; 381 ci_rq = spc->spc_sched_info; 382 383 /* Make sure that thread is in appropriate processor-set */ 384 if (__predict_true(spc->spc_psid == l->l_psid)) { 385 /* If CPU of this thread is idling - run there */ 386 if (ci_rq->r_count == 0) { 387 ci_rq->r_ev_stay.ev_count++; 388 return ci; 389 } 390 /* Stay if thread is cache-hot */ 391 eprio = lwp_eprio(l); 392 if (__predict_true(l->l_stat != LSIDL) && 393 lwp_cache_hot(l) && eprio >= spc->spc_curpriority) { 394 ci_rq->r_ev_stay.ev_count++; 395 return ci; 396 } 397 } else { 398 eprio = lwp_eprio(l); 399 } 400 401 /* Run on current CPU if priority of thread is higher */ 402 ci = curcpu(); 403 spc = &ci->ci_schedstate; 404 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) { 405 ci_rq = spc->spc_sched_info; 406 ci_rq->r_ev_localize.ev_count++; 407 return ci; 408 } 409 410 /* 411 * Look for the CPU with the lowest priority thread. In case of 412 * equal priority, choose the CPU with the fewest of threads. 413 */ 414 first = l->l_cpu; 415 ci = first; 416 tci = first; 417 lpri = PRI_COUNT; 418 do { 419 next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain); 420 spc = &ci->ci_schedstate; 421 ici_rq = spc->spc_sched_info; 422 pri = max(spc->spc_curpriority, spc->spc_maxpriority); 423 if (pri > lpri) 424 continue; 425 426 if (pri == lpri && ci_rq->r_count < ici_rq->r_count) 427 continue; 428 429 if (!sched_migratable(l, ci)) 430 continue; 431 432 lpri = pri; 433 tci = ci; 434 ci_rq = ici_rq; 435 } while (ci = next, ci != first); 436 437 ci_rq = tci->ci_schedstate.spc_sched_info; 438 ci_rq->r_ev_push.ev_count++; 439 440 return tci; 441 } 442 443 /* 444 * Tries to catch an LWP from the runqueue of other CPU. 445 */ 446 static struct lwp * 447 sched_catchlwp(struct cpu_info *ci) 448 { 449 struct cpu_info *curci = curcpu(); 450 struct schedstate_percpu *spc, *curspc; 451 TAILQ_HEAD(, lwp) *q_head; 452 runqueue_t *ci_rq; 453 struct lwp *l; 454 455 curspc = &curci->ci_schedstate; 456 spc = &ci->ci_schedstate; 457 KASSERT(curspc->spc_psid == spc->spc_psid); 458 459 ci_rq = spc->spc_sched_info; 460 if (ci_rq->r_mcount < min_catch) { 461 spc_unlock(ci); 462 return NULL; 463 } 464 465 /* Take the highest priority thread */ 466 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 467 l = TAILQ_FIRST(q_head); 468 469 for (;;) { 470 /* Check the first and next result from the queue */ 471 if (l == NULL) { 472 break; 473 } 474 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 475 ci->ci_data.cpu_name, 476 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 477 478 /* Look for threads, whose are allowed to migrate */ 479 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) || 480 !sched_migratable(l, curci)) { 481 l = TAILQ_NEXT(l, l_runq); 482 continue; 483 } 484 485 /* Grab the thread, and move to the local run queue */ 486 sched_dequeue(l); 487 488 /* 489 * If LWP is still context switching, we may need to 490 * spin-wait before changing its CPU. 491 */ 492 if (__predict_false(l->l_ctxswtch != 0)) { 493 u_int count; 494 count = SPINLOCK_BACKOFF_MIN; 495 while (l->l_ctxswtch) 496 SPINLOCK_BACKOFF(count); 497 } 498 l->l_cpu = curci; 499 ci_rq->r_ev_pull.ev_count++; 500 lwp_unlock_to(l, curspc->spc_mutex); 501 sched_enqueue(l, false); 502 return l; 503 } 504 spc_unlock(ci); 505 506 return l; 507 } 508 509 /* 510 * Periodical calculations for balancing. 511 */ 512 static void 513 sched_balance(void *nocallout) 514 { 515 struct cpu_info *ci, *hci; 516 runqueue_t *ci_rq; 517 CPU_INFO_ITERATOR cii; 518 u_int highest; 519 520 hci = curcpu(); 521 highest = 0; 522 523 /* Make lockless countings */ 524 for (CPU_INFO_FOREACH(cii, ci)) { 525 ci_rq = ci->ci_schedstate.spc_sched_info; 526 527 /* Average count of the threads */ 528 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1; 529 530 /* Look for CPU with the highest average */ 531 if (ci_rq->r_avgcount > highest) { 532 hci = ci; 533 highest = ci_rq->r_avgcount; 534 } 535 } 536 537 /* Update the worker */ 538 worker_ci = hci; 539 540 if (nocallout == NULL) 541 callout_schedule(&balance_ch, balance_period); 542 } 543 544 /* 545 * Called from each CPU's idle loop. 546 */ 547 void 548 sched_idle(void) 549 { 550 struct cpu_info *ci = curcpu(), *tci = NULL; 551 struct schedstate_percpu *spc, *tspc; 552 runqueue_t *ci_rq; 553 bool dlock = false; 554 555 /* Check if there is a migrating LWP */ 556 spc = &ci->ci_schedstate; 557 if (spc->spc_migrating == NULL) 558 goto no_migration; 559 560 spc_lock(ci); 561 for (;;) { 562 struct lwp *l; 563 564 l = spc->spc_migrating; 565 if (l == NULL) 566 break; 567 568 /* 569 * If second attempt, and target CPU has changed, 570 * drop the old lock. 571 */ 572 if (dlock == true && tci != l->l_target_cpu) { 573 KASSERT(tci != NULL); 574 spc_unlock(tci); 575 dlock = false; 576 } 577 578 /* 579 * Nothing to do if destination has changed to the 580 * local CPU, or migration was done by other CPU. 581 */ 582 tci = l->l_target_cpu; 583 if (tci == NULL || tci == ci) { 584 spc->spc_migrating = NULL; 585 l->l_target_cpu = NULL; 586 break; 587 } 588 tspc = &tci->ci_schedstate; 589 590 /* 591 * Double-lock the runqueues. 592 * We do that only once. 593 */ 594 if (dlock == false) { 595 dlock = true; 596 if (ci < tci) { 597 spc_lock(tci); 598 } else if (!mutex_tryenter(tspc->spc_mutex)) { 599 spc_unlock(ci); 600 spc_lock(tci); 601 spc_lock(ci); 602 /* Check the situation again.. */ 603 continue; 604 } 605 } 606 607 /* Migrate the thread */ 608 KASSERT(l->l_stat == LSRUN); 609 spc->spc_migrating = NULL; 610 l->l_target_cpu = NULL; 611 sched_dequeue(l); 612 l->l_cpu = tci; 613 lwp_setlock(l, tspc->spc_mutex); 614 sched_enqueue(l, false); 615 break; 616 } 617 if (dlock == true) { 618 KASSERT(tci != NULL); 619 spc_unlock(tci); 620 } 621 spc_unlock(ci); 622 623 no_migration: 624 ci_rq = spc->spc_sched_info; 625 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) { 626 return; 627 } 628 629 /* Reset the counter, and call the balancer */ 630 ci_rq->r_avgcount = 0; 631 sched_balance(ci); 632 tci = worker_ci; 633 tspc = &tci->ci_schedstate; 634 if (ci == tci || spc->spc_psid != tspc->spc_psid) 635 return; 636 spc_dlock(ci, tci); 637 (void)sched_catchlwp(tci); 638 spc_unlock(ci); 639 } 640 641 #else 642 643 /* 644 * stubs for !MULTIPROCESSOR 645 */ 646 647 struct cpu_info * 648 sched_takecpu(struct lwp *l) 649 { 650 651 return l->l_cpu; 652 } 653 654 void 655 sched_idle(void) 656 { 657 658 } 659 #endif /* MULTIPROCESSOR */ 660 661 /* 662 * Scheduling statistics and balancing. 663 */ 664 void 665 sched_lwp_stats(struct lwp *l) 666 { 667 int batch; 668 669 KASSERT(lwp_locked(l, NULL)); 670 671 /* Update sleep time */ 672 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 673 l->l_stat == LSSUSPENDED) 674 l->l_slptime++; 675 676 /* 677 * Set that thread is more CPU-bound, if sum of run time exceeds the 678 * sum of sleep time. Check if thread is CPU-bound a first time. 679 */ 680 batch = (l->l_rticksum > l->l_slpticksum); 681 if (batch != 0) { 682 if ((l->l_flag & LW_BATCH) == 0) 683 batch = 0; 684 l->l_flag |= LW_BATCH; 685 } else 686 l->l_flag &= ~LW_BATCH; 687 688 /* 689 * If thread is CPU-bound and never sleeps, it would occupy the CPU. 690 * In such case reset the value of last sleep, and check it later, if 691 * it is still zero - perform the migration, unmark the batch flag. 692 */ 693 if (batch && (l->l_slptime + l->l_slpticksum) == 0) { 694 if (l->l_slpticks == 0) { 695 if (l->l_target_cpu == NULL && 696 (l->l_stat == LSRUN || l->l_stat == LSONPROC)) { 697 struct cpu_info *ci = sched_takecpu(l); 698 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL; 699 } 700 l->l_flag &= ~LW_BATCH; 701 } else { 702 l->l_slpticks = 0; 703 } 704 } 705 706 /* Reset the time sums */ 707 l->l_slpticksum = 0; 708 l->l_rticksum = 0; 709 710 /* Scheduler-specific hook */ 711 sched_pstats_hook(l, batch); 712 } 713 714 /* 715 * Scheduler mill. 716 */ 717 struct lwp * 718 sched_nextlwp(void) 719 { 720 struct cpu_info *ci = curcpu(); 721 struct schedstate_percpu *spc; 722 TAILQ_HEAD(, lwp) *q_head; 723 runqueue_t *ci_rq; 724 struct lwp *l; 725 726 /* Return to idle LWP if there is a migrating thread */ 727 spc = &ci->ci_schedstate; 728 if (__predict_false(spc->spc_migrating != NULL)) 729 return NULL; 730 ci_rq = spc->spc_sched_info; 731 732 #ifdef MULTIPROCESSOR 733 /* If runqueue is empty, try to catch some thread from other CPU */ 734 if (__predict_false(ci_rq->r_count == 0)) { 735 struct schedstate_percpu *cspc; 736 struct cpu_info *cci; 737 738 /* Offline CPUs should not perform this, however */ 739 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 740 return NULL; 741 742 /* Reset the counter, and call the balancer */ 743 ci_rq->r_avgcount = 0; 744 sched_balance(ci); 745 cci = worker_ci; 746 cspc = &cci->ci_schedstate; 747 if (ci == cci || spc->spc_psid != cspc->spc_psid || 748 !mutex_tryenter(cci->ci_schedstate.spc_mutex)) 749 return NULL; 750 return sched_catchlwp(cci); 751 } 752 #else 753 if (__predict_false(ci_rq->r_count == 0)) 754 return NULL; 755 #endif 756 757 /* Take the highest priority thread */ 758 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 759 q_head = sched_getrq(ci_rq, spc->spc_maxpriority); 760 l = TAILQ_FIRST(q_head); 761 KASSERT(l != NULL); 762 763 sched_oncpu(l); 764 l->l_rticks = hardclock_ticks; 765 766 return l; 767 } 768 769 /* 770 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 771 */ 772 773 bool 774 sched_curcpu_runnable_p(void) 775 { 776 const struct cpu_info *ci; 777 const struct schedstate_percpu *spc; 778 const runqueue_t *ci_rq; 779 bool rv; 780 781 kpreempt_disable(); 782 ci = curcpu(); 783 spc = &ci->ci_schedstate; 784 ci_rq = spc->spc_sched_info; 785 786 #ifndef __HAVE_FAST_SOFTINTS 787 if (ci->ci_data.cpu_softints) { 788 kpreempt_enable(); 789 return true; 790 } 791 #endif 792 793 rv = (ci_rq->r_count != 0) ? true : false; 794 kpreempt_enable(); 795 796 return rv; 797 } 798 799 /* 800 * Sysctl nodes and initialization. 801 */ 802 803 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 804 { 805 const struct sysctlnode *node = NULL; 806 807 sysctl_createv(clog, 0, NULL, NULL, 808 CTLFLAG_PERMANENT, 809 CTLTYPE_NODE, "kern", NULL, 810 NULL, 0, NULL, 0, 811 CTL_KERN, CTL_EOL); 812 sysctl_createv(clog, 0, NULL, &node, 813 CTLFLAG_PERMANENT, 814 CTLTYPE_NODE, "sched", 815 SYSCTL_DESCR("Scheduler options"), 816 NULL, 0, NULL, 0, 817 CTL_KERN, CTL_CREATE, CTL_EOL); 818 819 if (node == NULL) 820 return; 821 822 sysctl_createv(clog, 0, &node, NULL, 823 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 824 CTLTYPE_INT, "cacheht_time", 825 SYSCTL_DESCR("Cache hotness time (in ticks)"), 826 NULL, 0, &cacheht_time, 0, 827 CTL_CREATE, CTL_EOL); 828 sysctl_createv(clog, 0, &node, NULL, 829 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 830 CTLTYPE_INT, "balance_period", 831 SYSCTL_DESCR("Balance period (in ticks)"), 832 NULL, 0, &balance_period, 0, 833 CTL_CREATE, CTL_EOL); 834 sysctl_createv(clog, 0, &node, NULL, 835 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 836 CTLTYPE_INT, "min_catch", 837 SYSCTL_DESCR("Minimal count of threads for catching"), 838 NULL, 0, &min_catch, 0, 839 CTL_CREATE, CTL_EOL); 840 sysctl_createv(clog, 0, &node, NULL, 841 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 842 CTLTYPE_INT, "timesoftints", 843 SYSCTL_DESCR("Track CPU time for soft interrupts"), 844 NULL, 0, &softint_timing, 0, 845 CTL_CREATE, CTL_EOL); 846 sysctl_createv(clog, 0, &node, NULL, 847 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 848 CTLTYPE_INT, "kpreempt_pri", 849 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 850 NULL, 0, &sched_kpreempt_pri, 0, 851 CTL_CREATE, CTL_EOL); 852 sysctl_createv(clog, 0, &node, NULL, 853 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 854 CTLTYPE_INT, "upreempt_pri", 855 SYSCTL_DESCR("Minimum priority to trigger user preemption"), 856 NULL, 0, &sched_upreempt_pri, 0, 857 CTL_CREATE, CTL_EOL); 858 } 859 860 /* 861 * Debugging. 862 */ 863 864 #ifdef DDB 865 866 void 867 sched_print_runqueue(void (*pr)(const char *, ...)) 868 { 869 runqueue_t *ci_rq; 870 struct cpu_info *ci, *tci; 871 struct schedstate_percpu *spc; 872 struct lwp *l; 873 struct proc *p; 874 CPU_INFO_ITERATOR cii; 875 876 for (CPU_INFO_FOREACH(cii, ci)) { 877 int i; 878 879 spc = &ci->ci_schedstate; 880 ci_rq = spc->spc_sched_info; 881 882 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 883 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, " 884 "maxpri = %d, mlwp = %p\n", 885 #ifdef MULTIPROCESSOR 886 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 887 #else 888 curlwp->l_proc->p_pid, curlwp->l_lid, 889 #endif 890 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority, 891 spc->spc_migrating); 892 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 893 do { 894 uint32_t q; 895 q = ci_rq->r_bitmap[i]; 896 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 897 } while (i--); 898 } 899 900 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 901 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 902 903 PROCLIST_FOREACH(p, &allproc) { 904 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 905 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 906 ci = l->l_cpu; 907 tci = l->l_target_cpu; 908 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 909 (int)l->l_lid, l->l_priority, lwp_eprio(l), 910 l->l_flag, l->l_stat == LSRUN ? "RQ" : 911 (l->l_stat == LSSLEEP ? "SQ" : "-"), 912 l, ci->ci_index, (tci ? tci->ci_index : -1), 913 (u_int)(hardclock_ticks - l->l_rticks)); 914 } 915 } 916 } 917 918 #endif 919