1 /* $NetBSD: sched_m2.c,v 1.13 2007/12/05 07:06:54 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2007, Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.13 2007/12/05 07:06:54 ad Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/bitops.h> 41 #include <sys/cpu.h> 42 #include <sys/callout.h> 43 #include <sys/errno.h> 44 #include <sys/kernel.h> 45 #include <sys/kmem.h> 46 #include <sys/lwp.h> 47 #include <sys/mutex.h> 48 #include <sys/pool.h> 49 #include <sys/proc.h> 50 #include <sys/resource.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sched.h> 53 #include <sys/syscallargs.h> 54 #include <sys/sysctl.h> 55 #include <sys/types.h> 56 57 /* 58 * Priority related defintions. 59 */ 60 #define PRI_TS_COUNT (NPRI_USER) 61 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 62 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 63 64 #define PRI_HIGHEST_TS (MAXPRI_USER) 65 #define PRI_DEFAULT (NPRI_USER >> 1) 66 67 const int schedppq = 1; 68 69 /* 70 * Bits per map. 71 */ 72 #define BITMAP_BITS (32) 73 #define BITMAP_SHIFT (5) 74 #define BITMAP_MSB (0x80000000U) 75 #define BITMAP_MASK (BITMAP_BITS - 1) 76 77 /* 78 * Time-slices and priorities. 79 */ 80 static u_int min_ts; /* Minimal time-slice */ 81 static u_int max_ts; /* Maximal time-slice */ 82 static u_int rt_ts; /* Real-time time-slice */ 83 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 84 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 85 86 /* 87 * Migration and balancing. 88 */ 89 #ifdef MULTIPROCESSOR 90 static u_int cacheht_time; /* Cache hotness time */ 91 static u_int min_catch; /* Minimal LWP count for catching */ 92 93 static u_int balance_period; /* Balance period */ 94 static struct callout balance_ch; /* Callout of balancer */ 95 96 static struct cpu_info * volatile worker_ci; 97 98 #define CACHE_HOT(sil) (sil->sl_lrtime && \ 99 (hardclock_ticks - sil->sl_lrtime < cacheht_time)) 100 101 #endif 102 103 /* 104 * Structures, runqueue. 105 */ 106 107 typedef struct { 108 TAILQ_HEAD(, lwp) q_head; 109 } queue_t; 110 111 typedef struct { 112 /* Lock and bitmap */ 113 kmutex_t r_rq_mutex; 114 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; 115 /* Counters */ 116 u_int r_count; /* Count of the threads */ 117 pri_t r_highest_pri; /* Highest priority */ 118 u_int r_avgcount; /* Average count of threads */ 119 u_int r_mcount; /* Count of migratable threads */ 120 /* Runqueues */ 121 queue_t r_rt_queue[PRI_RT_COUNT]; 122 queue_t r_ts_queue[PRI_TS_COUNT]; 123 } runqueue_t; 124 125 typedef struct { 126 u_int sl_flags; 127 u_int sl_timeslice; /* Time-slice of thread */ 128 u_int sl_slept; /* Saved sleep time for sleep sum */ 129 u_int sl_slpsum; /* Sum of sleep time */ 130 u_int sl_rtime; /* Saved start time of run */ 131 u_int sl_rtsum; /* Sum of the run time */ 132 u_int sl_lrtime; /* Last run time */ 133 } sched_info_lwp_t; 134 135 /* Flags */ 136 #define SL_BATCH 0x01 137 138 /* Pool of the scheduler-specific structures for threads */ 139 static struct pool sil_pool; 140 141 /* 142 * Prototypes. 143 */ 144 145 static inline void * sched_getrq(runqueue_t *, const pri_t); 146 static inline void sched_newts(struct lwp *); 147 static void sched_precalcts(void); 148 149 #ifdef MULTIPROCESSOR 150 static struct lwp * sched_catchlwp(void); 151 static void sched_balance(void *); 152 #endif 153 154 /* 155 * Initialization and setup. 156 */ 157 158 void 159 sched_rqinit(void) 160 { 161 struct cpu_info *ci = curcpu(); 162 163 if (hz < 100) { 164 panic("sched_rqinit: value of HZ is too low\n"); 165 } 166 167 /* Default timing ranges */ 168 min_ts = mstohz(50); /* ~50ms */ 169 max_ts = mstohz(150); /* ~150ms */ 170 rt_ts = mstohz(100); /* ~100ms */ 171 sched_precalcts(); 172 173 #ifdef MULTIPROCESSOR 174 /* Balancing */ 175 worker_ci = ci; 176 cacheht_time = mstohz(5); /* ~5 ms */ 177 balance_period = mstohz(300); /* ~300ms */ 178 min_catch = ~0; 179 #endif 180 181 /* Pool of the scheduler-specific structures */ 182 pool_init(&sil_pool, sizeof(sched_info_lwp_t), 0, 0, 0, 183 "lwpsd", &pool_allocator_nointr, IPL_NONE); 184 185 /* Attach the primary CPU here */ 186 sched_cpuattach(ci); 187 188 /* Initialize the scheduler structure of the primary LWP */ 189 lwp0.l_mutex = &ci->ci_schedstate.spc_lwplock; 190 sched_lwp_fork(NULL, &lwp0); 191 sched_newts(&lwp0); 192 } 193 194 void 195 sched_setup(void) 196 { 197 198 #ifdef MULTIPROCESSOR 199 /* Minimal count of LWPs for catching: log2(count of CPUs) */ 200 min_catch = min(ilog2(ncpu), 4); 201 202 /* Initialize balancing callout and run it */ 203 callout_init(&balance_ch, CALLOUT_MPSAFE); 204 callout_setfunc(&balance_ch, sched_balance, NULL); 205 callout_schedule(&balance_ch, balance_period); 206 #endif 207 } 208 209 void 210 sched_cpuattach(struct cpu_info *ci) 211 { 212 runqueue_t *ci_rq; 213 void *rq_ptr; 214 u_int i, size; 215 216 /* 217 * Allocate the run queue. 218 * XXX: Estimate cache behaviour more.. 219 */ 220 size = roundup(sizeof(runqueue_t), CACHE_LINE_SIZE) + CACHE_LINE_SIZE; 221 rq_ptr = kmem_zalloc(size, KM_NOSLEEP); 222 if (rq_ptr == NULL) { 223 panic("scheduler: could not allocate the runqueue"); 224 } 225 /* XXX: Save the original pointer for future.. */ 226 ci_rq = (void *)(roundup((intptr_t)(rq_ptr), CACHE_LINE_SIZE)); 227 228 /* Initialize run queues */ 229 mutex_init(&ci_rq->r_rq_mutex, MUTEX_DEFAULT, IPL_SCHED); 230 for (i = 0; i < PRI_RT_COUNT; i++) 231 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); 232 for (i = 0; i < PRI_TS_COUNT; i++) 233 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); 234 ci_rq->r_highest_pri = 0; 235 236 ci->ci_schedstate.spc_sched_info = ci_rq; 237 ci->ci_schedstate.spc_mutex = &ci_rq->r_rq_mutex; 238 } 239 240 /* Pre-calculate the time-slices for the priorities */ 241 static void 242 sched_precalcts(void) 243 { 244 pri_t p; 245 246 /* Time-sharing range */ 247 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 248 ts_map[p] = max_ts - 249 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 250 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 251 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 252 } 253 254 /* Real-time range */ 255 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 256 ts_map[p] = rt_ts; 257 high_pri[p] = p; 258 } 259 } 260 261 /* 262 * Hooks. 263 */ 264 265 void 266 sched_proc_fork(struct proc *parent, struct proc *child) 267 { 268 struct lwp *l; 269 270 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 271 lwp_lock(l); 272 sched_newts(l); 273 lwp_unlock(l); 274 } 275 } 276 277 void 278 sched_proc_exit(struct proc *child, struct proc *parent) 279 { 280 281 /* Dummy */ 282 } 283 284 void 285 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 286 { 287 288 KASSERT(l2->l_sched_info == NULL); 289 l2->l_sched_info = pool_get(&sil_pool, PR_WAITOK); 290 memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t)); 291 if (l2->l_priority <= PRI_HIGHEST_TS) /* XXX: For now only.. */ 292 l2->l_priority = PRI_DEFAULT; 293 } 294 295 void 296 sched_lwp_exit(struct lwp *l) 297 { 298 299 KASSERT(l->l_sched_info != NULL); 300 pool_put(&sil_pool, l->l_sched_info); 301 l->l_sched_info = NULL; 302 } 303 304 void 305 sched_lwp_collect(struct lwp *l) 306 { 307 308 } 309 310 void 311 sched_setrunnable(struct lwp *l) 312 { 313 314 /* Dummy */ 315 } 316 317 void 318 sched_schedclock(struct lwp *l) 319 { 320 321 /* Dummy */ 322 } 323 324 /* 325 * Priorities and time-slice. 326 */ 327 328 void 329 sched_nice(struct proc *p, int prio) 330 { 331 int nprio; 332 struct lwp *l; 333 334 KASSERT(mutex_owned(&p->p_smutex)); 335 336 p->p_nice = prio; 337 nprio = max(min(PRI_DEFAULT + p->p_nice, PRI_HIGHEST_TS), 0); 338 339 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 340 lwp_lock(l); 341 lwp_changepri(l, nprio); 342 lwp_unlock(l); 343 } 344 } 345 346 /* Recalculate the time-slice */ 347 static inline void 348 sched_newts(struct lwp *l) 349 { 350 sched_info_lwp_t *sil = l->l_sched_info; 351 352 sil->sl_timeslice = ts_map[lwp_eprio(l)]; 353 } 354 355 /* 356 * Control of the runqueue. 357 */ 358 359 static inline void * 360 sched_getrq(runqueue_t *ci_rq, const pri_t prio) 361 { 362 363 KASSERT(prio < PRI_COUNT); 364 return (prio <= PRI_HIGHEST_TS) ? 365 &ci_rq->r_ts_queue[prio].q_head : 366 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; 367 } 368 369 void 370 sched_enqueue(struct lwp *l, bool swtch) 371 { 372 runqueue_t *ci_rq; 373 sched_info_lwp_t *sil = l->l_sched_info; 374 TAILQ_HEAD(, lwp) *q_head; 375 const pri_t eprio = lwp_eprio(l); 376 377 ci_rq = l->l_cpu->ci_schedstate.spc_sched_info; 378 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 379 380 /* Update the last run time on switch */ 381 if (__predict_true(swtch == true)) { 382 sil->sl_lrtime = hardclock_ticks; 383 sil->sl_rtsum += (hardclock_ticks - sil->sl_rtime); 384 } else if (sil->sl_lrtime == 0) 385 sil->sl_lrtime = hardclock_ticks; 386 387 /* Enqueue the thread */ 388 q_head = sched_getrq(ci_rq, eprio); 389 if (TAILQ_EMPTY(q_head)) { 390 u_int i; 391 uint32_t q; 392 393 /* Mark bit */ 394 i = eprio >> BITMAP_SHIFT; 395 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 396 KASSERT((ci_rq->r_bitmap[i] & q) == 0); 397 ci_rq->r_bitmap[i] |= q; 398 } 399 TAILQ_INSERT_TAIL(q_head, l, l_runq); 400 ci_rq->r_count++; 401 if ((l->l_flag & LW_BOUND) == 0) 402 ci_rq->r_mcount++; 403 404 /* 405 * Update the value of highest priority in the runqueue, 406 * if priority of this thread is higher. 407 */ 408 if (eprio > ci_rq->r_highest_pri) 409 ci_rq->r_highest_pri = eprio; 410 411 sched_newts(l); 412 } 413 414 void 415 sched_dequeue(struct lwp *l) 416 { 417 runqueue_t *ci_rq; 418 TAILQ_HEAD(, lwp) *q_head; 419 const pri_t eprio = lwp_eprio(l); 420 421 ci_rq = l->l_cpu->ci_schedstate.spc_sched_info; 422 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 423 KASSERT(eprio <= ci_rq->r_highest_pri); 424 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); 425 KASSERT(ci_rq->r_count > 0); 426 427 ci_rq->r_count--; 428 if ((l->l_flag & LW_BOUND) == 0) 429 ci_rq->r_mcount--; 430 431 q_head = sched_getrq(ci_rq, eprio); 432 TAILQ_REMOVE(q_head, l, l_runq); 433 if (TAILQ_EMPTY(q_head)) { 434 u_int i; 435 uint32_t q; 436 437 /* Unmark bit */ 438 i = eprio >> BITMAP_SHIFT; 439 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 440 KASSERT((ci_rq->r_bitmap[i] & q) != 0); 441 ci_rq->r_bitmap[i] &= ~q; 442 443 /* 444 * Update the value of highest priority in the runqueue, in a 445 * case it was a last thread in the queue of highest priority. 446 */ 447 if (eprio != ci_rq->r_highest_pri) 448 return; 449 450 do { 451 q = ffs(ci_rq->r_bitmap[i]); 452 if (q) { 453 ci_rq->r_highest_pri = 454 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 455 return; 456 } 457 } while (i--); 458 459 /* If not found - set the lowest value */ 460 ci_rq->r_highest_pri = 0; 461 } 462 } 463 464 void 465 sched_slept(struct lwp *l) 466 { 467 sched_info_lwp_t *sil = l->l_sched_info; 468 469 /* Save the time when thread has slept */ 470 sil->sl_slept = hardclock_ticks; 471 472 /* 473 * If thread is in time-sharing queue and batch flag is not marked, 474 * increase the the priority, and run with the lower time-quantum. 475 */ 476 if (l->l_priority < PRI_HIGHEST_TS && (sil->sl_flags & SL_BATCH) == 0) { 477 KASSERT(l->l_class == SCHED_OTHER); 478 l->l_priority++; 479 } 480 } 481 482 void 483 sched_wakeup(struct lwp *l) 484 { 485 sched_info_lwp_t *sil = l->l_sched_info; 486 487 /* Update sleep time delta */ 488 sil->sl_slpsum += (l->l_slptime == 0) ? 489 (hardclock_ticks - sil->sl_slept) : hz; 490 491 /* If thread was sleeping a second or more - set a high priority */ 492 if (l->l_slptime > 1 || (hardclock_ticks - sil->sl_slept) >= hz) 493 l->l_priority = high_pri[l->l_priority]; 494 495 /* Also, consider looking for a better CPU to wake up */ 496 if ((l->l_flag & (LW_BOUND | LW_SYSTEM)) == 0) 497 l->l_cpu = sched_takecpu(l); 498 } 499 500 void 501 sched_pstats_hook(struct lwp *l) 502 { 503 sched_info_lwp_t *sil = l->l_sched_info; 504 pri_t prio; 505 bool batch; 506 507 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 508 l->l_stat == LSSUSPENDED) 509 l->l_slptime++; 510 511 /* 512 * Set that thread is more CPU-bound, if sum of run time exceeds the 513 * sum of sleep time. Check if thread is CPU-bound a first time. 514 */ 515 batch = (sil->sl_rtsum > sil->sl_slpsum); 516 if (batch) { 517 if ((sil->sl_flags & SL_BATCH) == 0) 518 batch = false; 519 sil->sl_flags |= SL_BATCH; 520 } else 521 sil->sl_flags &= ~SL_BATCH; 522 523 /* Reset the time sums */ 524 sil->sl_slpsum = 0; 525 sil->sl_rtsum = 0; 526 527 /* Estimate threads on time-sharing queue only */ 528 if (l->l_priority >= PRI_HIGHEST_TS) 529 return; 530 531 /* If it is CPU-bound not a first time - decrease the priority */ 532 prio = l->l_priority; 533 if (batch && prio != 0) 534 prio--; 535 536 /* If thread was not ran a second or more - set a high priority */ 537 if (l->l_stat == LSRUN) { 538 if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz)) 539 prio = high_pri[prio]; 540 /* Re-enqueue the thread if priority has changed */ 541 if (prio != l->l_priority) 542 lwp_changepri(l, prio); 543 } else { 544 /* In other states, change the priority directly */ 545 l->l_priority = prio; 546 } 547 } 548 549 /* 550 * Migration and balancing. 551 */ 552 553 #ifdef MULTIPROCESSOR 554 555 /* Check if LWP can migrate to the chosen CPU */ 556 static inline bool 557 sched_migratable(const struct lwp *l, const struct cpu_info *ci) 558 { 559 560 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE) 561 return false; 562 563 if ((l->l_flag & LW_BOUND) == 0) 564 return true; 565 #if 0 566 return cpu_in_pset(ci, l->l_psid); 567 #else 568 return false; 569 #endif 570 } 571 572 /* 573 * Estimate the migration of LWP to the other CPU. 574 * Take and return the CPU, if migration is needed. 575 */ 576 struct cpu_info * 577 sched_takecpu(struct lwp *l) 578 { 579 struct cpu_info *ci, *tci = NULL; 580 struct schedstate_percpu *spc; 581 runqueue_t *ci_rq; 582 sched_info_lwp_t *sil; 583 CPU_INFO_ITERATOR cii; 584 pri_t eprio, lpri; 585 586 ci = l->l_cpu; 587 spc = &ci->ci_schedstate; 588 ci_rq = spc->spc_sched_info; 589 590 /* CPU of this thread is idling - run there */ 591 if (ci_rq->r_count == 0) 592 return ci; 593 594 eprio = lwp_eprio(l); 595 sil = l->l_sched_info; 596 597 /* Stay if thread is cache-hot */ 598 if (l->l_stat == LSSLEEP && l->l_slptime <= 1 && 599 CACHE_HOT(sil) && eprio >= spc->spc_curpriority) 600 return ci; 601 602 /* Run on current CPU if priority of thread is higher */ 603 ci = curcpu(); 604 spc = &ci->ci_schedstate; 605 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) 606 return ci; 607 608 /* 609 * Look for the CPU with the lowest priority thread. In case of 610 * equal the priority - check the lower count of the threads. 611 */ 612 lpri = PRI_COUNT; 613 for (CPU_INFO_FOREACH(cii, ci)) { 614 runqueue_t *ici_rq; 615 pri_t pri; 616 617 spc = &ci->ci_schedstate; 618 ici_rq = spc->spc_sched_info; 619 pri = max(spc->spc_curpriority, ici_rq->r_highest_pri); 620 if (pri > lpri) 621 continue; 622 623 if (pri == lpri && tci && ci_rq->r_count < ici_rq->r_count) 624 continue; 625 626 if (sched_migratable(l, ci) == false) 627 continue; 628 629 lpri = pri; 630 tci = ci; 631 ci_rq = ici_rq; 632 } 633 634 KASSERT(tci != NULL); 635 return tci; 636 } 637 638 /* 639 * Tries to catch an LWP from the runqueue of other CPU. 640 */ 641 static struct lwp * 642 sched_catchlwp(void) 643 { 644 struct cpu_info *curci = curcpu(), *ci = worker_ci; 645 TAILQ_HEAD(, lwp) *q_head; 646 runqueue_t *ci_rq; 647 struct lwp *l; 648 649 if (curci == ci) 650 return NULL; 651 652 /* Lockless check */ 653 ci_rq = ci->ci_schedstate.spc_sched_info; 654 if (ci_rq->r_count < min_catch) 655 return NULL; 656 657 /* 658 * Double-lock the runqueues. 659 */ 660 if (curci < ci) { 661 spc_lock(ci); 662 } else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) { 663 const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info; 664 665 spc_unlock(curci); 666 spc_lock(ci); 667 spc_lock(curci); 668 669 if (cur_rq->r_count) { 670 spc_unlock(ci); 671 return NULL; 672 } 673 } 674 675 if (ci_rq->r_count < min_catch) { 676 spc_unlock(ci); 677 return NULL; 678 } 679 680 /* Take the highest priority thread */ 681 q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri); 682 l = TAILQ_FIRST(q_head); 683 684 for (;;) { 685 sched_info_lwp_t *sil; 686 687 /* Check the first and next result from the queue */ 688 if (l == NULL) 689 break; 690 691 /* Look for threads, whose are allowed to migrate */ 692 sil = l->l_sched_info; 693 if ((l->l_flag & LW_SYSTEM) || CACHE_HOT(sil) || 694 sched_migratable(l, curci) == false) { 695 l = TAILQ_NEXT(l, l_runq); 696 continue; 697 } 698 /* Recheck if chosen thread is still on the runqueue */ 699 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM)) { 700 sched_dequeue(l); 701 l->l_cpu = curci; 702 lwp_setlock(l, curci->ci_schedstate.spc_mutex); 703 sched_enqueue(l, false); 704 break; 705 } 706 l = TAILQ_NEXT(l, l_runq); 707 } 708 spc_unlock(ci); 709 710 return l; 711 } 712 713 /* 714 * Periodical calculations for balancing. 715 */ 716 static void 717 sched_balance(void *nocallout) 718 { 719 struct cpu_info *ci, *hci; 720 runqueue_t *ci_rq; 721 CPU_INFO_ITERATOR cii; 722 u_int highest; 723 724 hci = curcpu(); 725 highest = 0; 726 727 /* Make lockless countings */ 728 for (CPU_INFO_FOREACH(cii, ci)) { 729 ci_rq = ci->ci_schedstate.spc_sched_info; 730 731 /* Average count of the threads */ 732 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1; 733 734 /* Look for CPU with the highest average */ 735 if (ci_rq->r_avgcount > highest) { 736 hci = ci; 737 highest = ci_rq->r_avgcount; 738 } 739 } 740 741 /* Update the worker */ 742 worker_ci = hci; 743 744 if (nocallout == NULL) 745 callout_schedule(&balance_ch, balance_period); 746 } 747 748 #else 749 750 struct cpu_info * 751 sched_takecpu(struct lwp *l) 752 { 753 754 return l->l_cpu; 755 } 756 757 #endif /* MULTIPROCESSOR */ 758 759 /* 760 * Scheduler mill. 761 */ 762 struct lwp * 763 sched_nextlwp(void) 764 { 765 struct cpu_info *ci = curcpu(); 766 struct schedstate_percpu *spc; 767 TAILQ_HEAD(, lwp) *q_head; 768 sched_info_lwp_t *sil; 769 runqueue_t *ci_rq; 770 struct lwp *l; 771 772 spc = &ci->ci_schedstate; 773 ci_rq = ci->ci_schedstate.spc_sched_info; 774 775 #ifdef MULTIPROCESSOR 776 /* If runqueue is empty, try to catch some thread from other CPU */ 777 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) { 778 if ((ci_rq->r_count - ci_rq->r_mcount) == 0) 779 return NULL; 780 } else if (ci_rq->r_count == 0) { 781 /* Reset the counter, and call the balancer */ 782 ci_rq->r_avgcount = 0; 783 sched_balance(ci); 784 785 /* The re-locking will be done inside */ 786 return sched_catchlwp(); 787 } 788 #else 789 if (ci_rq->r_count == 0) 790 return NULL; 791 #endif 792 793 /* Take the highest priority thread */ 794 KASSERT(ci_rq->r_bitmap[ci_rq->r_highest_pri >> BITMAP_SHIFT]); 795 q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri); 796 l = TAILQ_FIRST(q_head); 797 KASSERT(l != NULL); 798 799 /* Update the counters */ 800 sil = l->l_sched_info; 801 KASSERT(sil->sl_timeslice >= min_ts); 802 KASSERT(sil->sl_timeslice <= max_ts); 803 spc->spc_ticks = sil->sl_timeslice; 804 sil->sl_rtime = hardclock_ticks; 805 806 return l; 807 } 808 809 bool 810 sched_curcpu_runnable_p(void) 811 { 812 const struct cpu_info *ci = curcpu(); 813 const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info; 814 815 #ifndef __HAVE_FAST_SOFTINTS 816 if (ci->ci_data.cpu_softints) 817 return true; 818 #endif 819 820 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE) 821 return (ci_rq->r_count - ci_rq->r_mcount); 822 823 return ci_rq->r_count; 824 } 825 826 /* 827 * Time-driven events. 828 */ 829 830 /* 831 * Called once per time-quantum. This routine is CPU-local and runs at 832 * IPL_SCHED, thus the locking is not needed. 833 */ 834 void 835 sched_tick(struct cpu_info *ci) 836 { 837 const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info; 838 struct schedstate_percpu *spc = &ci->ci_schedstate; 839 struct lwp *l = curlwp; 840 sched_info_lwp_t *sil = l->l_sched_info; 841 842 if (CURCPU_IDLE_P()) 843 return; 844 845 switch (l->l_class) { 846 case SCHED_FIFO: 847 /* 848 * Update the time-quantum, and continue running, 849 * if thread runs on FIFO real-time policy. 850 */ 851 spc->spc_ticks = sil->sl_timeslice; 852 return; 853 case SCHED_OTHER: 854 /* 855 * If thread is in time-sharing queue, decrease the priority, 856 * and run with a higher time-quantum. 857 */ 858 if (l->l_priority > PRI_HIGHEST_TS) 859 break; 860 if (l->l_priority != 0) 861 l->l_priority--; 862 break; 863 } 864 865 /* 866 * If there are higher priority threads or threads in the same queue, 867 * mark that thread should yield, otherwise, continue running. 868 */ 869 if (lwp_eprio(l) <= ci_rq->r_highest_pri) { 870 spc->spc_flags |= SPCF_SHOULDYIELD; 871 cpu_need_resched(ci, 0); 872 } else 873 spc->spc_ticks = sil->sl_timeslice; 874 } 875 876 /* 877 * Sysctl nodes and initialization. 878 */ 879 880 static int 881 sysctl_sched_mints(SYSCTLFN_ARGS) 882 { 883 struct sysctlnode node; 884 struct cpu_info *ci; 885 int error, newsize; 886 CPU_INFO_ITERATOR cii; 887 888 node = *rnode; 889 node.sysctl_data = &newsize; 890 891 newsize = hztoms(min_ts); 892 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 893 if (error || newp == NULL) 894 return error; 895 896 newsize = mstohz(newsize); 897 if (newsize < 1 || newsize > hz || newsize >= max_ts) 898 return EINVAL; 899 900 /* It is safe to do this in such order */ 901 for (CPU_INFO_FOREACH(cii, ci)) 902 spc_lock(ci); 903 904 min_ts = newsize; 905 sched_precalcts(); 906 907 for (CPU_INFO_FOREACH(cii, ci)) 908 spc_unlock(ci); 909 910 return 0; 911 } 912 913 static int 914 sysctl_sched_maxts(SYSCTLFN_ARGS) 915 { 916 struct sysctlnode node; 917 struct cpu_info *ci; 918 int error, newsize; 919 CPU_INFO_ITERATOR cii; 920 921 node = *rnode; 922 node.sysctl_data = &newsize; 923 924 newsize = hztoms(max_ts); 925 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 926 if (error || newp == NULL) 927 return error; 928 929 newsize = mstohz(newsize); 930 if (newsize < 10 || newsize > hz || newsize <= min_ts) 931 return EINVAL; 932 933 /* It is safe to do this in such order */ 934 for (CPU_INFO_FOREACH(cii, ci)) 935 spc_lock(ci); 936 937 max_ts = newsize; 938 sched_precalcts(); 939 940 for (CPU_INFO_FOREACH(cii, ci)) 941 spc_unlock(ci); 942 943 return 0; 944 } 945 946 SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup") 947 { 948 const struct sysctlnode *node = NULL; 949 950 sysctl_createv(clog, 0, NULL, NULL, 951 CTLFLAG_PERMANENT, 952 CTLTYPE_NODE, "kern", NULL, 953 NULL, 0, NULL, 0, 954 CTL_KERN, CTL_EOL); 955 sysctl_createv(clog, 0, NULL, &node, 956 CTLFLAG_PERMANENT, 957 CTLTYPE_NODE, "sched", 958 SYSCTL_DESCR("Scheduler options"), 959 NULL, 0, NULL, 0, 960 CTL_KERN, CTL_CREATE, CTL_EOL); 961 962 if (node == NULL) 963 return; 964 965 sysctl_createv(clog, 0, &node, NULL, 966 CTLFLAG_PERMANENT, 967 CTLTYPE_STRING, "name", NULL, 968 NULL, 0, __UNCONST("M2"), 0, 969 CTL_CREATE, CTL_EOL); 970 sysctl_createv(clog, 0, &node, NULL, 971 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 972 CTLTYPE_INT, "maxts", 973 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"), 974 sysctl_sched_maxts, 0, &max_ts, 0, 975 CTL_CREATE, CTL_EOL); 976 sysctl_createv(clog, 0, &node, NULL, 977 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 978 CTLTYPE_INT, "mints", 979 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"), 980 sysctl_sched_mints, 0, &min_ts, 0, 981 CTL_CREATE, CTL_EOL); 982 983 #ifdef MULTIPROCESSOR 984 sysctl_createv(clog, 0, &node, NULL, 985 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 986 CTLTYPE_INT, "cacheht_time", 987 SYSCTL_DESCR("Cache hotness time (in ticks)"), 988 NULL, 0, &cacheht_time, 0, 989 CTL_CREATE, CTL_EOL); 990 sysctl_createv(clog, 0, &node, NULL, 991 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 992 CTLTYPE_INT, "balance_period", 993 SYSCTL_DESCR("Balance period (in ticks)"), 994 NULL, 0, &balance_period, 0, 995 CTL_CREATE, CTL_EOL); 996 sysctl_createv(clog, 0, &node, NULL, 997 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 998 CTLTYPE_INT, "min_catch", 999 SYSCTL_DESCR("Minimal count of the threads for catching"), 1000 NULL, 0, &min_catch, 0, 1001 CTL_CREATE, CTL_EOL); 1002 #endif 1003 } 1004 1005 /* 1006 * Debugging. 1007 */ 1008 1009 #ifdef DDB 1010 1011 void 1012 sched_print_runqueue(void (*pr)(const char *, ...)) 1013 { 1014 runqueue_t *ci_rq; 1015 sched_info_lwp_t *sil; 1016 struct lwp *l; 1017 struct proc *p; 1018 int i; 1019 1020 struct cpu_info *ci; 1021 CPU_INFO_ITERATOR cii; 1022 1023 for (CPU_INFO_FOREACH(cii, ci)) { 1024 ci_rq = ci->ci_schedstate.spc_sched_info; 1025 1026 (*pr)("Run-queue (CPU = %d):\n", ci->ci_cpuid); 1027 (*pr)(" pid.lid = %d.%d, threads count = %u, " 1028 "avgcount = %u, highest pri = %d\n", 1029 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 1030 ci_rq->r_count, ci_rq->r_avgcount, ci_rq->r_highest_pri); 1031 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 1032 do { 1033 uint32_t q; 1034 q = ci_rq->r_bitmap[i]; 1035 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 1036 } while (i--); 1037 } 1038 1039 (*pr)(" %5s %4s %4s %10s %3s %4s %11s %3s %s\n", 1040 "LID", "PRI", "EPRI", "FL", "ST", "TS", "LWP", "CPU", "LRTIME"); 1041 1042 PROCLIST_FOREACH(p, &allproc) { 1043 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 1044 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1045 sil = l->l_sched_info; 1046 ci = l->l_cpu; 1047 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %4u %11p %3d " 1048 "%u ST=%d RT=%d %d\n", 1049 (int)l->l_lid, l->l_priority, lwp_eprio(l), 1050 l->l_flag, l->l_stat == LSRUN ? "RQ" : 1051 (l->l_stat == LSSLEEP ? "SQ" : "-"), 1052 sil->sl_timeslice, l, ci->ci_cpuid, 1053 (u_int)(hardclock_ticks - sil->sl_lrtime), 1054 sil->sl_slpsum, sil->sl_rtsum, sil->sl_flags); 1055 } 1056 } 1057 } 1058 1059 #endif /* defined(DDB) */ 1060