1 /* $OpenBSD: kern_sched.c,v 1.24 2011/10/12 18:30:09 miod Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * A few notes about cpu_switchto that is implemented in MD code. 49 * 50 * cpu_switchto takes two arguments, the old proc and the proc 51 * it should switch to. The new proc will never be NULL, so we always have 52 * a saved state that we need to switch to. The old proc however can 53 * be NULL if the process is exiting. NULL for the old proc simply 54 * means "don't bother saving old state". 55 * 56 * cpu_switchto is supposed to atomically load the new state of the process 57 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 58 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 59 * cpus in the system must not depend on this state being consistent. 60 * Therefore no locking is necessary in cpu_switchto other than blocking 61 * interrupts during the context switch. 62 */ 63 64 /* 65 * sched_init_cpu is called from main() for the boot cpu, then it's the 66 * responsibility of the MD code to call it for all other cpus. 67 */ 68 void 69 sched_init_cpu(struct cpu_info *ci) 70 { 71 struct schedstate_percpu *spc = &ci->ci_schedstate; 72 int i; 73 74 for (i = 0; i < SCHED_NQS; i++) 75 TAILQ_INIT(&spc->spc_qs[i]); 76 77 spc->spc_idleproc = NULL; 78 79 kthread_create_deferred(sched_kthreads_create, ci); 80 81 LIST_INIT(&spc->spc_deadproc); 82 83 /* 84 * Slight hack here until the cpuset code handles cpu_info 85 * structures. 86 */ 87 cpuset_init_cpu(ci); 88 cpuset_add(&sched_all_cpus, ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_UNLOCK(); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 p->p_cpu = ci; 124 atomic_setbits_int(&p->p_flag, P_CPUPEG); 125 mi_switch(); 126 cpuset_del(&sched_idle_cpus, ci); 127 SCHED_UNLOCK(s); 128 129 KASSERT(ci == curcpu()); 130 KASSERT(curproc == spc->spc_idleproc); 131 132 while (1) { 133 while (!curcpu_is_idle()) { 134 struct proc *dead; 135 136 SCHED_LOCK(s); 137 p->p_stat = SSLEEP; 138 mi_switch(); 139 SCHED_UNLOCK(s); 140 141 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 142 LIST_REMOVE(dead, p_hash); 143 exit2(dead); 144 } 145 } 146 147 splassert(IPL_NONE); 148 149 cpuset_add(&sched_idle_cpus, ci); 150 cpu_idle_enter(); 151 while (spc->spc_whichqs == 0) { 152 if (spc->spc_schedflags & SPCF_SHOULDHALT && 153 (spc->spc_schedflags & SPCF_HALTED) == 0) { 154 cpuset_del(&sched_idle_cpus, ci); 155 SCHED_LOCK(s); 156 atomic_setbits_int(&spc->spc_schedflags, 157 spc->spc_whichqs ? 0 : SPCF_HALTED); 158 SCHED_UNLOCK(s); 159 wakeup(spc); 160 } 161 cpu_idle_cycle(); 162 } 163 cpu_idle_leave(); 164 cpuset_del(&sched_idle_cpus, ci); 165 } 166 } 167 168 /* 169 * To free our address space we have to jump through a few hoops. 170 * The freeing is done by the reaper, but until we have one reaper 171 * per cpu, we have no way of putting this proc on the deadproc list 172 * and waking up the reaper without risking having our address space and 173 * stack torn from under us before we manage to switch to another proc. 174 * Therefore we have a per-cpu list of dead processes where we put this 175 * proc and have idle clean up that list and move it to the reaper list. 176 * All this will be unnecessary once we can bind the reaper this cpu 177 * and not risk having it switch to another in case it sleeps. 178 */ 179 void 180 sched_exit(struct proc *p) 181 { 182 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 183 struct timeval tv; 184 struct proc *idle; 185 int s; 186 187 microuptime(&tv); 188 timersub(&tv, &spc->spc_runtime, &tv); 189 timeradd(&p->p_rtime, &tv, &p->p_rtime); 190 191 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 192 193 /* This process no longer needs to hold the kernel lock. */ 194 KERNEL_UNLOCK(); 195 196 SCHED_LOCK(s); 197 idle = spc->spc_idleproc; 198 idle->p_stat = SRUN; 199 cpu_switchto(NULL, idle); 200 panic("cpu_switchto returned"); 201 } 202 203 /* 204 * Run queue management. 205 */ 206 void 207 sched_init_runqueues(void) 208 { 209 } 210 211 void 212 setrunqueue(struct proc *p) 213 { 214 struct schedstate_percpu *spc; 215 int queue = p->p_priority >> 2; 216 217 SCHED_ASSERT_LOCKED(); 218 spc = &p->p_cpu->ci_schedstate; 219 spc->spc_nrun++; 220 221 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 222 spc->spc_whichqs |= (1 << queue); 223 cpuset_add(&sched_queued_cpus, p->p_cpu); 224 225 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 226 cpu_unidle(p->p_cpu); 227 } 228 229 void 230 remrunqueue(struct proc *p) 231 { 232 struct schedstate_percpu *spc; 233 int queue = p->p_priority >> 2; 234 235 SCHED_ASSERT_LOCKED(); 236 spc = &p->p_cpu->ci_schedstate; 237 spc->spc_nrun--; 238 239 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 240 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 241 spc->spc_whichqs &= ~(1 << queue); 242 if (spc->spc_whichqs == 0) 243 cpuset_del(&sched_queued_cpus, p->p_cpu); 244 } 245 } 246 247 struct proc * 248 sched_chooseproc(void) 249 { 250 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 251 struct proc *p; 252 int queue; 253 254 SCHED_ASSERT_LOCKED(); 255 256 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 257 if (spc->spc_whichqs) { 258 for (queue = 0; queue < SCHED_NQS; queue++) { 259 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 260 remrunqueue(p); 261 p->p_cpu = sched_choosecpu(p); 262 setrunqueue(p); 263 } 264 } 265 } 266 p = spc->spc_idleproc; 267 KASSERT(p); 268 KASSERT(p->p_wchan == NULL); 269 p->p_stat = SRUN; 270 return (p); 271 } 272 273 again: 274 if (spc->spc_whichqs) { 275 queue = ffs(spc->spc_whichqs) - 1; 276 p = TAILQ_FIRST(&spc->spc_qs[queue]); 277 remrunqueue(p); 278 KASSERT(p->p_stat == SRUN); 279 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 280 p = spc->spc_idleproc; 281 if (p == NULL) { 282 int s; 283 /* 284 * We get here if someone decides to switch during 285 * boot before forking kthreads, bleh. 286 * This is kind of like a stupid idle loop. 287 */ 288 #ifdef MULTIPROCESSOR 289 __mp_unlock(&sched_lock); 290 #endif 291 spl0(); 292 delay(10); 293 SCHED_LOCK(s); 294 goto again; 295 } 296 KASSERT(p); 297 p->p_stat = SRUN; 298 } 299 300 KASSERT(p->p_wchan == NULL); 301 return (p); 302 } 303 304 uint64_t sched_nmigrations; 305 uint64_t sched_noidle; 306 uint64_t sched_stolen; 307 308 uint64_t sched_choose; 309 uint64_t sched_wasidle; 310 uint64_t sched_nomigrations; 311 312 struct cpu_info * 313 sched_choosecpu_fork(struct proc *parent, int flags) 314 { 315 struct cpu_info *choice = NULL; 316 fixpt_t load, best_load = ~0; 317 int run, best_run = INT_MAX; 318 struct cpu_info *ci; 319 struct cpuset set; 320 321 #if 0 322 /* 323 * XXX 324 * Don't do this until we have a painless way to move the cpu in exec. 325 * Preferably when nuking the old pmap and getting a new one on a 326 * new cpu. 327 */ 328 /* 329 * PPWAIT forks are simple. We know that the parent will not 330 * run until we exec and choose another cpu, so we just steal its 331 * cpu. 332 */ 333 if (flags & FORK_PPWAIT) 334 return (parent->p_cpu); 335 #endif 336 337 /* 338 * Look at all cpus that are currently idle and have nothing queued. 339 * If there are none, pick the one with least queued procs first, 340 * then the one with lowest load average. 341 */ 342 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 343 if (cpuset_first(&set) == NULL) 344 cpuset_copy(&set, &sched_all_cpus); 345 346 while ((ci = cpuset_first(&set)) != NULL) { 347 cpuset_del(&set, ci); 348 349 load = ci->ci_schedstate.spc_ldavg; 350 run = ci->ci_schedstate.spc_nrun; 351 352 if (choice == NULL || run < best_run || 353 (run == best_run &&load < best_load)) { 354 choice = ci; 355 best_load = load; 356 best_run = run; 357 } 358 } 359 360 return (choice); 361 } 362 363 struct cpu_info * 364 sched_choosecpu(struct proc *p) 365 { 366 struct cpu_info *choice = NULL; 367 int last_cost = INT_MAX; 368 struct cpu_info *ci; 369 struct cpuset set; 370 371 /* 372 * If pegged to a cpu, don't allow it to move. 373 */ 374 if (p->p_flag & P_CPUPEG) 375 return (p->p_cpu); 376 377 sched_choose++; 378 379 /* 380 * Look at all cpus that are currently idle and have nothing queued. 381 * If there are none, pick the cheapest of those. 382 * (idle + queued could mean that the cpu is handling an interrupt 383 * at this moment and haven't had time to leave idle yet). 384 */ 385 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 386 387 /* 388 * First, just check if our current cpu is in that set, if it is, 389 * this is simple. 390 * Also, our cpu might not be idle, but if it's the current cpu 391 * and it has nothing else queued and we're curproc, take it. 392 */ 393 if (cpuset_isset(&set, p->p_cpu) || 394 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 395 curproc == p)) { 396 sched_wasidle++; 397 return (p->p_cpu); 398 } 399 400 if (cpuset_first(&set) == NULL) 401 cpuset_copy(&set, &sched_all_cpus); 402 403 while ((ci = cpuset_first(&set)) != NULL) { 404 int cost = sched_proc_to_cpu_cost(ci, p); 405 406 if (choice == NULL || cost < last_cost) { 407 choice = ci; 408 last_cost = cost; 409 } 410 cpuset_del(&set, ci); 411 } 412 413 if (p->p_cpu != choice) 414 sched_nmigrations++; 415 else 416 sched_nomigrations++; 417 418 return (choice); 419 } 420 421 /* 422 * Attempt to steal a proc from some cpu. 423 */ 424 struct proc * 425 sched_steal_proc(struct cpu_info *self) 426 { 427 struct schedstate_percpu *spc; 428 struct proc *best = NULL; 429 int bestcost = INT_MAX; 430 struct cpu_info *ci; 431 struct cpuset set; 432 433 cpuset_copy(&set, &sched_queued_cpus); 434 435 while ((ci = cpuset_first(&set)) != NULL) { 436 struct proc *p; 437 int queue; 438 int cost; 439 440 cpuset_del(&set, ci); 441 442 spc = &ci->ci_schedstate; 443 444 queue = ffs(spc->spc_whichqs) - 1; 445 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 446 if (p->p_flag & P_CPUPEG) 447 continue; 448 449 cost = sched_proc_to_cpu_cost(self, p); 450 451 if (best == NULL || cost < bestcost) { 452 best = p; 453 bestcost = cost; 454 } 455 } 456 } 457 if (best == NULL) 458 return (NULL); 459 460 spc = &best->p_cpu->ci_schedstate; 461 remrunqueue(best); 462 best->p_cpu = self; 463 464 sched_stolen++; 465 466 return (best); 467 } 468 469 /* 470 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 471 */ 472 static int 473 log2(unsigned int i) 474 { 475 int ret = 0; 476 477 while (i >>= 1) 478 ret++; 479 480 return (ret); 481 } 482 483 /* 484 * Calculate the cost of moving the proc to this cpu. 485 * 486 * What we want is some guesstimate of how much "performance" it will 487 * cost us to move the proc here. Not just for caches and TLBs and NUMA 488 * memory, but also for the proc itself. A highly loaded cpu might not 489 * be the best candidate for this proc since it won't get run. 490 * 491 * Just total guesstimates for now. 492 */ 493 494 int sched_cost_load = 1; 495 int sched_cost_priority = 1; 496 int sched_cost_runnable = 3; 497 int sched_cost_resident = 1; 498 499 int 500 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 501 { 502 struct schedstate_percpu *spc; 503 int l2resident = 0; 504 int cost; 505 506 spc = &ci->ci_schedstate; 507 508 cost = 0; 509 510 /* 511 * First, account for the priority of the proc we want to move. 512 * More willing to move, the lower the priority of the destination 513 * and the higher the priority of the proc. 514 */ 515 if (!cpuset_isset(&sched_idle_cpus, ci)) { 516 cost += (p->p_priority - spc->spc_curpriority) * 517 sched_cost_priority; 518 cost += sched_cost_runnable; 519 } 520 if (cpuset_isset(&sched_queued_cpus, ci)) { 521 cost += spc->spc_nrun * sched_cost_runnable; 522 } 523 524 /* 525 * Higher load on the destination means we don't want to go there. 526 */ 527 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 528 529 /* 530 * If the proc is on this cpu already, lower the cost by how much 531 * it has been running and an estimate of its footprint. 532 */ 533 if (p->p_cpu == ci && p->p_slptime == 0) { 534 l2resident = 535 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 536 cost -= l2resident * sched_cost_resident; 537 } 538 539 return (cost); 540 } 541 542 /* 543 * Peg a proc to a cpu. 544 */ 545 void 546 sched_peg_curproc(struct cpu_info *ci) 547 { 548 struct proc *p = curproc; 549 int s; 550 551 SCHED_LOCK(s); 552 p->p_priority = p->p_usrpri; 553 p->p_stat = SRUN; 554 p->p_cpu = ci; 555 atomic_setbits_int(&p->p_flag, P_CPUPEG); 556 setrunqueue(p); 557 p->p_stats->p_ru.ru_nvcsw++; 558 mi_switch(); 559 SCHED_UNLOCK(s); 560 } 561 562 #ifdef MULTIPROCESSOR 563 564 void 565 sched_start_secondary_cpus(void) 566 { 567 CPU_INFO_ITERATOR cii; 568 struct cpu_info *ci; 569 570 CPU_INFO_FOREACH(cii, ci) { 571 struct schedstate_percpu *spc = &ci->ci_schedstate; 572 573 if (CPU_IS_PRIMARY(ci)) 574 continue; 575 cpuset_add(&sched_all_cpus, ci); 576 atomic_clearbits_int(&spc->spc_schedflags, 577 SPCF_SHOULDHALT | SPCF_HALTED); 578 } 579 } 580 581 void 582 sched_stop_secondary_cpus(void) 583 { 584 CPU_INFO_ITERATOR cii; 585 struct cpu_info *ci; 586 587 /* 588 * Make sure we stop the secondary CPUs. 589 */ 590 CPU_INFO_FOREACH(cii, ci) { 591 struct schedstate_percpu *spc = &ci->ci_schedstate; 592 593 if (CPU_IS_PRIMARY(ci)) 594 continue; 595 cpuset_del(&sched_all_cpus, ci); 596 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 597 } 598 CPU_INFO_FOREACH(cii, ci) { 599 struct schedstate_percpu *spc = &ci->ci_schedstate; 600 struct sleep_state sls; 601 602 if (CPU_IS_PRIMARY(ci)) 603 continue; 604 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 605 sleep_setup(&sls, spc, PZERO, "schedstate"); 606 sleep_finish(&sls, 607 (spc->spc_schedflags & SPCF_HALTED) == 0); 608 } 609 } 610 } 611 612 #endif 613 614 /* 615 * Functions to manipulate cpu sets. 616 */ 617 struct cpu_info *cpuset_infos[MAXCPUS]; 618 static struct cpuset cpuset_all; 619 620 void 621 cpuset_init_cpu(struct cpu_info *ci) 622 { 623 cpuset_add(&cpuset_all, ci); 624 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 625 } 626 627 void 628 cpuset_clear(struct cpuset *cs) 629 { 630 memset(cs, 0, sizeof(*cs)); 631 } 632 633 void 634 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 635 { 636 unsigned int num = CPU_INFO_UNIT(ci); 637 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 638 } 639 640 void 641 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 642 { 643 unsigned int num = CPU_INFO_UNIT(ci); 644 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 645 } 646 647 int 648 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 649 { 650 unsigned int num = CPU_INFO_UNIT(ci); 651 return (cs->cs_set[num/32] & (1 << (num % 32))); 652 } 653 654 void 655 cpuset_add_all(struct cpuset *cs) 656 { 657 cpuset_copy(cs, &cpuset_all); 658 } 659 660 void 661 cpuset_copy(struct cpuset *to, struct cpuset *from) 662 { 663 memcpy(to, from, sizeof(*to)); 664 } 665 666 struct cpu_info * 667 cpuset_first(struct cpuset *cs) 668 { 669 int i; 670 671 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 672 if (cs->cs_set[i]) 673 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 674 675 return (NULL); 676 } 677 678 void 679 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 680 { 681 int i; 682 683 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 684 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 685 } 686 687 void 688 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 689 { 690 int i; 691 692 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 693 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 694 } 695 696 void 697 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 698 { 699 int i; 700 701 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 702 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 703 } 704