1 /* $OpenBSD: kern_sched.c,v 1.19 2010/04/23 21:34:38 deraadt Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 46 /* 47 * A few notes about cpu_switchto that is implemented in MD code. 48 * 49 * cpu_switchto takes two arguments, the old proc and the proc 50 * it should switch to. The new proc will never be NULL, so we always have 51 * a saved state that we need to switch to. The old proc however can 52 * be NULL if the process is exiting. NULL for the old proc simply 53 * means "don't bother saving old state". 54 * 55 * cpu_switchto is supposed to atomically load the new state of the process 56 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 57 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 58 * cpus in the system must not depend on this state being consistent. 59 * Therefore no locking is necessary in cpu_switchto other than blocking 60 * interrupts during the context switch. 61 */ 62 63 /* 64 * sched_init_cpu is called from main() for the boot cpu, then it's the 65 * responsibility of the MD code to call it for all other cpus. 66 */ 67 void 68 sched_init_cpu(struct cpu_info *ci) 69 { 70 struct schedstate_percpu *spc = &ci->ci_schedstate; 71 int i; 72 73 for (i = 0; i < SCHED_NQS; i++) 74 TAILQ_INIT(&spc->spc_qs[i]); 75 76 spc->spc_idleproc = NULL; 77 78 kthread_create_deferred(sched_kthreads_create, ci); 79 80 LIST_INIT(&spc->spc_deadproc); 81 82 /* 83 * Slight hack here until the cpuset code handles cpu_info 84 * structures. 85 */ 86 cpuset_init_cpu(ci); 87 } 88 89 void 90 sched_kthreads_create(void *v) 91 { 92 struct cpu_info *ci = v; 93 struct schedstate_percpu *spc = &ci->ci_schedstate; 94 static int num; 95 96 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 97 panic("fork idle"); 98 99 num++; 100 } 101 102 void 103 sched_idle(void *v) 104 { 105 struct schedstate_percpu *spc; 106 struct proc *p = curproc; 107 struct cpu_info *ci = v; 108 int s; 109 110 KERNEL_PROC_UNLOCK(p); 111 112 spc = &ci->ci_schedstate; 113 114 /* 115 * First time we enter here, we're not supposed to idle, 116 * just go away for a while. 117 */ 118 SCHED_LOCK(s); 119 cpuset_add(&sched_idle_cpus, ci); 120 p->p_stat = SSLEEP; 121 p->p_cpu = ci; 122 atomic_setbits_int(&p->p_flag, P_CPUPEG); 123 mi_switch(); 124 cpuset_del(&sched_idle_cpus, ci); 125 SCHED_UNLOCK(s); 126 127 KASSERT(ci == curcpu()); 128 KASSERT(curproc == spc->spc_idleproc); 129 130 while (1) { 131 while (!curcpu_is_idle()) { 132 struct proc *dead; 133 134 SCHED_LOCK(s); 135 p->p_stat = SSLEEP; 136 mi_switch(); 137 SCHED_UNLOCK(s); 138 139 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 140 LIST_REMOVE(dead, p_hash); 141 exit2(dead); 142 } 143 } 144 145 splassert(IPL_NONE); 146 147 cpuset_add(&sched_idle_cpus, ci); 148 cpu_idle_enter(); 149 while (spc->spc_whichqs == 0) { 150 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 151 atomic_setbits_int(&spc->spc_schedflags, 152 SPCF_HALTED); 153 wakeup(spc); 154 } 155 cpu_idle_cycle(); 156 } 157 cpu_idle_leave(); 158 cpuset_del(&sched_idle_cpus, ci); 159 } 160 } 161 162 /* 163 * To free our address space we have to jump through a few hoops. 164 * The freeing is done by the reaper, but until we have one reaper 165 * per cpu, we have no way of putting this proc on the deadproc list 166 * and waking up the reaper without risking having our address space and 167 * stack torn from under us before we manage to switch to another proc. 168 * Therefore we have a per-cpu list of dead processes where we put this 169 * proc and have idle clean up that list and move it to the reaper list. 170 * All this will be unnecessary once we can bind the reaper this cpu 171 * and not risk having it switch to another in case it sleeps. 172 */ 173 void 174 sched_exit(struct proc *p) 175 { 176 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 177 struct timeval tv; 178 struct proc *idle; 179 int s; 180 181 microuptime(&tv); 182 timersub(&tv, &spc->spc_runtime, &tv); 183 timeradd(&p->p_rtime, &tv, &p->p_rtime); 184 185 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 186 187 /* This process no longer needs to hold the kernel lock. */ 188 KERNEL_PROC_UNLOCK(p); 189 190 SCHED_LOCK(s); 191 idle = spc->spc_idleproc; 192 idle->p_stat = SRUN; 193 cpu_switchto(NULL, idle); 194 panic("cpu_switchto returned"); 195 } 196 197 /* 198 * Run queue management. 199 */ 200 void 201 sched_init_runqueues(void) 202 { 203 #ifdef MULTIPROCESSOR 204 __mp_lock_init(&sched_lock); 205 #endif 206 } 207 208 void 209 setrunqueue(struct proc *p) 210 { 211 struct schedstate_percpu *spc; 212 int queue = p->p_priority >> 2; 213 214 SCHED_ASSERT_LOCKED(); 215 spc = &p->p_cpu->ci_schedstate; 216 spc->spc_nrun++; 217 218 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 219 spc->spc_whichqs |= (1 << queue); 220 cpuset_add(&sched_queued_cpus, p->p_cpu); 221 222 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 223 cpu_unidle(p->p_cpu); 224 } 225 226 void 227 remrunqueue(struct proc *p) 228 { 229 struct schedstate_percpu *spc; 230 int queue = p->p_priority >> 2; 231 232 SCHED_ASSERT_LOCKED(); 233 spc = &p->p_cpu->ci_schedstate; 234 spc->spc_nrun--; 235 236 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 237 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 238 spc->spc_whichqs &= ~(1 << queue); 239 if (spc->spc_whichqs == 0) 240 cpuset_del(&sched_queued_cpus, p->p_cpu); 241 } 242 } 243 244 struct proc * 245 sched_chooseproc(void) 246 { 247 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 248 struct proc *p; 249 int queue; 250 251 SCHED_ASSERT_LOCKED(); 252 253 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 254 p = spc->spc_idleproc; 255 KASSERT(p); 256 p->p_stat = SRUN; 257 return (p); 258 } 259 260 again: 261 if (spc->spc_whichqs) { 262 queue = ffs(spc->spc_whichqs) - 1; 263 p = TAILQ_FIRST(&spc->spc_qs[queue]); 264 remrunqueue(p); 265 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 266 p = spc->spc_idleproc; 267 if (p == NULL) { 268 int s; 269 /* 270 * We get here if someone decides to switch during 271 * boot before forking kthreads, bleh. 272 * This is kind of like a stupid idle loop. 273 */ 274 #ifdef MULTIPROCESSOR 275 __mp_unlock(&sched_lock); 276 #endif 277 spl0(); 278 delay(10); 279 SCHED_LOCK(s); 280 goto again; 281 } 282 KASSERT(p); 283 p->p_stat = SRUN; 284 } 285 286 return (p); 287 } 288 289 uint64_t sched_nmigrations; 290 uint64_t sched_noidle; 291 uint64_t sched_stolen; 292 293 uint64_t sched_choose; 294 uint64_t sched_wasidle; 295 uint64_t sched_nomigrations; 296 297 struct cpu_info * 298 sched_choosecpu_fork(struct proc *parent, int flags) 299 { 300 struct cpu_info *choice = NULL; 301 fixpt_t load, best_load = ~0; 302 int run, best_run = INT_MAX; 303 struct cpu_info *ci; 304 struct cpuset set; 305 306 #if 0 307 /* 308 * XXX 309 * Don't do this until we have a painless way to move the cpu in exec. 310 * Preferably when nuking the old pmap and getting a new one on a 311 * new cpu. 312 */ 313 /* 314 * PPWAIT forks are simple. We know that the parent will not 315 * run until we exec and choose another cpu, so we just steal its 316 * cpu. 317 */ 318 if (flags & FORK_PPWAIT) 319 return (parent->p_cpu); 320 #endif 321 322 /* 323 * Look at all cpus that are currently idle and have nothing queued. 324 * If there are none, pick the one with least queued procs first, 325 * then the one with lowest load average. 326 */ 327 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 328 if (cpuset_first(&set) == NULL) 329 cpuset_add_all(&set); 330 331 while ((ci = cpuset_first(&set)) != NULL) { 332 cpuset_del(&set, ci); 333 334 load = ci->ci_schedstate.spc_ldavg; 335 run = ci->ci_schedstate.spc_nrun; 336 337 if (choice == NULL || run < best_run || 338 (run == best_run &&load < best_load)) { 339 choice = ci; 340 best_load = load; 341 best_run = run; 342 } 343 } 344 345 return (choice); 346 } 347 348 struct cpu_info * 349 sched_choosecpu(struct proc *p) 350 { 351 struct cpu_info *choice = NULL; 352 int last_cost = INT_MAX; 353 struct cpu_info *ci; 354 struct cpuset set; 355 356 /* 357 * If pegged to a cpu, don't allow it to move. 358 */ 359 if (p->p_flag & P_CPUPEG) 360 return (p->p_cpu); 361 362 sched_choose++; 363 364 /* 365 * Look at all cpus that are currently idle and have nothing queued. 366 * If there are none, pick the cheapest of those. 367 * (idle + queued could mean that the cpu is handling an interrupt 368 * at this moment and haven't had time to leave idle yet). 369 */ 370 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 371 372 /* 373 * First, just check if our current cpu is in that set, if it is, 374 * this is simple. 375 * Also, our cpu might not be idle, but if it's the current cpu 376 * and it has nothing else queued and we're curproc, take it. 377 */ 378 if (cpuset_isset(&set, p->p_cpu) || 379 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 380 curproc == p)) { 381 sched_wasidle++; 382 return (p->p_cpu); 383 } 384 385 if (cpuset_first(&set) == NULL) 386 cpuset_add_all(&set); 387 388 while ((ci = cpuset_first(&set)) != NULL) { 389 int cost = sched_proc_to_cpu_cost(ci, p); 390 391 if (choice == NULL || cost < last_cost) { 392 choice = ci; 393 last_cost = cost; 394 } 395 cpuset_del(&set, ci); 396 } 397 398 if (p->p_cpu != choice) 399 sched_nmigrations++; 400 else 401 sched_nomigrations++; 402 403 return (choice); 404 } 405 406 /* 407 * Attempt to steal a proc from some cpu. 408 */ 409 struct proc * 410 sched_steal_proc(struct cpu_info *self) 411 { 412 struct schedstate_percpu *spc; 413 struct proc *best = NULL; 414 int bestcost = INT_MAX; 415 struct cpu_info *ci; 416 struct cpuset set; 417 418 cpuset_copy(&set, &sched_queued_cpus); 419 420 while ((ci = cpuset_first(&set)) != NULL) { 421 struct proc *p; 422 int queue; 423 int cost; 424 425 cpuset_del(&set, ci); 426 427 spc = &ci->ci_schedstate; 428 429 queue = ffs(spc->spc_whichqs) - 1; 430 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 431 if (p->p_flag & P_CPUPEG) 432 continue; 433 434 cost = sched_proc_to_cpu_cost(self, p); 435 436 if (best == NULL || cost < bestcost) { 437 best = p; 438 bestcost = cost; 439 } 440 } 441 } 442 if (best == NULL) 443 return (NULL); 444 445 spc = &best->p_cpu->ci_schedstate; 446 remrunqueue(best); 447 best->p_cpu = self; 448 449 sched_stolen++; 450 451 return (best); 452 } 453 454 /* 455 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 456 */ 457 static int 458 log2(unsigned int i) 459 { 460 int ret = 0; 461 462 while (i >>= 1) 463 ret++; 464 465 return (ret); 466 } 467 468 /* 469 * Calculate the cost of moving the proc to this cpu. 470 * 471 * What we want is some guesstimate of how much "performance" it will 472 * cost us to move the proc here. Not just for caches and TLBs and NUMA 473 * memory, but also for the proc itself. A highly loaded cpu might not 474 * be the best candidate for this proc since it won't get run. 475 * 476 * Just total guesstimates for now. 477 */ 478 479 int sched_cost_load = 1; 480 int sched_cost_priority = 1; 481 int sched_cost_runnable = 3; 482 int sched_cost_resident = 1; 483 484 int 485 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 486 { 487 struct schedstate_percpu *spc; 488 int l2resident = 0; 489 int cost; 490 491 spc = &ci->ci_schedstate; 492 493 cost = 0; 494 495 /* 496 * First, account for the priority of the proc we want to move. 497 * More willing to move, the lower the priority of the destination 498 * and the higher the priority of the proc. 499 */ 500 if (!cpuset_isset(&sched_idle_cpus, ci)) { 501 cost += (p->p_priority - spc->spc_curpriority) * 502 sched_cost_priority; 503 cost += sched_cost_runnable; 504 } 505 if (cpuset_isset(&sched_queued_cpus, ci)) { 506 cost += spc->spc_nrun * sched_cost_runnable; 507 } 508 509 /* 510 * Higher load on the destination means we don't want to go there. 511 */ 512 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 513 514 /* 515 * If the proc is on this cpu already, lower the cost by how much 516 * it has been running and an estimate of its footprint. 517 */ 518 if (p->p_cpu == ci && p->p_slptime == 0) { 519 l2resident = 520 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 521 cost -= l2resident * sched_cost_resident; 522 } 523 524 return (cost); 525 } 526 527 /* 528 * Peg a proc to a cpu. 529 */ 530 void 531 sched_peg_curproc(struct cpu_info *ci) 532 { 533 struct proc *p = curproc; 534 int s; 535 536 SCHED_LOCK(s); 537 p->p_priority = p->p_usrpri; 538 p->p_stat = SRUN; 539 p->p_cpu = ci; 540 atomic_setbits_int(&p->p_flag, P_CPUPEG); 541 setrunqueue(p); 542 p->p_stats->p_ru.ru_nvcsw++; 543 mi_switch(); 544 SCHED_UNLOCK(s); 545 } 546 547 #ifdef MULTIPROCESSOR 548 549 void 550 sched_start_secondary_cpus(void) 551 { 552 CPU_INFO_ITERATOR cii; 553 struct cpu_info *ci; 554 555 CPU_INFO_FOREACH(cii, ci) { 556 struct schedstate_percpu *spc = &ci->ci_schedstate; 557 558 if (CPU_IS_PRIMARY(ci)) 559 continue; 560 atomic_clearbits_int(&spc->spc_schedflags, 561 SPCF_SHOULDHALT | SPCF_HALTED); 562 } 563 } 564 565 void 566 sched_stop_secondary_cpus(void) 567 { 568 CPU_INFO_ITERATOR cii; 569 struct cpu_info *ci; 570 571 /* 572 * Make sure we stop the secondary CPUs. 573 */ 574 CPU_INFO_FOREACH(cii, ci) { 575 struct schedstate_percpu *spc = &ci->ci_schedstate; 576 577 if (CPU_IS_PRIMARY(ci)) 578 continue; 579 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 580 } 581 CPU_INFO_FOREACH(cii, ci) { 582 struct schedstate_percpu *spc = &ci->ci_schedstate; 583 struct sleep_state sls; 584 585 if (CPU_IS_PRIMARY(ci)) 586 continue; 587 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 588 sleep_setup(&sls, spc, PZERO, "schedstate"); 589 sleep_finish(&sls, 590 (spc->spc_schedflags & SPCF_HALTED) == 0); 591 } 592 } 593 } 594 595 #endif 596 597 /* 598 * Functions to manipulate cpu sets. 599 */ 600 struct cpu_info *cpuset_infos[MAXCPUS]; 601 static struct cpuset cpuset_all; 602 603 void 604 cpuset_init_cpu(struct cpu_info *ci) 605 { 606 cpuset_add(&cpuset_all, ci); 607 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 608 } 609 610 void 611 cpuset_clear(struct cpuset *cs) 612 { 613 memset(cs, 0, sizeof(*cs)); 614 } 615 616 /* 617 * XXX - implement it on SP architectures too 618 */ 619 #ifndef CPU_INFO_UNIT 620 #define CPU_INFO_UNIT 0 621 #endif 622 623 void 624 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 625 { 626 unsigned int num = CPU_INFO_UNIT(ci); 627 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 628 } 629 630 void 631 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 632 { 633 unsigned int num = CPU_INFO_UNIT(ci); 634 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 635 } 636 637 int 638 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 639 { 640 unsigned int num = CPU_INFO_UNIT(ci); 641 return (cs->cs_set[num/32] & (1 << (num % 32))); 642 } 643 644 void 645 cpuset_add_all(struct cpuset *cs) 646 { 647 cpuset_copy(cs, &cpuset_all); 648 } 649 650 void 651 cpuset_copy(struct cpuset *to, struct cpuset *from) 652 { 653 memcpy(to, from, sizeof(*to)); 654 } 655 656 struct cpu_info * 657 cpuset_first(struct cpuset *cs) 658 { 659 int i; 660 661 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 662 if (cs->cs_set[i]) 663 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 664 665 return (NULL); 666 } 667 668 void 669 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 670 { 671 int i; 672 673 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 674 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 675 } 676 677 void 678 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 679 { 680 int i; 681 682 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 683 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 684 } 685 686 void 687 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 688 { 689 int i; 690 691 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 692 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 693 } 694