1 /* $OpenBSD: kern_sched.c,v 1.23 2011/07/06 21:41:37 art Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * A few notes about cpu_switchto that is implemented in MD code. 49 * 50 * cpu_switchto takes two arguments, the old proc and the proc 51 * it should switch to. The new proc will never be NULL, so we always have 52 * a saved state that we need to switch to. The old proc however can 53 * be NULL if the process is exiting. NULL for the old proc simply 54 * means "don't bother saving old state". 55 * 56 * cpu_switchto is supposed to atomically load the new state of the process 57 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 58 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 59 * cpus in the system must not depend on this state being consistent. 60 * Therefore no locking is necessary in cpu_switchto other than blocking 61 * interrupts during the context switch. 62 */ 63 64 /* 65 * sched_init_cpu is called from main() for the boot cpu, then it's the 66 * responsibility of the MD code to call it for all other cpus. 67 */ 68 void 69 sched_init_cpu(struct cpu_info *ci) 70 { 71 struct schedstate_percpu *spc = &ci->ci_schedstate; 72 int i; 73 74 for (i = 0; i < SCHED_NQS; i++) 75 TAILQ_INIT(&spc->spc_qs[i]); 76 77 spc->spc_idleproc = NULL; 78 79 kthread_create_deferred(sched_kthreads_create, ci); 80 81 LIST_INIT(&spc->spc_deadproc); 82 83 /* 84 * Slight hack here until the cpuset code handles cpu_info 85 * structures. 86 */ 87 cpuset_init_cpu(ci); 88 cpuset_add(&sched_all_cpus, ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_UNLOCK(); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 p->p_cpu = ci; 124 atomic_setbits_int(&p->p_flag, P_CPUPEG); 125 mi_switch(); 126 cpuset_del(&sched_idle_cpus, ci); 127 SCHED_UNLOCK(s); 128 129 KASSERT(ci == curcpu()); 130 KASSERT(curproc == spc->spc_idleproc); 131 132 while (1) { 133 while (!curcpu_is_idle()) { 134 struct proc *dead; 135 136 SCHED_LOCK(s); 137 p->p_stat = SSLEEP; 138 mi_switch(); 139 SCHED_UNLOCK(s); 140 141 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 142 LIST_REMOVE(dead, p_hash); 143 exit2(dead); 144 } 145 } 146 147 splassert(IPL_NONE); 148 149 cpuset_add(&sched_idle_cpus, ci); 150 cpu_idle_enter(); 151 while (spc->spc_whichqs == 0) { 152 if (spc->spc_schedflags & SPCF_SHOULDHALT && 153 (spc->spc_schedflags & SPCF_HALTED) == 0) { 154 cpuset_del(&sched_idle_cpus, ci); 155 SCHED_LOCK(s); 156 atomic_setbits_int(&spc->spc_schedflags, 157 spc->spc_whichqs ? 0 : SPCF_HALTED); 158 SCHED_UNLOCK(s); 159 wakeup(spc); 160 } 161 cpu_idle_cycle(); 162 } 163 cpu_idle_leave(); 164 cpuset_del(&sched_idle_cpus, ci); 165 } 166 } 167 168 /* 169 * To free our address space we have to jump through a few hoops. 170 * The freeing is done by the reaper, but until we have one reaper 171 * per cpu, we have no way of putting this proc on the deadproc list 172 * and waking up the reaper without risking having our address space and 173 * stack torn from under us before we manage to switch to another proc. 174 * Therefore we have a per-cpu list of dead processes where we put this 175 * proc and have idle clean up that list and move it to the reaper list. 176 * All this will be unnecessary once we can bind the reaper this cpu 177 * and not risk having it switch to another in case it sleeps. 178 */ 179 void 180 sched_exit(struct proc *p) 181 { 182 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 183 struct timeval tv; 184 struct proc *idle; 185 int s; 186 187 microuptime(&tv); 188 timersub(&tv, &spc->spc_runtime, &tv); 189 timeradd(&p->p_rtime, &tv, &p->p_rtime); 190 191 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 192 193 /* This process no longer needs to hold the kernel lock. */ 194 KERNEL_UNLOCK(); 195 196 SCHED_LOCK(s); 197 idle = spc->spc_idleproc; 198 idle->p_stat = SRUN; 199 cpu_switchto(NULL, idle); 200 panic("cpu_switchto returned"); 201 } 202 203 /* 204 * Run queue management. 205 */ 206 void 207 sched_init_runqueues(void) 208 { 209 } 210 211 void 212 setrunqueue(struct proc *p) 213 { 214 struct schedstate_percpu *spc; 215 int queue = p->p_priority >> 2; 216 217 SCHED_ASSERT_LOCKED(); 218 spc = &p->p_cpu->ci_schedstate; 219 spc->spc_nrun++; 220 221 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 222 spc->spc_whichqs |= (1 << queue); 223 cpuset_add(&sched_queued_cpus, p->p_cpu); 224 225 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 226 cpu_unidle(p->p_cpu); 227 } 228 229 void 230 remrunqueue(struct proc *p) 231 { 232 struct schedstate_percpu *spc; 233 int queue = p->p_priority >> 2; 234 235 SCHED_ASSERT_LOCKED(); 236 spc = &p->p_cpu->ci_schedstate; 237 spc->spc_nrun--; 238 239 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 240 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 241 spc->spc_whichqs &= ~(1 << queue); 242 if (spc->spc_whichqs == 0) 243 cpuset_del(&sched_queued_cpus, p->p_cpu); 244 } 245 } 246 247 struct proc * 248 sched_chooseproc(void) 249 { 250 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 251 struct proc *p; 252 int queue; 253 254 SCHED_ASSERT_LOCKED(); 255 256 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 257 if (spc->spc_whichqs) { 258 for (queue = 0; queue < SCHED_NQS; queue++) { 259 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 260 remrunqueue(p); 261 p->p_cpu = sched_choosecpu(p); 262 setrunqueue(p); 263 } 264 } 265 } 266 p = spc->spc_idleproc; 267 KASSERT(p); 268 p->p_stat = SRUN; 269 return (p); 270 } 271 272 again: 273 if (spc->spc_whichqs) { 274 queue = ffs(spc->spc_whichqs) - 1; 275 p = TAILQ_FIRST(&spc->spc_qs[queue]); 276 remrunqueue(p); 277 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 278 p = spc->spc_idleproc; 279 if (p == NULL) { 280 int s; 281 /* 282 * We get here if someone decides to switch during 283 * boot before forking kthreads, bleh. 284 * This is kind of like a stupid idle loop. 285 */ 286 #ifdef MULTIPROCESSOR 287 __mp_unlock(&sched_lock); 288 #endif 289 spl0(); 290 delay(10); 291 SCHED_LOCK(s); 292 goto again; 293 } 294 KASSERT(p); 295 p->p_stat = SRUN; 296 } 297 298 return (p); 299 } 300 301 uint64_t sched_nmigrations; 302 uint64_t sched_noidle; 303 uint64_t sched_stolen; 304 305 uint64_t sched_choose; 306 uint64_t sched_wasidle; 307 uint64_t sched_nomigrations; 308 309 struct cpu_info * 310 sched_choosecpu_fork(struct proc *parent, int flags) 311 { 312 struct cpu_info *choice = NULL; 313 fixpt_t load, best_load = ~0; 314 int run, best_run = INT_MAX; 315 struct cpu_info *ci; 316 struct cpuset set; 317 318 #if 0 319 /* 320 * XXX 321 * Don't do this until we have a painless way to move the cpu in exec. 322 * Preferably when nuking the old pmap and getting a new one on a 323 * new cpu. 324 */ 325 /* 326 * PPWAIT forks are simple. We know that the parent will not 327 * run until we exec and choose another cpu, so we just steal its 328 * cpu. 329 */ 330 if (flags & FORK_PPWAIT) 331 return (parent->p_cpu); 332 #endif 333 334 /* 335 * Look at all cpus that are currently idle and have nothing queued. 336 * If there are none, pick the one with least queued procs first, 337 * then the one with lowest load average. 338 */ 339 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 340 if (cpuset_first(&set) == NULL) 341 cpuset_copy(&set, &sched_all_cpus); 342 343 while ((ci = cpuset_first(&set)) != NULL) { 344 cpuset_del(&set, ci); 345 346 load = ci->ci_schedstate.spc_ldavg; 347 run = ci->ci_schedstate.spc_nrun; 348 349 if (choice == NULL || run < best_run || 350 (run == best_run &&load < best_load)) { 351 choice = ci; 352 best_load = load; 353 best_run = run; 354 } 355 } 356 357 return (choice); 358 } 359 360 struct cpu_info * 361 sched_choosecpu(struct proc *p) 362 { 363 struct cpu_info *choice = NULL; 364 int last_cost = INT_MAX; 365 struct cpu_info *ci; 366 struct cpuset set; 367 368 /* 369 * If pegged to a cpu, don't allow it to move. 370 */ 371 if (p->p_flag & P_CPUPEG) 372 return (p->p_cpu); 373 374 sched_choose++; 375 376 /* 377 * Look at all cpus that are currently idle and have nothing queued. 378 * If there are none, pick the cheapest of those. 379 * (idle + queued could mean that the cpu is handling an interrupt 380 * at this moment and haven't had time to leave idle yet). 381 */ 382 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 383 384 /* 385 * First, just check if our current cpu is in that set, if it is, 386 * this is simple. 387 * Also, our cpu might not be idle, but if it's the current cpu 388 * and it has nothing else queued and we're curproc, take it. 389 */ 390 if (cpuset_isset(&set, p->p_cpu) || 391 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 392 curproc == p)) { 393 sched_wasidle++; 394 return (p->p_cpu); 395 } 396 397 if (cpuset_first(&set) == NULL) 398 cpuset_copy(&set, &sched_all_cpus); 399 400 while ((ci = cpuset_first(&set)) != NULL) { 401 int cost = sched_proc_to_cpu_cost(ci, p); 402 403 if (choice == NULL || cost < last_cost) { 404 choice = ci; 405 last_cost = cost; 406 } 407 cpuset_del(&set, ci); 408 } 409 410 if (p->p_cpu != choice) 411 sched_nmigrations++; 412 else 413 sched_nomigrations++; 414 415 return (choice); 416 } 417 418 /* 419 * Attempt to steal a proc from some cpu. 420 */ 421 struct proc * 422 sched_steal_proc(struct cpu_info *self) 423 { 424 struct schedstate_percpu *spc; 425 struct proc *best = NULL; 426 int bestcost = INT_MAX; 427 struct cpu_info *ci; 428 struct cpuset set; 429 430 cpuset_copy(&set, &sched_queued_cpus); 431 432 while ((ci = cpuset_first(&set)) != NULL) { 433 struct proc *p; 434 int queue; 435 int cost; 436 437 cpuset_del(&set, ci); 438 439 spc = &ci->ci_schedstate; 440 441 queue = ffs(spc->spc_whichqs) - 1; 442 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 443 if (p->p_flag & P_CPUPEG) 444 continue; 445 446 cost = sched_proc_to_cpu_cost(self, p); 447 448 if (best == NULL || cost < bestcost) { 449 best = p; 450 bestcost = cost; 451 } 452 } 453 } 454 if (best == NULL) 455 return (NULL); 456 457 spc = &best->p_cpu->ci_schedstate; 458 remrunqueue(best); 459 best->p_cpu = self; 460 461 sched_stolen++; 462 463 return (best); 464 } 465 466 /* 467 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 468 */ 469 static int 470 log2(unsigned int i) 471 { 472 int ret = 0; 473 474 while (i >>= 1) 475 ret++; 476 477 return (ret); 478 } 479 480 /* 481 * Calculate the cost of moving the proc to this cpu. 482 * 483 * What we want is some guesstimate of how much "performance" it will 484 * cost us to move the proc here. Not just for caches and TLBs and NUMA 485 * memory, but also for the proc itself. A highly loaded cpu might not 486 * be the best candidate for this proc since it won't get run. 487 * 488 * Just total guesstimates for now. 489 */ 490 491 int sched_cost_load = 1; 492 int sched_cost_priority = 1; 493 int sched_cost_runnable = 3; 494 int sched_cost_resident = 1; 495 496 int 497 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 498 { 499 struct schedstate_percpu *spc; 500 int l2resident = 0; 501 int cost; 502 503 spc = &ci->ci_schedstate; 504 505 cost = 0; 506 507 /* 508 * First, account for the priority of the proc we want to move. 509 * More willing to move, the lower the priority of the destination 510 * and the higher the priority of the proc. 511 */ 512 if (!cpuset_isset(&sched_idle_cpus, ci)) { 513 cost += (p->p_priority - spc->spc_curpriority) * 514 sched_cost_priority; 515 cost += sched_cost_runnable; 516 } 517 if (cpuset_isset(&sched_queued_cpus, ci)) { 518 cost += spc->spc_nrun * sched_cost_runnable; 519 } 520 521 /* 522 * Higher load on the destination means we don't want to go there. 523 */ 524 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 525 526 /* 527 * If the proc is on this cpu already, lower the cost by how much 528 * it has been running and an estimate of its footprint. 529 */ 530 if (p->p_cpu == ci && p->p_slptime == 0) { 531 l2resident = 532 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 533 cost -= l2resident * sched_cost_resident; 534 } 535 536 return (cost); 537 } 538 539 /* 540 * Peg a proc to a cpu. 541 */ 542 void 543 sched_peg_curproc(struct cpu_info *ci) 544 { 545 struct proc *p = curproc; 546 int s; 547 548 SCHED_LOCK(s); 549 p->p_priority = p->p_usrpri; 550 p->p_stat = SRUN; 551 p->p_cpu = ci; 552 atomic_setbits_int(&p->p_flag, P_CPUPEG); 553 setrunqueue(p); 554 p->p_stats->p_ru.ru_nvcsw++; 555 mi_switch(); 556 SCHED_UNLOCK(s); 557 } 558 559 #ifdef MULTIPROCESSOR 560 561 void 562 sched_start_secondary_cpus(void) 563 { 564 CPU_INFO_ITERATOR cii; 565 struct cpu_info *ci; 566 567 CPU_INFO_FOREACH(cii, ci) { 568 struct schedstate_percpu *spc = &ci->ci_schedstate; 569 570 if (CPU_IS_PRIMARY(ci)) 571 continue; 572 cpuset_add(&sched_all_cpus, ci); 573 atomic_clearbits_int(&spc->spc_schedflags, 574 SPCF_SHOULDHALT | SPCF_HALTED); 575 } 576 } 577 578 void 579 sched_stop_secondary_cpus(void) 580 { 581 CPU_INFO_ITERATOR cii; 582 struct cpu_info *ci; 583 584 /* 585 * Make sure we stop the secondary CPUs. 586 */ 587 CPU_INFO_FOREACH(cii, ci) { 588 struct schedstate_percpu *spc = &ci->ci_schedstate; 589 590 if (CPU_IS_PRIMARY(ci)) 591 continue; 592 cpuset_del(&sched_all_cpus, ci); 593 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 594 } 595 CPU_INFO_FOREACH(cii, ci) { 596 struct schedstate_percpu *spc = &ci->ci_schedstate; 597 struct sleep_state sls; 598 599 if (CPU_IS_PRIMARY(ci)) 600 continue; 601 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 602 sleep_setup(&sls, spc, PZERO, "schedstate"); 603 sleep_finish(&sls, 604 (spc->spc_schedflags & SPCF_HALTED) == 0); 605 } 606 } 607 } 608 609 #endif 610 611 /* 612 * Functions to manipulate cpu sets. 613 */ 614 struct cpu_info *cpuset_infos[MAXCPUS]; 615 static struct cpuset cpuset_all; 616 617 void 618 cpuset_init_cpu(struct cpu_info *ci) 619 { 620 cpuset_add(&cpuset_all, ci); 621 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 622 } 623 624 void 625 cpuset_clear(struct cpuset *cs) 626 { 627 memset(cs, 0, sizeof(*cs)); 628 } 629 630 void 631 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 632 { 633 unsigned int num = CPU_INFO_UNIT(ci); 634 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 635 } 636 637 void 638 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 639 { 640 unsigned int num = CPU_INFO_UNIT(ci); 641 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 642 } 643 644 int 645 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 646 { 647 unsigned int num = CPU_INFO_UNIT(ci); 648 return (cs->cs_set[num/32] & (1 << (num % 32))); 649 } 650 651 void 652 cpuset_add_all(struct cpuset *cs) 653 { 654 cpuset_copy(cs, &cpuset_all); 655 } 656 657 void 658 cpuset_copy(struct cpuset *to, struct cpuset *from) 659 { 660 memcpy(to, from, sizeof(*to)); 661 } 662 663 struct cpu_info * 664 cpuset_first(struct cpuset *cs) 665 { 666 int i; 667 668 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 669 if (cs->cs_set[i]) 670 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 671 672 return (NULL); 673 } 674 675 void 676 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 677 { 678 int i; 679 680 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 681 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 682 } 683 684 void 685 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 686 { 687 int i; 688 689 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 690 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 691 } 692 693 void 694 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 695 { 696 int i; 697 698 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 699 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 700 } 701