1 /* $OpenBSD: kern_sched.c,v 1.29 2013/06/03 16:55:22 guenther Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * Some general scheduler counters. 49 */ 50 uint64_t sched_nmigrations; /* Cpu migration counter */ 51 uint64_t sched_nomigrations; /* Cpu no migration counter */ 52 uint64_t sched_noidle; /* Times we didn't pick the idle task */ 53 uint64_t sched_stolen; /* Times we stole proc from other cpus */ 54 uint64_t sched_choose; /* Times we chose a cpu */ 55 uint64_t sched_wasidle; /* Times we came out of idle */ 56 57 /* 58 * A few notes about cpu_switchto that is implemented in MD code. 59 * 60 * cpu_switchto takes two arguments, the old proc and the proc 61 * it should switch to. The new proc will never be NULL, so we always have 62 * a saved state that we need to switch to. The old proc however can 63 * be NULL if the process is exiting. NULL for the old proc simply 64 * means "don't bother saving old state". 65 * 66 * cpu_switchto is supposed to atomically load the new state of the process 67 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 68 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 69 * cpus in the system must not depend on this state being consistent. 70 * Therefore no locking is necessary in cpu_switchto other than blocking 71 * interrupts during the context switch. 72 */ 73 74 /* 75 * sched_init_cpu is called from main() for the boot cpu, then it's the 76 * responsibility of the MD code to call it for all other cpus. 77 */ 78 void 79 sched_init_cpu(struct cpu_info *ci) 80 { 81 struct schedstate_percpu *spc = &ci->ci_schedstate; 82 int i; 83 84 for (i = 0; i < SCHED_NQS; i++) 85 TAILQ_INIT(&spc->spc_qs[i]); 86 87 spc->spc_idleproc = NULL; 88 89 kthread_create_deferred(sched_kthreads_create, ci); 90 91 LIST_INIT(&spc->spc_deadproc); 92 93 /* 94 * Slight hack here until the cpuset code handles cpu_info 95 * structures. 96 */ 97 cpuset_init_cpu(ci); 98 cpuset_add(&sched_all_cpus, ci); 99 } 100 101 void 102 sched_kthreads_create(void *v) 103 { 104 struct cpu_info *ci = v; 105 struct schedstate_percpu *spc = &ci->ci_schedstate; 106 static int num; 107 108 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 109 panic("fork idle"); 110 111 num++; 112 } 113 114 void 115 sched_idle(void *v) 116 { 117 struct schedstate_percpu *spc; 118 struct proc *p = curproc; 119 struct cpu_info *ci = v; 120 int s; 121 122 KERNEL_UNLOCK(); 123 124 spc = &ci->ci_schedstate; 125 126 /* 127 * First time we enter here, we're not supposed to idle, 128 * just go away for a while. 129 */ 130 SCHED_LOCK(s); 131 cpuset_add(&sched_idle_cpus, ci); 132 p->p_stat = SSLEEP; 133 p->p_cpu = ci; 134 atomic_setbits_int(&p->p_flag, P_CPUPEG); 135 mi_switch(); 136 cpuset_del(&sched_idle_cpus, ci); 137 SCHED_UNLOCK(s); 138 139 KASSERT(ci == curcpu()); 140 KASSERT(curproc == spc->spc_idleproc); 141 142 while (1) { 143 while (!curcpu_is_idle()) { 144 struct proc *dead; 145 146 SCHED_LOCK(s); 147 p->p_stat = SSLEEP; 148 mi_switch(); 149 SCHED_UNLOCK(s); 150 151 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 152 LIST_REMOVE(dead, p_hash); 153 exit2(dead); 154 } 155 } 156 157 splassert(IPL_NONE); 158 159 cpuset_add(&sched_idle_cpus, ci); 160 cpu_idle_enter(); 161 while (spc->spc_whichqs == 0) { 162 if (spc->spc_schedflags & SPCF_SHOULDHALT && 163 (spc->spc_schedflags & SPCF_HALTED) == 0) { 164 cpuset_del(&sched_idle_cpus, ci); 165 SCHED_LOCK(s); 166 atomic_setbits_int(&spc->spc_schedflags, 167 spc->spc_whichqs ? 0 : SPCF_HALTED); 168 SCHED_UNLOCK(s); 169 wakeup(spc); 170 } 171 cpu_idle_cycle(); 172 } 173 cpu_idle_leave(); 174 cpuset_del(&sched_idle_cpus, ci); 175 } 176 } 177 178 /* 179 * To free our address space we have to jump through a few hoops. 180 * The freeing is done by the reaper, but until we have one reaper 181 * per cpu, we have no way of putting this proc on the deadproc list 182 * and waking up the reaper without risking having our address space and 183 * stack torn from under us before we manage to switch to another proc. 184 * Therefore we have a per-cpu list of dead processes where we put this 185 * proc and have idle clean up that list and move it to the reaper list. 186 * All this will be unnecessary once we can bind the reaper this cpu 187 * and not risk having it switch to another in case it sleeps. 188 */ 189 void 190 sched_exit(struct proc *p) 191 { 192 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 193 struct timespec ts; 194 struct proc *idle; 195 int s; 196 197 nanouptime(&ts); 198 timespecsub(&ts, &spc->spc_runtime, &ts); 199 timespecadd(&p->p_rtime, &ts, &p->p_rtime); 200 201 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 202 203 /* This process no longer needs to hold the kernel lock. */ 204 KERNEL_UNLOCK(); 205 206 SCHED_LOCK(s); 207 idle = spc->spc_idleproc; 208 idle->p_stat = SRUN; 209 cpu_switchto(NULL, idle); 210 panic("cpu_switchto returned"); 211 } 212 213 /* 214 * Run queue management. 215 */ 216 void 217 sched_init_runqueues(void) 218 { 219 } 220 221 void 222 setrunqueue(struct proc *p) 223 { 224 struct schedstate_percpu *spc; 225 int queue = p->p_priority >> 2; 226 227 SCHED_ASSERT_LOCKED(); 228 spc = &p->p_cpu->ci_schedstate; 229 spc->spc_nrun++; 230 231 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 232 spc->spc_whichqs |= (1 << queue); 233 cpuset_add(&sched_queued_cpus, p->p_cpu); 234 235 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 236 cpu_unidle(p->p_cpu); 237 } 238 239 void 240 remrunqueue(struct proc *p) 241 { 242 struct schedstate_percpu *spc; 243 int queue = p->p_priority >> 2; 244 245 SCHED_ASSERT_LOCKED(); 246 spc = &p->p_cpu->ci_schedstate; 247 spc->spc_nrun--; 248 249 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 250 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 251 spc->spc_whichqs &= ~(1 << queue); 252 if (spc->spc_whichqs == 0) 253 cpuset_del(&sched_queued_cpus, p->p_cpu); 254 } 255 } 256 257 struct proc * 258 sched_chooseproc(void) 259 { 260 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 261 struct proc *p; 262 int queue; 263 264 SCHED_ASSERT_LOCKED(); 265 266 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 267 if (spc->spc_whichqs) { 268 for (queue = 0; queue < SCHED_NQS; queue++) { 269 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 270 remrunqueue(p); 271 p->p_cpu = sched_choosecpu(p); 272 setrunqueue(p); 273 } 274 } 275 } 276 p = spc->spc_idleproc; 277 KASSERT(p); 278 KASSERT(p->p_wchan == NULL); 279 p->p_stat = SRUN; 280 return (p); 281 } 282 283 again: 284 if (spc->spc_whichqs) { 285 queue = ffs(spc->spc_whichqs) - 1; 286 p = TAILQ_FIRST(&spc->spc_qs[queue]); 287 remrunqueue(p); 288 sched_noidle++; 289 KASSERT(p->p_stat == SRUN); 290 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 291 p = spc->spc_idleproc; 292 if (p == NULL) { 293 int s; 294 /* 295 * We get here if someone decides to switch during 296 * boot before forking kthreads, bleh. 297 * This is kind of like a stupid idle loop. 298 */ 299 #ifdef MULTIPROCESSOR 300 __mp_unlock(&sched_lock); 301 #endif 302 spl0(); 303 delay(10); 304 SCHED_LOCK(s); 305 goto again; 306 } 307 KASSERT(p); 308 p->p_stat = SRUN; 309 } 310 311 KASSERT(p->p_wchan == NULL); 312 return (p); 313 } 314 315 struct cpu_info * 316 sched_choosecpu_fork(struct proc *parent, int flags) 317 { 318 #ifdef MULTIPROCESSOR 319 struct cpu_info *choice = NULL; 320 fixpt_t load, best_load = ~0; 321 int run, best_run = INT_MAX; 322 struct cpu_info *ci; 323 struct cpuset set; 324 325 #if 0 326 /* 327 * XXX 328 * Don't do this until we have a painless way to move the cpu in exec. 329 * Preferably when nuking the old pmap and getting a new one on a 330 * new cpu. 331 */ 332 /* 333 * PPWAIT forks are simple. We know that the parent will not 334 * run until we exec and choose another cpu, so we just steal its 335 * cpu. 336 */ 337 if (flags & FORK_PPWAIT) 338 return (parent->p_cpu); 339 #endif 340 341 /* 342 * Look at all cpus that are currently idle and have nothing queued. 343 * If there are none, pick the one with least queued procs first, 344 * then the one with lowest load average. 345 */ 346 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 347 cpuset_intersection(&set, &set, &sched_all_cpus); 348 if (cpuset_first(&set) == NULL) 349 cpuset_copy(&set, &sched_all_cpus); 350 351 while ((ci = cpuset_first(&set)) != NULL) { 352 cpuset_del(&set, ci); 353 354 load = ci->ci_schedstate.spc_ldavg; 355 run = ci->ci_schedstate.spc_nrun; 356 357 if (choice == NULL || run < best_run || 358 (run == best_run &&load < best_load)) { 359 choice = ci; 360 best_load = load; 361 best_run = run; 362 } 363 } 364 365 return (choice); 366 #else 367 return (curcpu()); 368 #endif 369 } 370 371 struct cpu_info * 372 sched_choosecpu(struct proc *p) 373 { 374 #ifdef MULTIPROCESSOR 375 struct cpu_info *choice = NULL; 376 int last_cost = INT_MAX; 377 struct cpu_info *ci; 378 struct cpuset set; 379 380 /* 381 * If pegged to a cpu, don't allow it to move. 382 */ 383 if (p->p_flag & P_CPUPEG) 384 return (p->p_cpu); 385 386 sched_choose++; 387 388 /* 389 * Look at all cpus that are currently idle and have nothing queued. 390 * If there are none, pick the cheapest of those. 391 * (idle + queued could mean that the cpu is handling an interrupt 392 * at this moment and haven't had time to leave idle yet). 393 */ 394 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 395 cpuset_intersection(&set, &set, &sched_all_cpus); 396 397 /* 398 * First, just check if our current cpu is in that set, if it is, 399 * this is simple. 400 * Also, our cpu might not be idle, but if it's the current cpu 401 * and it has nothing else queued and we're curproc, take it. 402 */ 403 if (cpuset_isset(&set, p->p_cpu) || 404 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 405 curproc == p)) { 406 sched_wasidle++; 407 return (p->p_cpu); 408 } 409 410 if (cpuset_first(&set) == NULL) 411 cpuset_copy(&set, &sched_all_cpus); 412 413 while ((ci = cpuset_first(&set)) != NULL) { 414 int cost = sched_proc_to_cpu_cost(ci, p); 415 416 if (choice == NULL || cost < last_cost) { 417 choice = ci; 418 last_cost = cost; 419 } 420 cpuset_del(&set, ci); 421 } 422 423 if (p->p_cpu != choice) 424 sched_nmigrations++; 425 else 426 sched_nomigrations++; 427 428 return (choice); 429 #else 430 return (curcpu()); 431 #endif 432 } 433 434 /* 435 * Attempt to steal a proc from some cpu. 436 */ 437 struct proc * 438 sched_steal_proc(struct cpu_info *self) 439 { 440 struct proc *best = NULL; 441 #ifdef MULTIPROCESSOR 442 struct schedstate_percpu *spc; 443 int bestcost = INT_MAX; 444 struct cpu_info *ci; 445 struct cpuset set; 446 447 KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 448 449 cpuset_copy(&set, &sched_queued_cpus); 450 451 while ((ci = cpuset_first(&set)) != NULL) { 452 struct proc *p; 453 int queue; 454 int cost; 455 456 cpuset_del(&set, ci); 457 458 spc = &ci->ci_schedstate; 459 460 queue = ffs(spc->spc_whichqs) - 1; 461 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 462 if (p->p_flag & P_CPUPEG) 463 continue; 464 465 cost = sched_proc_to_cpu_cost(self, p); 466 467 if (best == NULL || cost < bestcost) { 468 best = p; 469 bestcost = cost; 470 } 471 } 472 } 473 if (best == NULL) 474 return (NULL); 475 476 spc = &best->p_cpu->ci_schedstate; 477 remrunqueue(best); 478 best->p_cpu = self; 479 480 sched_stolen++; 481 #endif 482 return (best); 483 } 484 485 #ifdef MULTIPROCESSOR 486 /* 487 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 488 */ 489 static int 490 log2(unsigned int i) 491 { 492 int ret = 0; 493 494 while (i >>= 1) 495 ret++; 496 497 return (ret); 498 } 499 500 /* 501 * Calculate the cost of moving the proc to this cpu. 502 * 503 * What we want is some guesstimate of how much "performance" it will 504 * cost us to move the proc here. Not just for caches and TLBs and NUMA 505 * memory, but also for the proc itself. A highly loaded cpu might not 506 * be the best candidate for this proc since it won't get run. 507 * 508 * Just total guesstimates for now. 509 */ 510 511 int sched_cost_load = 1; 512 int sched_cost_priority = 1; 513 int sched_cost_runnable = 3; 514 int sched_cost_resident = 1; 515 #endif 516 517 int 518 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 519 { 520 int cost = 0; 521 #ifdef MULTIPROCESSOR 522 struct schedstate_percpu *spc; 523 int l2resident = 0; 524 525 spc = &ci->ci_schedstate; 526 527 /* 528 * First, account for the priority of the proc we want to move. 529 * More willing to move, the lower the priority of the destination 530 * and the higher the priority of the proc. 531 */ 532 if (!cpuset_isset(&sched_idle_cpus, ci)) { 533 cost += (p->p_priority - spc->spc_curpriority) * 534 sched_cost_priority; 535 cost += sched_cost_runnable; 536 } 537 if (cpuset_isset(&sched_queued_cpus, ci)) 538 cost += spc->spc_nrun * sched_cost_runnable; 539 540 /* 541 * Higher load on the destination means we don't want to go there. 542 */ 543 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 544 545 /* 546 * If the proc is on this cpu already, lower the cost by how much 547 * it has been running and an estimate of its footprint. 548 */ 549 if (p->p_cpu == ci && p->p_slptime == 0) { 550 l2resident = 551 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 552 cost -= l2resident * sched_cost_resident; 553 } 554 #endif 555 return (cost); 556 } 557 558 /* 559 * Peg a proc to a cpu. 560 */ 561 void 562 sched_peg_curproc(struct cpu_info *ci) 563 { 564 struct proc *p = curproc; 565 int s; 566 567 SCHED_LOCK(s); 568 p->p_priority = p->p_usrpri; 569 p->p_stat = SRUN; 570 p->p_cpu = ci; 571 atomic_setbits_int(&p->p_flag, P_CPUPEG); 572 setrunqueue(p); 573 p->p_ru.ru_nvcsw++; 574 mi_switch(); 575 SCHED_UNLOCK(s); 576 } 577 578 #ifdef MULTIPROCESSOR 579 580 void 581 sched_start_secondary_cpus(void) 582 { 583 CPU_INFO_ITERATOR cii; 584 struct cpu_info *ci; 585 586 CPU_INFO_FOREACH(cii, ci) { 587 struct schedstate_percpu *spc = &ci->ci_schedstate; 588 589 if (CPU_IS_PRIMARY(ci)) 590 continue; 591 cpuset_add(&sched_all_cpus, ci); 592 atomic_clearbits_int(&spc->spc_schedflags, 593 SPCF_SHOULDHALT | SPCF_HALTED); 594 } 595 } 596 597 void 598 sched_stop_secondary_cpus(void) 599 { 600 CPU_INFO_ITERATOR cii; 601 struct cpu_info *ci; 602 603 /* 604 * Make sure we stop the secondary CPUs. 605 */ 606 CPU_INFO_FOREACH(cii, ci) { 607 struct schedstate_percpu *spc = &ci->ci_schedstate; 608 609 if (CPU_IS_PRIMARY(ci)) 610 continue; 611 cpuset_del(&sched_all_cpus, ci); 612 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 613 } 614 CPU_INFO_FOREACH(cii, ci) { 615 struct schedstate_percpu *spc = &ci->ci_schedstate; 616 struct sleep_state sls; 617 618 if (CPU_IS_PRIMARY(ci)) 619 continue; 620 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 621 sleep_setup(&sls, spc, PZERO, "schedstate"); 622 sleep_finish(&sls, 623 (spc->spc_schedflags & SPCF_HALTED) == 0); 624 } 625 } 626 } 627 628 #endif 629 630 /* 631 * Functions to manipulate cpu sets. 632 */ 633 struct cpu_info *cpuset_infos[MAXCPUS]; 634 static struct cpuset cpuset_all; 635 636 void 637 cpuset_init_cpu(struct cpu_info *ci) 638 { 639 cpuset_add(&cpuset_all, ci); 640 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 641 } 642 643 void 644 cpuset_clear(struct cpuset *cs) 645 { 646 memset(cs, 0, sizeof(*cs)); 647 } 648 649 void 650 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 651 { 652 unsigned int num = CPU_INFO_UNIT(ci); 653 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 654 } 655 656 void 657 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 658 { 659 unsigned int num = CPU_INFO_UNIT(ci); 660 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 661 } 662 663 int 664 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 665 { 666 unsigned int num = CPU_INFO_UNIT(ci); 667 return (cs->cs_set[num/32] & (1 << (num % 32))); 668 } 669 670 void 671 cpuset_add_all(struct cpuset *cs) 672 { 673 cpuset_copy(cs, &cpuset_all); 674 } 675 676 void 677 cpuset_copy(struct cpuset *to, struct cpuset *from) 678 { 679 memcpy(to, from, sizeof(*to)); 680 } 681 682 struct cpu_info * 683 cpuset_first(struct cpuset *cs) 684 { 685 int i; 686 687 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 688 if (cs->cs_set[i]) 689 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 690 691 return (NULL); 692 } 693 694 void 695 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 696 { 697 int i; 698 699 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 700 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 701 } 702 703 void 704 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 705 { 706 int i; 707 708 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 709 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 710 } 711 712 void 713 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 714 { 715 int i; 716 717 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 718 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 719 } 720