1 /* $OpenBSD: kern_sched.c,v 1.30 2013/06/06 13:09:37 haesbaert Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * Some general scheduler counters. 49 */ 50 uint64_t sched_nmigrations; /* Cpu migration counter */ 51 uint64_t sched_nomigrations; /* Cpu no migration counter */ 52 uint64_t sched_noidle; /* Times we didn't pick the idle task */ 53 uint64_t sched_stolen; /* Times we stole proc from other cpus */ 54 uint64_t sched_choose; /* Times we chose a cpu */ 55 uint64_t sched_wasidle; /* Times we came out of idle */ 56 57 /* 58 * A few notes about cpu_switchto that is implemented in MD code. 59 * 60 * cpu_switchto takes two arguments, the old proc and the proc 61 * it should switch to. The new proc will never be NULL, so we always have 62 * a saved state that we need to switch to. The old proc however can 63 * be NULL if the process is exiting. NULL for the old proc simply 64 * means "don't bother saving old state". 65 * 66 * cpu_switchto is supposed to atomically load the new state of the process 67 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 68 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 69 * cpus in the system must not depend on this state being consistent. 70 * Therefore no locking is necessary in cpu_switchto other than blocking 71 * interrupts during the context switch. 72 */ 73 74 /* 75 * sched_init_cpu is called from main() for the boot cpu, then it's the 76 * responsibility of the MD code to call it for all other cpus. 77 */ 78 void 79 sched_init_cpu(struct cpu_info *ci) 80 { 81 struct schedstate_percpu *spc = &ci->ci_schedstate; 82 int i; 83 84 for (i = 0; i < SCHED_NQS; i++) 85 TAILQ_INIT(&spc->spc_qs[i]); 86 87 spc->spc_idleproc = NULL; 88 89 kthread_create_deferred(sched_kthreads_create, ci); 90 91 LIST_INIT(&spc->spc_deadproc); 92 93 /* 94 * Slight hack here until the cpuset code handles cpu_info 95 * structures. 96 */ 97 cpuset_init_cpu(ci); 98 cpuset_add(&sched_all_cpus, ci); 99 } 100 101 void 102 sched_kthreads_create(void *v) 103 { 104 struct cpu_info *ci = v; 105 struct schedstate_percpu *spc = &ci->ci_schedstate; 106 static int num; 107 108 if (fork1(&proc0, 0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE| 109 FORK_SIGHAND|FORK_IDLE, NULL, 0, sched_idle, ci, NULL, 110 &spc->spc_idleproc)) 111 panic("fork idle"); 112 113 /* 114 * Mark it as a system process. 115 */ 116 atomic_setbits_int(&spc->spc_idleproc->p_flag, P_SYSTEM); 117 118 /* Name it as specified. */ 119 snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm), 120 "idle%d", num); 121 122 num++; 123 } 124 125 void 126 sched_idle(void *v) 127 { 128 struct schedstate_percpu *spc; 129 struct proc *p = curproc; 130 struct cpu_info *ci = v; 131 int s; 132 133 KERNEL_UNLOCK(); 134 135 spc = &ci->ci_schedstate; 136 137 /* 138 * First time we enter here, we're not supposed to idle, 139 * just go away for a while. 140 */ 141 SCHED_LOCK(s); 142 cpuset_add(&sched_idle_cpus, ci); 143 p->p_stat = SSLEEP; 144 p->p_cpu = ci; 145 atomic_setbits_int(&p->p_flag, P_CPUPEG); 146 mi_switch(); 147 cpuset_del(&sched_idle_cpus, ci); 148 SCHED_UNLOCK(s); 149 150 KASSERT(ci == curcpu()); 151 KASSERT(curproc == spc->spc_idleproc); 152 153 while (1) { 154 while (!curcpu_is_idle()) { 155 struct proc *dead; 156 157 SCHED_LOCK(s); 158 p->p_stat = SSLEEP; 159 mi_switch(); 160 SCHED_UNLOCK(s); 161 162 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 163 LIST_REMOVE(dead, p_hash); 164 exit2(dead); 165 } 166 } 167 168 splassert(IPL_NONE); 169 170 cpuset_add(&sched_idle_cpus, ci); 171 cpu_idle_enter(); 172 while (spc->spc_whichqs == 0) { 173 if (spc->spc_schedflags & SPCF_SHOULDHALT && 174 (spc->spc_schedflags & SPCF_HALTED) == 0) { 175 cpuset_del(&sched_idle_cpus, ci); 176 SCHED_LOCK(s); 177 atomic_setbits_int(&spc->spc_schedflags, 178 spc->spc_whichqs ? 0 : SPCF_HALTED); 179 SCHED_UNLOCK(s); 180 wakeup(spc); 181 } 182 cpu_idle_cycle(); 183 } 184 cpu_idle_leave(); 185 cpuset_del(&sched_idle_cpus, ci); 186 } 187 } 188 189 /* 190 * To free our address space we have to jump through a few hoops. 191 * The freeing is done by the reaper, but until we have one reaper 192 * per cpu, we have no way of putting this proc on the deadproc list 193 * and waking up the reaper without risking having our address space and 194 * stack torn from under us before we manage to switch to another proc. 195 * Therefore we have a per-cpu list of dead processes where we put this 196 * proc and have idle clean up that list and move it to the reaper list. 197 * All this will be unnecessary once we can bind the reaper this cpu 198 * and not risk having it switch to another in case it sleeps. 199 */ 200 void 201 sched_exit(struct proc *p) 202 { 203 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 204 struct timespec ts; 205 struct proc *idle; 206 int s; 207 208 nanouptime(&ts); 209 timespecsub(&ts, &spc->spc_runtime, &ts); 210 timespecadd(&p->p_rtime, &ts, &p->p_rtime); 211 212 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 213 214 /* This process no longer needs to hold the kernel lock. */ 215 KERNEL_UNLOCK(); 216 217 SCHED_LOCK(s); 218 idle = spc->spc_idleproc; 219 idle->p_stat = SRUN; 220 cpu_switchto(NULL, idle); 221 panic("cpu_switchto returned"); 222 } 223 224 /* 225 * Run queue management. 226 */ 227 void 228 sched_init_runqueues(void) 229 { 230 } 231 232 void 233 setrunqueue(struct proc *p) 234 { 235 struct schedstate_percpu *spc; 236 int queue = p->p_priority >> 2; 237 238 SCHED_ASSERT_LOCKED(); 239 spc = &p->p_cpu->ci_schedstate; 240 spc->spc_nrun++; 241 242 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 243 spc->spc_whichqs |= (1 << queue); 244 cpuset_add(&sched_queued_cpus, p->p_cpu); 245 246 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 247 cpu_unidle(p->p_cpu); 248 } 249 250 void 251 remrunqueue(struct proc *p) 252 { 253 struct schedstate_percpu *spc; 254 int queue = p->p_priority >> 2; 255 256 SCHED_ASSERT_LOCKED(); 257 spc = &p->p_cpu->ci_schedstate; 258 spc->spc_nrun--; 259 260 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 261 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 262 spc->spc_whichqs &= ~(1 << queue); 263 if (spc->spc_whichqs == 0) 264 cpuset_del(&sched_queued_cpus, p->p_cpu); 265 } 266 } 267 268 struct proc * 269 sched_chooseproc(void) 270 { 271 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 272 struct proc *p; 273 int queue; 274 275 SCHED_ASSERT_LOCKED(); 276 277 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 278 if (spc->spc_whichqs) { 279 for (queue = 0; queue < SCHED_NQS; queue++) { 280 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 281 remrunqueue(p); 282 p->p_cpu = sched_choosecpu(p); 283 setrunqueue(p); 284 } 285 } 286 } 287 p = spc->spc_idleproc; 288 KASSERT(p); 289 KASSERT(p->p_wchan == NULL); 290 p->p_stat = SRUN; 291 return (p); 292 } 293 294 again: 295 if (spc->spc_whichqs) { 296 queue = ffs(spc->spc_whichqs) - 1; 297 p = TAILQ_FIRST(&spc->spc_qs[queue]); 298 remrunqueue(p); 299 sched_noidle++; 300 KASSERT(p->p_stat == SRUN); 301 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 302 p = spc->spc_idleproc; 303 if (p == NULL) { 304 int s; 305 /* 306 * We get here if someone decides to switch during 307 * boot before forking kthreads, bleh. 308 * This is kind of like a stupid idle loop. 309 */ 310 #ifdef MULTIPROCESSOR 311 __mp_unlock(&sched_lock); 312 #endif 313 spl0(); 314 delay(10); 315 SCHED_LOCK(s); 316 goto again; 317 } 318 KASSERT(p); 319 p->p_stat = SRUN; 320 } 321 322 KASSERT(p->p_wchan == NULL); 323 return (p); 324 } 325 326 struct cpu_info * 327 sched_choosecpu_fork(struct proc *parent, int flags) 328 { 329 #ifdef MULTIPROCESSOR 330 struct cpu_info *choice = NULL; 331 fixpt_t load, best_load = ~0; 332 int run, best_run = INT_MAX; 333 struct cpu_info *ci; 334 struct cpuset set; 335 336 #if 0 337 /* 338 * XXX 339 * Don't do this until we have a painless way to move the cpu in exec. 340 * Preferably when nuking the old pmap and getting a new one on a 341 * new cpu. 342 */ 343 /* 344 * PPWAIT forks are simple. We know that the parent will not 345 * run until we exec and choose another cpu, so we just steal its 346 * cpu. 347 */ 348 if (flags & FORK_PPWAIT) 349 return (parent->p_cpu); 350 #endif 351 352 /* 353 * Look at all cpus that are currently idle and have nothing queued. 354 * If there are none, pick the one with least queued procs first, 355 * then the one with lowest load average. 356 */ 357 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 358 cpuset_intersection(&set, &set, &sched_all_cpus); 359 if (cpuset_first(&set) == NULL) 360 cpuset_copy(&set, &sched_all_cpus); 361 362 while ((ci = cpuset_first(&set)) != NULL) { 363 cpuset_del(&set, ci); 364 365 load = ci->ci_schedstate.spc_ldavg; 366 run = ci->ci_schedstate.spc_nrun; 367 368 if (choice == NULL || run < best_run || 369 (run == best_run &&load < best_load)) { 370 choice = ci; 371 best_load = load; 372 best_run = run; 373 } 374 } 375 376 return (choice); 377 #else 378 return (curcpu()); 379 #endif 380 } 381 382 struct cpu_info * 383 sched_choosecpu(struct proc *p) 384 { 385 #ifdef MULTIPROCESSOR 386 struct cpu_info *choice = NULL; 387 int last_cost = INT_MAX; 388 struct cpu_info *ci; 389 struct cpuset set; 390 391 /* 392 * If pegged to a cpu, don't allow it to move. 393 */ 394 if (p->p_flag & P_CPUPEG) 395 return (p->p_cpu); 396 397 sched_choose++; 398 399 /* 400 * Look at all cpus that are currently idle and have nothing queued. 401 * If there are none, pick the cheapest of those. 402 * (idle + queued could mean that the cpu is handling an interrupt 403 * at this moment and haven't had time to leave idle yet). 404 */ 405 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 406 cpuset_intersection(&set, &set, &sched_all_cpus); 407 408 /* 409 * First, just check if our current cpu is in that set, if it is, 410 * this is simple. 411 * Also, our cpu might not be idle, but if it's the current cpu 412 * and it has nothing else queued and we're curproc, take it. 413 */ 414 if (cpuset_isset(&set, p->p_cpu) || 415 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 416 curproc == p)) { 417 sched_wasidle++; 418 return (p->p_cpu); 419 } 420 421 if (cpuset_first(&set) == NULL) 422 cpuset_copy(&set, &sched_all_cpus); 423 424 while ((ci = cpuset_first(&set)) != NULL) { 425 int cost = sched_proc_to_cpu_cost(ci, p); 426 427 if (choice == NULL || cost < last_cost) { 428 choice = ci; 429 last_cost = cost; 430 } 431 cpuset_del(&set, ci); 432 } 433 434 if (p->p_cpu != choice) 435 sched_nmigrations++; 436 else 437 sched_nomigrations++; 438 439 return (choice); 440 #else 441 return (curcpu()); 442 #endif 443 } 444 445 /* 446 * Attempt to steal a proc from some cpu. 447 */ 448 struct proc * 449 sched_steal_proc(struct cpu_info *self) 450 { 451 struct proc *best = NULL; 452 #ifdef MULTIPROCESSOR 453 struct schedstate_percpu *spc; 454 int bestcost = INT_MAX; 455 struct cpu_info *ci; 456 struct cpuset set; 457 458 KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 459 460 cpuset_copy(&set, &sched_queued_cpus); 461 462 while ((ci = cpuset_first(&set)) != NULL) { 463 struct proc *p; 464 int queue; 465 int cost; 466 467 cpuset_del(&set, ci); 468 469 spc = &ci->ci_schedstate; 470 471 queue = ffs(spc->spc_whichqs) - 1; 472 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 473 if (p->p_flag & P_CPUPEG) 474 continue; 475 476 cost = sched_proc_to_cpu_cost(self, p); 477 478 if (best == NULL || cost < bestcost) { 479 best = p; 480 bestcost = cost; 481 } 482 } 483 } 484 if (best == NULL) 485 return (NULL); 486 487 spc = &best->p_cpu->ci_schedstate; 488 remrunqueue(best); 489 best->p_cpu = self; 490 491 sched_stolen++; 492 #endif 493 return (best); 494 } 495 496 #ifdef MULTIPROCESSOR 497 /* 498 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 499 */ 500 static int 501 log2(unsigned int i) 502 { 503 int ret = 0; 504 505 while (i >>= 1) 506 ret++; 507 508 return (ret); 509 } 510 511 /* 512 * Calculate the cost of moving the proc to this cpu. 513 * 514 * What we want is some guesstimate of how much "performance" it will 515 * cost us to move the proc here. Not just for caches and TLBs and NUMA 516 * memory, but also for the proc itself. A highly loaded cpu might not 517 * be the best candidate for this proc since it won't get run. 518 * 519 * Just total guesstimates for now. 520 */ 521 522 int sched_cost_load = 1; 523 int sched_cost_priority = 1; 524 int sched_cost_runnable = 3; 525 int sched_cost_resident = 1; 526 #endif 527 528 int 529 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 530 { 531 int cost = 0; 532 #ifdef MULTIPROCESSOR 533 struct schedstate_percpu *spc; 534 int l2resident = 0; 535 536 spc = &ci->ci_schedstate; 537 538 /* 539 * First, account for the priority of the proc we want to move. 540 * More willing to move, the lower the priority of the destination 541 * and the higher the priority of the proc. 542 */ 543 if (!cpuset_isset(&sched_idle_cpus, ci)) { 544 cost += (p->p_priority - spc->spc_curpriority) * 545 sched_cost_priority; 546 cost += sched_cost_runnable; 547 } 548 if (cpuset_isset(&sched_queued_cpus, ci)) 549 cost += spc->spc_nrun * sched_cost_runnable; 550 551 /* 552 * Higher load on the destination means we don't want to go there. 553 */ 554 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 555 556 /* 557 * If the proc is on this cpu already, lower the cost by how much 558 * it has been running and an estimate of its footprint. 559 */ 560 if (p->p_cpu == ci && p->p_slptime == 0) { 561 l2resident = 562 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 563 cost -= l2resident * sched_cost_resident; 564 } 565 #endif 566 return (cost); 567 } 568 569 /* 570 * Peg a proc to a cpu. 571 */ 572 void 573 sched_peg_curproc(struct cpu_info *ci) 574 { 575 struct proc *p = curproc; 576 int s; 577 578 SCHED_LOCK(s); 579 p->p_priority = p->p_usrpri; 580 p->p_stat = SRUN; 581 p->p_cpu = ci; 582 atomic_setbits_int(&p->p_flag, P_CPUPEG); 583 setrunqueue(p); 584 p->p_ru.ru_nvcsw++; 585 mi_switch(); 586 SCHED_UNLOCK(s); 587 } 588 589 #ifdef MULTIPROCESSOR 590 591 void 592 sched_start_secondary_cpus(void) 593 { 594 CPU_INFO_ITERATOR cii; 595 struct cpu_info *ci; 596 597 CPU_INFO_FOREACH(cii, ci) { 598 struct schedstate_percpu *spc = &ci->ci_schedstate; 599 600 if (CPU_IS_PRIMARY(ci)) 601 continue; 602 cpuset_add(&sched_all_cpus, ci); 603 atomic_clearbits_int(&spc->spc_schedflags, 604 SPCF_SHOULDHALT | SPCF_HALTED); 605 } 606 } 607 608 void 609 sched_stop_secondary_cpus(void) 610 { 611 CPU_INFO_ITERATOR cii; 612 struct cpu_info *ci; 613 614 /* 615 * Make sure we stop the secondary CPUs. 616 */ 617 CPU_INFO_FOREACH(cii, ci) { 618 struct schedstate_percpu *spc = &ci->ci_schedstate; 619 620 if (CPU_IS_PRIMARY(ci)) 621 continue; 622 cpuset_del(&sched_all_cpus, ci); 623 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 624 } 625 CPU_INFO_FOREACH(cii, ci) { 626 struct schedstate_percpu *spc = &ci->ci_schedstate; 627 struct sleep_state sls; 628 629 if (CPU_IS_PRIMARY(ci)) 630 continue; 631 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 632 sleep_setup(&sls, spc, PZERO, "schedstate"); 633 sleep_finish(&sls, 634 (spc->spc_schedflags & SPCF_HALTED) == 0); 635 } 636 } 637 } 638 639 #endif 640 641 /* 642 * Functions to manipulate cpu sets. 643 */ 644 struct cpu_info *cpuset_infos[MAXCPUS]; 645 static struct cpuset cpuset_all; 646 647 void 648 cpuset_init_cpu(struct cpu_info *ci) 649 { 650 cpuset_add(&cpuset_all, ci); 651 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 652 } 653 654 void 655 cpuset_clear(struct cpuset *cs) 656 { 657 memset(cs, 0, sizeof(*cs)); 658 } 659 660 void 661 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 662 { 663 unsigned int num = CPU_INFO_UNIT(ci); 664 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 665 } 666 667 void 668 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 669 { 670 unsigned int num = CPU_INFO_UNIT(ci); 671 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 672 } 673 674 int 675 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 676 { 677 unsigned int num = CPU_INFO_UNIT(ci); 678 return (cs->cs_set[num/32] & (1 << (num % 32))); 679 } 680 681 void 682 cpuset_add_all(struct cpuset *cs) 683 { 684 cpuset_copy(cs, &cpuset_all); 685 } 686 687 void 688 cpuset_copy(struct cpuset *to, struct cpuset *from) 689 { 690 memcpy(to, from, sizeof(*to)); 691 } 692 693 struct cpu_info * 694 cpuset_first(struct cpuset *cs) 695 { 696 int i; 697 698 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 699 if (cs->cs_set[i]) 700 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 701 702 return (NULL); 703 } 704 705 void 706 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 707 { 708 int i; 709 710 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 711 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 712 } 713 714 void 715 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 716 { 717 int i; 718 719 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 720 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 721 } 722 723 void 724 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 725 { 726 int i; 727 728 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 729 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 730 } 731