1 /* $OpenBSD: kern_sched.c,v 1.33 2014/07/13 21:44:58 matthew Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * Some general scheduler counters. 49 */ 50 uint64_t sched_nmigrations; /* Cpu migration counter */ 51 uint64_t sched_nomigrations; /* Cpu no migration counter */ 52 uint64_t sched_noidle; /* Times we didn't pick the idle task */ 53 uint64_t sched_stolen; /* Times we stole proc from other cpus */ 54 uint64_t sched_choose; /* Times we chose a cpu */ 55 uint64_t sched_wasidle; /* Times we came out of idle */ 56 57 /* 58 * A few notes about cpu_switchto that is implemented in MD code. 59 * 60 * cpu_switchto takes two arguments, the old proc and the proc 61 * it should switch to. The new proc will never be NULL, so we always have 62 * a saved state that we need to switch to. The old proc however can 63 * be NULL if the process is exiting. NULL for the old proc simply 64 * means "don't bother saving old state". 65 * 66 * cpu_switchto is supposed to atomically load the new state of the process 67 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 68 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 69 * cpus in the system must not depend on this state being consistent. 70 * Therefore no locking is necessary in cpu_switchto other than blocking 71 * interrupts during the context switch. 72 */ 73 74 /* 75 * sched_init_cpu is called from main() for the boot cpu, then it's the 76 * responsibility of the MD code to call it for all other cpus. 77 */ 78 void 79 sched_init_cpu(struct cpu_info *ci) 80 { 81 struct schedstate_percpu *spc = &ci->ci_schedstate; 82 int i; 83 84 for (i = 0; i < SCHED_NQS; i++) 85 TAILQ_INIT(&spc->spc_qs[i]); 86 87 spc->spc_idleproc = NULL; 88 89 kthread_create_deferred(sched_kthreads_create, ci); 90 91 LIST_INIT(&spc->spc_deadproc); 92 93 /* 94 * Slight hack here until the cpuset code handles cpu_info 95 * structures. 96 */ 97 cpuset_init_cpu(ci); 98 cpuset_add(&sched_all_cpus, ci); 99 } 100 101 void 102 sched_kthreads_create(void *v) 103 { 104 struct cpu_info *ci = v; 105 struct schedstate_percpu *spc = &ci->ci_schedstate; 106 static int num; 107 108 if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE| 109 FORK_SYSTEM|FORK_SIGHAND|FORK_IDLE, NULL, 0, sched_idle, ci, NULL, 110 &spc->spc_idleproc)) 111 panic("fork idle"); 112 113 /* Name it as specified. */ 114 snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm), 115 "idle%d", num); 116 117 num++; 118 } 119 120 void 121 sched_idle(void *v) 122 { 123 struct schedstate_percpu *spc; 124 struct proc *p = curproc; 125 struct cpu_info *ci = v; 126 int s; 127 128 KERNEL_UNLOCK(); 129 130 spc = &ci->ci_schedstate; 131 132 /* 133 * First time we enter here, we're not supposed to idle, 134 * just go away for a while. 135 */ 136 SCHED_LOCK(s); 137 cpuset_add(&sched_idle_cpus, ci); 138 p->p_stat = SSLEEP; 139 p->p_cpu = ci; 140 atomic_setbits_int(&p->p_flag, P_CPUPEG); 141 mi_switch(); 142 cpuset_del(&sched_idle_cpus, ci); 143 SCHED_UNLOCK(s); 144 145 KASSERT(ci == curcpu()); 146 KASSERT(curproc == spc->spc_idleproc); 147 148 while (1) { 149 while (!curcpu_is_idle()) { 150 struct proc *dead; 151 152 SCHED_LOCK(s); 153 p->p_stat = SSLEEP; 154 mi_switch(); 155 SCHED_UNLOCK(s); 156 157 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 158 LIST_REMOVE(dead, p_hash); 159 exit2(dead); 160 } 161 } 162 163 splassert(IPL_NONE); 164 165 cpuset_add(&sched_idle_cpus, ci); 166 cpu_idle_enter(); 167 while (spc->spc_whichqs == 0) { 168 if (spc->spc_schedflags & SPCF_SHOULDHALT && 169 (spc->spc_schedflags & SPCF_HALTED) == 0) { 170 cpuset_del(&sched_idle_cpus, ci); 171 SCHED_LOCK(s); 172 atomic_setbits_int(&spc->spc_schedflags, 173 spc->spc_whichqs ? 0 : SPCF_HALTED); 174 SCHED_UNLOCK(s); 175 wakeup(spc); 176 } 177 cpu_idle_cycle(); 178 } 179 cpu_idle_leave(); 180 cpuset_del(&sched_idle_cpus, ci); 181 } 182 } 183 184 /* 185 * To free our address space we have to jump through a few hoops. 186 * The freeing is done by the reaper, but until we have one reaper 187 * per cpu, we have no way of putting this proc on the deadproc list 188 * and waking up the reaper without risking having our address space and 189 * stack torn from under us before we manage to switch to another proc. 190 * Therefore we have a per-cpu list of dead processes where we put this 191 * proc and have idle clean up that list and move it to the reaper list. 192 * All this will be unnecessary once we can bind the reaper this cpu 193 * and not risk having it switch to another in case it sleeps. 194 */ 195 void 196 sched_exit(struct proc *p) 197 { 198 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 199 struct timespec ts; 200 struct proc *idle; 201 int s; 202 203 nanouptime(&ts); 204 timespecsub(&ts, &spc->spc_runtime, &ts); 205 timespecadd(&p->p_rtime, &ts, &p->p_rtime); 206 207 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 208 209 /* This process no longer needs to hold the kernel lock. */ 210 KERNEL_UNLOCK(); 211 212 SCHED_LOCK(s); 213 idle = spc->spc_idleproc; 214 idle->p_stat = SRUN; 215 cpu_switchto(NULL, idle); 216 panic("cpu_switchto returned"); 217 } 218 219 /* 220 * Run queue management. 221 */ 222 void 223 sched_init_runqueues(void) 224 { 225 } 226 227 void 228 setrunqueue(struct proc *p) 229 { 230 struct schedstate_percpu *spc; 231 int queue = p->p_priority >> 2; 232 233 SCHED_ASSERT_LOCKED(); 234 spc = &p->p_cpu->ci_schedstate; 235 spc->spc_nrun++; 236 237 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 238 spc->spc_whichqs |= (1 << queue); 239 cpuset_add(&sched_queued_cpus, p->p_cpu); 240 241 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 242 cpu_unidle(p->p_cpu); 243 } 244 245 void 246 remrunqueue(struct proc *p) 247 { 248 struct schedstate_percpu *spc; 249 int queue = p->p_priority >> 2; 250 251 SCHED_ASSERT_LOCKED(); 252 spc = &p->p_cpu->ci_schedstate; 253 spc->spc_nrun--; 254 255 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 256 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 257 spc->spc_whichqs &= ~(1 << queue); 258 if (spc->spc_whichqs == 0) 259 cpuset_del(&sched_queued_cpus, p->p_cpu); 260 } 261 } 262 263 struct proc * 264 sched_chooseproc(void) 265 { 266 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 267 struct proc *p; 268 int queue; 269 270 SCHED_ASSERT_LOCKED(); 271 272 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 273 if (spc->spc_whichqs) { 274 for (queue = 0; queue < SCHED_NQS; queue++) { 275 while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) { 276 remrunqueue(p); 277 p->p_cpu = sched_choosecpu(p); 278 setrunqueue(p); 279 } 280 } 281 } 282 p = spc->spc_idleproc; 283 KASSERT(p); 284 KASSERT(p->p_wchan == NULL); 285 p->p_stat = SRUN; 286 return (p); 287 } 288 289 again: 290 if (spc->spc_whichqs) { 291 queue = ffs(spc->spc_whichqs) - 1; 292 p = TAILQ_FIRST(&spc->spc_qs[queue]); 293 remrunqueue(p); 294 sched_noidle++; 295 KASSERT(p->p_stat == SRUN); 296 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 297 p = spc->spc_idleproc; 298 if (p == NULL) { 299 int s; 300 /* 301 * We get here if someone decides to switch during 302 * boot before forking kthreads, bleh. 303 * This is kind of like a stupid idle loop. 304 */ 305 #ifdef MULTIPROCESSOR 306 __mp_unlock(&sched_lock); 307 #endif 308 spl0(); 309 delay(10); 310 SCHED_LOCK(s); 311 goto again; 312 } 313 KASSERT(p); 314 p->p_stat = SRUN; 315 } 316 317 KASSERT(p->p_wchan == NULL); 318 return (p); 319 } 320 321 struct cpu_info * 322 sched_choosecpu_fork(struct proc *parent, int flags) 323 { 324 #ifdef MULTIPROCESSOR 325 struct cpu_info *choice = NULL; 326 fixpt_t load, best_load = ~0; 327 int run, best_run = INT_MAX; 328 struct cpu_info *ci; 329 struct cpuset set; 330 331 #if 0 332 /* 333 * XXX 334 * Don't do this until we have a painless way to move the cpu in exec. 335 * Preferably when nuking the old pmap and getting a new one on a 336 * new cpu. 337 */ 338 /* 339 * PPWAIT forks are simple. We know that the parent will not 340 * run until we exec and choose another cpu, so we just steal its 341 * cpu. 342 */ 343 if (flags & FORK_PPWAIT) 344 return (parent->p_cpu); 345 #endif 346 347 /* 348 * Look at all cpus that are currently idle and have nothing queued. 349 * If there are none, pick the one with least queued procs first, 350 * then the one with lowest load average. 351 */ 352 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 353 cpuset_intersection(&set, &set, &sched_all_cpus); 354 if (cpuset_first(&set) == NULL) 355 cpuset_copy(&set, &sched_all_cpus); 356 357 while ((ci = cpuset_first(&set)) != NULL) { 358 cpuset_del(&set, ci); 359 360 load = ci->ci_schedstate.spc_ldavg; 361 run = ci->ci_schedstate.spc_nrun; 362 363 if (choice == NULL || run < best_run || 364 (run == best_run &&load < best_load)) { 365 choice = ci; 366 best_load = load; 367 best_run = run; 368 } 369 } 370 371 return (choice); 372 #else 373 return (curcpu()); 374 #endif 375 } 376 377 struct cpu_info * 378 sched_choosecpu(struct proc *p) 379 { 380 #ifdef MULTIPROCESSOR 381 struct cpu_info *choice = NULL; 382 int last_cost = INT_MAX; 383 struct cpu_info *ci; 384 struct cpuset set; 385 386 /* 387 * If pegged to a cpu, don't allow it to move. 388 */ 389 if (p->p_flag & P_CPUPEG) 390 return (p->p_cpu); 391 392 sched_choose++; 393 394 /* 395 * Look at all cpus that are currently idle and have nothing queued. 396 * If there are none, pick the cheapest of those. 397 * (idle + queued could mean that the cpu is handling an interrupt 398 * at this moment and haven't had time to leave idle yet). 399 */ 400 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 401 cpuset_intersection(&set, &set, &sched_all_cpus); 402 403 /* 404 * First, just check if our current cpu is in that set, if it is, 405 * this is simple. 406 * Also, our cpu might not be idle, but if it's the current cpu 407 * and it has nothing else queued and we're curproc, take it. 408 */ 409 if (cpuset_isset(&set, p->p_cpu) || 410 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 411 curproc == p)) { 412 sched_wasidle++; 413 return (p->p_cpu); 414 } 415 416 if (cpuset_first(&set) == NULL) 417 cpuset_copy(&set, &sched_all_cpus); 418 419 while ((ci = cpuset_first(&set)) != NULL) { 420 int cost = sched_proc_to_cpu_cost(ci, p); 421 422 if (choice == NULL || cost < last_cost) { 423 choice = ci; 424 last_cost = cost; 425 } 426 cpuset_del(&set, ci); 427 } 428 429 if (p->p_cpu != choice) 430 sched_nmigrations++; 431 else 432 sched_nomigrations++; 433 434 return (choice); 435 #else 436 return (curcpu()); 437 #endif 438 } 439 440 /* 441 * Attempt to steal a proc from some cpu. 442 */ 443 struct proc * 444 sched_steal_proc(struct cpu_info *self) 445 { 446 struct proc *best = NULL; 447 #ifdef MULTIPROCESSOR 448 struct schedstate_percpu *spc; 449 int bestcost = INT_MAX; 450 struct cpu_info *ci; 451 struct cpuset set; 452 453 KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 454 455 cpuset_copy(&set, &sched_queued_cpus); 456 457 while ((ci = cpuset_first(&set)) != NULL) { 458 struct proc *p; 459 int queue; 460 int cost; 461 462 cpuset_del(&set, ci); 463 464 spc = &ci->ci_schedstate; 465 466 queue = ffs(spc->spc_whichqs) - 1; 467 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 468 if (p->p_flag & P_CPUPEG) 469 continue; 470 471 cost = sched_proc_to_cpu_cost(self, p); 472 473 if (best == NULL || cost < bestcost) { 474 best = p; 475 bestcost = cost; 476 } 477 } 478 } 479 if (best == NULL) 480 return (NULL); 481 482 spc = &best->p_cpu->ci_schedstate; 483 remrunqueue(best); 484 best->p_cpu = self; 485 486 sched_stolen++; 487 #endif 488 return (best); 489 } 490 491 #ifdef MULTIPROCESSOR 492 /* 493 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 494 */ 495 static int 496 log2(unsigned int i) 497 { 498 int ret = 0; 499 500 while (i >>= 1) 501 ret++; 502 503 return (ret); 504 } 505 506 /* 507 * Calculate the cost of moving the proc to this cpu. 508 * 509 * What we want is some guesstimate of how much "performance" it will 510 * cost us to move the proc here. Not just for caches and TLBs and NUMA 511 * memory, but also for the proc itself. A highly loaded cpu might not 512 * be the best candidate for this proc since it won't get run. 513 * 514 * Just total guesstimates for now. 515 */ 516 517 int sched_cost_load = 1; 518 int sched_cost_priority = 1; 519 int sched_cost_runnable = 3; 520 int sched_cost_resident = 1; 521 #endif 522 523 int 524 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 525 { 526 int cost = 0; 527 #ifdef MULTIPROCESSOR 528 struct schedstate_percpu *spc; 529 int l2resident = 0; 530 531 spc = &ci->ci_schedstate; 532 533 /* 534 * First, account for the priority of the proc we want to move. 535 * More willing to move, the lower the priority of the destination 536 * and the higher the priority of the proc. 537 */ 538 if (!cpuset_isset(&sched_idle_cpus, ci)) { 539 cost += (p->p_priority - spc->spc_curpriority) * 540 sched_cost_priority; 541 cost += sched_cost_runnable; 542 } 543 if (cpuset_isset(&sched_queued_cpus, ci)) 544 cost += spc->spc_nrun * sched_cost_runnable; 545 546 /* 547 * Higher load on the destination means we don't want to go there. 548 */ 549 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 550 551 /* 552 * If the proc is on this cpu already, lower the cost by how much 553 * it has been running and an estimate of its footprint. 554 */ 555 if (p->p_cpu == ci && p->p_slptime == 0) { 556 l2resident = 557 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 558 cost -= l2resident * sched_cost_resident; 559 } 560 #endif 561 return (cost); 562 } 563 564 /* 565 * Peg a proc to a cpu. 566 */ 567 void 568 sched_peg_curproc(struct cpu_info *ci) 569 { 570 struct proc *p = curproc; 571 int s; 572 573 SCHED_LOCK(s); 574 p->p_priority = p->p_usrpri; 575 p->p_stat = SRUN; 576 p->p_cpu = ci; 577 atomic_setbits_int(&p->p_flag, P_CPUPEG); 578 setrunqueue(p); 579 p->p_ru.ru_nvcsw++; 580 mi_switch(); 581 SCHED_UNLOCK(s); 582 } 583 584 #ifdef MULTIPROCESSOR 585 586 void 587 sched_start_secondary_cpus(void) 588 { 589 CPU_INFO_ITERATOR cii; 590 struct cpu_info *ci; 591 592 CPU_INFO_FOREACH(cii, ci) { 593 struct schedstate_percpu *spc = &ci->ci_schedstate; 594 595 if (CPU_IS_PRIMARY(ci)) 596 continue; 597 cpuset_add(&sched_all_cpus, ci); 598 atomic_clearbits_int(&spc->spc_schedflags, 599 SPCF_SHOULDHALT | SPCF_HALTED); 600 } 601 } 602 603 void 604 sched_stop_secondary_cpus(void) 605 { 606 CPU_INFO_ITERATOR cii; 607 struct cpu_info *ci; 608 609 /* 610 * Make sure we stop the secondary CPUs. 611 */ 612 CPU_INFO_FOREACH(cii, ci) { 613 struct schedstate_percpu *spc = &ci->ci_schedstate; 614 615 if (CPU_IS_PRIMARY(ci)) 616 continue; 617 cpuset_del(&sched_all_cpus, ci); 618 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 619 } 620 CPU_INFO_FOREACH(cii, ci) { 621 struct schedstate_percpu *spc = &ci->ci_schedstate; 622 struct sleep_state sls; 623 624 if (CPU_IS_PRIMARY(ci)) 625 continue; 626 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 627 sleep_setup(&sls, spc, PZERO, "schedstate"); 628 sleep_finish(&sls, 629 (spc->spc_schedflags & SPCF_HALTED) == 0); 630 } 631 } 632 } 633 634 #endif 635 636 /* 637 * Functions to manipulate cpu sets. 638 */ 639 struct cpu_info *cpuset_infos[MAXCPUS]; 640 static struct cpuset cpuset_all; 641 642 void 643 cpuset_init_cpu(struct cpu_info *ci) 644 { 645 cpuset_add(&cpuset_all, ci); 646 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 647 } 648 649 void 650 cpuset_clear(struct cpuset *cs) 651 { 652 memset(cs, 0, sizeof(*cs)); 653 } 654 655 void 656 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 657 { 658 unsigned int num = CPU_INFO_UNIT(ci); 659 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 660 } 661 662 void 663 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 664 { 665 unsigned int num = CPU_INFO_UNIT(ci); 666 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 667 } 668 669 int 670 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 671 { 672 unsigned int num = CPU_INFO_UNIT(ci); 673 return (cs->cs_set[num/32] & (1 << (num % 32))); 674 } 675 676 void 677 cpuset_add_all(struct cpuset *cs) 678 { 679 cpuset_copy(cs, &cpuset_all); 680 } 681 682 void 683 cpuset_copy(struct cpuset *to, struct cpuset *from) 684 { 685 memcpy(to, from, sizeof(*to)); 686 } 687 688 struct cpu_info * 689 cpuset_first(struct cpuset *cs) 690 { 691 int i; 692 693 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 694 if (cs->cs_set[i]) 695 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 696 697 return (NULL); 698 } 699 700 void 701 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 702 { 703 int i; 704 705 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 706 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 707 } 708 709 void 710 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 711 { 712 int i; 713 714 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 715 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 716 } 717 718 void 719 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 720 { 721 int i; 722 723 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 724 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 725 } 726