1 /* $OpenBSD: kern_sched.c,v 1.35 2014/09/24 09:13:20 mpi Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 #include <sys/malloc.h> 31 32 33 void sched_kthreads_create(void *); 34 35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 36 struct proc *sched_steal_proc(struct cpu_info *); 37 38 /* 39 * To help choosing which cpu should run which process we keep track 40 * of cpus which are currently idle and which cpus have processes 41 * queued. 42 */ 43 struct cpuset sched_idle_cpus; 44 struct cpuset sched_queued_cpus; 45 struct cpuset sched_all_cpus; 46 47 /* 48 * Some general scheduler counters. 49 */ 50 uint64_t sched_nmigrations; /* Cpu migration counter */ 51 uint64_t sched_nomigrations; /* Cpu no migration counter */ 52 uint64_t sched_noidle; /* Times we didn't pick the idle task */ 53 uint64_t sched_stolen; /* Times we stole proc from other cpus */ 54 uint64_t sched_choose; /* Times we chose a cpu */ 55 uint64_t sched_wasidle; /* Times we came out of idle */ 56 57 /* 58 * A few notes about cpu_switchto that is implemented in MD code. 59 * 60 * cpu_switchto takes two arguments, the old proc and the proc 61 * it should switch to. The new proc will never be NULL, so we always have 62 * a saved state that we need to switch to. The old proc however can 63 * be NULL if the process is exiting. NULL for the old proc simply 64 * means "don't bother saving old state". 65 * 66 * cpu_switchto is supposed to atomically load the new state of the process 67 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 68 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 69 * cpus in the system must not depend on this state being consistent. 70 * Therefore no locking is necessary in cpu_switchto other than blocking 71 * interrupts during the context switch. 72 */ 73 74 /* 75 * sched_init_cpu is called from main() for the boot cpu, then it's the 76 * responsibility of the MD code to call it for all other cpus. 77 */ 78 void 79 sched_init_cpu(struct cpu_info *ci) 80 { 81 struct schedstate_percpu *spc = &ci->ci_schedstate; 82 int i; 83 84 for (i = 0; i < SCHED_NQS; i++) 85 TAILQ_INIT(&spc->spc_qs[i]); 86 87 spc->spc_idleproc = NULL; 88 89 kthread_create_deferred(sched_kthreads_create, ci); 90 91 LIST_INIT(&spc->spc_deadproc); 92 93 /* 94 * Slight hack here until the cpuset code handles cpu_info 95 * structures. 96 */ 97 cpuset_init_cpu(ci); 98 cpuset_add(&sched_all_cpus, ci); 99 } 100 101 void 102 sched_kthreads_create(void *v) 103 { 104 struct cpu_info *ci = v; 105 struct schedstate_percpu *spc = &ci->ci_schedstate; 106 static int num; 107 108 if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE| 109 FORK_SYSTEM|FORK_SIGHAND|FORK_IDLE, NULL, 0, sched_idle, ci, NULL, 110 &spc->spc_idleproc)) 111 panic("fork idle"); 112 113 /* Name it as specified. */ 114 snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm), 115 "idle%d", num); 116 117 num++; 118 } 119 120 void 121 sched_idle(void *v) 122 { 123 struct schedstate_percpu *spc; 124 struct proc *p = curproc; 125 struct cpu_info *ci = v; 126 int s; 127 128 KERNEL_UNLOCK(); 129 130 spc = &ci->ci_schedstate; 131 132 /* 133 * First time we enter here, we're not supposed to idle, 134 * just go away for a while. 135 */ 136 SCHED_LOCK(s); 137 cpuset_add(&sched_idle_cpus, ci); 138 p->p_stat = SSLEEP; 139 p->p_cpu = ci; 140 atomic_setbits_int(&p->p_flag, P_CPUPEG); 141 mi_switch(); 142 cpuset_del(&sched_idle_cpus, ci); 143 SCHED_UNLOCK(s); 144 145 KASSERT(ci == curcpu()); 146 KASSERT(curproc == spc->spc_idleproc); 147 148 while (1) { 149 while (!curcpu_is_idle()) { 150 struct proc *dead; 151 152 SCHED_LOCK(s); 153 p->p_stat = SSLEEP; 154 mi_switch(); 155 SCHED_UNLOCK(s); 156 157 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 158 LIST_REMOVE(dead, p_hash); 159 exit2(dead); 160 } 161 } 162 163 splassert(IPL_NONE); 164 165 cpuset_add(&sched_idle_cpus, ci); 166 cpu_idle_enter(); 167 while (spc->spc_whichqs == 0) { 168 #ifdef MULTIPROCESSOR 169 if (spc->spc_schedflags & SPCF_SHOULDHALT && 170 (spc->spc_schedflags & SPCF_HALTED) == 0) { 171 cpuset_del(&sched_idle_cpus, ci); 172 SCHED_LOCK(s); 173 atomic_setbits_int(&spc->spc_schedflags, 174 spc->spc_whichqs ? 0 : SPCF_HALTED); 175 SCHED_UNLOCK(s); 176 wakeup(spc); 177 } 178 #endif 179 cpu_idle_cycle(); 180 } 181 cpu_idle_leave(); 182 cpuset_del(&sched_idle_cpus, ci); 183 } 184 } 185 186 /* 187 * To free our address space we have to jump through a few hoops. 188 * The freeing is done by the reaper, but until we have one reaper 189 * per cpu, we have no way of putting this proc on the deadproc list 190 * and waking up the reaper without risking having our address space and 191 * stack torn from under us before we manage to switch to another proc. 192 * Therefore we have a per-cpu list of dead processes where we put this 193 * proc and have idle clean up that list and move it to the reaper list. 194 * All this will be unnecessary once we can bind the reaper this cpu 195 * and not risk having it switch to another in case it sleeps. 196 */ 197 void 198 sched_exit(struct proc *p) 199 { 200 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 201 struct timespec ts; 202 struct proc *idle; 203 int s; 204 205 nanouptime(&ts); 206 timespecsub(&ts, &spc->spc_runtime, &ts); 207 timespecadd(&p->p_rtime, &ts, &p->p_rtime); 208 209 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 210 211 /* This process no longer needs to hold the kernel lock. */ 212 KERNEL_UNLOCK(); 213 214 SCHED_LOCK(s); 215 idle = spc->spc_idleproc; 216 idle->p_stat = SRUN; 217 cpu_switchto(NULL, idle); 218 panic("cpu_switchto returned"); 219 } 220 221 /* 222 * Run queue management. 223 */ 224 void 225 sched_init_runqueues(void) 226 { 227 } 228 229 void 230 setrunqueue(struct proc *p) 231 { 232 struct schedstate_percpu *spc; 233 int queue = p->p_priority >> 2; 234 235 SCHED_ASSERT_LOCKED(); 236 spc = &p->p_cpu->ci_schedstate; 237 spc->spc_nrun++; 238 239 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 240 spc->spc_whichqs |= (1 << queue); 241 cpuset_add(&sched_queued_cpus, p->p_cpu); 242 243 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 244 cpu_unidle(p->p_cpu); 245 } 246 247 void 248 remrunqueue(struct proc *p) 249 { 250 struct schedstate_percpu *spc; 251 int queue = p->p_priority >> 2; 252 253 SCHED_ASSERT_LOCKED(); 254 spc = &p->p_cpu->ci_schedstate; 255 spc->spc_nrun--; 256 257 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 258 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 259 spc->spc_whichqs &= ~(1 << queue); 260 if (spc->spc_whichqs == 0) 261 cpuset_del(&sched_queued_cpus, p->p_cpu); 262 } 263 } 264 265 struct proc * 266 sched_chooseproc(void) 267 { 268 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 269 struct proc *p; 270 int queue; 271 272 SCHED_ASSERT_LOCKED(); 273 274 #ifdef MULTIPROCESSOR 275 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 276 if (spc->spc_whichqs) { 277 for (queue = 0; queue < SCHED_NQS; queue++) { 278 while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) { 279 remrunqueue(p); 280 p->p_cpu = sched_choosecpu(p); 281 KASSERT(p->p_cpu != curcpu()); 282 setrunqueue(p); 283 } 284 } 285 } 286 p = spc->spc_idleproc; 287 KASSERT(p); 288 KASSERT(p->p_wchan == NULL); 289 p->p_stat = SRUN; 290 return (p); 291 } 292 #endif 293 294 again: 295 if (spc->spc_whichqs) { 296 queue = ffs(spc->spc_whichqs) - 1; 297 p = TAILQ_FIRST(&spc->spc_qs[queue]); 298 remrunqueue(p); 299 sched_noidle++; 300 KASSERT(p->p_stat == SRUN); 301 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 302 p = spc->spc_idleproc; 303 if (p == NULL) { 304 int s; 305 /* 306 * We get here if someone decides to switch during 307 * boot before forking kthreads, bleh. 308 * This is kind of like a stupid idle loop. 309 */ 310 #ifdef MULTIPROCESSOR 311 __mp_unlock(&sched_lock); 312 #endif 313 spl0(); 314 delay(10); 315 SCHED_LOCK(s); 316 goto again; 317 } 318 KASSERT(p); 319 p->p_stat = SRUN; 320 } 321 322 KASSERT(p->p_wchan == NULL); 323 return (p); 324 } 325 326 struct cpu_info * 327 sched_choosecpu_fork(struct proc *parent, int flags) 328 { 329 #ifdef MULTIPROCESSOR 330 struct cpu_info *choice = NULL; 331 fixpt_t load, best_load = ~0; 332 int run, best_run = INT_MAX; 333 struct cpu_info *ci; 334 struct cpuset set; 335 336 #if 0 337 /* 338 * XXX 339 * Don't do this until we have a painless way to move the cpu in exec. 340 * Preferably when nuking the old pmap and getting a new one on a 341 * new cpu. 342 */ 343 /* 344 * PPWAIT forks are simple. We know that the parent will not 345 * run until we exec and choose another cpu, so we just steal its 346 * cpu. 347 */ 348 if (flags & FORK_PPWAIT) 349 return (parent->p_cpu); 350 #endif 351 352 /* 353 * Look at all cpus that are currently idle and have nothing queued. 354 * If there are none, pick the one with least queued procs first, 355 * then the one with lowest load average. 356 */ 357 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 358 cpuset_intersection(&set, &set, &sched_all_cpus); 359 if (cpuset_first(&set) == NULL) 360 cpuset_copy(&set, &sched_all_cpus); 361 362 while ((ci = cpuset_first(&set)) != NULL) { 363 cpuset_del(&set, ci); 364 365 load = ci->ci_schedstate.spc_ldavg; 366 run = ci->ci_schedstate.spc_nrun; 367 368 if (choice == NULL || run < best_run || 369 (run == best_run &&load < best_load)) { 370 choice = ci; 371 best_load = load; 372 best_run = run; 373 } 374 } 375 376 return (choice); 377 #else 378 return (curcpu()); 379 #endif 380 } 381 382 struct cpu_info * 383 sched_choosecpu(struct proc *p) 384 { 385 #ifdef MULTIPROCESSOR 386 struct cpu_info *choice = NULL; 387 int last_cost = INT_MAX; 388 struct cpu_info *ci; 389 struct cpuset set; 390 391 /* 392 * If pegged to a cpu, don't allow it to move. 393 */ 394 if (p->p_flag & P_CPUPEG) 395 return (p->p_cpu); 396 397 sched_choose++; 398 399 /* 400 * Look at all cpus that are currently idle and have nothing queued. 401 * If there are none, pick the cheapest of those. 402 * (idle + queued could mean that the cpu is handling an interrupt 403 * at this moment and haven't had time to leave idle yet). 404 */ 405 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 406 cpuset_intersection(&set, &set, &sched_all_cpus); 407 408 /* 409 * First, just check if our current cpu is in that set, if it is, 410 * this is simple. 411 * Also, our cpu might not be idle, but if it's the current cpu 412 * and it has nothing else queued and we're curproc, take it. 413 */ 414 if (cpuset_isset(&set, p->p_cpu) || 415 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 416 (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 && 417 curproc == p)) { 418 sched_wasidle++; 419 return (p->p_cpu); 420 } 421 422 if (cpuset_first(&set) == NULL) 423 cpuset_copy(&set, &sched_all_cpus); 424 425 while ((ci = cpuset_first(&set)) != NULL) { 426 int cost = sched_proc_to_cpu_cost(ci, p); 427 428 if (choice == NULL || cost < last_cost) { 429 choice = ci; 430 last_cost = cost; 431 } 432 cpuset_del(&set, ci); 433 } 434 435 if (p->p_cpu != choice) 436 sched_nmigrations++; 437 else 438 sched_nomigrations++; 439 440 return (choice); 441 #else 442 return (curcpu()); 443 #endif 444 } 445 446 /* 447 * Attempt to steal a proc from some cpu. 448 */ 449 struct proc * 450 sched_steal_proc(struct cpu_info *self) 451 { 452 struct proc *best = NULL; 453 #ifdef MULTIPROCESSOR 454 struct schedstate_percpu *spc; 455 int bestcost = INT_MAX; 456 struct cpu_info *ci; 457 struct cpuset set; 458 459 KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 460 461 cpuset_copy(&set, &sched_queued_cpus); 462 463 while ((ci = cpuset_first(&set)) != NULL) { 464 struct proc *p; 465 int queue; 466 int cost; 467 468 cpuset_del(&set, ci); 469 470 spc = &ci->ci_schedstate; 471 472 queue = ffs(spc->spc_whichqs) - 1; 473 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 474 if (p->p_flag & P_CPUPEG) 475 continue; 476 477 cost = sched_proc_to_cpu_cost(self, p); 478 479 if (best == NULL || cost < bestcost) { 480 best = p; 481 bestcost = cost; 482 } 483 } 484 } 485 if (best == NULL) 486 return (NULL); 487 488 spc = &best->p_cpu->ci_schedstate; 489 remrunqueue(best); 490 best->p_cpu = self; 491 492 sched_stolen++; 493 #endif 494 return (best); 495 } 496 497 #ifdef MULTIPROCESSOR 498 /* 499 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 500 */ 501 static int 502 log2(unsigned int i) 503 { 504 int ret = 0; 505 506 while (i >>= 1) 507 ret++; 508 509 return (ret); 510 } 511 512 /* 513 * Calculate the cost of moving the proc to this cpu. 514 * 515 * What we want is some guesstimate of how much "performance" it will 516 * cost us to move the proc here. Not just for caches and TLBs and NUMA 517 * memory, but also for the proc itself. A highly loaded cpu might not 518 * be the best candidate for this proc since it won't get run. 519 * 520 * Just total guesstimates for now. 521 */ 522 523 int sched_cost_load = 1; 524 int sched_cost_priority = 1; 525 int sched_cost_runnable = 3; 526 int sched_cost_resident = 1; 527 #endif 528 529 int 530 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 531 { 532 int cost = 0; 533 #ifdef MULTIPROCESSOR 534 struct schedstate_percpu *spc; 535 int l2resident = 0; 536 537 spc = &ci->ci_schedstate; 538 539 /* 540 * First, account for the priority of the proc we want to move. 541 * More willing to move, the lower the priority of the destination 542 * and the higher the priority of the proc. 543 */ 544 if (!cpuset_isset(&sched_idle_cpus, ci)) { 545 cost += (p->p_priority - spc->spc_curpriority) * 546 sched_cost_priority; 547 cost += sched_cost_runnable; 548 } 549 if (cpuset_isset(&sched_queued_cpus, ci)) 550 cost += spc->spc_nrun * sched_cost_runnable; 551 552 /* 553 * Higher load on the destination means we don't want to go there. 554 */ 555 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 556 557 /* 558 * If the proc is on this cpu already, lower the cost by how much 559 * it has been running and an estimate of its footprint. 560 */ 561 if (p->p_cpu == ci && p->p_slptime == 0) { 562 l2resident = 563 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 564 cost -= l2resident * sched_cost_resident; 565 } 566 #endif 567 return (cost); 568 } 569 570 /* 571 * Peg a proc to a cpu. 572 */ 573 void 574 sched_peg_curproc(struct cpu_info *ci) 575 { 576 struct proc *p = curproc; 577 int s; 578 579 SCHED_LOCK(s); 580 p->p_priority = p->p_usrpri; 581 p->p_stat = SRUN; 582 p->p_cpu = ci; 583 atomic_setbits_int(&p->p_flag, P_CPUPEG); 584 setrunqueue(p); 585 p->p_ru.ru_nvcsw++; 586 mi_switch(); 587 SCHED_UNLOCK(s); 588 } 589 590 #ifdef MULTIPROCESSOR 591 592 void 593 sched_start_secondary_cpus(void) 594 { 595 CPU_INFO_ITERATOR cii; 596 struct cpu_info *ci; 597 598 CPU_INFO_FOREACH(cii, ci) { 599 struct schedstate_percpu *spc = &ci->ci_schedstate; 600 601 if (CPU_IS_PRIMARY(ci)) 602 continue; 603 cpuset_add(&sched_all_cpus, ci); 604 atomic_clearbits_int(&spc->spc_schedflags, 605 SPCF_SHOULDHALT | SPCF_HALTED); 606 } 607 } 608 609 void 610 sched_stop_secondary_cpus(void) 611 { 612 CPU_INFO_ITERATOR cii; 613 struct cpu_info *ci; 614 615 /* 616 * Make sure we stop the secondary CPUs. 617 */ 618 CPU_INFO_FOREACH(cii, ci) { 619 struct schedstate_percpu *spc = &ci->ci_schedstate; 620 621 if (CPU_IS_PRIMARY(ci)) 622 continue; 623 cpuset_del(&sched_all_cpus, ci); 624 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 625 } 626 CPU_INFO_FOREACH(cii, ci) { 627 struct schedstate_percpu *spc = &ci->ci_schedstate; 628 struct sleep_state sls; 629 630 if (CPU_IS_PRIMARY(ci)) 631 continue; 632 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 633 sleep_setup(&sls, spc, PZERO, "schedstate"); 634 sleep_finish(&sls, 635 (spc->spc_schedflags & SPCF_HALTED) == 0); 636 } 637 } 638 } 639 640 #endif 641 642 /* 643 * Functions to manipulate cpu sets. 644 */ 645 struct cpu_info *cpuset_infos[MAXCPUS]; 646 static struct cpuset cpuset_all; 647 648 void 649 cpuset_init_cpu(struct cpu_info *ci) 650 { 651 cpuset_add(&cpuset_all, ci); 652 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 653 } 654 655 void 656 cpuset_clear(struct cpuset *cs) 657 { 658 memset(cs, 0, sizeof(*cs)); 659 } 660 661 void 662 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 663 { 664 unsigned int num = CPU_INFO_UNIT(ci); 665 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 666 } 667 668 void 669 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 670 { 671 unsigned int num = CPU_INFO_UNIT(ci); 672 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 673 } 674 675 int 676 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 677 { 678 unsigned int num = CPU_INFO_UNIT(ci); 679 return (cs->cs_set[num/32] & (1 << (num % 32))); 680 } 681 682 void 683 cpuset_add_all(struct cpuset *cs) 684 { 685 cpuset_copy(cs, &cpuset_all); 686 } 687 688 void 689 cpuset_copy(struct cpuset *to, struct cpuset *from) 690 { 691 memcpy(to, from, sizeof(*to)); 692 } 693 694 struct cpu_info * 695 cpuset_first(struct cpuset *cs) 696 { 697 int i; 698 699 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 700 if (cs->cs_set[i]) 701 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 702 703 return (NULL); 704 } 705 706 void 707 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 708 { 709 int i; 710 711 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 712 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 713 } 714 715 void 716 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 717 { 718 int i; 719 720 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 721 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 722 } 723 724 void 725 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 726 { 727 int i; 728 729 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 730 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 731 } 732