1 /* $OpenBSD: kern_sched.c,v 1.15 2009/11/25 11:01:14 kettenis Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 p->p_cpu = ci; 124 atomic_setbits_int(&p->p_flag, P_CPUPEG); 125 mi_switch(); 126 cpuset_del(&sched_idle_cpus, ci); 127 SCHED_UNLOCK(s); 128 129 KASSERT(ci == curcpu()); 130 KASSERT(curproc == spc->spc_idleproc); 131 132 while (1) { 133 while (!curcpu_is_idle()) { 134 struct proc *dead; 135 136 SCHED_LOCK(s); 137 p->p_stat = SSLEEP; 138 mi_switch(); 139 SCHED_UNLOCK(s); 140 141 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 142 LIST_REMOVE(dead, p_hash); 143 exit2(dead); 144 } 145 } 146 147 splassert(IPL_NONE); 148 149 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 150 spc->spc_schedflags |= SPCF_HALTED; 151 wakeup(spc); 152 } 153 154 cpuset_add(&sched_idle_cpus, ci); 155 cpu_idle_enter(); 156 while (spc->spc_whichqs == 0) 157 cpu_idle_cycle(); 158 cpu_idle_leave(); 159 cpuset_del(&sched_idle_cpus, ci); 160 } 161 } 162 163 /* 164 * To free our address space we have to jump through a few hoops. 165 * The freeing is done by the reaper, but until we have one reaper 166 * per cpu, we have no way of putting this proc on the deadproc list 167 * and waking up the reaper without risking having our address space and 168 * stack torn from under us before we manage to switch to another proc. 169 * Therefore we have a per-cpu list of dead processes where we put this 170 * proc and have idle clean up that list and move it to the reaper list. 171 * All this will be unnecessary once we can bind the reaper this cpu 172 * and not risk having it switch to another in case it sleeps. 173 */ 174 void 175 sched_exit(struct proc *p) 176 { 177 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 178 struct timeval tv; 179 struct proc *idle; 180 int s; 181 182 microuptime(&tv); 183 timersub(&tv, &spc->spc_runtime, &tv); 184 timeradd(&p->p_rtime, &tv, &p->p_rtime); 185 186 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 187 188 /* This process no longer needs to hold the kernel lock. */ 189 KERNEL_PROC_UNLOCK(p); 190 191 SCHED_LOCK(s); 192 idle = spc->spc_idleproc; 193 idle->p_stat = SRUN; 194 cpu_switchto(NULL, idle); 195 panic("cpu_switchto returned"); 196 } 197 198 /* 199 * Run queue management. 200 */ 201 void 202 sched_init_runqueues(void) 203 { 204 #ifdef MULTIPROCESSOR 205 __mp_lock_init(&sched_lock); 206 #endif 207 } 208 209 void 210 setrunqueue(struct proc *p) 211 { 212 struct schedstate_percpu *spc; 213 int queue = p->p_priority >> 2; 214 215 SCHED_ASSERT_LOCKED(); 216 spc = &p->p_cpu->ci_schedstate; 217 spc->spc_nrun++; 218 219 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 220 spc->spc_whichqs |= (1 << queue); 221 cpuset_add(&sched_queued_cpus, p->p_cpu); 222 223 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 224 cpu_unidle(p->p_cpu); 225 } 226 227 void 228 remrunqueue(struct proc *p) 229 { 230 struct schedstate_percpu *spc; 231 int queue = p->p_priority >> 2; 232 233 SCHED_ASSERT_LOCKED(); 234 spc = &p->p_cpu->ci_schedstate; 235 spc->spc_nrun--; 236 237 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 238 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 239 spc->spc_whichqs &= ~(1 << queue); 240 if (spc->spc_whichqs == 0) 241 cpuset_del(&sched_queued_cpus, p->p_cpu); 242 } 243 } 244 245 struct proc * 246 sched_chooseproc(void) 247 { 248 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 249 struct proc *p; 250 int queue; 251 252 SCHED_ASSERT_LOCKED(); 253 254 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 255 p = spc->spc_idleproc; 256 KASSERT(p); 257 p->p_stat = SRUN; 258 return (p); 259 } 260 261 again: 262 if (spc->spc_whichqs) { 263 queue = ffs(spc->spc_whichqs) - 1; 264 p = TAILQ_FIRST(&spc->spc_qs[queue]); 265 remrunqueue(p); 266 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 267 p = spc->spc_idleproc; 268 if (p == NULL) { 269 int s; 270 /* 271 * We get here if someone decides to switch during 272 * boot before forking kthreads, bleh. 273 * This is kind of like a stupid idle loop. 274 */ 275 #ifdef MULTIPROCESSOR 276 __mp_unlock(&sched_lock); 277 #endif 278 spl0(); 279 delay(10); 280 SCHED_LOCK(s); 281 goto again; 282 } 283 KASSERT(p); 284 p->p_stat = SRUN; 285 } 286 287 return (p); 288 } 289 290 uint64_t sched_nmigrations; 291 uint64_t sched_noidle; 292 uint64_t sched_stolen; 293 294 uint64_t sched_choose; 295 uint64_t sched_wasidle; 296 uint64_t sched_nomigrations; 297 298 struct cpu_info * 299 sched_choosecpu_fork(struct proc *parent, int flags) 300 { 301 struct cpu_info *choice = NULL; 302 fixpt_t load, best_load = ~0; 303 int run, best_run = INT_MAX; 304 struct cpu_info *ci; 305 struct cpuset set; 306 307 #if 0 308 /* 309 * XXX 310 * Don't do this until we have a painless way to move the cpu in exec. 311 * Preferably when nuking the old pmap and getting a new one on a 312 * new cpu. 313 */ 314 /* 315 * PPWAIT forks are simple. We know that the parent will not 316 * run until we exec and choose another cpu, so we just steal its 317 * cpu. 318 */ 319 if (flags & FORK_PPWAIT) 320 return (parent->p_cpu); 321 #endif 322 323 /* 324 * Look at all cpus that are currently idle and have nothing queued. 325 * If there are none, pick the one with least queued procs first, 326 * then the one with lowest load average. 327 */ 328 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 329 if (cpuset_first(&set) == NULL) 330 cpuset_add_all(&set); 331 332 while ((ci = cpuset_first(&set)) != NULL) { 333 cpuset_del(&set, ci); 334 335 load = ci->ci_schedstate.spc_ldavg; 336 run = ci->ci_schedstate.spc_nrun; 337 338 if (choice == NULL || run < best_run || 339 (run == best_run &&load < best_load)) { 340 choice = ci; 341 best_load = load; 342 best_run = run; 343 } 344 } 345 346 return (choice); 347 } 348 349 struct cpu_info * 350 sched_choosecpu(struct proc *p) 351 { 352 struct cpu_info *choice = NULL; 353 int last_cost = INT_MAX; 354 struct cpu_info *ci; 355 struct cpuset set; 356 357 /* 358 * If pegged to a cpu, don't allow it to move. 359 */ 360 if (p->p_flag & P_CPUPEG) 361 return (p->p_cpu); 362 363 sched_choose++; 364 365 /* 366 * Look at all cpus that are currently idle and have nothing queued. 367 * If there are none, pick the cheapest of those. 368 * (idle + queued could mean that the cpu is handling an interrupt 369 * at this moment and haven't had time to leave idle yet). 370 */ 371 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 372 373 /* 374 * First, just check if our current cpu is in that set, if it is, 375 * this is simple. 376 * Also, our cpu might not be idle, but if it's the current cpu 377 * and it has nothing else queued and we're curproc, take it. 378 */ 379 if (cpuset_isset(&set, p->p_cpu) || 380 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 381 curproc == p)) { 382 sched_wasidle++; 383 return (p->p_cpu); 384 } 385 386 if (cpuset_first(&set) == NULL) 387 cpuset_add_all(&set); 388 389 while ((ci = cpuset_first(&set)) != NULL) { 390 int cost = sched_proc_to_cpu_cost(ci, p); 391 392 if (choice == NULL || cost < last_cost) { 393 choice = ci; 394 last_cost = cost; 395 } 396 cpuset_del(&set, ci); 397 } 398 399 if (p->p_cpu != choice) 400 sched_nmigrations++; 401 else 402 sched_nomigrations++; 403 404 return (choice); 405 } 406 407 /* 408 * Attempt to steal a proc from some cpu. 409 */ 410 struct proc * 411 sched_steal_proc(struct cpu_info *self) 412 { 413 struct schedstate_percpu *spc; 414 struct proc *best = NULL; 415 int bestcost = INT_MAX; 416 struct cpu_info *ci; 417 struct cpuset set; 418 419 cpuset_copy(&set, &sched_queued_cpus); 420 421 while ((ci = cpuset_first(&set)) != NULL) { 422 struct proc *p; 423 int queue; 424 int cost; 425 426 cpuset_del(&set, ci); 427 428 spc = &ci->ci_schedstate; 429 430 queue = ffs(spc->spc_whichqs) - 1; 431 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 432 if (p->p_flag & P_CPUPEG) 433 continue; 434 435 cost = sched_proc_to_cpu_cost(self, p); 436 437 if (best == NULL || cost < bestcost) { 438 best = p; 439 bestcost = cost; 440 } 441 } 442 } 443 if (best == NULL) 444 return (NULL); 445 446 spc = &best->p_cpu->ci_schedstate; 447 remrunqueue(best); 448 best->p_cpu = self; 449 450 sched_stolen++; 451 452 return (best); 453 } 454 455 /* 456 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 457 */ 458 static int 459 log2(unsigned int i) 460 { 461 int ret = 0; 462 463 while (i >>= 1) 464 ret++; 465 466 return (ret); 467 } 468 469 /* 470 * Calculate the cost of moving the proc to this cpu. 471 * 472 * What we want is some guesstimate of how much "performance" it will 473 * cost us to move the proc here. Not just for caches and TLBs and NUMA 474 * memory, but also for the proc itself. A highly loaded cpu might not 475 * be the best candidate for this proc since it won't get run. 476 * 477 * Just total guesstimates for now. 478 */ 479 480 int sched_cost_load = 1; 481 int sched_cost_priority = 1; 482 int sched_cost_runnable = 3; 483 int sched_cost_resident = 1; 484 485 int 486 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 487 { 488 struct schedstate_percpu *spc; 489 int l2resident = 0; 490 int cost; 491 492 spc = &ci->ci_schedstate; 493 494 cost = 0; 495 496 /* 497 * First, account for the priority of the proc we want to move. 498 * More willing to move, the lower the priority of the destination 499 * and the higher the priority of the proc. 500 */ 501 if (!cpuset_isset(&sched_idle_cpus, ci)) { 502 cost += (p->p_priority - spc->spc_curpriority) * 503 sched_cost_priority; 504 cost += sched_cost_runnable; 505 } 506 if (cpuset_isset(&sched_queued_cpus, ci)) { 507 cost += spc->spc_nrun * sched_cost_runnable; 508 } 509 510 /* 511 * Higher load on the destination means we don't want to go there. 512 */ 513 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 514 515 /* 516 * If the proc is on this cpu already, lower the cost by how much 517 * it has been running and an estimate of its footprint. 518 */ 519 if (p->p_cpu == ci && p->p_slptime == 0) { 520 l2resident = 521 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 522 cost -= l2resident * sched_cost_resident; 523 } 524 525 return (cost); 526 } 527 528 /* 529 * Peg a proc to a cpu. 530 */ 531 void 532 sched_peg_curproc(struct cpu_info *ci) 533 { 534 struct proc *p = curproc; 535 int s; 536 537 SCHED_LOCK(s); 538 p->p_priority = p->p_usrpri; 539 p->p_stat = SRUN; 540 p->p_cpu = ci; 541 atomic_setbits_int(&p->p_flag, P_CPUPEG); 542 setrunqueue(p); 543 p->p_stats->p_ru.ru_nvcsw++; 544 mi_switch(); 545 SCHED_UNLOCK(s); 546 } 547 548 /* 549 * Functions to manipulate cpu sets. 550 */ 551 struct cpu_info *cpuset_infos[MAXCPUS]; 552 static struct cpuset cpuset_all; 553 554 void 555 cpuset_init_cpu(struct cpu_info *ci) 556 { 557 cpuset_add(&cpuset_all, ci); 558 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 559 } 560 561 void 562 cpuset_clear(struct cpuset *cs) 563 { 564 memset(cs, 0, sizeof(*cs)); 565 } 566 567 /* 568 * XXX - implement it on SP architectures too 569 */ 570 #ifndef CPU_INFO_UNIT 571 #define CPU_INFO_UNIT 0 572 #endif 573 574 void 575 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 576 { 577 unsigned int num = CPU_INFO_UNIT(ci); 578 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 579 } 580 581 void 582 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 583 { 584 unsigned int num = CPU_INFO_UNIT(ci); 585 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 586 } 587 588 int 589 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 590 { 591 unsigned int num = CPU_INFO_UNIT(ci); 592 return (cs->cs_set[num/32] & (1 << (num % 32))); 593 } 594 595 void 596 cpuset_add_all(struct cpuset *cs) 597 { 598 cpuset_copy(cs, &cpuset_all); 599 } 600 601 void 602 cpuset_copy(struct cpuset *to, struct cpuset *from) 603 { 604 memcpy(to, from, sizeof(*to)); 605 } 606 607 struct cpu_info * 608 cpuset_first(struct cpuset *cs) 609 { 610 int i; 611 612 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 613 if (cs->cs_set[i]) 614 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 615 616 return (NULL); 617 } 618 619 void 620 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 621 { 622 int i; 623 624 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 625 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 626 } 627 628 void 629 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 630 { 631 int i; 632 633 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 634 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 635 } 636 637 void 638 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 639 { 640 int i; 641 642 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 643 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 644 } 645