1 /* $OpenBSD: kern_sched.c,v 1.12 2009/04/20 08:48:17 art Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 mi_switch(); 124 cpuset_del(&sched_idle_cpus, ci); 125 SCHED_UNLOCK(s); 126 127 KASSERT(ci == curcpu()); 128 KASSERT(curproc == spc->spc_idleproc); 129 130 while (1) { 131 while (!curcpu_is_idle()) { 132 struct proc *dead; 133 134 SCHED_LOCK(s); 135 p->p_stat = SSLEEP; 136 mi_switch(); 137 SCHED_UNLOCK(s); 138 139 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 140 LIST_REMOVE(dead, p_hash); 141 exit2(dead); 142 } 143 } 144 145 splassert(IPL_NONE); 146 147 cpuset_add(&sched_idle_cpus, ci); 148 cpu_idle_enter(); 149 while (spc->spc_whichqs == 0) 150 cpu_idle_cycle(); 151 cpu_idle_leave(); 152 cpuset_del(&sched_idle_cpus, ci); 153 } 154 } 155 156 /* 157 * To free our address space we have to jump through a few hoops. 158 * The freeing is done by the reaper, but until we have one reaper 159 * per cpu, we have no way of putting this proc on the deadproc list 160 * and waking up the reaper without risking having our address space and 161 * stack torn from under us before we manage to switch to another proc. 162 * Therefore we have a per-cpu list of dead processes where we put this 163 * proc and have idle clean up that list and move it to the reaper list. 164 * All this will be unnecessary once we can bind the reaper this cpu 165 * and not risk having it switch to another in case it sleeps. 166 */ 167 void 168 sched_exit(struct proc *p) 169 { 170 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 171 struct timeval tv; 172 struct proc *idle; 173 int s; 174 175 microuptime(&tv); 176 timersub(&tv, &spc->spc_runtime, &tv); 177 timeradd(&p->p_rtime, &tv, &p->p_rtime); 178 179 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 180 181 #ifdef MULTIPROCESSOR 182 KASSERT(__mp_lock_held(&kernel_lock) == 0); 183 #endif 184 185 SCHED_LOCK(s); 186 idle = spc->spc_idleproc; 187 idle->p_stat = SRUN; 188 cpu_switchto(NULL, idle); 189 panic("cpu_switchto returned"); 190 } 191 192 /* 193 * Run queue management. 194 */ 195 void 196 sched_init_runqueues(void) 197 { 198 #ifdef MULTIPROCESSOR 199 __mp_lock_init(&sched_lock); 200 #endif 201 } 202 203 void 204 setrunqueue(struct proc *p) 205 { 206 struct schedstate_percpu *spc; 207 int queue = p->p_priority >> 2; 208 209 SCHED_ASSERT_LOCKED(); 210 spc = &p->p_cpu->ci_schedstate; 211 spc->spc_nrun++; 212 213 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 214 spc->spc_whichqs |= (1 << queue); 215 cpuset_add(&sched_queued_cpus, p->p_cpu); 216 217 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 218 cpu_unidle(p->p_cpu); 219 } 220 221 void 222 remrunqueue(struct proc *p) 223 { 224 struct schedstate_percpu *spc; 225 int queue = p->p_priority >> 2; 226 227 SCHED_ASSERT_LOCKED(); 228 spc = &p->p_cpu->ci_schedstate; 229 spc->spc_nrun--; 230 231 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 232 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 233 spc->spc_whichqs &= ~(1 << queue); 234 if (spc->spc_whichqs == 0) 235 cpuset_del(&sched_queued_cpus, p->p_cpu); 236 } 237 } 238 239 struct proc * 240 sched_chooseproc(void) 241 { 242 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 243 struct proc *p; 244 int queue; 245 246 SCHED_ASSERT_LOCKED(); 247 248 again: 249 if (spc->spc_whichqs) { 250 queue = ffs(spc->spc_whichqs) - 1; 251 p = TAILQ_FIRST(&spc->spc_qs[queue]); 252 remrunqueue(p); 253 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 254 p = spc->spc_idleproc; 255 if (p == NULL) { 256 int s; 257 /* 258 * We get here if someone decides to switch during 259 * boot before forking kthreads, bleh. 260 * This is kind of like a stupid idle loop. 261 */ 262 #ifdef MULTIPROCESSOR 263 __mp_unlock(&sched_lock); 264 #endif 265 spl0(); 266 delay(10); 267 SCHED_LOCK(s); 268 goto again; 269 } 270 KASSERT(p); 271 p->p_stat = SRUN; 272 } 273 274 return (p); 275 } 276 277 uint64_t sched_nmigrations; 278 uint64_t sched_noidle; 279 uint64_t sched_stolen; 280 281 uint64_t sched_choose; 282 uint64_t sched_wasidle; 283 uint64_t sched_nomigrations; 284 285 struct cpu_info * 286 sched_choosecpu_fork(struct proc *parent, int flags) 287 { 288 struct cpu_info *choice = NULL; 289 fixpt_t load, best_load = ~0; 290 int run, best_run = INT_MAX; 291 struct cpu_info *ci; 292 struct cpuset set; 293 294 #if 0 295 /* 296 * XXX 297 * Don't do this until we have a painless way to move the cpu in exec. 298 * Preferably when nuking the old pmap and getting a new one on a 299 * new cpu. 300 */ 301 /* 302 * PPWAIT forks are simple. We know that the parent will not 303 * run until we exec and choose another cpu, so we just steal its 304 * cpu. 305 */ 306 if (flags & FORK_PPWAIT) 307 return (parent->p_cpu); 308 #endif 309 310 /* 311 * Look at all cpus that are currently idle and have nothing queued. 312 * If there are none, pick the one with least queued procs first, 313 * then the one with lowest load average. 314 */ 315 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 316 if (cpuset_first(&set) == NULL) 317 cpuset_add_all(&set); 318 319 while ((ci = cpuset_first(&set)) != NULL) { 320 cpuset_del(&set, ci); 321 322 load = ci->ci_schedstate.spc_ldavg; 323 run = ci->ci_schedstate.spc_nrun; 324 325 if (choice == NULL || run < best_run || 326 (run == best_run &&load < best_load)) { 327 choice = ci; 328 best_load = load; 329 best_run = run; 330 } 331 } 332 333 return (choice); 334 } 335 336 struct cpu_info * 337 sched_choosecpu(struct proc *p) 338 { 339 struct cpu_info *choice = NULL; 340 int last_cost = INT_MAX; 341 struct cpu_info *ci; 342 struct cpuset set; 343 344 /* 345 * If pegged to a cpu, don't allow it to move. 346 */ 347 if (p->p_flag & P_CPUPEG) 348 return (p->p_cpu); 349 350 sched_choose++; 351 352 /* 353 * Look at all cpus that are currently idle and have nothing queued. 354 * If there are none, pick the cheapest of those. 355 * (idle + queued could mean that the cpu is handling an interrupt 356 * at this moment and haven't had time to leave idle yet). 357 */ 358 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 359 360 /* 361 * First, just check if our current cpu is in that set, if it is, 362 * this is simple. 363 * Also, our cpu might not be idle, but if it's the current cpu 364 * and it has nothing else queued and we're curproc, take it. 365 */ 366 if (cpuset_isset(&set, p->p_cpu) || 367 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 368 curproc == p)) { 369 sched_wasidle++; 370 return (p->p_cpu); 371 } 372 373 if (cpuset_first(&set) == NULL) 374 cpuset_add_all(&set); 375 376 while ((ci = cpuset_first(&set)) != NULL) { 377 int cost = sched_proc_to_cpu_cost(ci, p); 378 379 if (choice == NULL || cost < last_cost) { 380 choice = ci; 381 last_cost = cost; 382 } 383 cpuset_del(&set, ci); 384 } 385 386 if (p->p_cpu != choice) 387 sched_nmigrations++; 388 else 389 sched_nomigrations++; 390 391 return (choice); 392 } 393 394 /* 395 * Attempt to steal a proc from some cpu. 396 */ 397 struct proc * 398 sched_steal_proc(struct cpu_info *self) 399 { 400 struct schedstate_percpu *spc; 401 struct proc *best = NULL; 402 int bestcost = INT_MAX; 403 struct cpu_info *ci; 404 struct cpuset set; 405 406 cpuset_copy(&set, &sched_queued_cpus); 407 408 while ((ci = cpuset_first(&set)) != NULL) { 409 struct proc *p; 410 int queue; 411 int cost; 412 413 cpuset_del(&set, ci); 414 415 spc = &ci->ci_schedstate; 416 417 queue = ffs(spc->spc_whichqs) - 1; 418 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 419 if (p->p_flag & P_CPUPEG) 420 continue; 421 422 cost = sched_proc_to_cpu_cost(self, p); 423 424 if (best == NULL || cost < bestcost) { 425 best = p; 426 bestcost = cost; 427 } 428 } 429 } 430 if (best == NULL) 431 return (NULL); 432 433 spc = &best->p_cpu->ci_schedstate; 434 remrunqueue(best); 435 best->p_cpu = self; 436 437 sched_stolen++; 438 439 return (best); 440 } 441 442 /* 443 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 444 */ 445 static int 446 log2(unsigned int i) 447 { 448 int ret = 0; 449 450 while (i >>= 1) 451 ret++; 452 453 return (ret); 454 } 455 456 /* 457 * Calculate the cost of moving the proc to this cpu. 458 * 459 * What we want is some guesstimate of how much "performance" it will 460 * cost us to move the proc here. Not just for caches and TLBs and NUMA 461 * memory, but also for the proc itself. A highly loaded cpu might not 462 * be the best candidate for this proc since it won't get run. 463 * 464 * Just total guesstimates for now. 465 */ 466 467 int sched_cost_load = 1; 468 int sched_cost_priority = 1; 469 int sched_cost_runnable = 3; 470 int sched_cost_resident = 1; 471 472 int 473 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 474 { 475 struct schedstate_percpu *spc; 476 int l2resident = 0; 477 int cost; 478 479 spc = &ci->ci_schedstate; 480 481 cost = 0; 482 483 /* 484 * First, account for the priority of the proc we want to move. 485 * More willing to move, the lower the priority of the destination 486 * and the higher the priority of the proc. 487 */ 488 if (!cpuset_isset(&sched_idle_cpus, ci)) { 489 cost += (p->p_priority - spc->spc_curpriority) * 490 sched_cost_priority; 491 cost += sched_cost_runnable; 492 } 493 if (cpuset_isset(&sched_queued_cpus, ci)) { 494 cost += spc->spc_nrun * sched_cost_runnable; 495 } 496 497 /* 498 * Higher load on the destination means we don't want to go there. 499 */ 500 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 501 502 /* 503 * If the proc is on this cpu already, lower the cost by how much 504 * it has been running and an estimate of its footprint. 505 */ 506 if (p->p_cpu == ci && p->p_slptime == 0) { 507 l2resident = 508 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 509 cost -= l2resident * sched_cost_resident; 510 } 511 512 return (cost); 513 } 514 515 /* 516 * Peg a proc to a cpu. 517 */ 518 void 519 sched_peg_curproc(struct cpu_info *ci) 520 { 521 struct proc *p = curproc; 522 int s; 523 524 SCHED_LOCK(s); 525 p->p_priority = p->p_usrpri; 526 p->p_stat = SRUN; 527 p->p_cpu = ci; 528 atomic_setbits_int(&p->p_flag, P_CPUPEG); 529 setrunqueue(p); 530 p->p_stats->p_ru.ru_nvcsw++; 531 mi_switch(); 532 SCHED_UNLOCK(s); 533 } 534 535 /* 536 * Functions to manipulate cpu sets. 537 */ 538 struct cpu_info *cpuset_infos[MAXCPUS]; 539 static struct cpuset cpuset_all; 540 541 void 542 cpuset_init_cpu(struct cpu_info *ci) 543 { 544 cpuset_add(&cpuset_all, ci); 545 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 546 } 547 548 void 549 cpuset_clear(struct cpuset *cs) 550 { 551 memset(cs, 0, sizeof(*cs)); 552 } 553 554 /* 555 * XXX - implement it on SP architectures too 556 */ 557 #ifndef CPU_INFO_UNIT 558 #define CPU_INFO_UNIT 0 559 #endif 560 561 void 562 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 563 { 564 unsigned int num = CPU_INFO_UNIT(ci); 565 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 566 } 567 568 void 569 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 570 { 571 unsigned int num = CPU_INFO_UNIT(ci); 572 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 573 } 574 575 int 576 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 577 { 578 unsigned int num = CPU_INFO_UNIT(ci); 579 return (cs->cs_set[num/32] & (1 << (num % 32))); 580 } 581 582 void 583 cpuset_add_all(struct cpuset *cs) 584 { 585 cpuset_copy(cs, &cpuset_all); 586 } 587 588 void 589 cpuset_copy(struct cpuset *to, struct cpuset *from) 590 { 591 memcpy(to, from, sizeof(*to)); 592 } 593 594 struct cpu_info * 595 cpuset_first(struct cpuset *cs) 596 { 597 int i; 598 599 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 600 if (cs->cs_set[i]) 601 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 602 603 return (NULL); 604 } 605 606 void 607 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 608 { 609 int i; 610 611 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 612 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 613 } 614 615 void 616 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 617 { 618 int i; 619 620 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 621 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 622 } 623 624 void 625 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 626 { 627 int i; 628 629 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 630 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 631 } 632