1 /* $OpenBSD: kern_sched.c,v 1.14 2009/10/05 17:43:08 deraadt Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 p->p_cpu = ci; 124 atomic_setbits_int(&p->p_flag, P_CPUPEG); 125 mi_switch(); 126 cpuset_del(&sched_idle_cpus, ci); 127 SCHED_UNLOCK(s); 128 129 KASSERT(ci == curcpu()); 130 KASSERT(curproc == spc->spc_idleproc); 131 132 while (1) { 133 while (!curcpu_is_idle()) { 134 struct proc *dead; 135 136 SCHED_LOCK(s); 137 p->p_stat = SSLEEP; 138 mi_switch(); 139 SCHED_UNLOCK(s); 140 141 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 142 LIST_REMOVE(dead, p_hash); 143 exit2(dead); 144 } 145 } 146 147 splassert(IPL_NONE); 148 149 cpuset_add(&sched_idle_cpus, ci); 150 cpu_idle_enter(); 151 while (spc->spc_whichqs == 0) 152 cpu_idle_cycle(); 153 cpu_idle_leave(); 154 cpuset_del(&sched_idle_cpus, ci); 155 } 156 } 157 158 /* 159 * To free our address space we have to jump through a few hoops. 160 * The freeing is done by the reaper, but until we have one reaper 161 * per cpu, we have no way of putting this proc on the deadproc list 162 * and waking up the reaper without risking having our address space and 163 * stack torn from under us before we manage to switch to another proc. 164 * Therefore we have a per-cpu list of dead processes where we put this 165 * proc and have idle clean up that list and move it to the reaper list. 166 * All this will be unnecessary once we can bind the reaper this cpu 167 * and not risk having it switch to another in case it sleeps. 168 */ 169 void 170 sched_exit(struct proc *p) 171 { 172 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 173 struct timeval tv; 174 struct proc *idle; 175 int s; 176 177 microuptime(&tv); 178 timersub(&tv, &spc->spc_runtime, &tv); 179 timeradd(&p->p_rtime, &tv, &p->p_rtime); 180 181 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 182 183 /* This process no longer needs to hold the kernel lock. */ 184 KERNEL_PROC_UNLOCK(p); 185 186 SCHED_LOCK(s); 187 idle = spc->spc_idleproc; 188 idle->p_stat = SRUN; 189 cpu_switchto(NULL, idle); 190 panic("cpu_switchto returned"); 191 } 192 193 /* 194 * Run queue management. 195 */ 196 void 197 sched_init_runqueues(void) 198 { 199 #ifdef MULTIPROCESSOR 200 __mp_lock_init(&sched_lock); 201 #endif 202 } 203 204 void 205 setrunqueue(struct proc *p) 206 { 207 struct schedstate_percpu *spc; 208 int queue = p->p_priority >> 2; 209 210 SCHED_ASSERT_LOCKED(); 211 spc = &p->p_cpu->ci_schedstate; 212 spc->spc_nrun++; 213 214 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 215 spc->spc_whichqs |= (1 << queue); 216 cpuset_add(&sched_queued_cpus, p->p_cpu); 217 218 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 219 cpu_unidle(p->p_cpu); 220 } 221 222 void 223 remrunqueue(struct proc *p) 224 { 225 struct schedstate_percpu *spc; 226 int queue = p->p_priority >> 2; 227 228 SCHED_ASSERT_LOCKED(); 229 spc = &p->p_cpu->ci_schedstate; 230 spc->spc_nrun--; 231 232 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 233 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 234 spc->spc_whichqs &= ~(1 << queue); 235 if (spc->spc_whichqs == 0) 236 cpuset_del(&sched_queued_cpus, p->p_cpu); 237 } 238 } 239 240 struct proc * 241 sched_chooseproc(void) 242 { 243 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 244 struct proc *p; 245 int queue; 246 247 SCHED_ASSERT_LOCKED(); 248 249 again: 250 if (spc->spc_whichqs) { 251 queue = ffs(spc->spc_whichqs) - 1; 252 p = TAILQ_FIRST(&spc->spc_qs[queue]); 253 remrunqueue(p); 254 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 255 p = spc->spc_idleproc; 256 if (p == NULL) { 257 int s; 258 /* 259 * We get here if someone decides to switch during 260 * boot before forking kthreads, bleh. 261 * This is kind of like a stupid idle loop. 262 */ 263 #ifdef MULTIPROCESSOR 264 __mp_unlock(&sched_lock); 265 #endif 266 spl0(); 267 delay(10); 268 SCHED_LOCK(s); 269 goto again; 270 } 271 KASSERT(p); 272 p->p_stat = SRUN; 273 } 274 275 return (p); 276 } 277 278 uint64_t sched_nmigrations; 279 uint64_t sched_noidle; 280 uint64_t sched_stolen; 281 282 uint64_t sched_choose; 283 uint64_t sched_wasidle; 284 uint64_t sched_nomigrations; 285 286 struct cpu_info * 287 sched_choosecpu_fork(struct proc *parent, int flags) 288 { 289 struct cpu_info *choice = NULL; 290 fixpt_t load, best_load = ~0; 291 int run, best_run = INT_MAX; 292 struct cpu_info *ci; 293 struct cpuset set; 294 295 #if 0 296 /* 297 * XXX 298 * Don't do this until we have a painless way to move the cpu in exec. 299 * Preferably when nuking the old pmap and getting a new one on a 300 * new cpu. 301 */ 302 /* 303 * PPWAIT forks are simple. We know that the parent will not 304 * run until we exec and choose another cpu, so we just steal its 305 * cpu. 306 */ 307 if (flags & FORK_PPWAIT) 308 return (parent->p_cpu); 309 #endif 310 311 /* 312 * Look at all cpus that are currently idle and have nothing queued. 313 * If there are none, pick the one with least queued procs first, 314 * then the one with lowest load average. 315 */ 316 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 317 if (cpuset_first(&set) == NULL) 318 cpuset_add_all(&set); 319 320 while ((ci = cpuset_first(&set)) != NULL) { 321 cpuset_del(&set, ci); 322 323 load = ci->ci_schedstate.spc_ldavg; 324 run = ci->ci_schedstate.spc_nrun; 325 326 if (choice == NULL || run < best_run || 327 (run == best_run &&load < best_load)) { 328 choice = ci; 329 best_load = load; 330 best_run = run; 331 } 332 } 333 334 return (choice); 335 } 336 337 struct cpu_info * 338 sched_choosecpu(struct proc *p) 339 { 340 struct cpu_info *choice = NULL; 341 int last_cost = INT_MAX; 342 struct cpu_info *ci; 343 struct cpuset set; 344 345 /* 346 * If pegged to a cpu, don't allow it to move. 347 */ 348 if (p->p_flag & P_CPUPEG) 349 return (p->p_cpu); 350 351 sched_choose++; 352 353 /* 354 * Look at all cpus that are currently idle and have nothing queued. 355 * If there are none, pick the cheapest of those. 356 * (idle + queued could mean that the cpu is handling an interrupt 357 * at this moment and haven't had time to leave idle yet). 358 */ 359 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 360 361 /* 362 * First, just check if our current cpu is in that set, if it is, 363 * this is simple. 364 * Also, our cpu might not be idle, but if it's the current cpu 365 * and it has nothing else queued and we're curproc, take it. 366 */ 367 if (cpuset_isset(&set, p->p_cpu) || 368 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 369 curproc == p)) { 370 sched_wasidle++; 371 return (p->p_cpu); 372 } 373 374 if (cpuset_first(&set) == NULL) 375 cpuset_add_all(&set); 376 377 while ((ci = cpuset_first(&set)) != NULL) { 378 int cost = sched_proc_to_cpu_cost(ci, p); 379 380 if (choice == NULL || cost < last_cost) { 381 choice = ci; 382 last_cost = cost; 383 } 384 cpuset_del(&set, ci); 385 } 386 387 if (p->p_cpu != choice) 388 sched_nmigrations++; 389 else 390 sched_nomigrations++; 391 392 return (choice); 393 } 394 395 /* 396 * Attempt to steal a proc from some cpu. 397 */ 398 struct proc * 399 sched_steal_proc(struct cpu_info *self) 400 { 401 struct schedstate_percpu *spc; 402 struct proc *best = NULL; 403 int bestcost = INT_MAX; 404 struct cpu_info *ci; 405 struct cpuset set; 406 407 cpuset_copy(&set, &sched_queued_cpus); 408 409 while ((ci = cpuset_first(&set)) != NULL) { 410 struct proc *p; 411 int queue; 412 int cost; 413 414 cpuset_del(&set, ci); 415 416 spc = &ci->ci_schedstate; 417 418 queue = ffs(spc->spc_whichqs) - 1; 419 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 420 if (p->p_flag & P_CPUPEG) 421 continue; 422 423 cost = sched_proc_to_cpu_cost(self, p); 424 425 if (best == NULL || cost < bestcost) { 426 best = p; 427 bestcost = cost; 428 } 429 } 430 } 431 if (best == NULL) 432 return (NULL); 433 434 spc = &best->p_cpu->ci_schedstate; 435 remrunqueue(best); 436 best->p_cpu = self; 437 438 sched_stolen++; 439 440 return (best); 441 } 442 443 /* 444 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 445 */ 446 static int 447 log2(unsigned int i) 448 { 449 int ret = 0; 450 451 while (i >>= 1) 452 ret++; 453 454 return (ret); 455 } 456 457 /* 458 * Calculate the cost of moving the proc to this cpu. 459 * 460 * What we want is some guesstimate of how much "performance" it will 461 * cost us to move the proc here. Not just for caches and TLBs and NUMA 462 * memory, but also for the proc itself. A highly loaded cpu might not 463 * be the best candidate for this proc since it won't get run. 464 * 465 * Just total guesstimates for now. 466 */ 467 468 int sched_cost_load = 1; 469 int sched_cost_priority = 1; 470 int sched_cost_runnable = 3; 471 int sched_cost_resident = 1; 472 473 int 474 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 475 { 476 struct schedstate_percpu *spc; 477 int l2resident = 0; 478 int cost; 479 480 spc = &ci->ci_schedstate; 481 482 cost = 0; 483 484 /* 485 * First, account for the priority of the proc we want to move. 486 * More willing to move, the lower the priority of the destination 487 * and the higher the priority of the proc. 488 */ 489 if (!cpuset_isset(&sched_idle_cpus, ci)) { 490 cost += (p->p_priority - spc->spc_curpriority) * 491 sched_cost_priority; 492 cost += sched_cost_runnable; 493 } 494 if (cpuset_isset(&sched_queued_cpus, ci)) { 495 cost += spc->spc_nrun * sched_cost_runnable; 496 } 497 498 /* 499 * Higher load on the destination means we don't want to go there. 500 */ 501 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 502 503 /* 504 * If the proc is on this cpu already, lower the cost by how much 505 * it has been running and an estimate of its footprint. 506 */ 507 if (p->p_cpu == ci && p->p_slptime == 0) { 508 l2resident = 509 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 510 cost -= l2resident * sched_cost_resident; 511 } 512 513 return (cost); 514 } 515 516 /* 517 * Peg a proc to a cpu. 518 */ 519 void 520 sched_peg_curproc(struct cpu_info *ci) 521 { 522 struct proc *p = curproc; 523 int s; 524 525 SCHED_LOCK(s); 526 p->p_priority = p->p_usrpri; 527 p->p_stat = SRUN; 528 p->p_cpu = ci; 529 atomic_setbits_int(&p->p_flag, P_CPUPEG); 530 setrunqueue(p); 531 p->p_stats->p_ru.ru_nvcsw++; 532 mi_switch(); 533 SCHED_UNLOCK(s); 534 } 535 536 /* 537 * Functions to manipulate cpu sets. 538 */ 539 struct cpu_info *cpuset_infos[MAXCPUS]; 540 static struct cpuset cpuset_all; 541 542 void 543 cpuset_init_cpu(struct cpu_info *ci) 544 { 545 cpuset_add(&cpuset_all, ci); 546 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 547 } 548 549 void 550 cpuset_clear(struct cpuset *cs) 551 { 552 memset(cs, 0, sizeof(*cs)); 553 } 554 555 /* 556 * XXX - implement it on SP architectures too 557 */ 558 #ifndef CPU_INFO_UNIT 559 #define CPU_INFO_UNIT 0 560 #endif 561 562 void 563 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 564 { 565 unsigned int num = CPU_INFO_UNIT(ci); 566 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 567 } 568 569 void 570 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 571 { 572 unsigned int num = CPU_INFO_UNIT(ci); 573 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 574 } 575 576 int 577 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 578 { 579 unsigned int num = CPU_INFO_UNIT(ci); 580 return (cs->cs_set[num/32] & (1 << (num % 32))); 581 } 582 583 void 584 cpuset_add_all(struct cpuset *cs) 585 { 586 cpuset_copy(cs, &cpuset_all); 587 } 588 589 void 590 cpuset_copy(struct cpuset *to, struct cpuset *from) 591 { 592 memcpy(to, from, sizeof(*to)); 593 } 594 595 struct cpu_info * 596 cpuset_first(struct cpuset *cs) 597 { 598 int i; 599 600 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 601 if (cs->cs_set[i]) 602 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 603 604 return (NULL); 605 } 606 607 void 608 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 609 { 610 int i; 611 612 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 613 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 614 } 615 616 void 617 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 618 { 619 int i; 620 621 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 622 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 623 } 624 625 void 626 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 627 { 628 int i; 629 630 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 631 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 632 } 633