1 /* $OpenBSD: kern_sched.c,v 1.13 2009/04/22 08:35:54 art Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 p->p_cpu = ci; 124 atomic_setbits_int(&p->p_flag, P_CPUPEG); 125 mi_switch(); 126 cpuset_del(&sched_idle_cpus, ci); 127 SCHED_UNLOCK(s); 128 129 KASSERT(ci == curcpu()); 130 KASSERT(curproc == spc->spc_idleproc); 131 132 while (1) { 133 while (!curcpu_is_idle()) { 134 struct proc *dead; 135 136 SCHED_LOCK(s); 137 p->p_stat = SSLEEP; 138 mi_switch(); 139 SCHED_UNLOCK(s); 140 141 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 142 LIST_REMOVE(dead, p_hash); 143 exit2(dead); 144 } 145 } 146 147 splassert(IPL_NONE); 148 149 cpuset_add(&sched_idle_cpus, ci); 150 cpu_idle_enter(); 151 while (spc->spc_whichqs == 0) 152 cpu_idle_cycle(); 153 cpu_idle_leave(); 154 cpuset_del(&sched_idle_cpus, ci); 155 } 156 } 157 158 /* 159 * To free our address space we have to jump through a few hoops. 160 * The freeing is done by the reaper, but until we have one reaper 161 * per cpu, we have no way of putting this proc on the deadproc list 162 * and waking up the reaper without risking having our address space and 163 * stack torn from under us before we manage to switch to another proc. 164 * Therefore we have a per-cpu list of dead processes where we put this 165 * proc and have idle clean up that list and move it to the reaper list. 166 * All this will be unnecessary once we can bind the reaper this cpu 167 * and not risk having it switch to another in case it sleeps. 168 */ 169 void 170 sched_exit(struct proc *p) 171 { 172 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 173 struct timeval tv; 174 struct proc *idle; 175 int s; 176 177 microuptime(&tv); 178 timersub(&tv, &spc->spc_runtime, &tv); 179 timeradd(&p->p_rtime, &tv, &p->p_rtime); 180 181 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 182 183 #ifdef MULTIPROCESSOR 184 KASSERT(__mp_lock_held(&kernel_lock) == 0); 185 #endif 186 187 SCHED_LOCK(s); 188 idle = spc->spc_idleproc; 189 idle->p_stat = SRUN; 190 cpu_switchto(NULL, idle); 191 panic("cpu_switchto returned"); 192 } 193 194 /* 195 * Run queue management. 196 */ 197 void 198 sched_init_runqueues(void) 199 { 200 #ifdef MULTIPROCESSOR 201 __mp_lock_init(&sched_lock); 202 #endif 203 } 204 205 void 206 setrunqueue(struct proc *p) 207 { 208 struct schedstate_percpu *spc; 209 int queue = p->p_priority >> 2; 210 211 SCHED_ASSERT_LOCKED(); 212 spc = &p->p_cpu->ci_schedstate; 213 spc->spc_nrun++; 214 215 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 216 spc->spc_whichqs |= (1 << queue); 217 cpuset_add(&sched_queued_cpus, p->p_cpu); 218 219 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 220 cpu_unidle(p->p_cpu); 221 } 222 223 void 224 remrunqueue(struct proc *p) 225 { 226 struct schedstate_percpu *spc; 227 int queue = p->p_priority >> 2; 228 229 SCHED_ASSERT_LOCKED(); 230 spc = &p->p_cpu->ci_schedstate; 231 spc->spc_nrun--; 232 233 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 234 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 235 spc->spc_whichqs &= ~(1 << queue); 236 if (spc->spc_whichqs == 0) 237 cpuset_del(&sched_queued_cpus, p->p_cpu); 238 } 239 } 240 241 struct proc * 242 sched_chooseproc(void) 243 { 244 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 245 struct proc *p; 246 int queue; 247 248 SCHED_ASSERT_LOCKED(); 249 250 again: 251 if (spc->spc_whichqs) { 252 queue = ffs(spc->spc_whichqs) - 1; 253 p = TAILQ_FIRST(&spc->spc_qs[queue]); 254 remrunqueue(p); 255 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 256 p = spc->spc_idleproc; 257 if (p == NULL) { 258 int s; 259 /* 260 * We get here if someone decides to switch during 261 * boot before forking kthreads, bleh. 262 * This is kind of like a stupid idle loop. 263 */ 264 #ifdef MULTIPROCESSOR 265 __mp_unlock(&sched_lock); 266 #endif 267 spl0(); 268 delay(10); 269 SCHED_LOCK(s); 270 goto again; 271 } 272 KASSERT(p); 273 p->p_stat = SRUN; 274 } 275 276 return (p); 277 } 278 279 uint64_t sched_nmigrations; 280 uint64_t sched_noidle; 281 uint64_t sched_stolen; 282 283 uint64_t sched_choose; 284 uint64_t sched_wasidle; 285 uint64_t sched_nomigrations; 286 287 struct cpu_info * 288 sched_choosecpu_fork(struct proc *parent, int flags) 289 { 290 struct cpu_info *choice = NULL; 291 fixpt_t load, best_load = ~0; 292 int run, best_run = INT_MAX; 293 struct cpu_info *ci; 294 struct cpuset set; 295 296 #if 0 297 /* 298 * XXX 299 * Don't do this until we have a painless way to move the cpu in exec. 300 * Preferably when nuking the old pmap and getting a new one on a 301 * new cpu. 302 */ 303 /* 304 * PPWAIT forks are simple. We know that the parent will not 305 * run until we exec and choose another cpu, so we just steal its 306 * cpu. 307 */ 308 if (flags & FORK_PPWAIT) 309 return (parent->p_cpu); 310 #endif 311 312 /* 313 * Look at all cpus that are currently idle and have nothing queued. 314 * If there are none, pick the one with least queued procs first, 315 * then the one with lowest load average. 316 */ 317 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 318 if (cpuset_first(&set) == NULL) 319 cpuset_add_all(&set); 320 321 while ((ci = cpuset_first(&set)) != NULL) { 322 cpuset_del(&set, ci); 323 324 load = ci->ci_schedstate.spc_ldavg; 325 run = ci->ci_schedstate.spc_nrun; 326 327 if (choice == NULL || run < best_run || 328 (run == best_run &&load < best_load)) { 329 choice = ci; 330 best_load = load; 331 best_run = run; 332 } 333 } 334 335 return (choice); 336 } 337 338 struct cpu_info * 339 sched_choosecpu(struct proc *p) 340 { 341 struct cpu_info *choice = NULL; 342 int last_cost = INT_MAX; 343 struct cpu_info *ci; 344 struct cpuset set; 345 346 /* 347 * If pegged to a cpu, don't allow it to move. 348 */ 349 if (p->p_flag & P_CPUPEG) 350 return (p->p_cpu); 351 352 sched_choose++; 353 354 /* 355 * Look at all cpus that are currently idle and have nothing queued. 356 * If there are none, pick the cheapest of those. 357 * (idle + queued could mean that the cpu is handling an interrupt 358 * at this moment and haven't had time to leave idle yet). 359 */ 360 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 361 362 /* 363 * First, just check if our current cpu is in that set, if it is, 364 * this is simple. 365 * Also, our cpu might not be idle, but if it's the current cpu 366 * and it has nothing else queued and we're curproc, take it. 367 */ 368 if (cpuset_isset(&set, p->p_cpu) || 369 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 370 curproc == p)) { 371 sched_wasidle++; 372 return (p->p_cpu); 373 } 374 375 if (cpuset_first(&set) == NULL) 376 cpuset_add_all(&set); 377 378 while ((ci = cpuset_first(&set)) != NULL) { 379 int cost = sched_proc_to_cpu_cost(ci, p); 380 381 if (choice == NULL || cost < last_cost) { 382 choice = ci; 383 last_cost = cost; 384 } 385 cpuset_del(&set, ci); 386 } 387 388 if (p->p_cpu != choice) 389 sched_nmigrations++; 390 else 391 sched_nomigrations++; 392 393 return (choice); 394 } 395 396 /* 397 * Attempt to steal a proc from some cpu. 398 */ 399 struct proc * 400 sched_steal_proc(struct cpu_info *self) 401 { 402 struct schedstate_percpu *spc; 403 struct proc *best = NULL; 404 int bestcost = INT_MAX; 405 struct cpu_info *ci; 406 struct cpuset set; 407 408 cpuset_copy(&set, &sched_queued_cpus); 409 410 while ((ci = cpuset_first(&set)) != NULL) { 411 struct proc *p; 412 int queue; 413 int cost; 414 415 cpuset_del(&set, ci); 416 417 spc = &ci->ci_schedstate; 418 419 queue = ffs(spc->spc_whichqs) - 1; 420 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 421 if (p->p_flag & P_CPUPEG) 422 continue; 423 424 cost = sched_proc_to_cpu_cost(self, p); 425 426 if (best == NULL || cost < bestcost) { 427 best = p; 428 bestcost = cost; 429 } 430 } 431 } 432 if (best == NULL) 433 return (NULL); 434 435 spc = &best->p_cpu->ci_schedstate; 436 remrunqueue(best); 437 best->p_cpu = self; 438 439 sched_stolen++; 440 441 return (best); 442 } 443 444 /* 445 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 446 */ 447 static int 448 log2(unsigned int i) 449 { 450 int ret = 0; 451 452 while (i >>= 1) 453 ret++; 454 455 return (ret); 456 } 457 458 /* 459 * Calculate the cost of moving the proc to this cpu. 460 * 461 * What we want is some guesstimate of how much "performance" it will 462 * cost us to move the proc here. Not just for caches and TLBs and NUMA 463 * memory, but also for the proc itself. A highly loaded cpu might not 464 * be the best candidate for this proc since it won't get run. 465 * 466 * Just total guesstimates for now. 467 */ 468 469 int sched_cost_load = 1; 470 int sched_cost_priority = 1; 471 int sched_cost_runnable = 3; 472 int sched_cost_resident = 1; 473 474 int 475 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 476 { 477 struct schedstate_percpu *spc; 478 int l2resident = 0; 479 int cost; 480 481 spc = &ci->ci_schedstate; 482 483 cost = 0; 484 485 /* 486 * First, account for the priority of the proc we want to move. 487 * More willing to move, the lower the priority of the destination 488 * and the higher the priority of the proc. 489 */ 490 if (!cpuset_isset(&sched_idle_cpus, ci)) { 491 cost += (p->p_priority - spc->spc_curpriority) * 492 sched_cost_priority; 493 cost += sched_cost_runnable; 494 } 495 if (cpuset_isset(&sched_queued_cpus, ci)) { 496 cost += spc->spc_nrun * sched_cost_runnable; 497 } 498 499 /* 500 * Higher load on the destination means we don't want to go there. 501 */ 502 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 503 504 /* 505 * If the proc is on this cpu already, lower the cost by how much 506 * it has been running and an estimate of its footprint. 507 */ 508 if (p->p_cpu == ci && p->p_slptime == 0) { 509 l2resident = 510 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 511 cost -= l2resident * sched_cost_resident; 512 } 513 514 return (cost); 515 } 516 517 /* 518 * Peg a proc to a cpu. 519 */ 520 void 521 sched_peg_curproc(struct cpu_info *ci) 522 { 523 struct proc *p = curproc; 524 int s; 525 526 SCHED_LOCK(s); 527 p->p_priority = p->p_usrpri; 528 p->p_stat = SRUN; 529 p->p_cpu = ci; 530 atomic_setbits_int(&p->p_flag, P_CPUPEG); 531 setrunqueue(p); 532 p->p_stats->p_ru.ru_nvcsw++; 533 mi_switch(); 534 SCHED_UNLOCK(s); 535 } 536 537 /* 538 * Functions to manipulate cpu sets. 539 */ 540 struct cpu_info *cpuset_infos[MAXCPUS]; 541 static struct cpuset cpuset_all; 542 543 void 544 cpuset_init_cpu(struct cpu_info *ci) 545 { 546 cpuset_add(&cpuset_all, ci); 547 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 548 } 549 550 void 551 cpuset_clear(struct cpuset *cs) 552 { 553 memset(cs, 0, sizeof(*cs)); 554 } 555 556 /* 557 * XXX - implement it on SP architectures too 558 */ 559 #ifndef CPU_INFO_UNIT 560 #define CPU_INFO_UNIT 0 561 #endif 562 563 void 564 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 565 { 566 unsigned int num = CPU_INFO_UNIT(ci); 567 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 568 } 569 570 void 571 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 572 { 573 unsigned int num = CPU_INFO_UNIT(ci); 574 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 575 } 576 577 int 578 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 579 { 580 unsigned int num = CPU_INFO_UNIT(ci); 581 return (cs->cs_set[num/32] & (1 << (num % 32))); 582 } 583 584 void 585 cpuset_add_all(struct cpuset *cs) 586 { 587 cpuset_copy(cs, &cpuset_all); 588 } 589 590 void 591 cpuset_copy(struct cpuset *to, struct cpuset *from) 592 { 593 memcpy(to, from, sizeof(*to)); 594 } 595 596 struct cpu_info * 597 cpuset_first(struct cpuset *cs) 598 { 599 int i; 600 601 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 602 if (cs->cs_set[i]) 603 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 604 605 return (NULL); 606 } 607 608 void 609 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 610 { 611 int i; 612 613 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 614 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 615 } 616 617 void 618 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 619 { 620 int i; 621 622 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 623 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 624 } 625 626 void 627 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 628 { 629 int i; 630 631 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 632 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 633 } 634