1 /* $OpenBSD: kern_sched.c,v 1.10 2009/04/03 09:29:15 art Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 mi_switch(); 124 cpuset_del(&sched_idle_cpus, ci); 125 SCHED_UNLOCK(s); 126 127 KASSERT(ci == curcpu()); 128 KASSERT(curproc == spc->spc_idleproc); 129 130 while (1) { 131 while (!curcpu_is_idle()) { 132 struct proc *dead; 133 134 SCHED_LOCK(s); 135 p->p_stat = SSLEEP; 136 mi_switch(); 137 SCHED_UNLOCK(s); 138 139 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 140 LIST_REMOVE(dead, p_hash); 141 exit2(dead); 142 } 143 } 144 145 splassert(IPL_NONE); 146 147 cpuset_add(&sched_idle_cpus, ci); 148 cpu_idle_enter(); 149 while (spc->spc_whichqs == 0) 150 cpu_idle_cycle(); 151 cpu_idle_leave(); 152 cpuset_del(&sched_idle_cpus, ci); 153 } 154 } 155 156 /* 157 * To free our address space we have to jump through a few hoops. 158 * The freeing is done by the reaper, but until we have one reaper 159 * per cpu, we have no way of putting this proc on the deadproc list 160 * and waking up the reaper without risking having our address space and 161 * stack torn from under us before we manage to switch to another proc. 162 * Therefore we have a per-cpu list of dead processes where we put this 163 * proc and have idle clean up that list and move it to the reaper list. 164 * All this will be unnecessary once we can bind the reaper this cpu 165 * and not risk having it switch to another in case it sleeps. 166 */ 167 void 168 sched_exit(struct proc *p) 169 { 170 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 171 struct timeval tv; 172 struct proc *idle; 173 int s; 174 175 microuptime(&tv); 176 timersub(&tv, &spc->spc_runtime, &tv); 177 timeradd(&p->p_rtime, &tv, &p->p_rtime); 178 179 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 180 181 #ifdef MULTIPROCESSOR 182 KASSERT(__mp_lock_held(&kernel_lock) == 0); 183 #endif 184 185 SCHED_LOCK(s); 186 idle = spc->spc_idleproc; 187 idle->p_stat = SRUN; 188 cpu_switchto(NULL, idle); 189 panic("cpu_switchto returned"); 190 } 191 192 /* 193 * Run queue management. 194 */ 195 void 196 sched_init_runqueues(void) 197 { 198 #ifdef MULTIPROCESSOR 199 __mp_lock_init(&sched_lock); 200 #endif 201 } 202 203 void 204 setrunqueue(struct proc *p) 205 { 206 struct schedstate_percpu *spc; 207 int queue = p->p_priority >> 2; 208 209 SCHED_ASSERT_LOCKED(); 210 sched_choosecpu(p); 211 spc = &p->p_cpu->ci_schedstate; 212 spc->spc_nrun++; 213 214 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 215 spc->spc_whichqs |= (1 << queue); 216 cpuset_add(&sched_queued_cpus, p->p_cpu); 217 218 if (p->p_cpu != curcpu()) 219 cpu_unidle(p->p_cpu); 220 } 221 222 void 223 remrunqueue(struct proc *p) 224 { 225 struct schedstate_percpu *spc; 226 int queue = p->p_priority >> 2; 227 228 SCHED_ASSERT_LOCKED(); 229 spc = &p->p_cpu->ci_schedstate; 230 spc->spc_nrun--; 231 232 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 233 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 234 spc->spc_whichqs &= ~(1 << queue); 235 if (spc->spc_whichqs == 0) 236 cpuset_del(&sched_queued_cpus, p->p_cpu); 237 } 238 } 239 240 struct proc * 241 sched_chooseproc(void) 242 { 243 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 244 struct proc *p; 245 int queue; 246 247 SCHED_ASSERT_LOCKED(); 248 249 again: 250 if (spc->spc_whichqs) { 251 queue = ffs(spc->spc_whichqs) - 1; 252 p = TAILQ_FIRST(&spc->spc_qs[queue]); 253 remrunqueue(p); 254 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 255 p = spc->spc_idleproc; 256 if (p == NULL) { 257 int s; 258 /* 259 * We get here if someone decides to switch during 260 * boot before forking kthreads, bleh. 261 * This is kind of like a stupid idle loop. 262 */ 263 #ifdef MULTIPROCESSOR 264 __mp_unlock(&sched_lock); 265 #endif 266 spl0(); 267 delay(10); 268 SCHED_LOCK(s); 269 goto again; 270 } 271 KASSERT(p); 272 p->p_stat = SRUN; 273 } 274 275 return (p); 276 } 277 278 uint64_t sched_nmigrations; 279 uint64_t sched_noidle; 280 uint64_t sched_stolen; 281 282 uint64_t sched_choose; 283 uint64_t sched_wasidle; 284 uint64_t sched_nomigrations; 285 286 void 287 sched_choosecpu(struct proc *p) 288 { 289 struct cpu_info *choice = NULL; 290 int last_cost = INT_MAX; 291 struct cpu_info *ci; 292 struct cpuset set; 293 294 /* 295 * If pegged to a cpu, don't allow it to move. 296 */ 297 if (p->p_flag & P_CPUPEG) 298 return; 299 300 sched_choose++; 301 302 /* 303 * The simplest case. Our cpu of choice was idle. This happens 304 * when we were sleeping and something woke us up. 305 * 306 * We also need to check sched_queued_cpus to make sure that 307 * we're not thundering herding one cpu that hasn't managed to 308 * get out of the idle loop yet. 309 */ 310 if (p->p_cpu && cpuset_isset(&sched_idle_cpus, p->p_cpu) && 311 !cpuset_isset(&sched_queued_cpus, p->p_cpu)) { 312 sched_wasidle++; 313 return; 314 } 315 316 #if 0 317 318 /* Most likely, this is broken. don't do it. */ 319 /* 320 * Second case. (shouldn't be necessary in the future) 321 * If our cpu is not idle, but has nothing else queued (which 322 * means that we are curproc and roundrobin asks us to reschedule). 323 */ 324 if (p->p_cpu && p->p_cpu->ci_schedstate.spc_nrun == 0) 325 return; 326 #endif 327 328 /* 329 * Look at all cpus that are currently idle. Pick the cheapest of 330 * those. 331 */ 332 cpuset_copy(&set, &sched_idle_cpus); 333 while ((ci = cpuset_first(&set)) != NULL) { 334 int cost = sched_proc_to_cpu_cost(ci, p); 335 336 if (choice == NULL || cost < last_cost) { 337 choice = ci; 338 last_cost = cost; 339 } 340 cpuset_del(&set, ci); 341 } 342 343 /* 344 * All cpus are busy. Pick one. 345 */ 346 if (choice == NULL) { 347 CPU_INFO_ITERATOR cii; 348 349 sched_noidle++; 350 351 /* 352 * Not curproc, pick the cpu with the lowest cost to switch to. 353 */ 354 CPU_INFO_FOREACH(cii, ci) { 355 int cost = sched_proc_to_cpu_cost(ci, p); 356 357 if (choice == NULL || cost < last_cost) { 358 choice = ci; 359 last_cost = cost; 360 } 361 } 362 } 363 364 KASSERT(choice); 365 366 if (p->p_cpu && p->p_cpu != choice) 367 sched_nmigrations++; 368 else if (p->p_cpu != NULL) 369 sched_nomigrations++; 370 371 p->p_cpu = choice; 372 } 373 374 /* 375 * Attempt to steal a proc from some cpu. 376 */ 377 struct proc * 378 sched_steal_proc(struct cpu_info *self) 379 { 380 struct schedstate_percpu *spc; 381 struct proc *best = NULL; 382 int bestcost = INT_MAX; 383 struct cpu_info *ci; 384 struct cpuset set; 385 386 cpuset_copy(&set, &sched_queued_cpus); 387 388 while ((ci = cpuset_first(&set)) != NULL) { 389 struct proc *p; 390 int cost; 391 392 cpuset_del(&set, ci); 393 394 spc = &ci->ci_schedstate; 395 396 p = TAILQ_FIRST(&spc->spc_qs[ffs(spc->spc_whichqs) - 1]); 397 KASSERT(p); 398 cost = sched_proc_to_cpu_cost(self, p); 399 400 if (best == NULL || cost < bestcost) { 401 best = p; 402 bestcost = cost; 403 } 404 } 405 if (best == NULL) 406 return (NULL); 407 408 spc = &best->p_cpu->ci_schedstate; 409 remrunqueue(best); 410 best->p_cpu = self; 411 412 sched_stolen++; 413 414 return (best); 415 } 416 417 /* 418 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 419 */ 420 static int 421 log2(unsigned int i) 422 { 423 int ret = 0; 424 425 while (i >>= 1) 426 ret++; 427 428 return (ret); 429 } 430 431 /* 432 * Calculate the cost of moving the proc to this cpu. 433 * 434 * What we want is some guesstimate of how much "performance" it will 435 * cost us to move the proc here. Not just for caches and TLBs and NUMA 436 * memory, but also for the proc itself. A highly loaded cpu might not 437 * be the best candidate for this proc since it won't get run. 438 * 439 * Just total guesstimates for now. 440 */ 441 442 int sched_cost_load = 1; 443 int sched_cost_priority = 1; 444 int sched_cost_runnable = 3; 445 int sched_cost_resident = 1; 446 447 int 448 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 449 { 450 struct schedstate_percpu *spc; 451 int l2resident = 0; 452 int cost; 453 454 spc = &ci->ci_schedstate; 455 456 cost = 0; 457 458 /* 459 * First, account for the priority of the proc we want to move. 460 * More willing to move, the lower the priority of the destination 461 * and the higher the priority of the proc. 462 */ 463 if (!cpuset_isset(&sched_idle_cpus, ci)) { 464 cost += (p->p_priority - spc->spc_curpriority) * 465 sched_cost_priority; 466 cost += sched_cost_runnable; 467 } 468 if (cpuset_isset(&sched_queued_cpus, ci)) { 469 cost += spc->spc_nrun * sched_cost_runnable; 470 } 471 472 /* 473 * Higher load on the destination means we don't want to go there. 474 */ 475 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 476 477 /* 478 * If the proc is on this cpu already, lower the cost by how much 479 * it has been running and an estimate of its footprint. 480 */ 481 if (p->p_cpu == ci && p->p_slptime == 0) { 482 l2resident = 483 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 484 cost -= l2resident * sched_cost_resident; 485 } 486 487 return (cost); 488 } 489 490 /* 491 * Peg a proc to a cpu. 492 */ 493 void 494 sched_peg_curproc(struct cpu_info *ci) 495 { 496 struct proc *p = curproc; 497 int s; 498 499 SCHED_LOCK(s); 500 p->p_priority = p->p_usrpri; 501 p->p_stat = SRUN; 502 p->p_cpu = ci; 503 atomic_setbits_int(&p->p_flag, P_CPUPEG); 504 setrunqueue(p); 505 p->p_stats->p_ru.ru_nvcsw++; 506 mi_switch(); 507 SCHED_UNLOCK(s); 508 } 509 510 /* 511 * Functions to manipulate cpu sets. 512 */ 513 struct cpu_info *cpuset_infos[MAXCPUS]; 514 static struct cpuset cpuset_all; 515 516 void 517 cpuset_init_cpu(struct cpu_info *ci) 518 { 519 cpuset_add(&cpuset_all, ci); 520 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 521 } 522 523 void 524 cpuset_clear(struct cpuset *cs) 525 { 526 memset(cs, 0, sizeof(*cs)); 527 } 528 529 /* 530 * XXX - implement it on SP architectures too 531 */ 532 #ifndef CPU_INFO_UNIT 533 #define CPU_INFO_UNIT 0 534 #endif 535 536 void 537 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 538 { 539 unsigned int num = CPU_INFO_UNIT(ci); 540 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 541 } 542 543 void 544 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 545 { 546 unsigned int num = CPU_INFO_UNIT(ci); 547 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 548 } 549 550 int 551 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 552 { 553 unsigned int num = CPU_INFO_UNIT(ci); 554 return (cs->cs_set[num/32] & (1 << (num % 32))); 555 } 556 557 void 558 cpuset_add_all(struct cpuset *cs) 559 { 560 cpuset_copy(cs, &cpuset_all); 561 } 562 563 void 564 cpuset_copy(struct cpuset *to, struct cpuset *from) 565 { 566 memcpy(to, from, sizeof(*to)); 567 } 568 569 struct cpu_info * 570 cpuset_first(struct cpuset *cs) 571 { 572 int i; 573 574 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 575 if (cs->cs_set[i]) 576 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 577 578 return (NULL); 579 } 580