1 /* $OpenBSD: kern_sched.c,v 1.9 2009/03/23 13:25:11 art Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 #include <machine/atomic.h> 28 29 #include <uvm/uvm_extern.h> 30 31 #include <sys/malloc.h> 32 33 34 void sched_kthreads_create(void *); 35 void sched_idle(void *); 36 37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 38 struct proc *sched_steal_proc(struct cpu_info *); 39 40 /* 41 * To help choosing which cpu should run which process we keep track 42 * of cpus which are currently idle and which cpus have processes 43 * queued. 44 */ 45 struct cpuset sched_idle_cpus; 46 struct cpuset sched_queued_cpus; 47 48 /* 49 * A few notes about cpu_switchto that is implemented in MD code. 50 * 51 * cpu_switchto takes two arguments, the old proc and the proc 52 * it should switch to. The new proc will never be NULL, so we always have 53 * a saved state that we need to switch to. The old proc however can 54 * be NULL if the process is exiting. NULL for the old proc simply 55 * means "don't bother saving old state". 56 * 57 * cpu_switchto is supposed to atomically load the new state of the process 58 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 59 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 60 * cpus in the system must not depend on this state being consistent. 61 * Therefore no locking is necessary in cpu_switchto other than blocking 62 * interrupts during the context switch. 63 */ 64 65 /* 66 * sched_init_cpu is called from main() for the boot cpu, then it's the 67 * responsibility of the MD code to call it for all other cpus. 68 */ 69 void 70 sched_init_cpu(struct cpu_info *ci) 71 { 72 struct schedstate_percpu *spc = &ci->ci_schedstate; 73 int i; 74 75 for (i = 0; i < SCHED_NQS; i++) 76 TAILQ_INIT(&spc->spc_qs[i]); 77 78 spc->spc_idleproc = NULL; 79 80 kthread_create_deferred(sched_kthreads_create, ci); 81 82 LIST_INIT(&spc->spc_deadproc); 83 84 /* 85 * Slight hack here until the cpuset code handles cpu_info 86 * structures. 87 */ 88 cpuset_init_cpu(ci); 89 } 90 91 void 92 sched_kthreads_create(void *v) 93 { 94 struct cpu_info *ci = v; 95 struct schedstate_percpu *spc = &ci->ci_schedstate; 96 static int num; 97 98 if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num)) 99 panic("fork idle"); 100 101 num++; 102 } 103 104 void 105 sched_idle(void *v) 106 { 107 struct schedstate_percpu *spc; 108 struct proc *p = curproc; 109 struct cpu_info *ci = v; 110 int s; 111 112 KERNEL_PROC_UNLOCK(p); 113 114 spc = &ci->ci_schedstate; 115 116 /* 117 * First time we enter here, we're not supposed to idle, 118 * just go away for a while. 119 */ 120 SCHED_LOCK(s); 121 cpuset_add(&sched_idle_cpus, ci); 122 p->p_stat = SSLEEP; 123 mi_switch(); 124 cpuset_del(&sched_idle_cpus, ci); 125 SCHED_UNLOCK(s); 126 127 KASSERT(ci == curcpu()); 128 KASSERT(curproc == spc->spc_idleproc); 129 130 while (1) { 131 while (!curcpu_is_idle()) { 132 struct proc *dead; 133 134 SCHED_LOCK(s); 135 p->p_stat = SSLEEP; 136 mi_switch(); 137 SCHED_UNLOCK(s); 138 139 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 140 LIST_REMOVE(dead, p_hash); 141 exit2(dead); 142 } 143 } 144 145 splassert(IPL_NONE); 146 147 cpuset_add(&sched_idle_cpus, ci); 148 cpu_idle_enter(); 149 while (spc->spc_whichqs == 0) 150 cpu_idle_cycle(); 151 cpu_idle_leave(); 152 cpuset_del(&sched_idle_cpus, ci); 153 } 154 } 155 156 /* 157 * To free our address space we have to jump through a few hoops. 158 * The freeing is done by the reaper, but until we have one reaper 159 * per cpu, we have no way of putting this proc on the deadproc list 160 * and waking up the reaper without risking having our address space and 161 * stack torn from under us before we manage to switch to another proc. 162 * Therefore we have a per-cpu list of dead processes where we put this 163 * proc and have idle clean up that list and move it to the reaper list. 164 * All this will be unnecessary once we can bind the reaper this cpu 165 * and not risk having it switch to another in case it sleeps. 166 */ 167 void 168 sched_exit(struct proc *p) 169 { 170 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 171 struct timeval tv; 172 struct proc *idle; 173 int s; 174 175 microuptime(&tv); 176 timersub(&tv, &spc->spc_runtime, &tv); 177 timeradd(&p->p_rtime, &tv, &p->p_rtime); 178 179 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 180 181 #ifdef MULTIPROCESSOR 182 KASSERT(__mp_lock_held(&kernel_lock) == 0); 183 #endif 184 185 SCHED_LOCK(s); 186 idle = spc->spc_idleproc; 187 idle->p_stat = SRUN; 188 cpu_switchto(NULL, idle); 189 panic("cpu_switchto returned"); 190 } 191 192 /* 193 * Run queue management. 194 */ 195 void 196 sched_init_runqueues(void) 197 { 198 #ifdef MULTIPROCESSOR 199 __mp_lock_init(&sched_lock); 200 #endif 201 } 202 203 void 204 setrunqueue(struct proc *p) 205 { 206 struct schedstate_percpu *spc; 207 int queue = p->p_priority >> 2; 208 209 SCHED_ASSERT_LOCKED(); 210 sched_choosecpu(p); 211 spc = &p->p_cpu->ci_schedstate; 212 spc->spc_nrun++; 213 214 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 215 spc->spc_whichqs |= (1 << queue); 216 cpuset_add(&sched_queued_cpus, p->p_cpu); 217 218 if (p->p_cpu != curcpu()) 219 cpu_unidle(p->p_cpu); 220 } 221 222 void 223 remrunqueue(struct proc *p) 224 { 225 struct schedstate_percpu *spc; 226 int queue = p->p_priority >> 2; 227 228 SCHED_ASSERT_LOCKED(); 229 spc = &p->p_cpu->ci_schedstate; 230 spc->spc_nrun--; 231 232 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 233 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 234 spc->spc_whichqs &= ~(1 << queue); 235 if (spc->spc_whichqs == 0) 236 cpuset_del(&sched_queued_cpus, p->p_cpu); 237 } 238 } 239 240 struct proc * 241 sched_chooseproc(void) 242 { 243 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 244 struct proc *p; 245 int queue; 246 247 SCHED_ASSERT_LOCKED(); 248 249 again: 250 if (spc->spc_whichqs) { 251 queue = ffs(spc->spc_whichqs) - 1; 252 p = TAILQ_FIRST(&spc->spc_qs[queue]); 253 remrunqueue(p); 254 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 255 p = spc->spc_idleproc; 256 if (p == NULL) { 257 int s; 258 /* 259 * We get here if someone decides to switch during 260 * boot before forking kthreads, bleh. 261 * This is kind of like a stupid idle loop. 262 */ 263 #ifdef MULTIPROCESSOR 264 __mp_unlock(&sched_lock); 265 #endif 266 spl0(); 267 delay(10); 268 SCHED_LOCK(s); 269 goto again; 270 } 271 KASSERT(p); 272 p->p_stat = SRUN; 273 } 274 275 return (p); 276 } 277 278 uint64_t sched_nmigrations; 279 uint64_t sched_noidle; 280 uint64_t sched_stolen; 281 282 uint64_t sched_choose; 283 uint64_t sched_wasidle; 284 uint64_t sched_nomigrations; 285 286 void 287 sched_choosecpu(struct proc *p) 288 { 289 struct cpu_info *choice = NULL; 290 int last_cost = INT_MAX; 291 struct cpu_info *ci; 292 struct cpuset set; 293 294 sched_choose++; 295 296 /* 297 * The simplest case. Our cpu of choice was idle. This happens 298 * when we were sleeping and something woke us up. 299 * 300 * We also need to check sched_queued_cpus to make sure that 301 * we're not thundering herding one cpu that hasn't managed to 302 * get out of the idle loop yet. 303 */ 304 if (p->p_cpu && cpuset_isset(&sched_idle_cpus, p->p_cpu) && 305 !cpuset_isset(&sched_queued_cpus, p->p_cpu)) { 306 sched_wasidle++; 307 return; 308 } 309 310 #if 0 311 312 /* Most likely, this is broken. don't do it. */ 313 /* 314 * Second case. (shouldn't be necessary in the future) 315 * If our cpu is not idle, but has nothing else queued (which 316 * means that we are curproc and roundrobin asks us to reschedule). 317 */ 318 if (p->p_cpu && p->p_cpu->ci_schedstate.spc_nrun == 0) 319 return; 320 #endif 321 322 /* 323 * Look at all cpus that are currently idle. Pick the cheapest of 324 * those. 325 */ 326 cpuset_copy(&set, &sched_idle_cpus); 327 while ((ci = cpuset_first(&set)) != NULL) { 328 int cost = sched_proc_to_cpu_cost(ci, p); 329 330 if (choice == NULL || cost < last_cost) { 331 choice = ci; 332 last_cost = cost; 333 } 334 cpuset_del(&set, ci); 335 } 336 337 /* 338 * All cpus are busy. Pick one. 339 */ 340 if (choice == NULL) { 341 CPU_INFO_ITERATOR cii; 342 343 sched_noidle++; 344 345 /* 346 * Not curproc, pick the cpu with the lowest cost to switch to. 347 */ 348 CPU_INFO_FOREACH(cii, ci) { 349 int cost = sched_proc_to_cpu_cost(ci, p); 350 351 if (choice == NULL || cost < last_cost) { 352 choice = ci; 353 last_cost = cost; 354 } 355 } 356 } 357 358 KASSERT(choice); 359 360 if (p->p_cpu && p->p_cpu != choice) 361 sched_nmigrations++; 362 else if (p->p_cpu != NULL) 363 sched_nomigrations++; 364 365 p->p_cpu = choice; 366 } 367 368 /* 369 * Attempt to steal a proc from some cpu. 370 */ 371 struct proc * 372 sched_steal_proc(struct cpu_info *self) 373 { 374 struct schedstate_percpu *spc; 375 struct proc *best = NULL; 376 int bestcost = INT_MAX; 377 struct cpu_info *ci; 378 struct cpuset set; 379 380 cpuset_copy(&set, &sched_queued_cpus); 381 382 while ((ci = cpuset_first(&set)) != NULL) { 383 struct proc *p; 384 int cost; 385 386 cpuset_del(&set, ci); 387 388 spc = &ci->ci_schedstate; 389 390 p = TAILQ_FIRST(&spc->spc_qs[ffs(spc->spc_whichqs) - 1]); 391 KASSERT(p); 392 cost = sched_proc_to_cpu_cost(self, p); 393 394 if (best == NULL || cost < bestcost) { 395 best = p; 396 bestcost = cost; 397 } 398 } 399 if (best == NULL) 400 return (NULL); 401 402 spc = &best->p_cpu->ci_schedstate; 403 remrunqueue(best); 404 best->p_cpu = self; 405 406 sched_stolen++; 407 408 return (best); 409 } 410 411 /* 412 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 413 */ 414 static int 415 log2(unsigned int i) 416 { 417 int ret = 0; 418 419 while (i >>= 1) 420 ret++; 421 422 return (ret); 423 } 424 425 /* 426 * Calculate the cost of moving the proc to this cpu. 427 * 428 * What we want is some guesstimate of how much "performance" it will 429 * cost us to move the proc here. Not just for caches and TLBs and NUMA 430 * memory, but also for the proc itself. A highly loaded cpu might not 431 * be the best candidate for this proc since it won't get run. 432 * 433 * Just total guesstimates for now. 434 */ 435 436 int sched_cost_load = 1; 437 int sched_cost_priority = 1; 438 int sched_cost_runnable = 3; 439 int sched_cost_resident = 1; 440 441 int 442 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 443 { 444 struct schedstate_percpu *spc; 445 int l2resident = 0; 446 int cost; 447 448 spc = &ci->ci_schedstate; 449 450 cost = 0; 451 452 /* 453 * First, account for the priority of the proc we want to move. 454 * More willing to move, the lower the priority of the destination 455 * and the higher the priority of the proc. 456 */ 457 if (!cpuset_isset(&sched_idle_cpus, ci)) { 458 cost += (p->p_priority - spc->spc_curpriority) * 459 sched_cost_priority; 460 cost += sched_cost_runnable; 461 } 462 if (cpuset_isset(&sched_queued_cpus, ci)) { 463 cost += spc->spc_nrun * sched_cost_runnable; 464 } 465 466 /* 467 * Higher load on the destination means we don't want to go there. 468 */ 469 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 470 471 /* 472 * If the proc is on this cpu already, lower the cost by how much 473 * it has been running and an estimate of its footprint. 474 */ 475 if (p->p_cpu == ci && p->p_slptime == 0) { 476 l2resident = 477 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 478 cost -= l2resident * sched_cost_resident; 479 } 480 481 return (cost); 482 } 483 484 /* 485 * Functions to manipulate cpu sets. 486 */ 487 struct cpu_info *cpuset_infos[MAXCPUS]; 488 static struct cpuset cpuset_all; 489 490 void 491 cpuset_init_cpu(struct cpu_info *ci) 492 { 493 cpuset_add(&cpuset_all, ci); 494 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 495 } 496 497 void 498 cpuset_clear(struct cpuset *cs) 499 { 500 memset(cs, 0, sizeof(*cs)); 501 } 502 503 /* 504 * XXX - implement it on SP architectures too 505 */ 506 #ifndef CPU_INFO_UNIT 507 #define CPU_INFO_UNIT 0 508 #endif 509 510 void 511 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 512 { 513 unsigned int num = CPU_INFO_UNIT(ci); 514 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 515 } 516 517 void 518 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 519 { 520 unsigned int num = CPU_INFO_UNIT(ci); 521 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 522 } 523 524 int 525 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 526 { 527 unsigned int num = CPU_INFO_UNIT(ci); 528 return (cs->cs_set[num/32] & (1 << (num % 32))); 529 } 530 531 void 532 cpuset_add_all(struct cpuset *cs) 533 { 534 cpuset_copy(cs, &cpuset_all); 535 } 536 537 void 538 cpuset_copy(struct cpuset *to, struct cpuset *from) 539 { 540 memcpy(to, from, sizeof(*to)); 541 } 542 543 struct cpu_info * 544 cpuset_first(struct cpuset *cs) 545 { 546 int i; 547 548 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 549 if (cs->cs_set[i]) 550 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 551 552 return (NULL); 553 } 554