xref: /openbsd-src/sys/kern/kern_sched.c (revision 2b0358df1d88d06ef4139321dd05bd5e05d91eaf)
1 /*	$OpenBSD: kern_sched.c,v 1.10 2009/04/03 09:29:15 art Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 #include <machine/atomic.h>
28 
29 #include <uvm/uvm_extern.h>
30 
31 #include <sys/malloc.h>
32 
33 
34 void sched_kthreads_create(void *);
35 void sched_idle(void *);
36 
37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
38 struct proc *sched_steal_proc(struct cpu_info *);
39 
40 /*
41  * To help choosing which cpu should run which process we keep track
42  * of cpus which are currently idle and which cpus have processes
43  * queued.
44  */
45 struct cpuset sched_idle_cpus;
46 struct cpuset sched_queued_cpus;
47 
48 /*
49  * A few notes about cpu_switchto that is implemented in MD code.
50  *
51  * cpu_switchto takes two arguments, the old proc and the proc
52  * it should switch to. The new proc will never be NULL, so we always have
53  * a saved state that we need to switch to. The old proc however can
54  * be NULL if the process is exiting. NULL for the old proc simply
55  * means "don't bother saving old state".
56  *
57  * cpu_switchto is supposed to atomically load the new state of the process
58  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
59  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
60  * cpus in the system must not depend on this state being consistent.
61  * Therefore no locking is necessary in cpu_switchto other than blocking
62  * interrupts during the context switch.
63  */
64 
65 /*
66  * sched_init_cpu is called from main() for the boot cpu, then it's the
67  * responsibility of the MD code to call it for all other cpus.
68  */
69 void
70 sched_init_cpu(struct cpu_info *ci)
71 {
72 	struct schedstate_percpu *spc = &ci->ci_schedstate;
73 	int i;
74 
75 	for (i = 0; i < SCHED_NQS; i++)
76 		TAILQ_INIT(&spc->spc_qs[i]);
77 
78 	spc->spc_idleproc = NULL;
79 
80 	kthread_create_deferred(sched_kthreads_create, ci);
81 
82 	LIST_INIT(&spc->spc_deadproc);
83 
84 	/*
85 	 * Slight hack here until the cpuset code handles cpu_info
86 	 * structures.
87 	 */
88 	cpuset_init_cpu(ci);
89 }
90 
91 void
92 sched_kthreads_create(void *v)
93 {
94 	struct cpu_info *ci = v;
95 	struct schedstate_percpu *spc = &ci->ci_schedstate;
96 	static int num;
97 
98 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
99 		panic("fork idle");
100 
101 	num++;
102 }
103 
104 void
105 sched_idle(void *v)
106 {
107 	struct schedstate_percpu *spc;
108 	struct proc *p = curproc;
109 	struct cpu_info *ci = v;
110 	int s;
111 
112 	KERNEL_PROC_UNLOCK(p);
113 
114 	spc = &ci->ci_schedstate;
115 
116 	/*
117 	 * First time we enter here, we're not supposed to idle,
118 	 * just go away for a while.
119 	 */
120 	SCHED_LOCK(s);
121 	cpuset_add(&sched_idle_cpus, ci);
122 	p->p_stat = SSLEEP;
123 	mi_switch();
124 	cpuset_del(&sched_idle_cpus, ci);
125 	SCHED_UNLOCK(s);
126 
127 	KASSERT(ci == curcpu());
128 	KASSERT(curproc == spc->spc_idleproc);
129 
130 	while (1) {
131 		while (!curcpu_is_idle()) {
132 			struct proc *dead;
133 
134 			SCHED_LOCK(s);
135 			p->p_stat = SSLEEP;
136 			mi_switch();
137 			SCHED_UNLOCK(s);
138 
139 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
140 				LIST_REMOVE(dead, p_hash);
141 				exit2(dead);
142 			}
143 		}
144 
145 		splassert(IPL_NONE);
146 
147 		cpuset_add(&sched_idle_cpus, ci);
148 		cpu_idle_enter();
149 		while (spc->spc_whichqs == 0)
150 			cpu_idle_cycle();
151 		cpu_idle_leave();
152 		cpuset_del(&sched_idle_cpus, ci);
153 	}
154 }
155 
156 /*
157  * To free our address space we have to jump through a few hoops.
158  * The freeing is done by the reaper, but until we have one reaper
159  * per cpu, we have no way of putting this proc on the deadproc list
160  * and waking up the reaper without risking having our address space and
161  * stack torn from under us before we manage to switch to another proc.
162  * Therefore we have a per-cpu list of dead processes where we put this
163  * proc and have idle clean up that list and move it to the reaper list.
164  * All this will be unnecessary once we can bind the reaper this cpu
165  * and not risk having it switch to another in case it sleeps.
166  */
167 void
168 sched_exit(struct proc *p)
169 {
170 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
171 	struct timeval tv;
172 	struct proc *idle;
173 	int s;
174 
175 	microuptime(&tv);
176 	timersub(&tv, &spc->spc_runtime, &tv);
177 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
178 
179 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
180 
181 #ifdef MULTIPROCESSOR
182 	KASSERT(__mp_lock_held(&kernel_lock) == 0);
183 #endif
184 
185 	SCHED_LOCK(s);
186 	idle = spc->spc_idleproc;
187 	idle->p_stat = SRUN;
188 	cpu_switchto(NULL, idle);
189 	panic("cpu_switchto returned");
190 }
191 
192 /*
193  * Run queue management.
194  */
195 void
196 sched_init_runqueues(void)
197 {
198 #ifdef MULTIPROCESSOR
199 	__mp_lock_init(&sched_lock);
200 #endif
201 }
202 
203 void
204 setrunqueue(struct proc *p)
205 {
206 	struct schedstate_percpu *spc;
207 	int queue = p->p_priority >> 2;
208 
209 	SCHED_ASSERT_LOCKED();
210 	sched_choosecpu(p);
211 	spc = &p->p_cpu->ci_schedstate;
212 	spc->spc_nrun++;
213 
214 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
215 	spc->spc_whichqs |= (1 << queue);
216 	cpuset_add(&sched_queued_cpus, p->p_cpu);
217 
218 	if (p->p_cpu != curcpu())
219 		cpu_unidle(p->p_cpu);
220 }
221 
222 void
223 remrunqueue(struct proc *p)
224 {
225 	struct schedstate_percpu *spc;
226 	int queue = p->p_priority >> 2;
227 
228 	SCHED_ASSERT_LOCKED();
229 	spc = &p->p_cpu->ci_schedstate;
230 	spc->spc_nrun--;
231 
232 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
233 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
234 		spc->spc_whichqs &= ~(1 << queue);
235 		if (spc->spc_whichqs == 0)
236 			cpuset_del(&sched_queued_cpus, p->p_cpu);
237 	}
238 }
239 
240 struct proc *
241 sched_chooseproc(void)
242 {
243 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
244 	struct proc *p;
245 	int queue;
246 
247 	SCHED_ASSERT_LOCKED();
248 
249 again:
250 	if (spc->spc_whichqs) {
251 		queue = ffs(spc->spc_whichqs) - 1;
252 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
253 		remrunqueue(p);
254 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
255 		p = spc->spc_idleproc;
256 		if (p == NULL) {
257                         int s;
258 			/*
259 			 * We get here if someone decides to switch during
260 			 * boot before forking kthreads, bleh.
261 			 * This is kind of like a stupid idle loop.
262 			 */
263 #ifdef MULTIPROCESSOR
264 			__mp_unlock(&sched_lock);
265 #endif
266 			spl0();
267 			delay(10);
268 			SCHED_LOCK(s);
269 			goto again;
270                 }
271 		KASSERT(p);
272 		p->p_stat = SRUN;
273 	}
274 
275 	return (p);
276 }
277 
278 uint64_t sched_nmigrations;
279 uint64_t sched_noidle;
280 uint64_t sched_stolen;
281 
282 uint64_t sched_choose;
283 uint64_t sched_wasidle;
284 uint64_t sched_nomigrations;
285 
286 void
287 sched_choosecpu(struct proc *p)
288 {
289 	struct cpu_info *choice = NULL;
290 	int last_cost = INT_MAX;
291 	struct cpu_info *ci;
292 	struct cpuset set;
293 
294 	/*
295 	 * If pegged to a cpu, don't allow it to move.
296 	 */
297 	if (p->p_flag & P_CPUPEG)
298 		return;
299 
300 	sched_choose++;
301 
302 	/*
303 	 * The simplest case. Our cpu of choice was idle. This happens
304 	 * when we were sleeping and something woke us up.
305 	 *
306 	 * We also need to check sched_queued_cpus to make sure that
307 	 * we're not thundering herding one cpu that hasn't managed to
308 	 * get out of the idle loop yet.
309 	 */
310 	if (p->p_cpu && cpuset_isset(&sched_idle_cpus, p->p_cpu) &&
311 	    !cpuset_isset(&sched_queued_cpus, p->p_cpu)) {
312 		sched_wasidle++;
313 		return;
314 	}
315 
316 #if 0
317 
318 		/* Most likely, this is broken. don't do it. */
319 	/*
320 	 * Second case. (shouldn't be necessary in the future)
321 	 * If our cpu is not idle, but has nothing else queued (which
322 	 * means that we are curproc and roundrobin asks us to reschedule).
323 	 */
324 	if (p->p_cpu && p->p_cpu->ci_schedstate.spc_nrun == 0)
325 		return;
326 #endif
327 
328 	/*
329 	 * Look at all cpus that are currently idle. Pick the cheapest of
330 	 * those.
331 	 */
332 	cpuset_copy(&set, &sched_idle_cpus);
333 	while ((ci = cpuset_first(&set)) != NULL) {
334 		int cost = sched_proc_to_cpu_cost(ci, p);
335 
336 		if (choice == NULL || cost < last_cost) {
337 			choice = ci;
338 			last_cost = cost;
339 		}
340 		cpuset_del(&set, ci);
341 	}
342 
343 	/*
344 	 * All cpus are busy. Pick one.
345 	 */
346 	if (choice == NULL) {
347 		CPU_INFO_ITERATOR cii;
348 
349 		sched_noidle++;
350 
351 		/*
352 		 * Not curproc, pick the cpu with the lowest cost to switch to.
353 		 */
354 		CPU_INFO_FOREACH(cii, ci) {
355 			int cost = sched_proc_to_cpu_cost(ci, p);
356 
357 			if (choice == NULL || cost < last_cost) {
358 				choice = ci;
359 				last_cost = cost;
360 			}
361 		}
362 	}
363 
364 	KASSERT(choice);
365 
366 	if (p->p_cpu && p->p_cpu != choice)
367 		sched_nmigrations++;
368 	else if (p->p_cpu != NULL)
369 		sched_nomigrations++;
370 
371 	p->p_cpu = choice;
372 }
373 
374 /*
375  * Attempt to steal a proc from some cpu.
376  */
377 struct proc *
378 sched_steal_proc(struct cpu_info *self)
379 {
380 	struct schedstate_percpu *spc;
381 	struct proc *best = NULL;
382 	int bestcost = INT_MAX;
383 	struct cpu_info *ci;
384 	struct cpuset set;
385 
386 	cpuset_copy(&set, &sched_queued_cpus);
387 
388 	while ((ci = cpuset_first(&set)) != NULL) {
389 		struct proc *p;
390 		int cost;
391 
392 		cpuset_del(&set, ci);
393 
394 		spc = &ci->ci_schedstate;
395 
396 		p = TAILQ_FIRST(&spc->spc_qs[ffs(spc->spc_whichqs) - 1]);
397 		KASSERT(p);
398 		cost = sched_proc_to_cpu_cost(self, p);
399 
400 		if (best == NULL || cost < bestcost) {
401 			best = p;
402 			bestcost = cost;
403 		}
404 	}
405 	if (best == NULL)
406 		return (NULL);
407 
408 	spc = &best->p_cpu->ci_schedstate;
409 	remrunqueue(best);
410 	best->p_cpu = self;
411 
412 	sched_stolen++;
413 
414 	return (best);
415 }
416 
417 /*
418  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
419  */
420 static int
421 log2(unsigned int i)
422 {
423 	int ret = 0;
424 
425 	while (i >>= 1)
426 		ret++;
427 
428 	return (ret);
429 }
430 
431 /*
432  * Calculate the cost of moving the proc to this cpu.
433  *
434  * What we want is some guesstimate of how much "performance" it will
435  * cost us to move the proc here. Not just for caches and TLBs and NUMA
436  * memory, but also for the proc itself. A highly loaded cpu might not
437  * be the best candidate for this proc since it won't get run.
438  *
439  * Just total guesstimates for now.
440  */
441 
442 int sched_cost_load = 1;
443 int sched_cost_priority = 1;
444 int sched_cost_runnable = 3;
445 int sched_cost_resident = 1;
446 
447 int
448 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
449 {
450 	struct schedstate_percpu *spc;
451 	int l2resident = 0;
452 	int cost;
453 
454 	spc = &ci->ci_schedstate;
455 
456 	cost = 0;
457 
458 	/*
459 	 * First, account for the priority of the proc we want to move.
460 	 * More willing to move, the lower the priority of the destination
461 	 * and the higher the priority of the proc.
462 	 */
463 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
464 		cost += (p->p_priority - spc->spc_curpriority) *
465 		    sched_cost_priority;
466 		cost += sched_cost_runnable;
467 	}
468 	if (cpuset_isset(&sched_queued_cpus, ci)) {
469 		cost += spc->spc_nrun * sched_cost_runnable;
470 	}
471 
472 	/*
473 	 * Higher load on the destination means we don't want to go there.
474 	 */
475 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
476 
477 	/*
478 	 * If the proc is on this cpu already, lower the cost by how much
479 	 * it has been running and an estimate of its footprint.
480 	 */
481 	if (p->p_cpu == ci && p->p_slptime == 0) {
482 		l2resident =
483 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
484 		cost -= l2resident * sched_cost_resident;
485 	}
486 
487 	return (cost);
488 }
489 
490 /*
491  * Peg a proc to a cpu.
492  */
493 void
494 sched_peg_curproc(struct cpu_info *ci)
495 {
496 	struct proc *p = curproc;
497 	int s;
498 
499 	SCHED_LOCK(s);
500 	p->p_priority = p->p_usrpri;
501 	p->p_stat = SRUN;
502 	p->p_cpu = ci;
503 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
504 	setrunqueue(p);
505 	p->p_stats->p_ru.ru_nvcsw++;
506 	mi_switch();
507 	SCHED_UNLOCK(s);
508 }
509 
510 /*
511  * Functions to manipulate cpu sets.
512  */
513 struct cpu_info *cpuset_infos[MAXCPUS];
514 static struct cpuset cpuset_all;
515 
516 void
517 cpuset_init_cpu(struct cpu_info *ci)
518 {
519 	cpuset_add(&cpuset_all, ci);
520 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
521 }
522 
523 void
524 cpuset_clear(struct cpuset *cs)
525 {
526 	memset(cs, 0, sizeof(*cs));
527 }
528 
529 /*
530  * XXX - implement it on SP architectures too
531  */
532 #ifndef CPU_INFO_UNIT
533 #define CPU_INFO_UNIT 0
534 #endif
535 
536 void
537 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
538 {
539 	unsigned int num = CPU_INFO_UNIT(ci);
540 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
541 }
542 
543 void
544 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
545 {
546 	unsigned int num = CPU_INFO_UNIT(ci);
547 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
548 }
549 
550 int
551 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
552 {
553 	unsigned int num = CPU_INFO_UNIT(ci);
554 	return (cs->cs_set[num/32] & (1 << (num % 32)));
555 }
556 
557 void
558 cpuset_add_all(struct cpuset *cs)
559 {
560 	cpuset_copy(cs, &cpuset_all);
561 }
562 
563 void
564 cpuset_copy(struct cpuset *to, struct cpuset *from)
565 {
566 	memcpy(to, from, sizeof(*to));
567 }
568 
569 struct cpu_info *
570 cpuset_first(struct cpuset *cs)
571 {
572 	int i;
573 
574 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
575 		if (cs->cs_set[i])
576 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
577 
578 	return (NULL);
579 }
580