xref: /openbsd-src/sys/kern/kern_sched.c (revision d746c22dec89954371da372e701c622c09423b88)
1 /*	$OpenBSD: kern_sched.c,v 1.15 2009/11/25 11:01:14 kettenis Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 #include <machine/atomic.h>
28 
29 #include <uvm/uvm_extern.h>
30 
31 #include <sys/malloc.h>
32 
33 
34 void sched_kthreads_create(void *);
35 void sched_idle(void *);
36 
37 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
38 struct proc *sched_steal_proc(struct cpu_info *);
39 
40 /*
41  * To help choosing which cpu should run which process we keep track
42  * of cpus which are currently idle and which cpus have processes
43  * queued.
44  */
45 struct cpuset sched_idle_cpus;
46 struct cpuset sched_queued_cpus;
47 
48 /*
49  * A few notes about cpu_switchto that is implemented in MD code.
50  *
51  * cpu_switchto takes two arguments, the old proc and the proc
52  * it should switch to. The new proc will never be NULL, so we always have
53  * a saved state that we need to switch to. The old proc however can
54  * be NULL if the process is exiting. NULL for the old proc simply
55  * means "don't bother saving old state".
56  *
57  * cpu_switchto is supposed to atomically load the new state of the process
58  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
59  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
60  * cpus in the system must not depend on this state being consistent.
61  * Therefore no locking is necessary in cpu_switchto other than blocking
62  * interrupts during the context switch.
63  */
64 
65 /*
66  * sched_init_cpu is called from main() for the boot cpu, then it's the
67  * responsibility of the MD code to call it for all other cpus.
68  */
69 void
70 sched_init_cpu(struct cpu_info *ci)
71 {
72 	struct schedstate_percpu *spc = &ci->ci_schedstate;
73 	int i;
74 
75 	for (i = 0; i < SCHED_NQS; i++)
76 		TAILQ_INIT(&spc->spc_qs[i]);
77 
78 	spc->spc_idleproc = NULL;
79 
80 	kthread_create_deferred(sched_kthreads_create, ci);
81 
82 	LIST_INIT(&spc->spc_deadproc);
83 
84 	/*
85 	 * Slight hack here until the cpuset code handles cpu_info
86 	 * structures.
87 	 */
88 	cpuset_init_cpu(ci);
89 }
90 
91 void
92 sched_kthreads_create(void *v)
93 {
94 	struct cpu_info *ci = v;
95 	struct schedstate_percpu *spc = &ci->ci_schedstate;
96 	static int num;
97 
98 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
99 		panic("fork idle");
100 
101 	num++;
102 }
103 
104 void
105 sched_idle(void *v)
106 {
107 	struct schedstate_percpu *spc;
108 	struct proc *p = curproc;
109 	struct cpu_info *ci = v;
110 	int s;
111 
112 	KERNEL_PROC_UNLOCK(p);
113 
114 	spc = &ci->ci_schedstate;
115 
116 	/*
117 	 * First time we enter here, we're not supposed to idle,
118 	 * just go away for a while.
119 	 */
120 	SCHED_LOCK(s);
121 	cpuset_add(&sched_idle_cpus, ci);
122 	p->p_stat = SSLEEP;
123 	p->p_cpu = ci;
124 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
125 	mi_switch();
126 	cpuset_del(&sched_idle_cpus, ci);
127 	SCHED_UNLOCK(s);
128 
129 	KASSERT(ci == curcpu());
130 	KASSERT(curproc == spc->spc_idleproc);
131 
132 	while (1) {
133 		while (!curcpu_is_idle()) {
134 			struct proc *dead;
135 
136 			SCHED_LOCK(s);
137 			p->p_stat = SSLEEP;
138 			mi_switch();
139 			SCHED_UNLOCK(s);
140 
141 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
142 				LIST_REMOVE(dead, p_hash);
143 				exit2(dead);
144 			}
145 		}
146 
147 		splassert(IPL_NONE);
148 
149 		if (spc->spc_schedflags & SPCF_SHOULDHALT) {
150 			spc->spc_schedflags |= SPCF_HALTED;
151 			wakeup(spc);
152 		}
153 
154 		cpuset_add(&sched_idle_cpus, ci);
155 		cpu_idle_enter();
156 		while (spc->spc_whichqs == 0)
157 			cpu_idle_cycle();
158 		cpu_idle_leave();
159 		cpuset_del(&sched_idle_cpus, ci);
160 	}
161 }
162 
163 /*
164  * To free our address space we have to jump through a few hoops.
165  * The freeing is done by the reaper, but until we have one reaper
166  * per cpu, we have no way of putting this proc on the deadproc list
167  * and waking up the reaper without risking having our address space and
168  * stack torn from under us before we manage to switch to another proc.
169  * Therefore we have a per-cpu list of dead processes where we put this
170  * proc and have idle clean up that list and move it to the reaper list.
171  * All this will be unnecessary once we can bind the reaper this cpu
172  * and not risk having it switch to another in case it sleeps.
173  */
174 void
175 sched_exit(struct proc *p)
176 {
177 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
178 	struct timeval tv;
179 	struct proc *idle;
180 	int s;
181 
182 	microuptime(&tv);
183 	timersub(&tv, &spc->spc_runtime, &tv);
184 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
185 
186 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
187 
188 	/* This process no longer needs to hold the kernel lock. */
189 	KERNEL_PROC_UNLOCK(p);
190 
191 	SCHED_LOCK(s);
192 	idle = spc->spc_idleproc;
193 	idle->p_stat = SRUN;
194 	cpu_switchto(NULL, idle);
195 	panic("cpu_switchto returned");
196 }
197 
198 /*
199  * Run queue management.
200  */
201 void
202 sched_init_runqueues(void)
203 {
204 #ifdef MULTIPROCESSOR
205 	__mp_lock_init(&sched_lock);
206 #endif
207 }
208 
209 void
210 setrunqueue(struct proc *p)
211 {
212 	struct schedstate_percpu *spc;
213 	int queue = p->p_priority >> 2;
214 
215 	SCHED_ASSERT_LOCKED();
216 	spc = &p->p_cpu->ci_schedstate;
217 	spc->spc_nrun++;
218 
219 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
220 	spc->spc_whichqs |= (1 << queue);
221 	cpuset_add(&sched_queued_cpus, p->p_cpu);
222 
223 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
224 		cpu_unidle(p->p_cpu);
225 }
226 
227 void
228 remrunqueue(struct proc *p)
229 {
230 	struct schedstate_percpu *spc;
231 	int queue = p->p_priority >> 2;
232 
233 	SCHED_ASSERT_LOCKED();
234 	spc = &p->p_cpu->ci_schedstate;
235 	spc->spc_nrun--;
236 
237 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
238 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
239 		spc->spc_whichqs &= ~(1 << queue);
240 		if (spc->spc_whichqs == 0)
241 			cpuset_del(&sched_queued_cpus, p->p_cpu);
242 	}
243 }
244 
245 struct proc *
246 sched_chooseproc(void)
247 {
248 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
249 	struct proc *p;
250 	int queue;
251 
252 	SCHED_ASSERT_LOCKED();
253 
254 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
255 		p = spc->spc_idleproc;
256 		KASSERT(p);
257 		p->p_stat = SRUN;
258 		return (p);
259 	}
260 
261 again:
262 	if (spc->spc_whichqs) {
263 		queue = ffs(spc->spc_whichqs) - 1;
264 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
265 		remrunqueue(p);
266 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
267 		p = spc->spc_idleproc;
268 		if (p == NULL) {
269                         int s;
270 			/*
271 			 * We get here if someone decides to switch during
272 			 * boot before forking kthreads, bleh.
273 			 * This is kind of like a stupid idle loop.
274 			 */
275 #ifdef MULTIPROCESSOR
276 			__mp_unlock(&sched_lock);
277 #endif
278 			spl0();
279 			delay(10);
280 			SCHED_LOCK(s);
281 			goto again;
282                 }
283 		KASSERT(p);
284 		p->p_stat = SRUN;
285 	}
286 
287 	return (p);
288 }
289 
290 uint64_t sched_nmigrations;
291 uint64_t sched_noidle;
292 uint64_t sched_stolen;
293 
294 uint64_t sched_choose;
295 uint64_t sched_wasidle;
296 uint64_t sched_nomigrations;
297 
298 struct cpu_info *
299 sched_choosecpu_fork(struct proc *parent, int flags)
300 {
301 	struct cpu_info *choice = NULL;
302 	fixpt_t load, best_load = ~0;
303 	int run, best_run = INT_MAX;
304 	struct cpu_info *ci;
305 	struct cpuset set;
306 
307 #if 0
308 	/*
309 	 * XXX
310 	 * Don't do this until we have a painless way to move the cpu in exec.
311 	 * Preferably when nuking the old pmap and getting a new one on a
312 	 * new cpu.
313 	 */
314 	/*
315 	 * PPWAIT forks are simple. We know that the parent will not
316 	 * run until we exec and choose another cpu, so we just steal its
317 	 * cpu.
318 	 */
319 	if (flags & FORK_PPWAIT)
320 		return (parent->p_cpu);
321 #endif
322 
323 	/*
324 	 * Look at all cpus that are currently idle and have nothing queued.
325 	 * If there are none, pick the one with least queued procs first,
326 	 * then the one with lowest load average.
327 	 */
328 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
329 	if (cpuset_first(&set) == NULL)
330 		cpuset_add_all(&set);
331 
332 	while ((ci = cpuset_first(&set)) != NULL) {
333 		cpuset_del(&set, ci);
334 
335 		load = ci->ci_schedstate.spc_ldavg;
336 		run = ci->ci_schedstate.spc_nrun;
337 
338 		if (choice == NULL || run < best_run ||
339 		    (run == best_run &&load < best_load)) {
340 			choice = ci;
341 			best_load = load;
342 			best_run = run;
343 		}
344 	}
345 
346 	return (choice);
347 }
348 
349 struct cpu_info *
350 sched_choosecpu(struct proc *p)
351 {
352 	struct cpu_info *choice = NULL;
353 	int last_cost = INT_MAX;
354 	struct cpu_info *ci;
355 	struct cpuset set;
356 
357 	/*
358 	 * If pegged to a cpu, don't allow it to move.
359 	 */
360 	if (p->p_flag & P_CPUPEG)
361 		return (p->p_cpu);
362 
363 	sched_choose++;
364 
365 	/*
366 	 * Look at all cpus that are currently idle and have nothing queued.
367 	 * If there are none, pick the cheapest of those.
368 	 * (idle + queued could mean that the cpu is handling an interrupt
369 	 * at this moment and haven't had time to leave idle yet).
370 	 */
371 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
372 
373 	/*
374 	 * First, just check if our current cpu is in that set, if it is,
375 	 * this is simple.
376 	 * Also, our cpu might not be idle, but if it's the current cpu
377 	 * and it has nothing else queued and we're curproc, take it.
378 	 */
379 	if (cpuset_isset(&set, p->p_cpu) ||
380 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
381 	    curproc == p)) {
382 		sched_wasidle++;
383 		return (p->p_cpu);
384 	}
385 
386 	if (cpuset_first(&set) == NULL)
387 		cpuset_add_all(&set);
388 
389 	while ((ci = cpuset_first(&set)) != NULL) {
390 		int cost = sched_proc_to_cpu_cost(ci, p);
391 
392 		if (choice == NULL || cost < last_cost) {
393 			choice = ci;
394 			last_cost = cost;
395 		}
396 		cpuset_del(&set, ci);
397 	}
398 
399 	if (p->p_cpu != choice)
400 		sched_nmigrations++;
401 	else
402 		sched_nomigrations++;
403 
404 	return (choice);
405 }
406 
407 /*
408  * Attempt to steal a proc from some cpu.
409  */
410 struct proc *
411 sched_steal_proc(struct cpu_info *self)
412 {
413 	struct schedstate_percpu *spc;
414 	struct proc *best = NULL;
415 	int bestcost = INT_MAX;
416 	struct cpu_info *ci;
417 	struct cpuset set;
418 
419 	cpuset_copy(&set, &sched_queued_cpus);
420 
421 	while ((ci = cpuset_first(&set)) != NULL) {
422 		struct proc *p;
423 		int queue;
424 		int cost;
425 
426 		cpuset_del(&set, ci);
427 
428 		spc = &ci->ci_schedstate;
429 
430 		queue = ffs(spc->spc_whichqs) - 1;
431 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
432 			if (p->p_flag & P_CPUPEG)
433 				continue;
434 
435 			cost = sched_proc_to_cpu_cost(self, p);
436 
437 			if (best == NULL || cost < bestcost) {
438 				best = p;
439 				bestcost = cost;
440 			}
441 		}
442 	}
443 	if (best == NULL)
444 		return (NULL);
445 
446 	spc = &best->p_cpu->ci_schedstate;
447 	remrunqueue(best);
448 	best->p_cpu = self;
449 
450 	sched_stolen++;
451 
452 	return (best);
453 }
454 
455 /*
456  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
457  */
458 static int
459 log2(unsigned int i)
460 {
461 	int ret = 0;
462 
463 	while (i >>= 1)
464 		ret++;
465 
466 	return (ret);
467 }
468 
469 /*
470  * Calculate the cost of moving the proc to this cpu.
471  *
472  * What we want is some guesstimate of how much "performance" it will
473  * cost us to move the proc here. Not just for caches and TLBs and NUMA
474  * memory, but also for the proc itself. A highly loaded cpu might not
475  * be the best candidate for this proc since it won't get run.
476  *
477  * Just total guesstimates for now.
478  */
479 
480 int sched_cost_load = 1;
481 int sched_cost_priority = 1;
482 int sched_cost_runnable = 3;
483 int sched_cost_resident = 1;
484 
485 int
486 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
487 {
488 	struct schedstate_percpu *spc;
489 	int l2resident = 0;
490 	int cost;
491 
492 	spc = &ci->ci_schedstate;
493 
494 	cost = 0;
495 
496 	/*
497 	 * First, account for the priority of the proc we want to move.
498 	 * More willing to move, the lower the priority of the destination
499 	 * and the higher the priority of the proc.
500 	 */
501 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
502 		cost += (p->p_priority - spc->spc_curpriority) *
503 		    sched_cost_priority;
504 		cost += sched_cost_runnable;
505 	}
506 	if (cpuset_isset(&sched_queued_cpus, ci)) {
507 		cost += spc->spc_nrun * sched_cost_runnable;
508 	}
509 
510 	/*
511 	 * Higher load on the destination means we don't want to go there.
512 	 */
513 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
514 
515 	/*
516 	 * If the proc is on this cpu already, lower the cost by how much
517 	 * it has been running and an estimate of its footprint.
518 	 */
519 	if (p->p_cpu == ci && p->p_slptime == 0) {
520 		l2resident =
521 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
522 		cost -= l2resident * sched_cost_resident;
523 	}
524 
525 	return (cost);
526 }
527 
528 /*
529  * Peg a proc to a cpu.
530  */
531 void
532 sched_peg_curproc(struct cpu_info *ci)
533 {
534 	struct proc *p = curproc;
535 	int s;
536 
537 	SCHED_LOCK(s);
538 	p->p_priority = p->p_usrpri;
539 	p->p_stat = SRUN;
540 	p->p_cpu = ci;
541 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
542 	setrunqueue(p);
543 	p->p_stats->p_ru.ru_nvcsw++;
544 	mi_switch();
545 	SCHED_UNLOCK(s);
546 }
547 
548 /*
549  * Functions to manipulate cpu sets.
550  */
551 struct cpu_info *cpuset_infos[MAXCPUS];
552 static struct cpuset cpuset_all;
553 
554 void
555 cpuset_init_cpu(struct cpu_info *ci)
556 {
557 	cpuset_add(&cpuset_all, ci);
558 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
559 }
560 
561 void
562 cpuset_clear(struct cpuset *cs)
563 {
564 	memset(cs, 0, sizeof(*cs));
565 }
566 
567 /*
568  * XXX - implement it on SP architectures too
569  */
570 #ifndef CPU_INFO_UNIT
571 #define CPU_INFO_UNIT 0
572 #endif
573 
574 void
575 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
576 {
577 	unsigned int num = CPU_INFO_UNIT(ci);
578 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
579 }
580 
581 void
582 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
583 {
584 	unsigned int num = CPU_INFO_UNIT(ci);
585 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
586 }
587 
588 int
589 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
590 {
591 	unsigned int num = CPU_INFO_UNIT(ci);
592 	return (cs->cs_set[num/32] & (1 << (num % 32)));
593 }
594 
595 void
596 cpuset_add_all(struct cpuset *cs)
597 {
598 	cpuset_copy(cs, &cpuset_all);
599 }
600 
601 void
602 cpuset_copy(struct cpuset *to, struct cpuset *from)
603 {
604 	memcpy(to, from, sizeof(*to));
605 }
606 
607 struct cpu_info *
608 cpuset_first(struct cpuset *cs)
609 {
610 	int i;
611 
612 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
613 		if (cs->cs_set[i])
614 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
615 
616 	return (NULL);
617 }
618 
619 void
620 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
621 {
622 	int i;
623 
624 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
625 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
626 }
627 
628 void
629 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
630 {
631 	int i;
632 
633 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
634 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
635 }
636 
637 void
638 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
639 {
640 	int i;
641 
642 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
643 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
644 }
645