xref: /openbsd-src/sys/kern/kern_sched.c (revision 8f15e6a4ddcf78e1c1320bc27380937fb93b1657)
1 /*	$OpenBSD: kern_sched.c,v 1.26 2012/03/23 15:51:26 guenther Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 
28 #include <uvm/uvm_extern.h>
29 
30 #include <sys/malloc.h>
31 
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 /*
58  * A few notes about cpu_switchto that is implemented in MD code.
59  *
60  * cpu_switchto takes two arguments, the old proc and the proc
61  * it should switch to. The new proc will never be NULL, so we always have
62  * a saved state that we need to switch to. The old proc however can
63  * be NULL if the process is exiting. NULL for the old proc simply
64  * means "don't bother saving old state".
65  *
66  * cpu_switchto is supposed to atomically load the new state of the process
67  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
68  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
69  * cpus in the system must not depend on this state being consistent.
70  * Therefore no locking is necessary in cpu_switchto other than blocking
71  * interrupts during the context switch.
72  */
73 
74 /*
75  * sched_init_cpu is called from main() for the boot cpu, then it's the
76  * responsibility of the MD code to call it for all other cpus.
77  */
78 void
79 sched_init_cpu(struct cpu_info *ci)
80 {
81 	struct schedstate_percpu *spc = &ci->ci_schedstate;
82 	int i;
83 
84 	for (i = 0; i < SCHED_NQS; i++)
85 		TAILQ_INIT(&spc->spc_qs[i]);
86 
87 	spc->spc_idleproc = NULL;
88 
89 	kthread_create_deferred(sched_kthreads_create, ci);
90 
91 	LIST_INIT(&spc->spc_deadproc);
92 
93 	/*
94 	 * Slight hack here until the cpuset code handles cpu_info
95 	 * structures.
96 	 */
97 	cpuset_init_cpu(ci);
98 	cpuset_add(&sched_all_cpus, ci);
99 }
100 
101 void
102 sched_kthreads_create(void *v)
103 {
104 	struct cpu_info *ci = v;
105 	struct schedstate_percpu *spc = &ci->ci_schedstate;
106 	static int num;
107 
108 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
109 		panic("fork idle");
110 
111 	num++;
112 }
113 
114 void
115 sched_idle(void *v)
116 {
117 	struct schedstate_percpu *spc;
118 	struct proc *p = curproc;
119 	struct cpu_info *ci = v;
120 	int s;
121 
122 	KERNEL_UNLOCK();
123 
124 	spc = &ci->ci_schedstate;
125 
126 	/*
127 	 * First time we enter here, we're not supposed to idle,
128 	 * just go away for a while.
129 	 */
130 	SCHED_LOCK(s);
131 	cpuset_add(&sched_idle_cpus, ci);
132 	p->p_stat = SSLEEP;
133 	p->p_cpu = ci;
134 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
135 	mi_switch();
136 	cpuset_del(&sched_idle_cpus, ci);
137 	SCHED_UNLOCK(s);
138 
139 	KASSERT(ci == curcpu());
140 	KASSERT(curproc == spc->spc_idleproc);
141 
142 	while (1) {
143 		while (!curcpu_is_idle()) {
144 			struct proc *dead;
145 
146 			SCHED_LOCK(s);
147 			p->p_stat = SSLEEP;
148 			mi_switch();
149 			SCHED_UNLOCK(s);
150 
151 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
152 				LIST_REMOVE(dead, p_hash);
153 				exit2(dead);
154 			}
155 		}
156 
157 		splassert(IPL_NONE);
158 
159 		cpuset_add(&sched_idle_cpus, ci);
160 		cpu_idle_enter();
161 		while (spc->spc_whichqs == 0) {
162 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
163 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
164 				cpuset_del(&sched_idle_cpus, ci);
165 				SCHED_LOCK(s);
166 				atomic_setbits_int(&spc->spc_schedflags,
167 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
168 				SCHED_UNLOCK(s);
169 				wakeup(spc);
170 			}
171 			cpu_idle_cycle();
172 		}
173 		cpu_idle_leave();
174 		cpuset_del(&sched_idle_cpus, ci);
175 	}
176 }
177 
178 /*
179  * To free our address space we have to jump through a few hoops.
180  * The freeing is done by the reaper, but until we have one reaper
181  * per cpu, we have no way of putting this proc on the deadproc list
182  * and waking up the reaper without risking having our address space and
183  * stack torn from under us before we manage to switch to another proc.
184  * Therefore we have a per-cpu list of dead processes where we put this
185  * proc and have idle clean up that list and move it to the reaper list.
186  * All this will be unnecessary once we can bind the reaper this cpu
187  * and not risk having it switch to another in case it sleeps.
188  */
189 void
190 sched_exit(struct proc *p)
191 {
192 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
193 	struct timeval tv;
194 	struct proc *idle;
195 	int s;
196 
197 	microuptime(&tv);
198 	timersub(&tv, &spc->spc_runtime, &tv);
199 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
200 
201 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
202 
203 	/* This process no longer needs to hold the kernel lock. */
204 	KERNEL_UNLOCK();
205 
206 	SCHED_LOCK(s);
207 	idle = spc->spc_idleproc;
208 	idle->p_stat = SRUN;
209 	cpu_switchto(NULL, idle);
210 	panic("cpu_switchto returned");
211 }
212 
213 /*
214  * Run queue management.
215  */
216 void
217 sched_init_runqueues(void)
218 {
219 }
220 
221 void
222 setrunqueue(struct proc *p)
223 {
224 	struct schedstate_percpu *spc;
225 	int queue = p->p_priority >> 2;
226 
227 	SCHED_ASSERT_LOCKED();
228 	spc = &p->p_cpu->ci_schedstate;
229 	spc->spc_nrun++;
230 
231 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
232 	spc->spc_whichqs |= (1 << queue);
233 	cpuset_add(&sched_queued_cpus, p->p_cpu);
234 
235 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
236 		cpu_unidle(p->p_cpu);
237 }
238 
239 void
240 remrunqueue(struct proc *p)
241 {
242 	struct schedstate_percpu *spc;
243 	int queue = p->p_priority >> 2;
244 
245 	SCHED_ASSERT_LOCKED();
246 	spc = &p->p_cpu->ci_schedstate;
247 	spc->spc_nrun--;
248 
249 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
250 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
251 		spc->spc_whichqs &= ~(1 << queue);
252 		if (spc->spc_whichqs == 0)
253 			cpuset_del(&sched_queued_cpus, p->p_cpu);
254 	}
255 }
256 
257 struct proc *
258 sched_chooseproc(void)
259 {
260 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
261 	struct proc *p;
262 	int queue;
263 
264 	SCHED_ASSERT_LOCKED();
265 
266 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
267 		if (spc->spc_whichqs) {
268 			for (queue = 0; queue < SCHED_NQS; queue++) {
269 				TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
270 					remrunqueue(p);
271 					p->p_cpu = sched_choosecpu(p);
272 					setrunqueue(p);
273 				}
274 			}
275 		}
276 		p = spc->spc_idleproc;
277 		KASSERT(p);
278 		KASSERT(p->p_wchan == NULL);
279 		p->p_stat = SRUN;
280 		return (p);
281 	}
282 
283 again:
284 	if (spc->spc_whichqs) {
285 		queue = ffs(spc->spc_whichqs) - 1;
286 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
287 		remrunqueue(p);
288 		sched_noidle++;
289 		KASSERT(p->p_stat == SRUN);
290 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
291 		p = spc->spc_idleproc;
292 		if (p == NULL) {
293                         int s;
294 			/*
295 			 * We get here if someone decides to switch during
296 			 * boot before forking kthreads, bleh.
297 			 * This is kind of like a stupid idle loop.
298 			 */
299 #ifdef MULTIPROCESSOR
300 			__mp_unlock(&sched_lock);
301 #endif
302 			spl0();
303 			delay(10);
304 			SCHED_LOCK(s);
305 			goto again;
306                 }
307 		KASSERT(p);
308 		p->p_stat = SRUN;
309 	}
310 
311 	KASSERT(p->p_wchan == NULL);
312 	return (p);
313 }
314 
315 struct cpu_info *
316 sched_choosecpu_fork(struct proc *parent, int flags)
317 {
318 	struct cpu_info *choice = NULL;
319 	fixpt_t load, best_load = ~0;
320 	int run, best_run = INT_MAX;
321 	struct cpu_info *ci;
322 	struct cpuset set;
323 
324 #if 0
325 	/*
326 	 * XXX
327 	 * Don't do this until we have a painless way to move the cpu in exec.
328 	 * Preferably when nuking the old pmap and getting a new one on a
329 	 * new cpu.
330 	 */
331 	/*
332 	 * PPWAIT forks are simple. We know that the parent will not
333 	 * run until we exec and choose another cpu, so we just steal its
334 	 * cpu.
335 	 */
336 	if (flags & FORK_PPWAIT)
337 		return (parent->p_cpu);
338 #endif
339 
340 	/*
341 	 * Look at all cpus that are currently idle and have nothing queued.
342 	 * If there are none, pick the one with least queued procs first,
343 	 * then the one with lowest load average.
344 	 */
345 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
346 	if (cpuset_first(&set) == NULL)
347 		cpuset_copy(&set, &sched_all_cpus);
348 
349 	while ((ci = cpuset_first(&set)) != NULL) {
350 		cpuset_del(&set, ci);
351 
352 		load = ci->ci_schedstate.spc_ldavg;
353 		run = ci->ci_schedstate.spc_nrun;
354 
355 		if (choice == NULL || run < best_run ||
356 		    (run == best_run &&load < best_load)) {
357 			choice = ci;
358 			best_load = load;
359 			best_run = run;
360 		}
361 	}
362 
363 	return (choice);
364 }
365 
366 struct cpu_info *
367 sched_choosecpu(struct proc *p)
368 {
369 	struct cpu_info *choice = NULL;
370 	int last_cost = INT_MAX;
371 	struct cpu_info *ci;
372 	struct cpuset set;
373 
374 	/*
375 	 * If pegged to a cpu, don't allow it to move.
376 	 */
377 	if (p->p_flag & P_CPUPEG)
378 		return (p->p_cpu);
379 
380 	sched_choose++;
381 
382 	/*
383 	 * Look at all cpus that are currently idle and have nothing queued.
384 	 * If there are none, pick the cheapest of those.
385 	 * (idle + queued could mean that the cpu is handling an interrupt
386 	 * at this moment and haven't had time to leave idle yet).
387 	 */
388 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
389 
390 	/*
391 	 * First, just check if our current cpu is in that set, if it is,
392 	 * this is simple.
393 	 * Also, our cpu might not be idle, but if it's the current cpu
394 	 * and it has nothing else queued and we're curproc, take it.
395 	 */
396 	if (cpuset_isset(&set, p->p_cpu) ||
397 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
398 	    curproc == p)) {
399 		sched_wasidle++;
400 		return (p->p_cpu);
401 	}
402 
403 	if (cpuset_first(&set) == NULL)
404 		cpuset_copy(&set, &sched_all_cpus);
405 
406 	while ((ci = cpuset_first(&set)) != NULL) {
407 		int cost = sched_proc_to_cpu_cost(ci, p);
408 
409 		if (choice == NULL || cost < last_cost) {
410 			choice = ci;
411 			last_cost = cost;
412 		}
413 		cpuset_del(&set, ci);
414 	}
415 
416 	if (p->p_cpu != choice)
417 		sched_nmigrations++;
418 	else
419 		sched_nomigrations++;
420 
421 	return (choice);
422 }
423 
424 /*
425  * Attempt to steal a proc from some cpu.
426  */
427 struct proc *
428 sched_steal_proc(struct cpu_info *self)
429 {
430 	struct schedstate_percpu *spc;
431 	struct proc *best = NULL;
432 	int bestcost = INT_MAX;
433 	struct cpu_info *ci;
434 	struct cpuset set;
435 
436 	cpuset_copy(&set, &sched_queued_cpus);
437 
438 	while ((ci = cpuset_first(&set)) != NULL) {
439 		struct proc *p;
440 		int queue;
441 		int cost;
442 
443 		cpuset_del(&set, ci);
444 
445 		spc = &ci->ci_schedstate;
446 
447 		queue = ffs(spc->spc_whichqs) - 1;
448 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
449 			if (p->p_flag & P_CPUPEG)
450 				continue;
451 
452 			cost = sched_proc_to_cpu_cost(self, p);
453 
454 			if (best == NULL || cost < bestcost) {
455 				best = p;
456 				bestcost = cost;
457 			}
458 		}
459 	}
460 	if (best == NULL)
461 		return (NULL);
462 
463 	spc = &best->p_cpu->ci_schedstate;
464 	remrunqueue(best);
465 	best->p_cpu = self;
466 
467 	sched_stolen++;
468 
469 	return (best);
470 }
471 
472 /*
473  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
474  */
475 static int
476 log2(unsigned int i)
477 {
478 	int ret = 0;
479 
480 	while (i >>= 1)
481 		ret++;
482 
483 	return (ret);
484 }
485 
486 /*
487  * Calculate the cost of moving the proc to this cpu.
488  *
489  * What we want is some guesstimate of how much "performance" it will
490  * cost us to move the proc here. Not just for caches and TLBs and NUMA
491  * memory, but also for the proc itself. A highly loaded cpu might not
492  * be the best candidate for this proc since it won't get run.
493  *
494  * Just total guesstimates for now.
495  */
496 
497 int sched_cost_load = 1;
498 int sched_cost_priority = 1;
499 int sched_cost_runnable = 3;
500 int sched_cost_resident = 1;
501 
502 int
503 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
504 {
505 	struct schedstate_percpu *spc;
506 	int l2resident = 0;
507 	int cost;
508 
509 	spc = &ci->ci_schedstate;
510 
511 	cost = 0;
512 
513 	/*
514 	 * First, account for the priority of the proc we want to move.
515 	 * More willing to move, the lower the priority of the destination
516 	 * and the higher the priority of the proc.
517 	 */
518 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
519 		cost += (p->p_priority - spc->spc_curpriority) *
520 		    sched_cost_priority;
521 		cost += sched_cost_runnable;
522 	}
523 	if (cpuset_isset(&sched_queued_cpus, ci))
524 		cost += spc->spc_nrun * sched_cost_runnable;
525 
526 	/*
527 	 * Higher load on the destination means we don't want to go there.
528 	 */
529 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
530 
531 	/*
532 	 * If the proc is on this cpu already, lower the cost by how much
533 	 * it has been running and an estimate of its footprint.
534 	 */
535 	if (p->p_cpu == ci && p->p_slptime == 0) {
536 		l2resident =
537 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
538 		cost -= l2resident * sched_cost_resident;
539 	}
540 
541 	return (cost);
542 }
543 
544 /*
545  * Peg a proc to a cpu.
546  */
547 void
548 sched_peg_curproc(struct cpu_info *ci)
549 {
550 	struct proc *p = curproc;
551 	int s;
552 
553 	SCHED_LOCK(s);
554 	p->p_priority = p->p_usrpri;
555 	p->p_stat = SRUN;
556 	p->p_cpu = ci;
557 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
558 	setrunqueue(p);
559 	p->p_ru.ru_nvcsw++;
560 	mi_switch();
561 	SCHED_UNLOCK(s);
562 }
563 
564 #ifdef MULTIPROCESSOR
565 
566 void
567 sched_start_secondary_cpus(void)
568 {
569 	CPU_INFO_ITERATOR cii;
570 	struct cpu_info *ci;
571 
572 	CPU_INFO_FOREACH(cii, ci) {
573 		struct schedstate_percpu *spc = &ci->ci_schedstate;
574 
575 		if (CPU_IS_PRIMARY(ci))
576 			continue;
577 		cpuset_add(&sched_all_cpus, ci);
578 		atomic_clearbits_int(&spc->spc_schedflags,
579 		    SPCF_SHOULDHALT | SPCF_HALTED);
580 	}
581 }
582 
583 void
584 sched_stop_secondary_cpus(void)
585 {
586 	CPU_INFO_ITERATOR cii;
587 	struct cpu_info *ci;
588 
589 	/*
590 	 * Make sure we stop the secondary CPUs.
591 	 */
592 	CPU_INFO_FOREACH(cii, ci) {
593 		struct schedstate_percpu *spc = &ci->ci_schedstate;
594 
595 		if (CPU_IS_PRIMARY(ci))
596 			continue;
597 		cpuset_del(&sched_all_cpus, ci);
598 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
599 	}
600 	CPU_INFO_FOREACH(cii, ci) {
601 		struct schedstate_percpu *spc = &ci->ci_schedstate;
602 		struct sleep_state sls;
603 
604 		if (CPU_IS_PRIMARY(ci))
605 			continue;
606 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
607 			sleep_setup(&sls, spc, PZERO, "schedstate");
608 			sleep_finish(&sls,
609 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
610 		}
611 	}
612 }
613 
614 #endif
615 
616 /*
617  * Functions to manipulate cpu sets.
618  */
619 struct cpu_info *cpuset_infos[MAXCPUS];
620 static struct cpuset cpuset_all;
621 
622 void
623 cpuset_init_cpu(struct cpu_info *ci)
624 {
625 	cpuset_add(&cpuset_all, ci);
626 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
627 }
628 
629 void
630 cpuset_clear(struct cpuset *cs)
631 {
632 	memset(cs, 0, sizeof(*cs));
633 }
634 
635 void
636 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
637 {
638 	unsigned int num = CPU_INFO_UNIT(ci);
639 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
640 }
641 
642 void
643 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
644 {
645 	unsigned int num = CPU_INFO_UNIT(ci);
646 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
647 }
648 
649 int
650 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
651 {
652 	unsigned int num = CPU_INFO_UNIT(ci);
653 	return (cs->cs_set[num/32] & (1 << (num % 32)));
654 }
655 
656 void
657 cpuset_add_all(struct cpuset *cs)
658 {
659 	cpuset_copy(cs, &cpuset_all);
660 }
661 
662 void
663 cpuset_copy(struct cpuset *to, struct cpuset *from)
664 {
665 	memcpy(to, from, sizeof(*to));
666 }
667 
668 struct cpu_info *
669 cpuset_first(struct cpuset *cs)
670 {
671 	int i;
672 
673 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
674 		if (cs->cs_set[i])
675 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
676 
677 	return (NULL);
678 }
679 
680 void
681 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
682 {
683 	int i;
684 
685 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
686 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
687 }
688 
689 void
690 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
691 {
692 	int i;
693 
694 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
695 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
696 }
697 
698 void
699 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
700 {
701 	int i;
702 
703 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
704 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
705 }
706