xref: /openbsd-src/sys/kern/kern_sched.c (revision 1a90c3d64e1f00092f47e9fc533235225576ebe1)
1 /*	$OpenBSD: kern_sched.c,v 1.24 2011/10/12 18:30:09 miod Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 
28 #include <uvm/uvm_extern.h>
29 
30 #include <sys/malloc.h>
31 
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * A few notes about cpu_switchto that is implemented in MD code.
49  *
50  * cpu_switchto takes two arguments, the old proc and the proc
51  * it should switch to. The new proc will never be NULL, so we always have
52  * a saved state that we need to switch to. The old proc however can
53  * be NULL if the process is exiting. NULL for the old proc simply
54  * means "don't bother saving old state".
55  *
56  * cpu_switchto is supposed to atomically load the new state of the process
57  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
58  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
59  * cpus in the system must not depend on this state being consistent.
60  * Therefore no locking is necessary in cpu_switchto other than blocking
61  * interrupts during the context switch.
62  */
63 
64 /*
65  * sched_init_cpu is called from main() for the boot cpu, then it's the
66  * responsibility of the MD code to call it for all other cpus.
67  */
68 void
69 sched_init_cpu(struct cpu_info *ci)
70 {
71 	struct schedstate_percpu *spc = &ci->ci_schedstate;
72 	int i;
73 
74 	for (i = 0; i < SCHED_NQS; i++)
75 		TAILQ_INIT(&spc->spc_qs[i]);
76 
77 	spc->spc_idleproc = NULL;
78 
79 	kthread_create_deferred(sched_kthreads_create, ci);
80 
81 	LIST_INIT(&spc->spc_deadproc);
82 
83 	/*
84 	 * Slight hack here until the cpuset code handles cpu_info
85 	 * structures.
86 	 */
87 	cpuset_init_cpu(ci);
88 	cpuset_add(&sched_all_cpus, ci);
89 }
90 
91 void
92 sched_kthreads_create(void *v)
93 {
94 	struct cpu_info *ci = v;
95 	struct schedstate_percpu *spc = &ci->ci_schedstate;
96 	static int num;
97 
98 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
99 		panic("fork idle");
100 
101 	num++;
102 }
103 
104 void
105 sched_idle(void *v)
106 {
107 	struct schedstate_percpu *spc;
108 	struct proc *p = curproc;
109 	struct cpu_info *ci = v;
110 	int s;
111 
112 	KERNEL_UNLOCK();
113 
114 	spc = &ci->ci_schedstate;
115 
116 	/*
117 	 * First time we enter here, we're not supposed to idle,
118 	 * just go away for a while.
119 	 */
120 	SCHED_LOCK(s);
121 	cpuset_add(&sched_idle_cpus, ci);
122 	p->p_stat = SSLEEP;
123 	p->p_cpu = ci;
124 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
125 	mi_switch();
126 	cpuset_del(&sched_idle_cpus, ci);
127 	SCHED_UNLOCK(s);
128 
129 	KASSERT(ci == curcpu());
130 	KASSERT(curproc == spc->spc_idleproc);
131 
132 	while (1) {
133 		while (!curcpu_is_idle()) {
134 			struct proc *dead;
135 
136 			SCHED_LOCK(s);
137 			p->p_stat = SSLEEP;
138 			mi_switch();
139 			SCHED_UNLOCK(s);
140 
141 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
142 				LIST_REMOVE(dead, p_hash);
143 				exit2(dead);
144 			}
145 		}
146 
147 		splassert(IPL_NONE);
148 
149 		cpuset_add(&sched_idle_cpus, ci);
150 		cpu_idle_enter();
151 		while (spc->spc_whichqs == 0) {
152 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
153 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
154 				cpuset_del(&sched_idle_cpus, ci);
155 				SCHED_LOCK(s);
156 				atomic_setbits_int(&spc->spc_schedflags,
157 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
158 				SCHED_UNLOCK(s);
159 				wakeup(spc);
160 			}
161 			cpu_idle_cycle();
162 		}
163 		cpu_idle_leave();
164 		cpuset_del(&sched_idle_cpus, ci);
165 	}
166 }
167 
168 /*
169  * To free our address space we have to jump through a few hoops.
170  * The freeing is done by the reaper, but until we have one reaper
171  * per cpu, we have no way of putting this proc on the deadproc list
172  * and waking up the reaper without risking having our address space and
173  * stack torn from under us before we manage to switch to another proc.
174  * Therefore we have a per-cpu list of dead processes where we put this
175  * proc and have idle clean up that list and move it to the reaper list.
176  * All this will be unnecessary once we can bind the reaper this cpu
177  * and not risk having it switch to another in case it sleeps.
178  */
179 void
180 sched_exit(struct proc *p)
181 {
182 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
183 	struct timeval tv;
184 	struct proc *idle;
185 	int s;
186 
187 	microuptime(&tv);
188 	timersub(&tv, &spc->spc_runtime, &tv);
189 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
190 
191 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
192 
193 	/* This process no longer needs to hold the kernel lock. */
194 	KERNEL_UNLOCK();
195 
196 	SCHED_LOCK(s);
197 	idle = spc->spc_idleproc;
198 	idle->p_stat = SRUN;
199 	cpu_switchto(NULL, idle);
200 	panic("cpu_switchto returned");
201 }
202 
203 /*
204  * Run queue management.
205  */
206 void
207 sched_init_runqueues(void)
208 {
209 }
210 
211 void
212 setrunqueue(struct proc *p)
213 {
214 	struct schedstate_percpu *spc;
215 	int queue = p->p_priority >> 2;
216 
217 	SCHED_ASSERT_LOCKED();
218 	spc = &p->p_cpu->ci_schedstate;
219 	spc->spc_nrun++;
220 
221 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
222 	spc->spc_whichqs |= (1 << queue);
223 	cpuset_add(&sched_queued_cpus, p->p_cpu);
224 
225 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
226 		cpu_unidle(p->p_cpu);
227 }
228 
229 void
230 remrunqueue(struct proc *p)
231 {
232 	struct schedstate_percpu *spc;
233 	int queue = p->p_priority >> 2;
234 
235 	SCHED_ASSERT_LOCKED();
236 	spc = &p->p_cpu->ci_schedstate;
237 	spc->spc_nrun--;
238 
239 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
240 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
241 		spc->spc_whichqs &= ~(1 << queue);
242 		if (spc->spc_whichqs == 0)
243 			cpuset_del(&sched_queued_cpus, p->p_cpu);
244 	}
245 }
246 
247 struct proc *
248 sched_chooseproc(void)
249 {
250 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
251 	struct proc *p;
252 	int queue;
253 
254 	SCHED_ASSERT_LOCKED();
255 
256 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
257 		if (spc->spc_whichqs) {
258 			for (queue = 0; queue < SCHED_NQS; queue++) {
259 				TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
260 					remrunqueue(p);
261 					p->p_cpu = sched_choosecpu(p);
262 					setrunqueue(p);
263 				}
264 			}
265 		}
266 		p = spc->spc_idleproc;
267 		KASSERT(p);
268 		KASSERT(p->p_wchan == NULL);
269 		p->p_stat = SRUN;
270 		return (p);
271 	}
272 
273 again:
274 	if (spc->spc_whichqs) {
275 		queue = ffs(spc->spc_whichqs) - 1;
276 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
277 		remrunqueue(p);
278 		KASSERT(p->p_stat == SRUN);
279 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
280 		p = spc->spc_idleproc;
281 		if (p == NULL) {
282                         int s;
283 			/*
284 			 * We get here if someone decides to switch during
285 			 * boot before forking kthreads, bleh.
286 			 * This is kind of like a stupid idle loop.
287 			 */
288 #ifdef MULTIPROCESSOR
289 			__mp_unlock(&sched_lock);
290 #endif
291 			spl0();
292 			delay(10);
293 			SCHED_LOCK(s);
294 			goto again;
295                 }
296 		KASSERT(p);
297 		p->p_stat = SRUN;
298 	}
299 
300 	KASSERT(p->p_wchan == NULL);
301 	return (p);
302 }
303 
304 uint64_t sched_nmigrations;
305 uint64_t sched_noidle;
306 uint64_t sched_stolen;
307 
308 uint64_t sched_choose;
309 uint64_t sched_wasidle;
310 uint64_t sched_nomigrations;
311 
312 struct cpu_info *
313 sched_choosecpu_fork(struct proc *parent, int flags)
314 {
315 	struct cpu_info *choice = NULL;
316 	fixpt_t load, best_load = ~0;
317 	int run, best_run = INT_MAX;
318 	struct cpu_info *ci;
319 	struct cpuset set;
320 
321 #if 0
322 	/*
323 	 * XXX
324 	 * Don't do this until we have a painless way to move the cpu in exec.
325 	 * Preferably when nuking the old pmap and getting a new one on a
326 	 * new cpu.
327 	 */
328 	/*
329 	 * PPWAIT forks are simple. We know that the parent will not
330 	 * run until we exec and choose another cpu, so we just steal its
331 	 * cpu.
332 	 */
333 	if (flags & FORK_PPWAIT)
334 		return (parent->p_cpu);
335 #endif
336 
337 	/*
338 	 * Look at all cpus that are currently idle and have nothing queued.
339 	 * If there are none, pick the one with least queued procs first,
340 	 * then the one with lowest load average.
341 	 */
342 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
343 	if (cpuset_first(&set) == NULL)
344 		cpuset_copy(&set, &sched_all_cpus);
345 
346 	while ((ci = cpuset_first(&set)) != NULL) {
347 		cpuset_del(&set, ci);
348 
349 		load = ci->ci_schedstate.spc_ldavg;
350 		run = ci->ci_schedstate.spc_nrun;
351 
352 		if (choice == NULL || run < best_run ||
353 		    (run == best_run &&load < best_load)) {
354 			choice = ci;
355 			best_load = load;
356 			best_run = run;
357 		}
358 	}
359 
360 	return (choice);
361 }
362 
363 struct cpu_info *
364 sched_choosecpu(struct proc *p)
365 {
366 	struct cpu_info *choice = NULL;
367 	int last_cost = INT_MAX;
368 	struct cpu_info *ci;
369 	struct cpuset set;
370 
371 	/*
372 	 * If pegged to a cpu, don't allow it to move.
373 	 */
374 	if (p->p_flag & P_CPUPEG)
375 		return (p->p_cpu);
376 
377 	sched_choose++;
378 
379 	/*
380 	 * Look at all cpus that are currently idle and have nothing queued.
381 	 * If there are none, pick the cheapest of those.
382 	 * (idle + queued could mean that the cpu is handling an interrupt
383 	 * at this moment and haven't had time to leave idle yet).
384 	 */
385 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
386 
387 	/*
388 	 * First, just check if our current cpu is in that set, if it is,
389 	 * this is simple.
390 	 * Also, our cpu might not be idle, but if it's the current cpu
391 	 * and it has nothing else queued and we're curproc, take it.
392 	 */
393 	if (cpuset_isset(&set, p->p_cpu) ||
394 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
395 	    curproc == p)) {
396 		sched_wasidle++;
397 		return (p->p_cpu);
398 	}
399 
400 	if (cpuset_first(&set) == NULL)
401 		cpuset_copy(&set, &sched_all_cpus);
402 
403 	while ((ci = cpuset_first(&set)) != NULL) {
404 		int cost = sched_proc_to_cpu_cost(ci, p);
405 
406 		if (choice == NULL || cost < last_cost) {
407 			choice = ci;
408 			last_cost = cost;
409 		}
410 		cpuset_del(&set, ci);
411 	}
412 
413 	if (p->p_cpu != choice)
414 		sched_nmigrations++;
415 	else
416 		sched_nomigrations++;
417 
418 	return (choice);
419 }
420 
421 /*
422  * Attempt to steal a proc from some cpu.
423  */
424 struct proc *
425 sched_steal_proc(struct cpu_info *self)
426 {
427 	struct schedstate_percpu *spc;
428 	struct proc *best = NULL;
429 	int bestcost = INT_MAX;
430 	struct cpu_info *ci;
431 	struct cpuset set;
432 
433 	cpuset_copy(&set, &sched_queued_cpus);
434 
435 	while ((ci = cpuset_first(&set)) != NULL) {
436 		struct proc *p;
437 		int queue;
438 		int cost;
439 
440 		cpuset_del(&set, ci);
441 
442 		spc = &ci->ci_schedstate;
443 
444 		queue = ffs(spc->spc_whichqs) - 1;
445 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
446 			if (p->p_flag & P_CPUPEG)
447 				continue;
448 
449 			cost = sched_proc_to_cpu_cost(self, p);
450 
451 			if (best == NULL || cost < bestcost) {
452 				best = p;
453 				bestcost = cost;
454 			}
455 		}
456 	}
457 	if (best == NULL)
458 		return (NULL);
459 
460 	spc = &best->p_cpu->ci_schedstate;
461 	remrunqueue(best);
462 	best->p_cpu = self;
463 
464 	sched_stolen++;
465 
466 	return (best);
467 }
468 
469 /*
470  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
471  */
472 static int
473 log2(unsigned int i)
474 {
475 	int ret = 0;
476 
477 	while (i >>= 1)
478 		ret++;
479 
480 	return (ret);
481 }
482 
483 /*
484  * Calculate the cost of moving the proc to this cpu.
485  *
486  * What we want is some guesstimate of how much "performance" it will
487  * cost us to move the proc here. Not just for caches and TLBs and NUMA
488  * memory, but also for the proc itself. A highly loaded cpu might not
489  * be the best candidate for this proc since it won't get run.
490  *
491  * Just total guesstimates for now.
492  */
493 
494 int sched_cost_load = 1;
495 int sched_cost_priority = 1;
496 int sched_cost_runnable = 3;
497 int sched_cost_resident = 1;
498 
499 int
500 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
501 {
502 	struct schedstate_percpu *spc;
503 	int l2resident = 0;
504 	int cost;
505 
506 	spc = &ci->ci_schedstate;
507 
508 	cost = 0;
509 
510 	/*
511 	 * First, account for the priority of the proc we want to move.
512 	 * More willing to move, the lower the priority of the destination
513 	 * and the higher the priority of the proc.
514 	 */
515 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
516 		cost += (p->p_priority - spc->spc_curpriority) *
517 		    sched_cost_priority;
518 		cost += sched_cost_runnable;
519 	}
520 	if (cpuset_isset(&sched_queued_cpus, ci)) {
521 		cost += spc->spc_nrun * sched_cost_runnable;
522 	}
523 
524 	/*
525 	 * Higher load on the destination means we don't want to go there.
526 	 */
527 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
528 
529 	/*
530 	 * If the proc is on this cpu already, lower the cost by how much
531 	 * it has been running and an estimate of its footprint.
532 	 */
533 	if (p->p_cpu == ci && p->p_slptime == 0) {
534 		l2resident =
535 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
536 		cost -= l2resident * sched_cost_resident;
537 	}
538 
539 	return (cost);
540 }
541 
542 /*
543  * Peg a proc to a cpu.
544  */
545 void
546 sched_peg_curproc(struct cpu_info *ci)
547 {
548 	struct proc *p = curproc;
549 	int s;
550 
551 	SCHED_LOCK(s);
552 	p->p_priority = p->p_usrpri;
553 	p->p_stat = SRUN;
554 	p->p_cpu = ci;
555 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
556 	setrunqueue(p);
557 	p->p_stats->p_ru.ru_nvcsw++;
558 	mi_switch();
559 	SCHED_UNLOCK(s);
560 }
561 
562 #ifdef MULTIPROCESSOR
563 
564 void
565 sched_start_secondary_cpus(void)
566 {
567 	CPU_INFO_ITERATOR cii;
568 	struct cpu_info *ci;
569 
570 	CPU_INFO_FOREACH(cii, ci) {
571 		struct schedstate_percpu *spc = &ci->ci_schedstate;
572 
573 		if (CPU_IS_PRIMARY(ci))
574 			continue;
575 		cpuset_add(&sched_all_cpus, ci);
576 		atomic_clearbits_int(&spc->spc_schedflags,
577 		    SPCF_SHOULDHALT | SPCF_HALTED);
578 	}
579 }
580 
581 void
582 sched_stop_secondary_cpus(void)
583 {
584 	CPU_INFO_ITERATOR cii;
585 	struct cpu_info *ci;
586 
587 	/*
588 	 * Make sure we stop the secondary CPUs.
589 	 */
590 	CPU_INFO_FOREACH(cii, ci) {
591 		struct schedstate_percpu *spc = &ci->ci_schedstate;
592 
593 		if (CPU_IS_PRIMARY(ci))
594 			continue;
595 		cpuset_del(&sched_all_cpus, ci);
596 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
597 	}
598 	CPU_INFO_FOREACH(cii, ci) {
599 		struct schedstate_percpu *spc = &ci->ci_schedstate;
600 		struct sleep_state sls;
601 
602 		if (CPU_IS_PRIMARY(ci))
603 			continue;
604 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
605 			sleep_setup(&sls, spc, PZERO, "schedstate");
606 			sleep_finish(&sls,
607 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
608 		}
609 	}
610 }
611 
612 #endif
613 
614 /*
615  * Functions to manipulate cpu sets.
616  */
617 struct cpu_info *cpuset_infos[MAXCPUS];
618 static struct cpuset cpuset_all;
619 
620 void
621 cpuset_init_cpu(struct cpu_info *ci)
622 {
623 	cpuset_add(&cpuset_all, ci);
624 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
625 }
626 
627 void
628 cpuset_clear(struct cpuset *cs)
629 {
630 	memset(cs, 0, sizeof(*cs));
631 }
632 
633 void
634 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
635 {
636 	unsigned int num = CPU_INFO_UNIT(ci);
637 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
638 }
639 
640 void
641 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
642 {
643 	unsigned int num = CPU_INFO_UNIT(ci);
644 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
645 }
646 
647 int
648 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
649 {
650 	unsigned int num = CPU_INFO_UNIT(ci);
651 	return (cs->cs_set[num/32] & (1 << (num % 32)));
652 }
653 
654 void
655 cpuset_add_all(struct cpuset *cs)
656 {
657 	cpuset_copy(cs, &cpuset_all);
658 }
659 
660 void
661 cpuset_copy(struct cpuset *to, struct cpuset *from)
662 {
663 	memcpy(to, from, sizeof(*to));
664 }
665 
666 struct cpu_info *
667 cpuset_first(struct cpuset *cs)
668 {
669 	int i;
670 
671 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
672 		if (cs->cs_set[i])
673 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
674 
675 	return (NULL);
676 }
677 
678 void
679 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
680 {
681 	int i;
682 
683 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
684 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
685 }
686 
687 void
688 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
689 {
690 	int i;
691 
692 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
693 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
694 }
695 
696 void
697 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
698 {
699 	int i;
700 
701 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
702 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
703 }
704