xref: /openbsd-src/sys/kern/kern_sched.c (revision d7c931073bdb4f5fe50798b0ad4185158146eef1)
1 /*	$OpenBSD: kern_sched.c,v 1.20 2010/05/14 18:47:56 kettenis Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 
28 #include <uvm/uvm_extern.h>
29 
30 #include <sys/malloc.h>
31 
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 
46 /*
47  * A few notes about cpu_switchto that is implemented in MD code.
48  *
49  * cpu_switchto takes two arguments, the old proc and the proc
50  * it should switch to. The new proc will never be NULL, so we always have
51  * a saved state that we need to switch to. The old proc however can
52  * be NULL if the process is exiting. NULL for the old proc simply
53  * means "don't bother saving old state".
54  *
55  * cpu_switchto is supposed to atomically load the new state of the process
56  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
57  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
58  * cpus in the system must not depend on this state being consistent.
59  * Therefore no locking is necessary in cpu_switchto other than blocking
60  * interrupts during the context switch.
61  */
62 
63 /*
64  * sched_init_cpu is called from main() for the boot cpu, then it's the
65  * responsibility of the MD code to call it for all other cpus.
66  */
67 void
68 sched_init_cpu(struct cpu_info *ci)
69 {
70 	struct schedstate_percpu *spc = &ci->ci_schedstate;
71 	int i;
72 
73 	for (i = 0; i < SCHED_NQS; i++)
74 		TAILQ_INIT(&spc->spc_qs[i]);
75 
76 	spc->spc_idleproc = NULL;
77 
78 	kthread_create_deferred(sched_kthreads_create, ci);
79 
80 	LIST_INIT(&spc->spc_deadproc);
81 
82 	/*
83 	 * Slight hack here until the cpuset code handles cpu_info
84 	 * structures.
85 	 */
86 	cpuset_init_cpu(ci);
87 }
88 
89 void
90 sched_kthreads_create(void *v)
91 {
92 	struct cpu_info *ci = v;
93 	struct schedstate_percpu *spc = &ci->ci_schedstate;
94 	static int num;
95 
96 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
97 		panic("fork idle");
98 
99 	num++;
100 }
101 
102 void
103 sched_idle(void *v)
104 {
105 	struct schedstate_percpu *spc;
106 	struct proc *p = curproc;
107 	struct cpu_info *ci = v;
108 	int s;
109 
110 	KERNEL_PROC_UNLOCK(p);
111 
112 	spc = &ci->ci_schedstate;
113 
114 	/*
115 	 * First time we enter here, we're not supposed to idle,
116 	 * just go away for a while.
117 	 */
118 	SCHED_LOCK(s);
119 	cpuset_add(&sched_idle_cpus, ci);
120 	p->p_stat = SSLEEP;
121 	p->p_cpu = ci;
122 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
123 	mi_switch();
124 	cpuset_del(&sched_idle_cpus, ci);
125 	SCHED_UNLOCK(s);
126 
127 	KASSERT(ci == curcpu());
128 	KASSERT(curproc == spc->spc_idleproc);
129 
130 	while (1) {
131 		while (!curcpu_is_idle()) {
132 			struct proc *dead;
133 
134 			SCHED_LOCK(s);
135 			p->p_stat = SSLEEP;
136 			mi_switch();
137 			SCHED_UNLOCK(s);
138 
139 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
140 				LIST_REMOVE(dead, p_hash);
141 				exit2(dead);
142 			}
143 		}
144 
145 		splassert(IPL_NONE);
146 
147 		cpuset_add(&sched_idle_cpus, ci);
148 		cpu_idle_enter();
149 		while (spc->spc_whichqs == 0) {
150 			if (spc->spc_schedflags & SPCF_SHOULDHALT) {
151 				atomic_setbits_int(&spc->spc_schedflags,
152 				    SPCF_HALTED);
153 				wakeup(spc);
154 			}
155 			cpu_idle_cycle();
156 		}
157 		cpu_idle_leave();
158 		cpuset_del(&sched_idle_cpus, ci);
159 	}
160 }
161 
162 /*
163  * To free our address space we have to jump through a few hoops.
164  * The freeing is done by the reaper, but until we have one reaper
165  * per cpu, we have no way of putting this proc on the deadproc list
166  * and waking up the reaper without risking having our address space and
167  * stack torn from under us before we manage to switch to another proc.
168  * Therefore we have a per-cpu list of dead processes where we put this
169  * proc and have idle clean up that list and move it to the reaper list.
170  * All this will be unnecessary once we can bind the reaper this cpu
171  * and not risk having it switch to another in case it sleeps.
172  */
173 void
174 sched_exit(struct proc *p)
175 {
176 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
177 	struct timeval tv;
178 	struct proc *idle;
179 	int s;
180 
181 	microuptime(&tv);
182 	timersub(&tv, &spc->spc_runtime, &tv);
183 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
184 
185 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
186 
187 	/* This process no longer needs to hold the kernel lock. */
188 	KERNEL_PROC_UNLOCK(p);
189 
190 	SCHED_LOCK(s);
191 	idle = spc->spc_idleproc;
192 	idle->p_stat = SRUN;
193 	cpu_switchto(NULL, idle);
194 	panic("cpu_switchto returned");
195 }
196 
197 /*
198  * Run queue management.
199  */
200 void
201 sched_init_runqueues(void)
202 {
203 }
204 
205 void
206 setrunqueue(struct proc *p)
207 {
208 	struct schedstate_percpu *spc;
209 	int queue = p->p_priority >> 2;
210 
211 	SCHED_ASSERT_LOCKED();
212 	spc = &p->p_cpu->ci_schedstate;
213 	spc->spc_nrun++;
214 
215 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
216 	spc->spc_whichqs |= (1 << queue);
217 	cpuset_add(&sched_queued_cpus, p->p_cpu);
218 
219 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
220 		cpu_unidle(p->p_cpu);
221 }
222 
223 void
224 remrunqueue(struct proc *p)
225 {
226 	struct schedstate_percpu *spc;
227 	int queue = p->p_priority >> 2;
228 
229 	SCHED_ASSERT_LOCKED();
230 	spc = &p->p_cpu->ci_schedstate;
231 	spc->spc_nrun--;
232 
233 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
234 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
235 		spc->spc_whichqs &= ~(1 << queue);
236 		if (spc->spc_whichqs == 0)
237 			cpuset_del(&sched_queued_cpus, p->p_cpu);
238 	}
239 }
240 
241 struct proc *
242 sched_chooseproc(void)
243 {
244 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
245 	struct proc *p;
246 	int queue;
247 
248 	SCHED_ASSERT_LOCKED();
249 
250 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
251 		p = spc->spc_idleproc;
252 		KASSERT(p);
253 		p->p_stat = SRUN;
254 		return (p);
255 	}
256 
257 again:
258 	if (spc->spc_whichqs) {
259 		queue = ffs(spc->spc_whichqs) - 1;
260 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
261 		remrunqueue(p);
262 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
263 		p = spc->spc_idleproc;
264 		if (p == NULL) {
265                         int s;
266 			/*
267 			 * We get here if someone decides to switch during
268 			 * boot before forking kthreads, bleh.
269 			 * This is kind of like a stupid idle loop.
270 			 */
271 #ifdef MULTIPROCESSOR
272 			__mp_unlock(&sched_lock);
273 #endif
274 			spl0();
275 			delay(10);
276 			SCHED_LOCK(s);
277 			goto again;
278                 }
279 		KASSERT(p);
280 		p->p_stat = SRUN;
281 	}
282 
283 	return (p);
284 }
285 
286 uint64_t sched_nmigrations;
287 uint64_t sched_noidle;
288 uint64_t sched_stolen;
289 
290 uint64_t sched_choose;
291 uint64_t sched_wasidle;
292 uint64_t sched_nomigrations;
293 
294 struct cpu_info *
295 sched_choosecpu_fork(struct proc *parent, int flags)
296 {
297 	struct cpu_info *choice = NULL;
298 	fixpt_t load, best_load = ~0;
299 	int run, best_run = INT_MAX;
300 	struct cpu_info *ci;
301 	struct cpuset set;
302 
303 #if 0
304 	/*
305 	 * XXX
306 	 * Don't do this until we have a painless way to move the cpu in exec.
307 	 * Preferably when nuking the old pmap and getting a new one on a
308 	 * new cpu.
309 	 */
310 	/*
311 	 * PPWAIT forks are simple. We know that the parent will not
312 	 * run until we exec and choose another cpu, so we just steal its
313 	 * cpu.
314 	 */
315 	if (flags & FORK_PPWAIT)
316 		return (parent->p_cpu);
317 #endif
318 
319 	/*
320 	 * Look at all cpus that are currently idle and have nothing queued.
321 	 * If there are none, pick the one with least queued procs first,
322 	 * then the one with lowest load average.
323 	 */
324 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
325 	if (cpuset_first(&set) == NULL)
326 		cpuset_add_all(&set);
327 
328 	while ((ci = cpuset_first(&set)) != NULL) {
329 		cpuset_del(&set, ci);
330 
331 		load = ci->ci_schedstate.spc_ldavg;
332 		run = ci->ci_schedstate.spc_nrun;
333 
334 		if (choice == NULL || run < best_run ||
335 		    (run == best_run &&load < best_load)) {
336 			choice = ci;
337 			best_load = load;
338 			best_run = run;
339 		}
340 	}
341 
342 	return (choice);
343 }
344 
345 struct cpu_info *
346 sched_choosecpu(struct proc *p)
347 {
348 	struct cpu_info *choice = NULL;
349 	int last_cost = INT_MAX;
350 	struct cpu_info *ci;
351 	struct cpuset set;
352 
353 	/*
354 	 * If pegged to a cpu, don't allow it to move.
355 	 */
356 	if (p->p_flag & P_CPUPEG)
357 		return (p->p_cpu);
358 
359 	sched_choose++;
360 
361 	/*
362 	 * Look at all cpus that are currently idle and have nothing queued.
363 	 * If there are none, pick the cheapest of those.
364 	 * (idle + queued could mean that the cpu is handling an interrupt
365 	 * at this moment and haven't had time to leave idle yet).
366 	 */
367 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
368 
369 	/*
370 	 * First, just check if our current cpu is in that set, if it is,
371 	 * this is simple.
372 	 * Also, our cpu might not be idle, but if it's the current cpu
373 	 * and it has nothing else queued and we're curproc, take it.
374 	 */
375 	if (cpuset_isset(&set, p->p_cpu) ||
376 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
377 	    curproc == p)) {
378 		sched_wasidle++;
379 		return (p->p_cpu);
380 	}
381 
382 	if (cpuset_first(&set) == NULL)
383 		cpuset_add_all(&set);
384 
385 	while ((ci = cpuset_first(&set)) != NULL) {
386 		int cost = sched_proc_to_cpu_cost(ci, p);
387 
388 		if (choice == NULL || cost < last_cost) {
389 			choice = ci;
390 			last_cost = cost;
391 		}
392 		cpuset_del(&set, ci);
393 	}
394 
395 	if (p->p_cpu != choice)
396 		sched_nmigrations++;
397 	else
398 		sched_nomigrations++;
399 
400 	return (choice);
401 }
402 
403 /*
404  * Attempt to steal a proc from some cpu.
405  */
406 struct proc *
407 sched_steal_proc(struct cpu_info *self)
408 {
409 	struct schedstate_percpu *spc;
410 	struct proc *best = NULL;
411 	int bestcost = INT_MAX;
412 	struct cpu_info *ci;
413 	struct cpuset set;
414 
415 	cpuset_copy(&set, &sched_queued_cpus);
416 
417 	while ((ci = cpuset_first(&set)) != NULL) {
418 		struct proc *p;
419 		int queue;
420 		int cost;
421 
422 		cpuset_del(&set, ci);
423 
424 		spc = &ci->ci_schedstate;
425 
426 		queue = ffs(spc->spc_whichqs) - 1;
427 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
428 			if (p->p_flag & P_CPUPEG)
429 				continue;
430 
431 			cost = sched_proc_to_cpu_cost(self, p);
432 
433 			if (best == NULL || cost < bestcost) {
434 				best = p;
435 				bestcost = cost;
436 			}
437 		}
438 	}
439 	if (best == NULL)
440 		return (NULL);
441 
442 	spc = &best->p_cpu->ci_schedstate;
443 	remrunqueue(best);
444 	best->p_cpu = self;
445 
446 	sched_stolen++;
447 
448 	return (best);
449 }
450 
451 /*
452  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
453  */
454 static int
455 log2(unsigned int i)
456 {
457 	int ret = 0;
458 
459 	while (i >>= 1)
460 		ret++;
461 
462 	return (ret);
463 }
464 
465 /*
466  * Calculate the cost of moving the proc to this cpu.
467  *
468  * What we want is some guesstimate of how much "performance" it will
469  * cost us to move the proc here. Not just for caches and TLBs and NUMA
470  * memory, but also for the proc itself. A highly loaded cpu might not
471  * be the best candidate for this proc since it won't get run.
472  *
473  * Just total guesstimates for now.
474  */
475 
476 int sched_cost_load = 1;
477 int sched_cost_priority = 1;
478 int sched_cost_runnable = 3;
479 int sched_cost_resident = 1;
480 
481 int
482 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
483 {
484 	struct schedstate_percpu *spc;
485 	int l2resident = 0;
486 	int cost;
487 
488 	spc = &ci->ci_schedstate;
489 
490 	cost = 0;
491 
492 	/*
493 	 * First, account for the priority of the proc we want to move.
494 	 * More willing to move, the lower the priority of the destination
495 	 * and the higher the priority of the proc.
496 	 */
497 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
498 		cost += (p->p_priority - spc->spc_curpriority) *
499 		    sched_cost_priority;
500 		cost += sched_cost_runnable;
501 	}
502 	if (cpuset_isset(&sched_queued_cpus, ci)) {
503 		cost += spc->spc_nrun * sched_cost_runnable;
504 	}
505 
506 	/*
507 	 * Higher load on the destination means we don't want to go there.
508 	 */
509 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
510 
511 	/*
512 	 * If the proc is on this cpu already, lower the cost by how much
513 	 * it has been running and an estimate of its footprint.
514 	 */
515 	if (p->p_cpu == ci && p->p_slptime == 0) {
516 		l2resident =
517 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
518 		cost -= l2resident * sched_cost_resident;
519 	}
520 
521 	return (cost);
522 }
523 
524 /*
525  * Peg a proc to a cpu.
526  */
527 void
528 sched_peg_curproc(struct cpu_info *ci)
529 {
530 	struct proc *p = curproc;
531 	int s;
532 
533 	SCHED_LOCK(s);
534 	p->p_priority = p->p_usrpri;
535 	p->p_stat = SRUN;
536 	p->p_cpu = ci;
537 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
538 	setrunqueue(p);
539 	p->p_stats->p_ru.ru_nvcsw++;
540 	mi_switch();
541 	SCHED_UNLOCK(s);
542 }
543 
544 #ifdef MULTIPROCESSOR
545 
546 void
547 sched_start_secondary_cpus(void)
548 {
549 	CPU_INFO_ITERATOR cii;
550 	struct cpu_info *ci;
551 
552 	CPU_INFO_FOREACH(cii, ci) {
553 		struct schedstate_percpu *spc = &ci->ci_schedstate;
554 
555 		if (CPU_IS_PRIMARY(ci))
556 			continue;
557 		atomic_clearbits_int(&spc->spc_schedflags,
558 		    SPCF_SHOULDHALT | SPCF_HALTED);
559 	}
560 }
561 
562 void
563 sched_stop_secondary_cpus(void)
564 {
565 	CPU_INFO_ITERATOR cii;
566 	struct cpu_info *ci;
567 
568 	/*
569 	 * Make sure we stop the secondary CPUs.
570 	 */
571 	CPU_INFO_FOREACH(cii, ci) {
572 		struct schedstate_percpu *spc = &ci->ci_schedstate;
573 
574 		if (CPU_IS_PRIMARY(ci))
575 			continue;
576 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
577 	}
578 	CPU_INFO_FOREACH(cii, ci) {
579 		struct schedstate_percpu *spc = &ci->ci_schedstate;
580 		struct sleep_state sls;
581 
582 		if (CPU_IS_PRIMARY(ci))
583 			continue;
584 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
585 			sleep_setup(&sls, spc, PZERO, "schedstate");
586 			sleep_finish(&sls,
587 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
588 		}
589 	}
590 }
591 
592 #endif
593 
594 /*
595  * Functions to manipulate cpu sets.
596  */
597 struct cpu_info *cpuset_infos[MAXCPUS];
598 static struct cpuset cpuset_all;
599 
600 void
601 cpuset_init_cpu(struct cpu_info *ci)
602 {
603 	cpuset_add(&cpuset_all, ci);
604 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
605 }
606 
607 void
608 cpuset_clear(struct cpuset *cs)
609 {
610 	memset(cs, 0, sizeof(*cs));
611 }
612 
613 /*
614  * XXX - implement it on SP architectures too
615  */
616 #ifndef CPU_INFO_UNIT
617 #define CPU_INFO_UNIT 0
618 #endif
619 
620 void
621 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
622 {
623 	unsigned int num = CPU_INFO_UNIT(ci);
624 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
625 }
626 
627 void
628 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
629 {
630 	unsigned int num = CPU_INFO_UNIT(ci);
631 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
632 }
633 
634 int
635 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
636 {
637 	unsigned int num = CPU_INFO_UNIT(ci);
638 	return (cs->cs_set[num/32] & (1 << (num % 32)));
639 }
640 
641 void
642 cpuset_add_all(struct cpuset *cs)
643 {
644 	cpuset_copy(cs, &cpuset_all);
645 }
646 
647 void
648 cpuset_copy(struct cpuset *to, struct cpuset *from)
649 {
650 	memcpy(to, from, sizeof(*to));
651 }
652 
653 struct cpu_info *
654 cpuset_first(struct cpuset *cs)
655 {
656 	int i;
657 
658 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
659 		if (cs->cs_set[i])
660 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
661 
662 	return (NULL);
663 }
664 
665 void
666 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
667 {
668 	int i;
669 
670 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
671 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
672 }
673 
674 void
675 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
676 {
677 	int i;
678 
679 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
680 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
681 }
682 
683 void
684 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
685 {
686 	int i;
687 
688 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
689 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
690 }
691