xref: /openbsd-src/sys/kern/kern_sched.c (revision 971e1bb644cc0d78fb53db085bc59be6450a74f1)
1 /*	$OpenBSD: kern_sched.c,v 1.23 2011/07/06 21:41:37 art Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 
28 #include <uvm/uvm_extern.h>
29 
30 #include <sys/malloc.h>
31 
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * A few notes about cpu_switchto that is implemented in MD code.
49  *
50  * cpu_switchto takes two arguments, the old proc and the proc
51  * it should switch to. The new proc will never be NULL, so we always have
52  * a saved state that we need to switch to. The old proc however can
53  * be NULL if the process is exiting. NULL for the old proc simply
54  * means "don't bother saving old state".
55  *
56  * cpu_switchto is supposed to atomically load the new state of the process
57  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
58  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
59  * cpus in the system must not depend on this state being consistent.
60  * Therefore no locking is necessary in cpu_switchto other than blocking
61  * interrupts during the context switch.
62  */
63 
64 /*
65  * sched_init_cpu is called from main() for the boot cpu, then it's the
66  * responsibility of the MD code to call it for all other cpus.
67  */
68 void
69 sched_init_cpu(struct cpu_info *ci)
70 {
71 	struct schedstate_percpu *spc = &ci->ci_schedstate;
72 	int i;
73 
74 	for (i = 0; i < SCHED_NQS; i++)
75 		TAILQ_INIT(&spc->spc_qs[i]);
76 
77 	spc->spc_idleproc = NULL;
78 
79 	kthread_create_deferred(sched_kthreads_create, ci);
80 
81 	LIST_INIT(&spc->spc_deadproc);
82 
83 	/*
84 	 * Slight hack here until the cpuset code handles cpu_info
85 	 * structures.
86 	 */
87 	cpuset_init_cpu(ci);
88 	cpuset_add(&sched_all_cpus, ci);
89 }
90 
91 void
92 sched_kthreads_create(void *v)
93 {
94 	struct cpu_info *ci = v;
95 	struct schedstate_percpu *spc = &ci->ci_schedstate;
96 	static int num;
97 
98 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
99 		panic("fork idle");
100 
101 	num++;
102 }
103 
104 void
105 sched_idle(void *v)
106 {
107 	struct schedstate_percpu *spc;
108 	struct proc *p = curproc;
109 	struct cpu_info *ci = v;
110 	int s;
111 
112 	KERNEL_UNLOCK();
113 
114 	spc = &ci->ci_schedstate;
115 
116 	/*
117 	 * First time we enter here, we're not supposed to idle,
118 	 * just go away for a while.
119 	 */
120 	SCHED_LOCK(s);
121 	cpuset_add(&sched_idle_cpus, ci);
122 	p->p_stat = SSLEEP;
123 	p->p_cpu = ci;
124 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
125 	mi_switch();
126 	cpuset_del(&sched_idle_cpus, ci);
127 	SCHED_UNLOCK(s);
128 
129 	KASSERT(ci == curcpu());
130 	KASSERT(curproc == spc->spc_idleproc);
131 
132 	while (1) {
133 		while (!curcpu_is_idle()) {
134 			struct proc *dead;
135 
136 			SCHED_LOCK(s);
137 			p->p_stat = SSLEEP;
138 			mi_switch();
139 			SCHED_UNLOCK(s);
140 
141 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
142 				LIST_REMOVE(dead, p_hash);
143 				exit2(dead);
144 			}
145 		}
146 
147 		splassert(IPL_NONE);
148 
149 		cpuset_add(&sched_idle_cpus, ci);
150 		cpu_idle_enter();
151 		while (spc->spc_whichqs == 0) {
152 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
153 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
154 				cpuset_del(&sched_idle_cpus, ci);
155 				SCHED_LOCK(s);
156 				atomic_setbits_int(&spc->spc_schedflags,
157 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
158 				SCHED_UNLOCK(s);
159 				wakeup(spc);
160 			}
161 			cpu_idle_cycle();
162 		}
163 		cpu_idle_leave();
164 		cpuset_del(&sched_idle_cpus, ci);
165 	}
166 }
167 
168 /*
169  * To free our address space we have to jump through a few hoops.
170  * The freeing is done by the reaper, but until we have one reaper
171  * per cpu, we have no way of putting this proc on the deadproc list
172  * and waking up the reaper without risking having our address space and
173  * stack torn from under us before we manage to switch to another proc.
174  * Therefore we have a per-cpu list of dead processes where we put this
175  * proc and have idle clean up that list and move it to the reaper list.
176  * All this will be unnecessary once we can bind the reaper this cpu
177  * and not risk having it switch to another in case it sleeps.
178  */
179 void
180 sched_exit(struct proc *p)
181 {
182 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
183 	struct timeval tv;
184 	struct proc *idle;
185 	int s;
186 
187 	microuptime(&tv);
188 	timersub(&tv, &spc->spc_runtime, &tv);
189 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
190 
191 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
192 
193 	/* This process no longer needs to hold the kernel lock. */
194 	KERNEL_UNLOCK();
195 
196 	SCHED_LOCK(s);
197 	idle = spc->spc_idleproc;
198 	idle->p_stat = SRUN;
199 	cpu_switchto(NULL, idle);
200 	panic("cpu_switchto returned");
201 }
202 
203 /*
204  * Run queue management.
205  */
206 void
207 sched_init_runqueues(void)
208 {
209 }
210 
211 void
212 setrunqueue(struct proc *p)
213 {
214 	struct schedstate_percpu *spc;
215 	int queue = p->p_priority >> 2;
216 
217 	SCHED_ASSERT_LOCKED();
218 	spc = &p->p_cpu->ci_schedstate;
219 	spc->spc_nrun++;
220 
221 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
222 	spc->spc_whichqs |= (1 << queue);
223 	cpuset_add(&sched_queued_cpus, p->p_cpu);
224 
225 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
226 		cpu_unidle(p->p_cpu);
227 }
228 
229 void
230 remrunqueue(struct proc *p)
231 {
232 	struct schedstate_percpu *spc;
233 	int queue = p->p_priority >> 2;
234 
235 	SCHED_ASSERT_LOCKED();
236 	spc = &p->p_cpu->ci_schedstate;
237 	spc->spc_nrun--;
238 
239 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
240 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
241 		spc->spc_whichqs &= ~(1 << queue);
242 		if (spc->spc_whichqs == 0)
243 			cpuset_del(&sched_queued_cpus, p->p_cpu);
244 	}
245 }
246 
247 struct proc *
248 sched_chooseproc(void)
249 {
250 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
251 	struct proc *p;
252 	int queue;
253 
254 	SCHED_ASSERT_LOCKED();
255 
256 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
257 		if (spc->spc_whichqs) {
258 			for (queue = 0; queue < SCHED_NQS; queue++) {
259 				TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
260 					remrunqueue(p);
261 					p->p_cpu = sched_choosecpu(p);
262 					setrunqueue(p);
263 				}
264 			}
265 		}
266 		p = spc->spc_idleproc;
267 		KASSERT(p);
268 		p->p_stat = SRUN;
269 		return (p);
270 	}
271 
272 again:
273 	if (spc->spc_whichqs) {
274 		queue = ffs(spc->spc_whichqs) - 1;
275 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
276 		remrunqueue(p);
277 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
278 		p = spc->spc_idleproc;
279 		if (p == NULL) {
280                         int s;
281 			/*
282 			 * We get here if someone decides to switch during
283 			 * boot before forking kthreads, bleh.
284 			 * This is kind of like a stupid idle loop.
285 			 */
286 #ifdef MULTIPROCESSOR
287 			__mp_unlock(&sched_lock);
288 #endif
289 			spl0();
290 			delay(10);
291 			SCHED_LOCK(s);
292 			goto again;
293                 }
294 		KASSERT(p);
295 		p->p_stat = SRUN;
296 	}
297 
298 	return (p);
299 }
300 
301 uint64_t sched_nmigrations;
302 uint64_t sched_noidle;
303 uint64_t sched_stolen;
304 
305 uint64_t sched_choose;
306 uint64_t sched_wasidle;
307 uint64_t sched_nomigrations;
308 
309 struct cpu_info *
310 sched_choosecpu_fork(struct proc *parent, int flags)
311 {
312 	struct cpu_info *choice = NULL;
313 	fixpt_t load, best_load = ~0;
314 	int run, best_run = INT_MAX;
315 	struct cpu_info *ci;
316 	struct cpuset set;
317 
318 #if 0
319 	/*
320 	 * XXX
321 	 * Don't do this until we have a painless way to move the cpu in exec.
322 	 * Preferably when nuking the old pmap and getting a new one on a
323 	 * new cpu.
324 	 */
325 	/*
326 	 * PPWAIT forks are simple. We know that the parent will not
327 	 * run until we exec and choose another cpu, so we just steal its
328 	 * cpu.
329 	 */
330 	if (flags & FORK_PPWAIT)
331 		return (parent->p_cpu);
332 #endif
333 
334 	/*
335 	 * Look at all cpus that are currently idle and have nothing queued.
336 	 * If there are none, pick the one with least queued procs first,
337 	 * then the one with lowest load average.
338 	 */
339 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
340 	if (cpuset_first(&set) == NULL)
341 		cpuset_copy(&set, &sched_all_cpus);
342 
343 	while ((ci = cpuset_first(&set)) != NULL) {
344 		cpuset_del(&set, ci);
345 
346 		load = ci->ci_schedstate.spc_ldavg;
347 		run = ci->ci_schedstate.spc_nrun;
348 
349 		if (choice == NULL || run < best_run ||
350 		    (run == best_run &&load < best_load)) {
351 			choice = ci;
352 			best_load = load;
353 			best_run = run;
354 		}
355 	}
356 
357 	return (choice);
358 }
359 
360 struct cpu_info *
361 sched_choosecpu(struct proc *p)
362 {
363 	struct cpu_info *choice = NULL;
364 	int last_cost = INT_MAX;
365 	struct cpu_info *ci;
366 	struct cpuset set;
367 
368 	/*
369 	 * If pegged to a cpu, don't allow it to move.
370 	 */
371 	if (p->p_flag & P_CPUPEG)
372 		return (p->p_cpu);
373 
374 	sched_choose++;
375 
376 	/*
377 	 * Look at all cpus that are currently idle and have nothing queued.
378 	 * If there are none, pick the cheapest of those.
379 	 * (idle + queued could mean that the cpu is handling an interrupt
380 	 * at this moment and haven't had time to leave idle yet).
381 	 */
382 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
383 
384 	/*
385 	 * First, just check if our current cpu is in that set, if it is,
386 	 * this is simple.
387 	 * Also, our cpu might not be idle, but if it's the current cpu
388 	 * and it has nothing else queued and we're curproc, take it.
389 	 */
390 	if (cpuset_isset(&set, p->p_cpu) ||
391 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
392 	    curproc == p)) {
393 		sched_wasidle++;
394 		return (p->p_cpu);
395 	}
396 
397 	if (cpuset_first(&set) == NULL)
398 		cpuset_copy(&set, &sched_all_cpus);
399 
400 	while ((ci = cpuset_first(&set)) != NULL) {
401 		int cost = sched_proc_to_cpu_cost(ci, p);
402 
403 		if (choice == NULL || cost < last_cost) {
404 			choice = ci;
405 			last_cost = cost;
406 		}
407 		cpuset_del(&set, ci);
408 	}
409 
410 	if (p->p_cpu != choice)
411 		sched_nmigrations++;
412 	else
413 		sched_nomigrations++;
414 
415 	return (choice);
416 }
417 
418 /*
419  * Attempt to steal a proc from some cpu.
420  */
421 struct proc *
422 sched_steal_proc(struct cpu_info *self)
423 {
424 	struct schedstate_percpu *spc;
425 	struct proc *best = NULL;
426 	int bestcost = INT_MAX;
427 	struct cpu_info *ci;
428 	struct cpuset set;
429 
430 	cpuset_copy(&set, &sched_queued_cpus);
431 
432 	while ((ci = cpuset_first(&set)) != NULL) {
433 		struct proc *p;
434 		int queue;
435 		int cost;
436 
437 		cpuset_del(&set, ci);
438 
439 		spc = &ci->ci_schedstate;
440 
441 		queue = ffs(spc->spc_whichqs) - 1;
442 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
443 			if (p->p_flag & P_CPUPEG)
444 				continue;
445 
446 			cost = sched_proc_to_cpu_cost(self, p);
447 
448 			if (best == NULL || cost < bestcost) {
449 				best = p;
450 				bestcost = cost;
451 			}
452 		}
453 	}
454 	if (best == NULL)
455 		return (NULL);
456 
457 	spc = &best->p_cpu->ci_schedstate;
458 	remrunqueue(best);
459 	best->p_cpu = self;
460 
461 	sched_stolen++;
462 
463 	return (best);
464 }
465 
466 /*
467  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
468  */
469 static int
470 log2(unsigned int i)
471 {
472 	int ret = 0;
473 
474 	while (i >>= 1)
475 		ret++;
476 
477 	return (ret);
478 }
479 
480 /*
481  * Calculate the cost of moving the proc to this cpu.
482  *
483  * What we want is some guesstimate of how much "performance" it will
484  * cost us to move the proc here. Not just for caches and TLBs and NUMA
485  * memory, but also for the proc itself. A highly loaded cpu might not
486  * be the best candidate for this proc since it won't get run.
487  *
488  * Just total guesstimates for now.
489  */
490 
491 int sched_cost_load = 1;
492 int sched_cost_priority = 1;
493 int sched_cost_runnable = 3;
494 int sched_cost_resident = 1;
495 
496 int
497 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
498 {
499 	struct schedstate_percpu *spc;
500 	int l2resident = 0;
501 	int cost;
502 
503 	spc = &ci->ci_schedstate;
504 
505 	cost = 0;
506 
507 	/*
508 	 * First, account for the priority of the proc we want to move.
509 	 * More willing to move, the lower the priority of the destination
510 	 * and the higher the priority of the proc.
511 	 */
512 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
513 		cost += (p->p_priority - spc->spc_curpriority) *
514 		    sched_cost_priority;
515 		cost += sched_cost_runnable;
516 	}
517 	if (cpuset_isset(&sched_queued_cpus, ci)) {
518 		cost += spc->spc_nrun * sched_cost_runnable;
519 	}
520 
521 	/*
522 	 * Higher load on the destination means we don't want to go there.
523 	 */
524 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
525 
526 	/*
527 	 * If the proc is on this cpu already, lower the cost by how much
528 	 * it has been running and an estimate of its footprint.
529 	 */
530 	if (p->p_cpu == ci && p->p_slptime == 0) {
531 		l2resident =
532 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
533 		cost -= l2resident * sched_cost_resident;
534 	}
535 
536 	return (cost);
537 }
538 
539 /*
540  * Peg a proc to a cpu.
541  */
542 void
543 sched_peg_curproc(struct cpu_info *ci)
544 {
545 	struct proc *p = curproc;
546 	int s;
547 
548 	SCHED_LOCK(s);
549 	p->p_priority = p->p_usrpri;
550 	p->p_stat = SRUN;
551 	p->p_cpu = ci;
552 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
553 	setrunqueue(p);
554 	p->p_stats->p_ru.ru_nvcsw++;
555 	mi_switch();
556 	SCHED_UNLOCK(s);
557 }
558 
559 #ifdef MULTIPROCESSOR
560 
561 void
562 sched_start_secondary_cpus(void)
563 {
564 	CPU_INFO_ITERATOR cii;
565 	struct cpu_info *ci;
566 
567 	CPU_INFO_FOREACH(cii, ci) {
568 		struct schedstate_percpu *spc = &ci->ci_schedstate;
569 
570 		if (CPU_IS_PRIMARY(ci))
571 			continue;
572 		cpuset_add(&sched_all_cpus, ci);
573 		atomic_clearbits_int(&spc->spc_schedflags,
574 		    SPCF_SHOULDHALT | SPCF_HALTED);
575 	}
576 }
577 
578 void
579 sched_stop_secondary_cpus(void)
580 {
581 	CPU_INFO_ITERATOR cii;
582 	struct cpu_info *ci;
583 
584 	/*
585 	 * Make sure we stop the secondary CPUs.
586 	 */
587 	CPU_INFO_FOREACH(cii, ci) {
588 		struct schedstate_percpu *spc = &ci->ci_schedstate;
589 
590 		if (CPU_IS_PRIMARY(ci))
591 			continue;
592 		cpuset_del(&sched_all_cpus, ci);
593 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
594 	}
595 	CPU_INFO_FOREACH(cii, ci) {
596 		struct schedstate_percpu *spc = &ci->ci_schedstate;
597 		struct sleep_state sls;
598 
599 		if (CPU_IS_PRIMARY(ci))
600 			continue;
601 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
602 			sleep_setup(&sls, spc, PZERO, "schedstate");
603 			sleep_finish(&sls,
604 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
605 		}
606 	}
607 }
608 
609 #endif
610 
611 /*
612  * Functions to manipulate cpu sets.
613  */
614 struct cpu_info *cpuset_infos[MAXCPUS];
615 static struct cpuset cpuset_all;
616 
617 void
618 cpuset_init_cpu(struct cpu_info *ci)
619 {
620 	cpuset_add(&cpuset_all, ci);
621 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
622 }
623 
624 void
625 cpuset_clear(struct cpuset *cs)
626 {
627 	memset(cs, 0, sizeof(*cs));
628 }
629 
630 void
631 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
632 {
633 	unsigned int num = CPU_INFO_UNIT(ci);
634 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
635 }
636 
637 void
638 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
639 {
640 	unsigned int num = CPU_INFO_UNIT(ci);
641 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
642 }
643 
644 int
645 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
646 {
647 	unsigned int num = CPU_INFO_UNIT(ci);
648 	return (cs->cs_set[num/32] & (1 << (num % 32)));
649 }
650 
651 void
652 cpuset_add_all(struct cpuset *cs)
653 {
654 	cpuset_copy(cs, &cpuset_all);
655 }
656 
657 void
658 cpuset_copy(struct cpuset *to, struct cpuset *from)
659 {
660 	memcpy(to, from, sizeof(*to));
661 }
662 
663 struct cpu_info *
664 cpuset_first(struct cpuset *cs)
665 {
666 	int i;
667 
668 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
669 		if (cs->cs_set[i])
670 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
671 
672 	return (NULL);
673 }
674 
675 void
676 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
677 {
678 	int i;
679 
680 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
681 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
682 }
683 
684 void
685 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
686 {
687 	int i;
688 
689 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
690 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
691 }
692 
693 void
694 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
695 {
696 	int i;
697 
698 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
699 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
700 }
701