xref: /openbsd-src/sys/kern/kern_sched.c (revision cd1eb269cafb12c415be1749cd4a4b5422710415)
1 /*	$OpenBSD: kern_sched.c,v 1.19 2010/04/23 21:34:38 deraadt Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 
28 #include <uvm/uvm_extern.h>
29 
30 #include <sys/malloc.h>
31 
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 
46 /*
47  * A few notes about cpu_switchto that is implemented in MD code.
48  *
49  * cpu_switchto takes two arguments, the old proc and the proc
50  * it should switch to. The new proc will never be NULL, so we always have
51  * a saved state that we need to switch to. The old proc however can
52  * be NULL if the process is exiting. NULL for the old proc simply
53  * means "don't bother saving old state".
54  *
55  * cpu_switchto is supposed to atomically load the new state of the process
56  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
57  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
58  * cpus in the system must not depend on this state being consistent.
59  * Therefore no locking is necessary in cpu_switchto other than blocking
60  * interrupts during the context switch.
61  */
62 
63 /*
64  * sched_init_cpu is called from main() for the boot cpu, then it's the
65  * responsibility of the MD code to call it for all other cpus.
66  */
67 void
68 sched_init_cpu(struct cpu_info *ci)
69 {
70 	struct schedstate_percpu *spc = &ci->ci_schedstate;
71 	int i;
72 
73 	for (i = 0; i < SCHED_NQS; i++)
74 		TAILQ_INIT(&spc->spc_qs[i]);
75 
76 	spc->spc_idleproc = NULL;
77 
78 	kthread_create_deferred(sched_kthreads_create, ci);
79 
80 	LIST_INIT(&spc->spc_deadproc);
81 
82 	/*
83 	 * Slight hack here until the cpuset code handles cpu_info
84 	 * structures.
85 	 */
86 	cpuset_init_cpu(ci);
87 }
88 
89 void
90 sched_kthreads_create(void *v)
91 {
92 	struct cpu_info *ci = v;
93 	struct schedstate_percpu *spc = &ci->ci_schedstate;
94 	static int num;
95 
96 	if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
97 		panic("fork idle");
98 
99 	num++;
100 }
101 
102 void
103 sched_idle(void *v)
104 {
105 	struct schedstate_percpu *spc;
106 	struct proc *p = curproc;
107 	struct cpu_info *ci = v;
108 	int s;
109 
110 	KERNEL_PROC_UNLOCK(p);
111 
112 	spc = &ci->ci_schedstate;
113 
114 	/*
115 	 * First time we enter here, we're not supposed to idle,
116 	 * just go away for a while.
117 	 */
118 	SCHED_LOCK(s);
119 	cpuset_add(&sched_idle_cpus, ci);
120 	p->p_stat = SSLEEP;
121 	p->p_cpu = ci;
122 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
123 	mi_switch();
124 	cpuset_del(&sched_idle_cpus, ci);
125 	SCHED_UNLOCK(s);
126 
127 	KASSERT(ci == curcpu());
128 	KASSERT(curproc == spc->spc_idleproc);
129 
130 	while (1) {
131 		while (!curcpu_is_idle()) {
132 			struct proc *dead;
133 
134 			SCHED_LOCK(s);
135 			p->p_stat = SSLEEP;
136 			mi_switch();
137 			SCHED_UNLOCK(s);
138 
139 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
140 				LIST_REMOVE(dead, p_hash);
141 				exit2(dead);
142 			}
143 		}
144 
145 		splassert(IPL_NONE);
146 
147 		cpuset_add(&sched_idle_cpus, ci);
148 		cpu_idle_enter();
149 		while (spc->spc_whichqs == 0) {
150 			if (spc->spc_schedflags & SPCF_SHOULDHALT) {
151 				atomic_setbits_int(&spc->spc_schedflags,
152 				    SPCF_HALTED);
153 				wakeup(spc);
154 			}
155 			cpu_idle_cycle();
156 		}
157 		cpu_idle_leave();
158 		cpuset_del(&sched_idle_cpus, ci);
159 	}
160 }
161 
162 /*
163  * To free our address space we have to jump through a few hoops.
164  * The freeing is done by the reaper, but until we have one reaper
165  * per cpu, we have no way of putting this proc on the deadproc list
166  * and waking up the reaper without risking having our address space and
167  * stack torn from under us before we manage to switch to another proc.
168  * Therefore we have a per-cpu list of dead processes where we put this
169  * proc and have idle clean up that list and move it to the reaper list.
170  * All this will be unnecessary once we can bind the reaper this cpu
171  * and not risk having it switch to another in case it sleeps.
172  */
173 void
174 sched_exit(struct proc *p)
175 {
176 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
177 	struct timeval tv;
178 	struct proc *idle;
179 	int s;
180 
181 	microuptime(&tv);
182 	timersub(&tv, &spc->spc_runtime, &tv);
183 	timeradd(&p->p_rtime, &tv, &p->p_rtime);
184 
185 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
186 
187 	/* This process no longer needs to hold the kernel lock. */
188 	KERNEL_PROC_UNLOCK(p);
189 
190 	SCHED_LOCK(s);
191 	idle = spc->spc_idleproc;
192 	idle->p_stat = SRUN;
193 	cpu_switchto(NULL, idle);
194 	panic("cpu_switchto returned");
195 }
196 
197 /*
198  * Run queue management.
199  */
200 void
201 sched_init_runqueues(void)
202 {
203 #ifdef MULTIPROCESSOR
204 	__mp_lock_init(&sched_lock);
205 #endif
206 }
207 
208 void
209 setrunqueue(struct proc *p)
210 {
211 	struct schedstate_percpu *spc;
212 	int queue = p->p_priority >> 2;
213 
214 	SCHED_ASSERT_LOCKED();
215 	spc = &p->p_cpu->ci_schedstate;
216 	spc->spc_nrun++;
217 
218 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
219 	spc->spc_whichqs |= (1 << queue);
220 	cpuset_add(&sched_queued_cpus, p->p_cpu);
221 
222 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
223 		cpu_unidle(p->p_cpu);
224 }
225 
226 void
227 remrunqueue(struct proc *p)
228 {
229 	struct schedstate_percpu *spc;
230 	int queue = p->p_priority >> 2;
231 
232 	SCHED_ASSERT_LOCKED();
233 	spc = &p->p_cpu->ci_schedstate;
234 	spc->spc_nrun--;
235 
236 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
237 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
238 		spc->spc_whichqs &= ~(1 << queue);
239 		if (spc->spc_whichqs == 0)
240 			cpuset_del(&sched_queued_cpus, p->p_cpu);
241 	}
242 }
243 
244 struct proc *
245 sched_chooseproc(void)
246 {
247 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
248 	struct proc *p;
249 	int queue;
250 
251 	SCHED_ASSERT_LOCKED();
252 
253 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
254 		p = spc->spc_idleproc;
255 		KASSERT(p);
256 		p->p_stat = SRUN;
257 		return (p);
258 	}
259 
260 again:
261 	if (spc->spc_whichqs) {
262 		queue = ffs(spc->spc_whichqs) - 1;
263 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
264 		remrunqueue(p);
265 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
266 		p = spc->spc_idleproc;
267 		if (p == NULL) {
268                         int s;
269 			/*
270 			 * We get here if someone decides to switch during
271 			 * boot before forking kthreads, bleh.
272 			 * This is kind of like a stupid idle loop.
273 			 */
274 #ifdef MULTIPROCESSOR
275 			__mp_unlock(&sched_lock);
276 #endif
277 			spl0();
278 			delay(10);
279 			SCHED_LOCK(s);
280 			goto again;
281                 }
282 		KASSERT(p);
283 		p->p_stat = SRUN;
284 	}
285 
286 	return (p);
287 }
288 
289 uint64_t sched_nmigrations;
290 uint64_t sched_noidle;
291 uint64_t sched_stolen;
292 
293 uint64_t sched_choose;
294 uint64_t sched_wasidle;
295 uint64_t sched_nomigrations;
296 
297 struct cpu_info *
298 sched_choosecpu_fork(struct proc *parent, int flags)
299 {
300 	struct cpu_info *choice = NULL;
301 	fixpt_t load, best_load = ~0;
302 	int run, best_run = INT_MAX;
303 	struct cpu_info *ci;
304 	struct cpuset set;
305 
306 #if 0
307 	/*
308 	 * XXX
309 	 * Don't do this until we have a painless way to move the cpu in exec.
310 	 * Preferably when nuking the old pmap and getting a new one on a
311 	 * new cpu.
312 	 */
313 	/*
314 	 * PPWAIT forks are simple. We know that the parent will not
315 	 * run until we exec and choose another cpu, so we just steal its
316 	 * cpu.
317 	 */
318 	if (flags & FORK_PPWAIT)
319 		return (parent->p_cpu);
320 #endif
321 
322 	/*
323 	 * Look at all cpus that are currently idle and have nothing queued.
324 	 * If there are none, pick the one with least queued procs first,
325 	 * then the one with lowest load average.
326 	 */
327 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
328 	if (cpuset_first(&set) == NULL)
329 		cpuset_add_all(&set);
330 
331 	while ((ci = cpuset_first(&set)) != NULL) {
332 		cpuset_del(&set, ci);
333 
334 		load = ci->ci_schedstate.spc_ldavg;
335 		run = ci->ci_schedstate.spc_nrun;
336 
337 		if (choice == NULL || run < best_run ||
338 		    (run == best_run &&load < best_load)) {
339 			choice = ci;
340 			best_load = load;
341 			best_run = run;
342 		}
343 	}
344 
345 	return (choice);
346 }
347 
348 struct cpu_info *
349 sched_choosecpu(struct proc *p)
350 {
351 	struct cpu_info *choice = NULL;
352 	int last_cost = INT_MAX;
353 	struct cpu_info *ci;
354 	struct cpuset set;
355 
356 	/*
357 	 * If pegged to a cpu, don't allow it to move.
358 	 */
359 	if (p->p_flag & P_CPUPEG)
360 		return (p->p_cpu);
361 
362 	sched_choose++;
363 
364 	/*
365 	 * Look at all cpus that are currently idle and have nothing queued.
366 	 * If there are none, pick the cheapest of those.
367 	 * (idle + queued could mean that the cpu is handling an interrupt
368 	 * at this moment and haven't had time to leave idle yet).
369 	 */
370 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
371 
372 	/*
373 	 * First, just check if our current cpu is in that set, if it is,
374 	 * this is simple.
375 	 * Also, our cpu might not be idle, but if it's the current cpu
376 	 * and it has nothing else queued and we're curproc, take it.
377 	 */
378 	if (cpuset_isset(&set, p->p_cpu) ||
379 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
380 	    curproc == p)) {
381 		sched_wasidle++;
382 		return (p->p_cpu);
383 	}
384 
385 	if (cpuset_first(&set) == NULL)
386 		cpuset_add_all(&set);
387 
388 	while ((ci = cpuset_first(&set)) != NULL) {
389 		int cost = sched_proc_to_cpu_cost(ci, p);
390 
391 		if (choice == NULL || cost < last_cost) {
392 			choice = ci;
393 			last_cost = cost;
394 		}
395 		cpuset_del(&set, ci);
396 	}
397 
398 	if (p->p_cpu != choice)
399 		sched_nmigrations++;
400 	else
401 		sched_nomigrations++;
402 
403 	return (choice);
404 }
405 
406 /*
407  * Attempt to steal a proc from some cpu.
408  */
409 struct proc *
410 sched_steal_proc(struct cpu_info *self)
411 {
412 	struct schedstate_percpu *spc;
413 	struct proc *best = NULL;
414 	int bestcost = INT_MAX;
415 	struct cpu_info *ci;
416 	struct cpuset set;
417 
418 	cpuset_copy(&set, &sched_queued_cpus);
419 
420 	while ((ci = cpuset_first(&set)) != NULL) {
421 		struct proc *p;
422 		int queue;
423 		int cost;
424 
425 		cpuset_del(&set, ci);
426 
427 		spc = &ci->ci_schedstate;
428 
429 		queue = ffs(spc->spc_whichqs) - 1;
430 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
431 			if (p->p_flag & P_CPUPEG)
432 				continue;
433 
434 			cost = sched_proc_to_cpu_cost(self, p);
435 
436 			if (best == NULL || cost < bestcost) {
437 				best = p;
438 				bestcost = cost;
439 			}
440 		}
441 	}
442 	if (best == NULL)
443 		return (NULL);
444 
445 	spc = &best->p_cpu->ci_schedstate;
446 	remrunqueue(best);
447 	best->p_cpu = self;
448 
449 	sched_stolen++;
450 
451 	return (best);
452 }
453 
454 /*
455  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
456  */
457 static int
458 log2(unsigned int i)
459 {
460 	int ret = 0;
461 
462 	while (i >>= 1)
463 		ret++;
464 
465 	return (ret);
466 }
467 
468 /*
469  * Calculate the cost of moving the proc to this cpu.
470  *
471  * What we want is some guesstimate of how much "performance" it will
472  * cost us to move the proc here. Not just for caches and TLBs and NUMA
473  * memory, but also for the proc itself. A highly loaded cpu might not
474  * be the best candidate for this proc since it won't get run.
475  *
476  * Just total guesstimates for now.
477  */
478 
479 int sched_cost_load = 1;
480 int sched_cost_priority = 1;
481 int sched_cost_runnable = 3;
482 int sched_cost_resident = 1;
483 
484 int
485 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
486 {
487 	struct schedstate_percpu *spc;
488 	int l2resident = 0;
489 	int cost;
490 
491 	spc = &ci->ci_schedstate;
492 
493 	cost = 0;
494 
495 	/*
496 	 * First, account for the priority of the proc we want to move.
497 	 * More willing to move, the lower the priority of the destination
498 	 * and the higher the priority of the proc.
499 	 */
500 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
501 		cost += (p->p_priority - spc->spc_curpriority) *
502 		    sched_cost_priority;
503 		cost += sched_cost_runnable;
504 	}
505 	if (cpuset_isset(&sched_queued_cpus, ci)) {
506 		cost += spc->spc_nrun * sched_cost_runnable;
507 	}
508 
509 	/*
510 	 * Higher load on the destination means we don't want to go there.
511 	 */
512 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
513 
514 	/*
515 	 * If the proc is on this cpu already, lower the cost by how much
516 	 * it has been running and an estimate of its footprint.
517 	 */
518 	if (p->p_cpu == ci && p->p_slptime == 0) {
519 		l2resident =
520 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
521 		cost -= l2resident * sched_cost_resident;
522 	}
523 
524 	return (cost);
525 }
526 
527 /*
528  * Peg a proc to a cpu.
529  */
530 void
531 sched_peg_curproc(struct cpu_info *ci)
532 {
533 	struct proc *p = curproc;
534 	int s;
535 
536 	SCHED_LOCK(s);
537 	p->p_priority = p->p_usrpri;
538 	p->p_stat = SRUN;
539 	p->p_cpu = ci;
540 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
541 	setrunqueue(p);
542 	p->p_stats->p_ru.ru_nvcsw++;
543 	mi_switch();
544 	SCHED_UNLOCK(s);
545 }
546 
547 #ifdef MULTIPROCESSOR
548 
549 void
550 sched_start_secondary_cpus(void)
551 {
552 	CPU_INFO_ITERATOR cii;
553 	struct cpu_info *ci;
554 
555 	CPU_INFO_FOREACH(cii, ci) {
556 		struct schedstate_percpu *spc = &ci->ci_schedstate;
557 
558 		if (CPU_IS_PRIMARY(ci))
559 			continue;
560 		atomic_clearbits_int(&spc->spc_schedflags,
561 		    SPCF_SHOULDHALT | SPCF_HALTED);
562 	}
563 }
564 
565 void
566 sched_stop_secondary_cpus(void)
567 {
568 	CPU_INFO_ITERATOR cii;
569 	struct cpu_info *ci;
570 
571 	/*
572 	 * Make sure we stop the secondary CPUs.
573 	 */
574 	CPU_INFO_FOREACH(cii, ci) {
575 		struct schedstate_percpu *spc = &ci->ci_schedstate;
576 
577 		if (CPU_IS_PRIMARY(ci))
578 			continue;
579 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
580 	}
581 	CPU_INFO_FOREACH(cii, ci) {
582 		struct schedstate_percpu *spc = &ci->ci_schedstate;
583 		struct sleep_state sls;
584 
585 		if (CPU_IS_PRIMARY(ci))
586 			continue;
587 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
588 			sleep_setup(&sls, spc, PZERO, "schedstate");
589 			sleep_finish(&sls,
590 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
591 		}
592 	}
593 }
594 
595 #endif
596 
597 /*
598  * Functions to manipulate cpu sets.
599  */
600 struct cpu_info *cpuset_infos[MAXCPUS];
601 static struct cpuset cpuset_all;
602 
603 void
604 cpuset_init_cpu(struct cpu_info *ci)
605 {
606 	cpuset_add(&cpuset_all, ci);
607 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
608 }
609 
610 void
611 cpuset_clear(struct cpuset *cs)
612 {
613 	memset(cs, 0, sizeof(*cs));
614 }
615 
616 /*
617  * XXX - implement it on SP architectures too
618  */
619 #ifndef CPU_INFO_UNIT
620 #define CPU_INFO_UNIT 0
621 #endif
622 
623 void
624 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
625 {
626 	unsigned int num = CPU_INFO_UNIT(ci);
627 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
628 }
629 
630 void
631 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
632 {
633 	unsigned int num = CPU_INFO_UNIT(ci);
634 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
635 }
636 
637 int
638 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
639 {
640 	unsigned int num = CPU_INFO_UNIT(ci);
641 	return (cs->cs_set[num/32] & (1 << (num % 32)));
642 }
643 
644 void
645 cpuset_add_all(struct cpuset *cs)
646 {
647 	cpuset_copy(cs, &cpuset_all);
648 }
649 
650 void
651 cpuset_copy(struct cpuset *to, struct cpuset *from)
652 {
653 	memcpy(to, from, sizeof(*to));
654 }
655 
656 struct cpu_info *
657 cpuset_first(struct cpuset *cs)
658 {
659 	int i;
660 
661 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
662 		if (cs->cs_set[i])
663 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
664 
665 	return (NULL);
666 }
667 
668 void
669 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
670 {
671 	int i;
672 
673 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
674 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
675 }
676 
677 void
678 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
679 {
680 	int i;
681 
682 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
683 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
684 }
685 
686 void
687 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
688 {
689 	int i;
690 
691 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
692 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
693 }
694