xref: /openbsd-src/sys/kern/kern_sched.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*	$OpenBSD: kern_sched.c,v 1.93 2023/10/24 13:20:11 claudio Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	spc->spc_itimer = clockintr_establish(ci, itimer_update, NULL);
92 	if (spc->spc_itimer == NULL)
93 		panic("%s: clockintr_establish itimer_update", __func__);
94 	spc->spc_profclock = clockintr_establish(ci, profclock, NULL);
95 	if (spc->spc_profclock == NULL)
96 		panic("%s: clockintr_establish profclock", __func__);
97 	spc->spc_roundrobin = clockintr_establish(ci, roundrobin, NULL);
98 	if (spc->spc_roundrobin == NULL)
99 		panic("%s: clockintr_establish roundrobin", __func__);
100 	spc->spc_statclock = clockintr_establish(ci, statclock, NULL);
101 	if (spc->spc_statclock == NULL)
102 		panic("%s: clockintr_establish statclock", __func__);
103 
104 	kthread_create_deferred(sched_kthreads_create, ci);
105 
106 	LIST_INIT(&spc->spc_deadproc);
107 	SIMPLEQ_INIT(&spc->spc_deferred);
108 
109 	/*
110 	 * Slight hack here until the cpuset code handles cpu_info
111 	 * structures.
112 	 */
113 	cpuset_init_cpu(ci);
114 
115 #ifdef __HAVE_CPU_TOPOLOGY
116 	if (!sched_smt && ci->ci_smt_id > 0)
117 		return;
118 #endif
119 	cpuset_add(&sched_all_cpus, ci);
120 }
121 
122 void
123 sched_kthreads_create(void *v)
124 {
125 	struct cpu_info *ci = v;
126 	struct schedstate_percpu *spc = &ci->ci_schedstate;
127 	static int num;
128 
129 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
130 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
131 	    &spc->spc_idleproc))
132 		panic("fork idle");
133 
134 	/* Name it as specified. */
135 	snprintf(spc->spc_idleproc->p_p->ps_comm,
136 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
137 	    "idle%d", num);
138 
139 	num++;
140 }
141 
142 void
143 sched_idle(void *v)
144 {
145 	struct schedstate_percpu *spc;
146 	struct proc *p = curproc;
147 	struct cpu_info *ci = v;
148 	int s;
149 
150 	KERNEL_UNLOCK();
151 
152 	spc = &ci->ci_schedstate;
153 
154 	/*
155 	 * First time we enter here, we're not supposed to idle,
156 	 * just go away for a while.
157 	 */
158 	SCHED_LOCK(s);
159 	cpuset_add(&sched_idle_cpus, ci);
160 	p->p_stat = SSLEEP;
161 	p->p_cpu = ci;
162 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
163 	mi_switch();
164 	cpuset_del(&sched_idle_cpus, ci);
165 	SCHED_UNLOCK(s);
166 
167 	KASSERT(ci == curcpu());
168 	KASSERT(curproc == spc->spc_idleproc);
169 
170 	while (1) {
171 		while (!cpu_is_idle(curcpu())) {
172 			struct proc *dead;
173 
174 			SCHED_LOCK(s);
175 			p->p_stat = SSLEEP;
176 			mi_switch();
177 			SCHED_UNLOCK(s);
178 
179 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
180 				LIST_REMOVE(dead, p_hash);
181 				exit2(dead);
182 			}
183 		}
184 
185 		splassert(IPL_NONE);
186 
187 		smr_idle();
188 
189 		cpuset_add(&sched_idle_cpus, ci);
190 		cpu_idle_enter();
191 		while (spc->spc_whichqs == 0) {
192 #ifdef MULTIPROCESSOR
193 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
194 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
195 				cpuset_del(&sched_idle_cpus, ci);
196 				SCHED_LOCK(s);
197 				atomic_setbits_int(&spc->spc_schedflags,
198 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
199 				SCHED_UNLOCK(s);
200 				wakeup(spc);
201 			}
202 #endif
203 			cpu_idle_cycle();
204 		}
205 		cpu_idle_leave();
206 		cpuset_del(&sched_idle_cpus, ci);
207 	}
208 }
209 
210 /*
211  * To free our address space we have to jump through a few hoops.
212  * The freeing is done by the reaper, but until we have one reaper
213  * per cpu, we have no way of putting this proc on the deadproc list
214  * and waking up the reaper without risking having our address space and
215  * stack torn from under us before we manage to switch to another proc.
216  * Therefore we have a per-cpu list of dead processes where we put this
217  * proc and have idle clean up that list and move it to the reaper list.
218  * All this will be unnecessary once we can bind the reaper this cpu
219  * and not risk having it switch to another in case it sleeps.
220  */
221 void
222 sched_exit(struct proc *p)
223 {
224 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
225 
226 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
227 
228 	KERNEL_ASSERT_LOCKED();
229 	sched_toidle();
230 }
231 
232 void
233 sched_toidle(void)
234 {
235 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
236 	struct proc *idle;
237 	int s;
238 
239 #ifdef MULTIPROCESSOR
240 	/* This process no longer needs to hold the kernel lock. */
241 	if (_kernel_lock_held())
242 		__mp_release_all(&kernel_lock);
243 #endif
244 
245 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
246 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
247 		clockintr_cancel(spc->spc_itimer);
248 	}
249 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
250 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
251 		clockintr_cancel(spc->spc_profclock);
252 	}
253 
254 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
255 
256 	SCHED_LOCK(s);
257 
258 	idle = spc->spc_idleproc;
259 	idle->p_stat = SRUN;
260 
261 	uvmexp.swtch++;
262 	TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
263 	    idle->p_p->ps_pid);
264 	cpu_switchto(NULL, idle);
265 	panic("cpu_switchto returned");
266 }
267 
268 /*
269  * Run queue management.
270  */
271 void
272 sched_init_runqueues(void)
273 {
274 }
275 
276 void
277 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
278 {
279 	struct schedstate_percpu *spc;
280 	int queue = prio >> 2;
281 
282 	if (ci == NULL)
283 		ci = sched_choosecpu(p);
284 
285 	KASSERT(ci != NULL);
286 	SCHED_ASSERT_LOCKED();
287 	KASSERT(p->p_wchan == NULL);
288 
289 	p->p_cpu = ci;
290 	p->p_stat = SRUN;
291 	p->p_runpri = prio;
292 
293 	spc = &p->p_cpu->ci_schedstate;
294 	spc->spc_nrun++;
295 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
296 	    p->p_p->ps_pid);
297 
298 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
299 	spc->spc_whichqs |= (1U << queue);
300 	cpuset_add(&sched_queued_cpus, p->p_cpu);
301 
302 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
303 		cpu_unidle(p->p_cpu);
304 
305 	if (prio < spc->spc_curpriority)
306 		need_resched(ci);
307 }
308 
309 void
310 remrunqueue(struct proc *p)
311 {
312 	struct schedstate_percpu *spc;
313 	int queue = p->p_runpri >> 2;
314 
315 	SCHED_ASSERT_LOCKED();
316 	spc = &p->p_cpu->ci_schedstate;
317 	spc->spc_nrun--;
318 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
319 	    p->p_p->ps_pid);
320 
321 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
322 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
323 		spc->spc_whichqs &= ~(1U << queue);
324 		if (spc->spc_whichqs == 0)
325 			cpuset_del(&sched_queued_cpus, p->p_cpu);
326 	}
327 }
328 
329 struct proc *
330 sched_chooseproc(void)
331 {
332 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
333 	struct proc *p;
334 	int queue;
335 
336 	SCHED_ASSERT_LOCKED();
337 
338 #ifdef MULTIPROCESSOR
339 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
340 		if (spc->spc_whichqs) {
341 			for (queue = 0; queue < SCHED_NQS; queue++) {
342 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
343 					remrunqueue(p);
344 					setrunqueue(NULL, p, p->p_runpri);
345 					if (p->p_cpu == curcpu()) {
346 						KASSERT(p->p_flag & P_CPUPEG);
347 						goto again;
348 					}
349 				}
350 			}
351 		}
352 		p = spc->spc_idleproc;
353 		if (p == NULL)
354 			panic("no idleproc set on CPU%d",
355 			    CPU_INFO_UNIT(curcpu()));
356 		p->p_stat = SRUN;
357 		KASSERT(p->p_wchan == NULL);
358 		return (p);
359 	}
360 again:
361 #endif
362 
363 	if (spc->spc_whichqs) {
364 		queue = ffs(spc->spc_whichqs) - 1;
365 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
366 		remrunqueue(p);
367 		sched_noidle++;
368 		if (p->p_stat != SRUN)
369 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
370 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
371 		p = spc->spc_idleproc;
372 		if (p == NULL)
373 			panic("no idleproc set on CPU%d",
374 			    CPU_INFO_UNIT(curcpu()));
375 		p->p_stat = SRUN;
376 	}
377 
378 	KASSERT(p->p_wchan == NULL);
379 	return (p);
380 }
381 
382 struct cpu_info *
383 sched_choosecpu_fork(struct proc *parent, int flags)
384 {
385 #ifdef MULTIPROCESSOR
386 	struct cpu_info *choice = NULL;
387 	int run, best_run = INT_MAX;
388 	struct cpu_info *ci;
389 	struct cpuset set;
390 
391 #if 0
392 	/*
393 	 * XXX
394 	 * Don't do this until we have a painless way to move the cpu in exec.
395 	 * Preferably when nuking the old pmap and getting a new one on a
396 	 * new cpu.
397 	 */
398 	/*
399 	 * PPWAIT forks are simple. We know that the parent will not
400 	 * run until we exec and choose another cpu, so we just steal its
401 	 * cpu.
402 	 */
403 	if (flags & FORK_PPWAIT)
404 		return (parent->p_cpu);
405 #endif
406 
407 	/*
408 	 * Look at all cpus that are currently idle and have nothing queued.
409 	 * If there are none, pick the one with least queued procs first,
410 	 * then the one with lowest load average.
411 	 */
412 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
413 	cpuset_intersection(&set, &set, &sched_all_cpus);
414 	if (cpuset_first(&set) == NULL)
415 		cpuset_copy(&set, &sched_all_cpus);
416 
417 	while ((ci = cpuset_first(&set)) != NULL) {
418 		cpuset_del(&set, ci);
419 
420 		run = ci->ci_schedstate.spc_nrun;
421 
422 		if (choice == NULL || run < best_run) {
423 			choice = ci;
424 			best_run = run;
425 		}
426 	}
427 
428 	return (choice);
429 #else
430 	return (curcpu());
431 #endif
432 }
433 
434 struct cpu_info *
435 sched_choosecpu(struct proc *p)
436 {
437 #ifdef MULTIPROCESSOR
438 	struct cpu_info *choice = NULL;
439 	int last_cost = INT_MAX;
440 	struct cpu_info *ci;
441 	struct cpuset set;
442 
443 	/*
444 	 * If pegged to a cpu, don't allow it to move.
445 	 */
446 	if (p->p_flag & P_CPUPEG)
447 		return (p->p_cpu);
448 
449 	sched_choose++;
450 
451 	/*
452 	 * Look at all cpus that are currently idle and have nothing queued.
453 	 * If there are none, pick the cheapest of those.
454 	 * (idle + queued could mean that the cpu is handling an interrupt
455 	 * at this moment and haven't had time to leave idle yet).
456 	 */
457 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
458 	cpuset_intersection(&set, &set, &sched_all_cpus);
459 
460 	/*
461 	 * First, just check if our current cpu is in that set, if it is,
462 	 * this is simple.
463 	 * Also, our cpu might not be idle, but if it's the current cpu
464 	 * and it has nothing else queued and we're curproc, take it.
465 	 */
466 	if (cpuset_isset(&set, p->p_cpu) ||
467 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
468 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
469 	    curproc == p)) {
470 		sched_wasidle++;
471 		return (p->p_cpu);
472 	}
473 
474 	if (cpuset_first(&set) == NULL)
475 		cpuset_copy(&set, &sched_all_cpus);
476 
477 	while ((ci = cpuset_first(&set)) != NULL) {
478 		int cost = sched_proc_to_cpu_cost(ci, p);
479 
480 		if (choice == NULL || cost < last_cost) {
481 			choice = ci;
482 			last_cost = cost;
483 		}
484 		cpuset_del(&set, ci);
485 	}
486 
487 	if (p->p_cpu != choice)
488 		sched_nmigrations++;
489 	else
490 		sched_nomigrations++;
491 
492 	return (choice);
493 #else
494 	return (curcpu());
495 #endif
496 }
497 
498 /*
499  * Attempt to steal a proc from some cpu.
500  */
501 struct proc *
502 sched_steal_proc(struct cpu_info *self)
503 {
504 	struct proc *best = NULL;
505 #ifdef MULTIPROCESSOR
506 	struct schedstate_percpu *spc;
507 	int bestcost = INT_MAX;
508 	struct cpu_info *ci;
509 	struct cpuset set;
510 
511 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
512 
513 	/* Don't steal if we don't want to schedule processes in this CPU. */
514 	if (!cpuset_isset(&sched_all_cpus, self))
515 		return (NULL);
516 
517 	cpuset_copy(&set, &sched_queued_cpus);
518 
519 	while ((ci = cpuset_first(&set)) != NULL) {
520 		struct proc *p;
521 		int queue;
522 		int cost;
523 
524 		cpuset_del(&set, ci);
525 
526 		spc = &ci->ci_schedstate;
527 
528 		queue = ffs(spc->spc_whichqs) - 1;
529 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
530 			if (p->p_flag & P_CPUPEG)
531 				continue;
532 
533 			cost = sched_proc_to_cpu_cost(self, p);
534 
535 			if (best == NULL || cost < bestcost) {
536 				best = p;
537 				bestcost = cost;
538 			}
539 		}
540 	}
541 	if (best == NULL)
542 		return (NULL);
543 
544 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
545 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
546 
547 	remrunqueue(best);
548 	best->p_cpu = self;
549 
550 	sched_stolen++;
551 #endif
552 	return (best);
553 }
554 
555 #ifdef MULTIPROCESSOR
556 /*
557  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
558  */
559 static int
560 log2(unsigned int i)
561 {
562 	int ret = 0;
563 
564 	while (i >>= 1)
565 		ret++;
566 
567 	return (ret);
568 }
569 
570 /*
571  * Calculate the cost of moving the proc to this cpu.
572  *
573  * What we want is some guesstimate of how much "performance" it will
574  * cost us to move the proc here. Not just for caches and TLBs and NUMA
575  * memory, but also for the proc itself. A highly loaded cpu might not
576  * be the best candidate for this proc since it won't get run.
577  *
578  * Just total guesstimates for now.
579  */
580 
581 int sched_cost_load = 1;
582 int sched_cost_priority = 1;
583 int sched_cost_runnable = 3;
584 int sched_cost_resident = 1;
585 #endif
586 
587 int
588 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
589 {
590 	int cost = 0;
591 #ifdef MULTIPROCESSOR
592 	struct schedstate_percpu *spc;
593 	int l2resident = 0;
594 
595 	spc = &ci->ci_schedstate;
596 
597 	/*
598 	 * First, account for the priority of the proc we want to move.
599 	 * More willing to move, the lower the priority of the destination
600 	 * and the higher the priority of the proc.
601 	 */
602 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
603 		cost += (p->p_usrpri - spc->spc_curpriority) *
604 		    sched_cost_priority;
605 		cost += sched_cost_runnable;
606 	}
607 	if (cpuset_isset(&sched_queued_cpus, ci))
608 		cost += spc->spc_nrun * sched_cost_runnable;
609 
610 	/*
611 	 * Try to avoid the primary cpu as it handles hardware interrupts.
612 	 *
613 	 * XXX Needs to be revisited when we distribute interrupts
614 	 * over cpus.
615 	 */
616 	if (CPU_IS_PRIMARY(ci))
617 		cost += sched_cost_runnable;
618 
619 	/*
620 	 * If the proc is on this cpu already, lower the cost by how much
621 	 * it has been running and an estimate of its footprint.
622 	 */
623 	if (p->p_cpu == ci && p->p_slptime == 0) {
624 		l2resident =
625 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
626 		cost -= l2resident * sched_cost_resident;
627 	}
628 #endif
629 	return (cost);
630 }
631 
632 /*
633  * Peg a proc to a cpu.
634  */
635 void
636 sched_peg_curproc(struct cpu_info *ci)
637 {
638 	struct proc *p = curproc;
639 	int s;
640 
641 	SCHED_LOCK(s);
642 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
643 	setrunqueue(ci, p, p->p_usrpri);
644 	p->p_ru.ru_nvcsw++;
645 	mi_switch();
646 	SCHED_UNLOCK(s);
647 }
648 
649 #ifdef MULTIPROCESSOR
650 
651 void
652 sched_start_secondary_cpus(void)
653 {
654 	CPU_INFO_ITERATOR cii;
655 	struct cpu_info *ci;
656 
657 	CPU_INFO_FOREACH(cii, ci) {
658 		struct schedstate_percpu *spc = &ci->ci_schedstate;
659 
660 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
661 			continue;
662 		atomic_clearbits_int(&spc->spc_schedflags,
663 		    SPCF_SHOULDHALT | SPCF_HALTED);
664 #ifdef __HAVE_CPU_TOPOLOGY
665 		if (!sched_smt && ci->ci_smt_id > 0)
666 			continue;
667 #endif
668 		cpuset_add(&sched_all_cpus, ci);
669 	}
670 }
671 
672 void
673 sched_stop_secondary_cpus(void)
674 {
675 	CPU_INFO_ITERATOR cii;
676 	struct cpu_info *ci;
677 
678 	/*
679 	 * Make sure we stop the secondary CPUs.
680 	 */
681 	CPU_INFO_FOREACH(cii, ci) {
682 		struct schedstate_percpu *spc = &ci->ci_schedstate;
683 
684 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
685 			continue;
686 		cpuset_del(&sched_all_cpus, ci);
687 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
688 	}
689 	CPU_INFO_FOREACH(cii, ci) {
690 		struct schedstate_percpu *spc = &ci->ci_schedstate;
691 
692 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
693 			continue;
694 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
695 			sleep_setup(spc, PZERO, "schedstate");
696 			sleep_finish(0,
697 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
698 		}
699 	}
700 }
701 
702 struct sched_barrier_state {
703 	struct cpu_info *ci;
704 	struct cond cond;
705 };
706 
707 void
708 sched_barrier_task(void *arg)
709 {
710 	struct sched_barrier_state *sb = arg;
711 	struct cpu_info *ci = sb->ci;
712 
713 	sched_peg_curproc(ci);
714 	cond_signal(&sb->cond);
715 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
716 }
717 
718 void
719 sched_barrier(struct cpu_info *ci)
720 {
721 	struct sched_barrier_state sb;
722 	struct task task;
723 	CPU_INFO_ITERATOR cii;
724 
725 	if (ci == NULL) {
726 		CPU_INFO_FOREACH(cii, ci) {
727 			if (CPU_IS_PRIMARY(ci))
728 				break;
729 		}
730 	}
731 	KASSERT(ci != NULL);
732 
733 	if (ci == curcpu())
734 		return;
735 
736 	sb.ci = ci;
737 	cond_init(&sb.cond);
738 	task_set(&task, sched_barrier_task, &sb);
739 
740 	task_add(systqmp, &task);
741 	cond_wait(&sb.cond, "sbar");
742 }
743 
744 #else
745 
746 void
747 sched_barrier(struct cpu_info *ci)
748 {
749 }
750 
751 #endif
752 
753 /*
754  * Functions to manipulate cpu sets.
755  */
756 struct cpu_info *cpuset_infos[MAXCPUS];
757 static struct cpuset cpuset_all;
758 
759 void
760 cpuset_init_cpu(struct cpu_info *ci)
761 {
762 	cpuset_add(&cpuset_all, ci);
763 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
764 }
765 
766 void
767 cpuset_clear(struct cpuset *cs)
768 {
769 	memset(cs, 0, sizeof(*cs));
770 }
771 
772 void
773 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
774 {
775 	unsigned int num = CPU_INFO_UNIT(ci);
776 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
777 }
778 
779 void
780 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
781 {
782 	unsigned int num = CPU_INFO_UNIT(ci);
783 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
784 }
785 
786 int
787 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
788 {
789 	unsigned int num = CPU_INFO_UNIT(ci);
790 	return (cs->cs_set[num/32] & (1U << (num % 32)));
791 }
792 
793 void
794 cpuset_add_all(struct cpuset *cs)
795 {
796 	cpuset_copy(cs, &cpuset_all);
797 }
798 
799 void
800 cpuset_copy(struct cpuset *to, struct cpuset *from)
801 {
802 	memcpy(to, from, sizeof(*to));
803 }
804 
805 struct cpu_info *
806 cpuset_first(struct cpuset *cs)
807 {
808 	int i;
809 
810 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
811 		if (cs->cs_set[i])
812 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
813 
814 	return (NULL);
815 }
816 
817 void
818 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
819 {
820 	int i;
821 
822 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
823 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
824 }
825 
826 void
827 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
828 {
829 	int i;
830 
831 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
832 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
833 }
834 
835 void
836 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
837 {
838 	int i;
839 
840 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
841 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
842 }
843 
844 int
845 cpuset_cardinality(struct cpuset *cs)
846 {
847 	int cardinality, i, n;
848 
849 	cardinality = 0;
850 
851 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
852 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
853 			cardinality++;
854 
855 	return (cardinality);
856 }
857 
858 int
859 sysctl_hwncpuonline(void)
860 {
861 	return cpuset_cardinality(&sched_all_cpus);
862 }
863 
864 int
865 cpu_is_online(struct cpu_info *ci)
866 {
867 	return cpuset_isset(&sched_all_cpus, ci);
868 }
869 
870 #ifdef __HAVE_CPU_TOPOLOGY
871 
872 #include <sys/sysctl.h>
873 
874 int
875 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
876 {
877 	CPU_INFO_ITERATOR cii;
878 	struct cpu_info *ci;
879 	int err, newsmt;
880 
881 	newsmt = sched_smt;
882 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
883 	if (err)
884 		return err;
885 	if (newsmt == sched_smt)
886 		return 0;
887 
888 	sched_smt = newsmt;
889 	CPU_INFO_FOREACH(cii, ci) {
890 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
891 			continue;
892 		if (ci->ci_smt_id == 0)
893 			continue;
894 		if (sched_smt)
895 			cpuset_add(&sched_all_cpus, ci);
896 		else
897 			cpuset_del(&sched_all_cpus, ci);
898 	}
899 
900 	return 0;
901 }
902 
903 #endif
904