xref: /openbsd-src/sys/kern/kern_sched.c (revision a332869ab2345858bff8ccd1725674b106eafb6f)
1 /*	$OpenBSD: kern_sched.c,v 1.91 2023/09/14 22:07:11 cheloha Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	spc->spc_itimer = clockintr_establish(ci, itimer_update, NULL);
92 	if (spc->spc_itimer == NULL)
93 		panic("%s: clockintr_establish itimer_update", __func__);
94 	spc->spc_profclock = clockintr_establish(ci, profclock, NULL);
95 	if (spc->spc_profclock == NULL)
96 		panic("%s: clockintr_establish profclock", __func__);
97 	spc->spc_roundrobin = clockintr_establish(ci, roundrobin, NULL);
98 	if (spc->spc_roundrobin == NULL)
99 		panic("%s: clockintr_establish roundrobin", __func__);
100 	spc->spc_statclock = clockintr_establish(ci, statclock, NULL);
101 	if (spc->spc_statclock == NULL)
102 		panic("%s: clockintr_establish statclock", __func__);
103 
104 	kthread_create_deferred(sched_kthreads_create, ci);
105 
106 	LIST_INIT(&spc->spc_deadproc);
107 	SIMPLEQ_INIT(&spc->spc_deferred);
108 
109 	/*
110 	 * Slight hack here until the cpuset code handles cpu_info
111 	 * structures.
112 	 */
113 	cpuset_init_cpu(ci);
114 
115 #ifdef __HAVE_CPU_TOPOLOGY
116 	if (!sched_smt && ci->ci_smt_id > 0)
117 		return;
118 #endif
119 	cpuset_add(&sched_all_cpus, ci);
120 }
121 
122 void
123 sched_kthreads_create(void *v)
124 {
125 	struct cpu_info *ci = v;
126 	struct schedstate_percpu *spc = &ci->ci_schedstate;
127 	static int num;
128 
129 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
130 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
131 	    &spc->spc_idleproc))
132 		panic("fork idle");
133 
134 	/* Name it as specified. */
135 	snprintf(spc->spc_idleproc->p_p->ps_comm,
136 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
137 	    "idle%d", num);
138 
139 	num++;
140 }
141 
142 void
143 sched_idle(void *v)
144 {
145 	struct schedstate_percpu *spc;
146 	struct proc *p = curproc;
147 	struct cpu_info *ci = v;
148 	int s;
149 
150 	KERNEL_UNLOCK();
151 
152 	spc = &ci->ci_schedstate;
153 
154 	/*
155 	 * First time we enter here, we're not supposed to idle,
156 	 * just go away for a while.
157 	 */
158 	SCHED_LOCK(s);
159 	cpuset_add(&sched_idle_cpus, ci);
160 	p->p_stat = SSLEEP;
161 	p->p_cpu = ci;
162 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
163 	mi_switch();
164 	cpuset_del(&sched_idle_cpus, ci);
165 	SCHED_UNLOCK(s);
166 
167 	KASSERT(ci == curcpu());
168 	KASSERT(curproc == spc->spc_idleproc);
169 
170 	while (1) {
171 		while (!cpu_is_idle(curcpu())) {
172 			struct proc *dead;
173 
174 			SCHED_LOCK(s);
175 			p->p_stat = SSLEEP;
176 			mi_switch();
177 			SCHED_UNLOCK(s);
178 
179 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
180 				LIST_REMOVE(dead, p_hash);
181 				exit2(dead);
182 			}
183 		}
184 
185 		splassert(IPL_NONE);
186 
187 		smr_idle();
188 
189 		cpuset_add(&sched_idle_cpus, ci);
190 		cpu_idle_enter();
191 		while (spc->spc_whichqs == 0) {
192 #ifdef MULTIPROCESSOR
193 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
194 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
195 				cpuset_del(&sched_idle_cpus, ci);
196 				SCHED_LOCK(s);
197 				atomic_setbits_int(&spc->spc_schedflags,
198 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
199 				SCHED_UNLOCK(s);
200 				wakeup(spc);
201 			}
202 #endif
203 			cpu_idle_cycle();
204 		}
205 		cpu_idle_leave();
206 		cpuset_del(&sched_idle_cpus, ci);
207 	}
208 }
209 
210 /*
211  * To free our address space we have to jump through a few hoops.
212  * The freeing is done by the reaper, but until we have one reaper
213  * per cpu, we have no way of putting this proc on the deadproc list
214  * and waking up the reaper without risking having our address space and
215  * stack torn from under us before we manage to switch to another proc.
216  * Therefore we have a per-cpu list of dead processes where we put this
217  * proc and have idle clean up that list and move it to the reaper list.
218  * All this will be unnecessary once we can bind the reaper this cpu
219  * and not risk having it switch to another in case it sleeps.
220  */
221 void
222 sched_exit(struct proc *p)
223 {
224 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
225 	struct proc *idle;
226 	int s;
227 
228 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
229 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
230 		clockintr_cancel(spc->spc_itimer);
231 	}
232 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
233 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
234 		clockintr_cancel(spc->spc_profclock);
235 	}
236 
237 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
238 
239 #ifdef MULTIPROCESSOR
240 	/* This process no longer needs to hold the kernel lock. */
241 	KERNEL_ASSERT_LOCKED();
242 	__mp_release_all(&kernel_lock);
243 #endif
244 
245 	SCHED_LOCK(s);
246 	idle = spc->spc_idleproc;
247 	idle->p_stat = SRUN;
248 	cpu_switchto(NULL, idle);
249 	panic("cpu_switchto returned");
250 }
251 
252 /*
253  * Run queue management.
254  */
255 void
256 sched_init_runqueues(void)
257 {
258 }
259 
260 void
261 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
262 {
263 	struct schedstate_percpu *spc;
264 	int queue = prio >> 2;
265 
266 	if (ci == NULL)
267 		ci = sched_choosecpu(p);
268 
269 	KASSERT(ci != NULL);
270 	SCHED_ASSERT_LOCKED();
271 
272 	p->p_cpu = ci;
273 	p->p_stat = SRUN;
274 	p->p_runpri = prio;
275 
276 	spc = &p->p_cpu->ci_schedstate;
277 	spc->spc_nrun++;
278 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
279 	    p->p_p->ps_pid);
280 
281 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
282 	spc->spc_whichqs |= (1U << queue);
283 	cpuset_add(&sched_queued_cpus, p->p_cpu);
284 
285 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
286 		cpu_unidle(p->p_cpu);
287 
288 	if (prio < spc->spc_curpriority)
289 		need_resched(ci);
290 }
291 
292 void
293 remrunqueue(struct proc *p)
294 {
295 	struct schedstate_percpu *spc;
296 	int queue = p->p_runpri >> 2;
297 
298 	SCHED_ASSERT_LOCKED();
299 	spc = &p->p_cpu->ci_schedstate;
300 	spc->spc_nrun--;
301 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
302 	    p->p_p->ps_pid);
303 
304 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
305 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
306 		spc->spc_whichqs &= ~(1U << queue);
307 		if (spc->spc_whichqs == 0)
308 			cpuset_del(&sched_queued_cpus, p->p_cpu);
309 	}
310 }
311 
312 struct proc *
313 sched_chooseproc(void)
314 {
315 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
316 	struct proc *p;
317 	int queue;
318 
319 	SCHED_ASSERT_LOCKED();
320 
321 #ifdef MULTIPROCESSOR
322 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
323 		if (spc->spc_whichqs) {
324 			for (queue = 0; queue < SCHED_NQS; queue++) {
325 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
326 					remrunqueue(p);
327 					setrunqueue(NULL, p, p->p_runpri);
328 					if (p->p_cpu == curcpu()) {
329 						KASSERT(p->p_flag & P_CPUPEG);
330 						goto again;
331 					}
332 				}
333 			}
334 		}
335 		p = spc->spc_idleproc;
336 		KASSERT(p);
337 		KASSERT(p->p_wchan == NULL);
338 		p->p_stat = SRUN;
339 		return (p);
340 	}
341 #endif
342 
343 again:
344 	if (spc->spc_whichqs) {
345 		queue = ffs(spc->spc_whichqs) - 1;
346 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
347 		remrunqueue(p);
348 		sched_noidle++;
349 		if (p->p_stat != SRUN)
350 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
351 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
352 		p = spc->spc_idleproc;
353 		if (p == NULL) {
354                         int s;
355 			/*
356 			 * We get here if someone decides to switch during
357 			 * boot before forking kthreads, bleh.
358 			 * This is kind of like a stupid idle loop.
359 			 */
360 #ifdef MULTIPROCESSOR
361 			__mp_unlock(&sched_lock);
362 #endif
363 			spl0();
364 			delay(10);
365 			SCHED_LOCK(s);
366 			goto again;
367                 }
368 		KASSERT(p);
369 		p->p_stat = SRUN;
370 	}
371 
372 	KASSERT(p->p_wchan == NULL);
373 	return (p);
374 }
375 
376 struct cpu_info *
377 sched_choosecpu_fork(struct proc *parent, int flags)
378 {
379 #ifdef MULTIPROCESSOR
380 	struct cpu_info *choice = NULL;
381 	int run, best_run = INT_MAX;
382 	struct cpu_info *ci;
383 	struct cpuset set;
384 
385 #if 0
386 	/*
387 	 * XXX
388 	 * Don't do this until we have a painless way to move the cpu in exec.
389 	 * Preferably when nuking the old pmap and getting a new one on a
390 	 * new cpu.
391 	 */
392 	/*
393 	 * PPWAIT forks are simple. We know that the parent will not
394 	 * run until we exec and choose another cpu, so we just steal its
395 	 * cpu.
396 	 */
397 	if (flags & FORK_PPWAIT)
398 		return (parent->p_cpu);
399 #endif
400 
401 	/*
402 	 * Look at all cpus that are currently idle and have nothing queued.
403 	 * If there are none, pick the one with least queued procs first,
404 	 * then the one with lowest load average.
405 	 */
406 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
407 	cpuset_intersection(&set, &set, &sched_all_cpus);
408 	if (cpuset_first(&set) == NULL)
409 		cpuset_copy(&set, &sched_all_cpus);
410 
411 	while ((ci = cpuset_first(&set)) != NULL) {
412 		cpuset_del(&set, ci);
413 
414 		run = ci->ci_schedstate.spc_nrun;
415 
416 		if (choice == NULL || run < best_run) {
417 			choice = ci;
418 			best_run = run;
419 		}
420 	}
421 
422 	return (choice);
423 #else
424 	return (curcpu());
425 #endif
426 }
427 
428 struct cpu_info *
429 sched_choosecpu(struct proc *p)
430 {
431 #ifdef MULTIPROCESSOR
432 	struct cpu_info *choice = NULL;
433 	int last_cost = INT_MAX;
434 	struct cpu_info *ci;
435 	struct cpuset set;
436 
437 	/*
438 	 * If pegged to a cpu, don't allow it to move.
439 	 */
440 	if (p->p_flag & P_CPUPEG)
441 		return (p->p_cpu);
442 
443 	sched_choose++;
444 
445 	/*
446 	 * Look at all cpus that are currently idle and have nothing queued.
447 	 * If there are none, pick the cheapest of those.
448 	 * (idle + queued could mean that the cpu is handling an interrupt
449 	 * at this moment and haven't had time to leave idle yet).
450 	 */
451 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
452 	cpuset_intersection(&set, &set, &sched_all_cpus);
453 
454 	/*
455 	 * First, just check if our current cpu is in that set, if it is,
456 	 * this is simple.
457 	 * Also, our cpu might not be idle, but if it's the current cpu
458 	 * and it has nothing else queued and we're curproc, take it.
459 	 */
460 	if (cpuset_isset(&set, p->p_cpu) ||
461 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
462 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
463 	    curproc == p)) {
464 		sched_wasidle++;
465 		return (p->p_cpu);
466 	}
467 
468 	if (cpuset_first(&set) == NULL)
469 		cpuset_copy(&set, &sched_all_cpus);
470 
471 	while ((ci = cpuset_first(&set)) != NULL) {
472 		int cost = sched_proc_to_cpu_cost(ci, p);
473 
474 		if (choice == NULL || cost < last_cost) {
475 			choice = ci;
476 			last_cost = cost;
477 		}
478 		cpuset_del(&set, ci);
479 	}
480 
481 	if (p->p_cpu != choice)
482 		sched_nmigrations++;
483 	else
484 		sched_nomigrations++;
485 
486 	return (choice);
487 #else
488 	return (curcpu());
489 #endif
490 }
491 
492 /*
493  * Attempt to steal a proc from some cpu.
494  */
495 struct proc *
496 sched_steal_proc(struct cpu_info *self)
497 {
498 	struct proc *best = NULL;
499 #ifdef MULTIPROCESSOR
500 	struct schedstate_percpu *spc;
501 	int bestcost = INT_MAX;
502 	struct cpu_info *ci;
503 	struct cpuset set;
504 
505 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
506 
507 	/* Don't steal if we don't want to schedule processes in this CPU. */
508 	if (!cpuset_isset(&sched_all_cpus, self))
509 		return (NULL);
510 
511 	cpuset_copy(&set, &sched_queued_cpus);
512 
513 	while ((ci = cpuset_first(&set)) != NULL) {
514 		struct proc *p;
515 		int queue;
516 		int cost;
517 
518 		cpuset_del(&set, ci);
519 
520 		spc = &ci->ci_schedstate;
521 
522 		queue = ffs(spc->spc_whichqs) - 1;
523 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
524 			if (p->p_flag & P_CPUPEG)
525 				continue;
526 
527 			cost = sched_proc_to_cpu_cost(self, p);
528 
529 			if (best == NULL || cost < bestcost) {
530 				best = p;
531 				bestcost = cost;
532 			}
533 		}
534 	}
535 	if (best == NULL)
536 		return (NULL);
537 
538 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
539 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
540 
541 	remrunqueue(best);
542 	best->p_cpu = self;
543 
544 	sched_stolen++;
545 #endif
546 	return (best);
547 }
548 
549 #ifdef MULTIPROCESSOR
550 /*
551  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
552  */
553 static int
554 log2(unsigned int i)
555 {
556 	int ret = 0;
557 
558 	while (i >>= 1)
559 		ret++;
560 
561 	return (ret);
562 }
563 
564 /*
565  * Calculate the cost of moving the proc to this cpu.
566  *
567  * What we want is some guesstimate of how much "performance" it will
568  * cost us to move the proc here. Not just for caches and TLBs and NUMA
569  * memory, but also for the proc itself. A highly loaded cpu might not
570  * be the best candidate for this proc since it won't get run.
571  *
572  * Just total guesstimates for now.
573  */
574 
575 int sched_cost_load = 1;
576 int sched_cost_priority = 1;
577 int sched_cost_runnable = 3;
578 int sched_cost_resident = 1;
579 #endif
580 
581 int
582 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
583 {
584 	int cost = 0;
585 #ifdef MULTIPROCESSOR
586 	struct schedstate_percpu *spc;
587 	int l2resident = 0;
588 
589 	spc = &ci->ci_schedstate;
590 
591 	/*
592 	 * First, account for the priority of the proc we want to move.
593 	 * More willing to move, the lower the priority of the destination
594 	 * and the higher the priority of the proc.
595 	 */
596 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
597 		cost += (p->p_usrpri - spc->spc_curpriority) *
598 		    sched_cost_priority;
599 		cost += sched_cost_runnable;
600 	}
601 	if (cpuset_isset(&sched_queued_cpus, ci))
602 		cost += spc->spc_nrun * sched_cost_runnable;
603 
604 	/*
605 	 * Try to avoid the primary cpu as it handles hardware interrupts.
606 	 *
607 	 * XXX Needs to be revisited when we distribute interrupts
608 	 * over cpus.
609 	 */
610 	if (CPU_IS_PRIMARY(ci))
611 		cost += sched_cost_runnable;
612 
613 	/*
614 	 * If the proc is on this cpu already, lower the cost by how much
615 	 * it has been running and an estimate of its footprint.
616 	 */
617 	if (p->p_cpu == ci && p->p_slptime == 0) {
618 		l2resident =
619 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
620 		cost -= l2resident * sched_cost_resident;
621 	}
622 #endif
623 	return (cost);
624 }
625 
626 /*
627  * Peg a proc to a cpu.
628  */
629 void
630 sched_peg_curproc(struct cpu_info *ci)
631 {
632 	struct proc *p = curproc;
633 	int s;
634 
635 	SCHED_LOCK(s);
636 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
637 	setrunqueue(ci, p, p->p_usrpri);
638 	p->p_ru.ru_nvcsw++;
639 	mi_switch();
640 	SCHED_UNLOCK(s);
641 }
642 
643 #ifdef MULTIPROCESSOR
644 
645 void
646 sched_start_secondary_cpus(void)
647 {
648 	CPU_INFO_ITERATOR cii;
649 	struct cpu_info *ci;
650 
651 	CPU_INFO_FOREACH(cii, ci) {
652 		struct schedstate_percpu *spc = &ci->ci_schedstate;
653 
654 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
655 			continue;
656 		atomic_clearbits_int(&spc->spc_schedflags,
657 		    SPCF_SHOULDHALT | SPCF_HALTED);
658 #ifdef __HAVE_CPU_TOPOLOGY
659 		if (!sched_smt && ci->ci_smt_id > 0)
660 			continue;
661 #endif
662 		cpuset_add(&sched_all_cpus, ci);
663 	}
664 }
665 
666 void
667 sched_stop_secondary_cpus(void)
668 {
669 	CPU_INFO_ITERATOR cii;
670 	struct cpu_info *ci;
671 
672 	/*
673 	 * Make sure we stop the secondary CPUs.
674 	 */
675 	CPU_INFO_FOREACH(cii, ci) {
676 		struct schedstate_percpu *spc = &ci->ci_schedstate;
677 
678 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
679 			continue;
680 		cpuset_del(&sched_all_cpus, ci);
681 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
682 	}
683 	CPU_INFO_FOREACH(cii, ci) {
684 		struct schedstate_percpu *spc = &ci->ci_schedstate;
685 
686 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
687 			continue;
688 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
689 			sleep_setup(spc, PZERO, "schedstate");
690 			sleep_finish(0,
691 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
692 		}
693 	}
694 }
695 
696 struct sched_barrier_state {
697 	struct cpu_info *ci;
698 	struct cond cond;
699 };
700 
701 void
702 sched_barrier_task(void *arg)
703 {
704 	struct sched_barrier_state *sb = arg;
705 	struct cpu_info *ci = sb->ci;
706 
707 	sched_peg_curproc(ci);
708 	cond_signal(&sb->cond);
709 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
710 }
711 
712 void
713 sched_barrier(struct cpu_info *ci)
714 {
715 	struct sched_barrier_state sb;
716 	struct task task;
717 	CPU_INFO_ITERATOR cii;
718 
719 	if (ci == NULL) {
720 		CPU_INFO_FOREACH(cii, ci) {
721 			if (CPU_IS_PRIMARY(ci))
722 				break;
723 		}
724 	}
725 	KASSERT(ci != NULL);
726 
727 	if (ci == curcpu())
728 		return;
729 
730 	sb.ci = ci;
731 	cond_init(&sb.cond);
732 	task_set(&task, sched_barrier_task, &sb);
733 
734 	task_add(systqmp, &task);
735 	cond_wait(&sb.cond, "sbar");
736 }
737 
738 #else
739 
740 void
741 sched_barrier(struct cpu_info *ci)
742 {
743 }
744 
745 #endif
746 
747 /*
748  * Functions to manipulate cpu sets.
749  */
750 struct cpu_info *cpuset_infos[MAXCPUS];
751 static struct cpuset cpuset_all;
752 
753 void
754 cpuset_init_cpu(struct cpu_info *ci)
755 {
756 	cpuset_add(&cpuset_all, ci);
757 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
758 }
759 
760 void
761 cpuset_clear(struct cpuset *cs)
762 {
763 	memset(cs, 0, sizeof(*cs));
764 }
765 
766 void
767 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
768 {
769 	unsigned int num = CPU_INFO_UNIT(ci);
770 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
771 }
772 
773 void
774 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
775 {
776 	unsigned int num = CPU_INFO_UNIT(ci);
777 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
778 }
779 
780 int
781 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
782 {
783 	unsigned int num = CPU_INFO_UNIT(ci);
784 	return (cs->cs_set[num/32] & (1U << (num % 32)));
785 }
786 
787 void
788 cpuset_add_all(struct cpuset *cs)
789 {
790 	cpuset_copy(cs, &cpuset_all);
791 }
792 
793 void
794 cpuset_copy(struct cpuset *to, struct cpuset *from)
795 {
796 	memcpy(to, from, sizeof(*to));
797 }
798 
799 struct cpu_info *
800 cpuset_first(struct cpuset *cs)
801 {
802 	int i;
803 
804 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
805 		if (cs->cs_set[i])
806 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
807 
808 	return (NULL);
809 }
810 
811 void
812 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
813 {
814 	int i;
815 
816 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
817 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
818 }
819 
820 void
821 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
822 {
823 	int i;
824 
825 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
826 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
827 }
828 
829 void
830 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
831 {
832 	int i;
833 
834 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
835 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
836 }
837 
838 int
839 cpuset_cardinality(struct cpuset *cs)
840 {
841 	int cardinality, i, n;
842 
843 	cardinality = 0;
844 
845 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
846 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
847 			cardinality++;
848 
849 	return (cardinality);
850 }
851 
852 int
853 sysctl_hwncpuonline(void)
854 {
855 	return cpuset_cardinality(&sched_all_cpus);
856 }
857 
858 int
859 cpu_is_online(struct cpu_info *ci)
860 {
861 	return cpuset_isset(&sched_all_cpus, ci);
862 }
863 
864 #ifdef __HAVE_CPU_TOPOLOGY
865 
866 #include <sys/sysctl.h>
867 
868 int
869 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
870 {
871 	CPU_INFO_ITERATOR cii;
872 	struct cpu_info *ci;
873 	int err, newsmt;
874 
875 	newsmt = sched_smt;
876 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
877 	if (err)
878 		return err;
879 	if (newsmt == sched_smt)
880 		return 0;
881 
882 	sched_smt = newsmt;
883 	CPU_INFO_FOREACH(cii, ci) {
884 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
885 			continue;
886 		if (ci->ci_smt_id == 0)
887 			continue;
888 		if (sched_smt)
889 			cpuset_add(&sched_all_cpus, ci);
890 		else
891 			cpuset_del(&sched_all_cpus, ci);
892 	}
893 
894 	return 0;
895 }
896 
897 #endif
898