xref: /openbsd-src/sys/kern/kern_sched.c (revision 44e0cbf245051670af619ef55662ce4a84441667)
1 /*	$OpenBSD: kern_sched.c,v 1.84 2023/08/05 20:07:55 cheloha Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	if (spc->spc_itimer == NULL) {
92 		spc->spc_itimer = clockintr_establish(&ci->ci_queue,
93 		    itimer_update);
94 		if (spc->spc_itimer == NULL) {
95 			panic("%s: clockintr_establish itimer_update",
96 			    __func__);
97 		}
98 	}
99 	if (spc->spc_profclock == NULL) {
100 		spc->spc_profclock = clockintr_establish(&ci->ci_queue,
101 		    profclock);
102 		if (spc->spc_profclock == NULL)
103 			panic("%s: clockintr_establish profclock", __func__);
104 	}
105 
106 	kthread_create_deferred(sched_kthreads_create, ci);
107 
108 	LIST_INIT(&spc->spc_deadproc);
109 	SIMPLEQ_INIT(&spc->spc_deferred);
110 
111 	/*
112 	 * Slight hack here until the cpuset code handles cpu_info
113 	 * structures.
114 	 */
115 	cpuset_init_cpu(ci);
116 
117 #ifdef __HAVE_CPU_TOPOLOGY
118 	if (!sched_smt && ci->ci_smt_id > 0)
119 		return;
120 #endif
121 	cpuset_add(&sched_all_cpus, ci);
122 }
123 
124 void
125 sched_kthreads_create(void *v)
126 {
127 	struct cpu_info *ci = v;
128 	struct schedstate_percpu *spc = &ci->ci_schedstate;
129 	static int num;
130 
131 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
132 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
133 	    &spc->spc_idleproc))
134 		panic("fork idle");
135 
136 	/* Name it as specified. */
137 	snprintf(spc->spc_idleproc->p_p->ps_comm,
138 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
139 	    "idle%d", num);
140 
141 	num++;
142 }
143 
144 void
145 sched_idle(void *v)
146 {
147 	struct schedstate_percpu *spc;
148 	struct proc *p = curproc;
149 	struct cpu_info *ci = v;
150 	int s;
151 
152 	KERNEL_UNLOCK();
153 
154 	spc = &ci->ci_schedstate;
155 
156 	/*
157 	 * First time we enter here, we're not supposed to idle,
158 	 * just go away for a while.
159 	 */
160 	SCHED_LOCK(s);
161 	cpuset_add(&sched_idle_cpus, ci);
162 	p->p_stat = SSLEEP;
163 	p->p_cpu = ci;
164 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
165 	mi_switch();
166 	cpuset_del(&sched_idle_cpus, ci);
167 	SCHED_UNLOCK(s);
168 
169 	KASSERT(ci == curcpu());
170 	KASSERT(curproc == spc->spc_idleproc);
171 
172 	while (1) {
173 		while (!cpu_is_idle(curcpu())) {
174 			struct proc *dead;
175 
176 			SCHED_LOCK(s);
177 			p->p_stat = SSLEEP;
178 			mi_switch();
179 			SCHED_UNLOCK(s);
180 
181 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
182 				LIST_REMOVE(dead, p_hash);
183 				exit2(dead);
184 			}
185 		}
186 
187 		splassert(IPL_NONE);
188 
189 		smr_idle();
190 
191 		cpuset_add(&sched_idle_cpus, ci);
192 		cpu_idle_enter();
193 		while (spc->spc_whichqs == 0) {
194 #ifdef MULTIPROCESSOR
195 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
196 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
197 				cpuset_del(&sched_idle_cpus, ci);
198 				SCHED_LOCK(s);
199 				atomic_setbits_int(&spc->spc_schedflags,
200 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
201 				SCHED_UNLOCK(s);
202 				wakeup(spc);
203 			}
204 #endif
205 			cpu_idle_cycle();
206 		}
207 		cpu_idle_leave();
208 		cpuset_del(&sched_idle_cpus, ci);
209 	}
210 }
211 
212 /*
213  * To free our address space we have to jump through a few hoops.
214  * The freeing is done by the reaper, but until we have one reaper
215  * per cpu, we have no way of putting this proc on the deadproc list
216  * and waking up the reaper without risking having our address space and
217  * stack torn from under us before we manage to switch to another proc.
218  * Therefore we have a per-cpu list of dead processes where we put this
219  * proc and have idle clean up that list and move it to the reaper list.
220  * All this will be unnecessary once we can bind the reaper this cpu
221  * and not risk having it switch to another in case it sleeps.
222  */
223 void
224 sched_exit(struct proc *p)
225 {
226 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
227 	struct timespec ts;
228 	struct proc *idle;
229 	int s;
230 
231 	nanouptime(&ts);
232 	timespecsub(&ts, &spc->spc_runtime, &ts);
233 	timespecadd(&p->p_rtime, &ts, &p->p_rtime);
234 
235 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
236 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
237 		clockintr_cancel(spc->spc_itimer);
238 	}
239 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
240 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
241 		clockintr_cancel(spc->spc_profclock);
242 	}
243 
244 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
245 
246 #ifdef MULTIPROCESSOR
247 	/* This process no longer needs to hold the kernel lock. */
248 	KERNEL_ASSERT_LOCKED();
249 	__mp_release_all(&kernel_lock);
250 #endif
251 
252 	SCHED_LOCK(s);
253 	idle = spc->spc_idleproc;
254 	idle->p_stat = SRUN;
255 	cpu_switchto(NULL, idle);
256 	panic("cpu_switchto returned");
257 }
258 
259 /*
260  * Run queue management.
261  */
262 void
263 sched_init_runqueues(void)
264 {
265 }
266 
267 void
268 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
269 {
270 	struct schedstate_percpu *spc;
271 	int queue = prio >> 2;
272 
273 	if (ci == NULL)
274 		ci = sched_choosecpu(p);
275 
276 	KASSERT(ci != NULL);
277 	SCHED_ASSERT_LOCKED();
278 
279 	p->p_cpu = ci;
280 	p->p_stat = SRUN;
281 	p->p_runpri = prio;
282 
283 	spc = &p->p_cpu->ci_schedstate;
284 	spc->spc_nrun++;
285 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
286 	    p->p_p->ps_pid);
287 
288 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
289 	spc->spc_whichqs |= (1U << queue);
290 	cpuset_add(&sched_queued_cpus, p->p_cpu);
291 
292 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
293 		cpu_unidle(p->p_cpu);
294 
295 	if (prio < spc->spc_curpriority)
296 		need_resched(ci);
297 }
298 
299 void
300 remrunqueue(struct proc *p)
301 {
302 	struct schedstate_percpu *spc;
303 	int queue = p->p_runpri >> 2;
304 
305 	SCHED_ASSERT_LOCKED();
306 	spc = &p->p_cpu->ci_schedstate;
307 	spc->spc_nrun--;
308 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
309 	    p->p_p->ps_pid);
310 
311 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
312 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
313 		spc->spc_whichqs &= ~(1U << queue);
314 		if (spc->spc_whichqs == 0)
315 			cpuset_del(&sched_queued_cpus, p->p_cpu);
316 	}
317 }
318 
319 struct proc *
320 sched_chooseproc(void)
321 {
322 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
323 	struct proc *p;
324 	int queue;
325 
326 	SCHED_ASSERT_LOCKED();
327 
328 #ifdef MULTIPROCESSOR
329 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
330 		if (spc->spc_whichqs) {
331 			for (queue = 0; queue < SCHED_NQS; queue++) {
332 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
333 					remrunqueue(p);
334 					setrunqueue(NULL, p, p->p_runpri);
335 					if (p->p_cpu == curcpu()) {
336 						KASSERT(p->p_flag & P_CPUPEG);
337 						goto again;
338 					}
339 				}
340 			}
341 		}
342 		p = spc->spc_idleproc;
343 		KASSERT(p);
344 		KASSERT(p->p_wchan == NULL);
345 		p->p_stat = SRUN;
346 		return (p);
347 	}
348 #endif
349 
350 again:
351 	if (spc->spc_whichqs) {
352 		queue = ffs(spc->spc_whichqs) - 1;
353 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
354 		remrunqueue(p);
355 		sched_noidle++;
356 		if (p->p_stat != SRUN)
357 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
358 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
359 		p = spc->spc_idleproc;
360 		if (p == NULL) {
361                         int s;
362 			/*
363 			 * We get here if someone decides to switch during
364 			 * boot before forking kthreads, bleh.
365 			 * This is kind of like a stupid idle loop.
366 			 */
367 #ifdef MULTIPROCESSOR
368 			__mp_unlock(&sched_lock);
369 #endif
370 			spl0();
371 			delay(10);
372 			SCHED_LOCK(s);
373 			goto again;
374                 }
375 		KASSERT(p);
376 		p->p_stat = SRUN;
377 	}
378 
379 	KASSERT(p->p_wchan == NULL);
380 	return (p);
381 }
382 
383 struct cpu_info *
384 sched_choosecpu_fork(struct proc *parent, int flags)
385 {
386 #ifdef MULTIPROCESSOR
387 	struct cpu_info *choice = NULL;
388 	int run, best_run = INT_MAX;
389 	struct cpu_info *ci;
390 	struct cpuset set;
391 
392 #if 0
393 	/*
394 	 * XXX
395 	 * Don't do this until we have a painless way to move the cpu in exec.
396 	 * Preferably when nuking the old pmap and getting a new one on a
397 	 * new cpu.
398 	 */
399 	/*
400 	 * PPWAIT forks are simple. We know that the parent will not
401 	 * run until we exec and choose another cpu, so we just steal its
402 	 * cpu.
403 	 */
404 	if (flags & FORK_PPWAIT)
405 		return (parent->p_cpu);
406 #endif
407 
408 	/*
409 	 * Look at all cpus that are currently idle and have nothing queued.
410 	 * If there are none, pick the one with least queued procs first,
411 	 * then the one with lowest load average.
412 	 */
413 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
414 	cpuset_intersection(&set, &set, &sched_all_cpus);
415 	if (cpuset_first(&set) == NULL)
416 		cpuset_copy(&set, &sched_all_cpus);
417 
418 	while ((ci = cpuset_first(&set)) != NULL) {
419 		cpuset_del(&set, ci);
420 
421 		run = ci->ci_schedstate.spc_nrun;
422 
423 		if (choice == NULL || run < best_run) {
424 			choice = ci;
425 			best_run = run;
426 		}
427 	}
428 
429 	return (choice);
430 #else
431 	return (curcpu());
432 #endif
433 }
434 
435 struct cpu_info *
436 sched_choosecpu(struct proc *p)
437 {
438 #ifdef MULTIPROCESSOR
439 	struct cpu_info *choice = NULL;
440 	int last_cost = INT_MAX;
441 	struct cpu_info *ci;
442 	struct cpuset set;
443 
444 	/*
445 	 * If pegged to a cpu, don't allow it to move.
446 	 */
447 	if (p->p_flag & P_CPUPEG)
448 		return (p->p_cpu);
449 
450 	sched_choose++;
451 
452 	/*
453 	 * Look at all cpus that are currently idle and have nothing queued.
454 	 * If there are none, pick the cheapest of those.
455 	 * (idle + queued could mean that the cpu is handling an interrupt
456 	 * at this moment and haven't had time to leave idle yet).
457 	 */
458 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
459 	cpuset_intersection(&set, &set, &sched_all_cpus);
460 
461 	/*
462 	 * First, just check if our current cpu is in that set, if it is,
463 	 * this is simple.
464 	 * Also, our cpu might not be idle, but if it's the current cpu
465 	 * and it has nothing else queued and we're curproc, take it.
466 	 */
467 	if (cpuset_isset(&set, p->p_cpu) ||
468 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
469 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
470 	    curproc == p)) {
471 		sched_wasidle++;
472 		return (p->p_cpu);
473 	}
474 
475 	if (cpuset_first(&set) == NULL)
476 		cpuset_copy(&set, &sched_all_cpus);
477 
478 	while ((ci = cpuset_first(&set)) != NULL) {
479 		int cost = sched_proc_to_cpu_cost(ci, p);
480 
481 		if (choice == NULL || cost < last_cost) {
482 			choice = ci;
483 			last_cost = cost;
484 		}
485 		cpuset_del(&set, ci);
486 	}
487 
488 	if (p->p_cpu != choice)
489 		sched_nmigrations++;
490 	else
491 		sched_nomigrations++;
492 
493 	return (choice);
494 #else
495 	return (curcpu());
496 #endif
497 }
498 
499 /*
500  * Attempt to steal a proc from some cpu.
501  */
502 struct proc *
503 sched_steal_proc(struct cpu_info *self)
504 {
505 	struct proc *best = NULL;
506 #ifdef MULTIPROCESSOR
507 	struct schedstate_percpu *spc;
508 	int bestcost = INT_MAX;
509 	struct cpu_info *ci;
510 	struct cpuset set;
511 
512 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
513 
514 	/* Don't steal if we don't want to schedule processes in this CPU. */
515 	if (!cpuset_isset(&sched_all_cpus, self))
516 		return (NULL);
517 
518 	cpuset_copy(&set, &sched_queued_cpus);
519 
520 	while ((ci = cpuset_first(&set)) != NULL) {
521 		struct proc *p;
522 		int queue;
523 		int cost;
524 
525 		cpuset_del(&set, ci);
526 
527 		spc = &ci->ci_schedstate;
528 
529 		queue = ffs(spc->spc_whichqs) - 1;
530 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
531 			if (p->p_flag & P_CPUPEG)
532 				continue;
533 
534 			cost = sched_proc_to_cpu_cost(self, p);
535 
536 			if (best == NULL || cost < bestcost) {
537 				best = p;
538 				bestcost = cost;
539 			}
540 		}
541 	}
542 	if (best == NULL)
543 		return (NULL);
544 
545 	remrunqueue(best);
546 	best->p_cpu = self;
547 
548 	sched_stolen++;
549 #endif
550 	return (best);
551 }
552 
553 #ifdef MULTIPROCESSOR
554 /*
555  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
556  */
557 static int
558 log2(unsigned int i)
559 {
560 	int ret = 0;
561 
562 	while (i >>= 1)
563 		ret++;
564 
565 	return (ret);
566 }
567 
568 /*
569  * Calculate the cost of moving the proc to this cpu.
570  *
571  * What we want is some guesstimate of how much "performance" it will
572  * cost us to move the proc here. Not just for caches and TLBs and NUMA
573  * memory, but also for the proc itself. A highly loaded cpu might not
574  * be the best candidate for this proc since it won't get run.
575  *
576  * Just total guesstimates for now.
577  */
578 
579 int sched_cost_load = 1;
580 int sched_cost_priority = 1;
581 int sched_cost_runnable = 3;
582 int sched_cost_resident = 1;
583 #endif
584 
585 int
586 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
587 {
588 	int cost = 0;
589 #ifdef MULTIPROCESSOR
590 	struct schedstate_percpu *spc;
591 	int l2resident = 0;
592 
593 	spc = &ci->ci_schedstate;
594 
595 	/*
596 	 * First, account for the priority of the proc we want to move.
597 	 * More willing to move, the lower the priority of the destination
598 	 * and the higher the priority of the proc.
599 	 */
600 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
601 		cost += (p->p_usrpri - spc->spc_curpriority) *
602 		    sched_cost_priority;
603 		cost += sched_cost_runnable;
604 	}
605 	if (cpuset_isset(&sched_queued_cpus, ci))
606 		cost += spc->spc_nrun * sched_cost_runnable;
607 
608 	/*
609 	 * Try to avoid the primary cpu as it handles hardware interrupts.
610 	 *
611 	 * XXX Needs to be revisited when we distribute interrupts
612 	 * over cpus.
613 	 */
614 	if (CPU_IS_PRIMARY(ci))
615 		cost += sched_cost_runnable;
616 
617 	/*
618 	 * If the proc is on this cpu already, lower the cost by how much
619 	 * it has been running and an estimate of its footprint.
620 	 */
621 	if (p->p_cpu == ci && p->p_slptime == 0) {
622 		l2resident =
623 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
624 		cost -= l2resident * sched_cost_resident;
625 	}
626 #endif
627 	return (cost);
628 }
629 
630 /*
631  * Peg a proc to a cpu.
632  */
633 void
634 sched_peg_curproc(struct cpu_info *ci)
635 {
636 	struct proc *p = curproc;
637 	int s;
638 
639 	SCHED_LOCK(s);
640 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
641 	setrunqueue(ci, p, p->p_usrpri);
642 	p->p_ru.ru_nvcsw++;
643 	mi_switch();
644 	SCHED_UNLOCK(s);
645 }
646 
647 #ifdef MULTIPROCESSOR
648 
649 void
650 sched_start_secondary_cpus(void)
651 {
652 	CPU_INFO_ITERATOR cii;
653 	struct cpu_info *ci;
654 
655 	CPU_INFO_FOREACH(cii, ci) {
656 		struct schedstate_percpu *spc = &ci->ci_schedstate;
657 
658 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
659 			continue;
660 		atomic_clearbits_int(&spc->spc_schedflags,
661 		    SPCF_SHOULDHALT | SPCF_HALTED);
662 #ifdef __HAVE_CPU_TOPOLOGY
663 		if (!sched_smt && ci->ci_smt_id > 0)
664 			continue;
665 #endif
666 		cpuset_add(&sched_all_cpus, ci);
667 	}
668 }
669 
670 void
671 sched_stop_secondary_cpus(void)
672 {
673 	CPU_INFO_ITERATOR cii;
674 	struct cpu_info *ci;
675 
676 	/*
677 	 * Make sure we stop the secondary CPUs.
678 	 */
679 	CPU_INFO_FOREACH(cii, ci) {
680 		struct schedstate_percpu *spc = &ci->ci_schedstate;
681 
682 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
683 			continue;
684 		cpuset_del(&sched_all_cpus, ci);
685 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
686 	}
687 	CPU_INFO_FOREACH(cii, ci) {
688 		struct schedstate_percpu *spc = &ci->ci_schedstate;
689 
690 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
691 			continue;
692 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
693 			sleep_setup(spc, PZERO, "schedstate");
694 			sleep_finish(0,
695 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
696 		}
697 	}
698 }
699 
700 struct sched_barrier_state {
701 	struct cpu_info *ci;
702 	struct cond cond;
703 };
704 
705 void
706 sched_barrier_task(void *arg)
707 {
708 	struct sched_barrier_state *sb = arg;
709 	struct cpu_info *ci = sb->ci;
710 
711 	sched_peg_curproc(ci);
712 	cond_signal(&sb->cond);
713 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
714 }
715 
716 void
717 sched_barrier(struct cpu_info *ci)
718 {
719 	struct sched_barrier_state sb;
720 	struct task task;
721 	CPU_INFO_ITERATOR cii;
722 
723 	if (ci == NULL) {
724 		CPU_INFO_FOREACH(cii, ci) {
725 			if (CPU_IS_PRIMARY(ci))
726 				break;
727 		}
728 	}
729 	KASSERT(ci != NULL);
730 
731 	if (ci == curcpu())
732 		return;
733 
734 	sb.ci = ci;
735 	cond_init(&sb.cond);
736 	task_set(&task, sched_barrier_task, &sb);
737 
738 	task_add(systqmp, &task);
739 	cond_wait(&sb.cond, "sbar");
740 }
741 
742 #else
743 
744 void
745 sched_barrier(struct cpu_info *ci)
746 {
747 }
748 
749 #endif
750 
751 /*
752  * Functions to manipulate cpu sets.
753  */
754 struct cpu_info *cpuset_infos[MAXCPUS];
755 static struct cpuset cpuset_all;
756 
757 void
758 cpuset_init_cpu(struct cpu_info *ci)
759 {
760 	cpuset_add(&cpuset_all, ci);
761 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
762 }
763 
764 void
765 cpuset_clear(struct cpuset *cs)
766 {
767 	memset(cs, 0, sizeof(*cs));
768 }
769 
770 void
771 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
772 {
773 	unsigned int num = CPU_INFO_UNIT(ci);
774 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
775 }
776 
777 void
778 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
779 {
780 	unsigned int num = CPU_INFO_UNIT(ci);
781 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
782 }
783 
784 int
785 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
786 {
787 	unsigned int num = CPU_INFO_UNIT(ci);
788 	return (cs->cs_set[num/32] & (1U << (num % 32)));
789 }
790 
791 void
792 cpuset_add_all(struct cpuset *cs)
793 {
794 	cpuset_copy(cs, &cpuset_all);
795 }
796 
797 void
798 cpuset_copy(struct cpuset *to, struct cpuset *from)
799 {
800 	memcpy(to, from, sizeof(*to));
801 }
802 
803 struct cpu_info *
804 cpuset_first(struct cpuset *cs)
805 {
806 	int i;
807 
808 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
809 		if (cs->cs_set[i])
810 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
811 
812 	return (NULL);
813 }
814 
815 void
816 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
817 {
818 	int i;
819 
820 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
821 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
822 }
823 
824 void
825 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
826 {
827 	int i;
828 
829 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
830 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
831 }
832 
833 void
834 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
835 {
836 	int i;
837 
838 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
839 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
840 }
841 
842 int
843 cpuset_cardinality(struct cpuset *cs)
844 {
845 	int cardinality, i, n;
846 
847 	cardinality = 0;
848 
849 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
850 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
851 			cardinality++;
852 
853 	return (cardinality);
854 }
855 
856 int
857 sysctl_hwncpuonline(void)
858 {
859 	return cpuset_cardinality(&sched_all_cpus);
860 }
861 
862 int
863 cpu_is_online(struct cpu_info *ci)
864 {
865 	return cpuset_isset(&sched_all_cpus, ci);
866 }
867 
868 #ifdef __HAVE_CPU_TOPOLOGY
869 
870 #include <sys/sysctl.h>
871 
872 int
873 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
874 {
875 	CPU_INFO_ITERATOR cii;
876 	struct cpu_info *ci;
877 	int err, newsmt;
878 
879 	newsmt = sched_smt;
880 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
881 	if (err)
882 		return err;
883 	if (newsmt == sched_smt)
884 		return 0;
885 
886 	sched_smt = newsmt;
887 	CPU_INFO_FOREACH(cii, ci) {
888 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
889 			continue;
890 		if (ci->ci_smt_id == 0)
891 			continue;
892 		if (sched_smt)
893 			cpuset_add(&sched_all_cpus, ci);
894 		else
895 			cpuset_del(&sched_all_cpus, ci);
896 	}
897 
898 	return 0;
899 }
900 
901 #endif
902