xref: /openbsd-src/sys/kern/kern_sched.c (revision 94358d69ee05fa503294e6438e1b1bbf60aa9d02)
1 /*	$OpenBSD: kern_sched.c,v 1.92 2023/09/19 11:31:51 claudio Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	spc->spc_itimer = clockintr_establish(ci, itimer_update, NULL);
92 	if (spc->spc_itimer == NULL)
93 		panic("%s: clockintr_establish itimer_update", __func__);
94 	spc->spc_profclock = clockintr_establish(ci, profclock, NULL);
95 	if (spc->spc_profclock == NULL)
96 		panic("%s: clockintr_establish profclock", __func__);
97 	spc->spc_roundrobin = clockintr_establish(ci, roundrobin, NULL);
98 	if (spc->spc_roundrobin == NULL)
99 		panic("%s: clockintr_establish roundrobin", __func__);
100 	spc->spc_statclock = clockintr_establish(ci, statclock, NULL);
101 	if (spc->spc_statclock == NULL)
102 		panic("%s: clockintr_establish statclock", __func__);
103 
104 	kthread_create_deferred(sched_kthreads_create, ci);
105 
106 	LIST_INIT(&spc->spc_deadproc);
107 	SIMPLEQ_INIT(&spc->spc_deferred);
108 
109 	/*
110 	 * Slight hack here until the cpuset code handles cpu_info
111 	 * structures.
112 	 */
113 	cpuset_init_cpu(ci);
114 
115 #ifdef __HAVE_CPU_TOPOLOGY
116 	if (!sched_smt && ci->ci_smt_id > 0)
117 		return;
118 #endif
119 	cpuset_add(&sched_all_cpus, ci);
120 }
121 
122 void
123 sched_kthreads_create(void *v)
124 {
125 	struct cpu_info *ci = v;
126 	struct schedstate_percpu *spc = &ci->ci_schedstate;
127 	static int num;
128 
129 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
130 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
131 	    &spc->spc_idleproc))
132 		panic("fork idle");
133 
134 	/* Name it as specified. */
135 	snprintf(spc->spc_idleproc->p_p->ps_comm,
136 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
137 	    "idle%d", num);
138 
139 	num++;
140 }
141 
142 void
143 sched_idle(void *v)
144 {
145 	struct schedstate_percpu *spc;
146 	struct proc *p = curproc;
147 	struct cpu_info *ci = v;
148 	int s;
149 
150 	KERNEL_UNLOCK();
151 
152 	spc = &ci->ci_schedstate;
153 
154 	/*
155 	 * First time we enter here, we're not supposed to idle,
156 	 * just go away for a while.
157 	 */
158 	SCHED_LOCK(s);
159 	cpuset_add(&sched_idle_cpus, ci);
160 	p->p_stat = SSLEEP;
161 	p->p_cpu = ci;
162 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
163 	mi_switch();
164 	cpuset_del(&sched_idle_cpus, ci);
165 	SCHED_UNLOCK(s);
166 
167 	KASSERT(ci == curcpu());
168 	KASSERT(curproc == spc->spc_idleproc);
169 
170 	while (1) {
171 		while (!cpu_is_idle(curcpu())) {
172 			struct proc *dead;
173 
174 			SCHED_LOCK(s);
175 			p->p_stat = SSLEEP;
176 			mi_switch();
177 			SCHED_UNLOCK(s);
178 
179 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
180 				LIST_REMOVE(dead, p_hash);
181 				exit2(dead);
182 			}
183 		}
184 
185 		splassert(IPL_NONE);
186 
187 		smr_idle();
188 
189 		cpuset_add(&sched_idle_cpus, ci);
190 		cpu_idle_enter();
191 		while (spc->spc_whichqs == 0) {
192 #ifdef MULTIPROCESSOR
193 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
194 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
195 				cpuset_del(&sched_idle_cpus, ci);
196 				SCHED_LOCK(s);
197 				atomic_setbits_int(&spc->spc_schedflags,
198 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
199 				SCHED_UNLOCK(s);
200 				wakeup(spc);
201 			}
202 #endif
203 			cpu_idle_cycle();
204 		}
205 		cpu_idle_leave();
206 		cpuset_del(&sched_idle_cpus, ci);
207 	}
208 }
209 
210 /*
211  * To free our address space we have to jump through a few hoops.
212  * The freeing is done by the reaper, but until we have one reaper
213  * per cpu, we have no way of putting this proc on the deadproc list
214  * and waking up the reaper without risking having our address space and
215  * stack torn from under us before we manage to switch to another proc.
216  * Therefore we have a per-cpu list of dead processes where we put this
217  * proc and have idle clean up that list and move it to the reaper list.
218  * All this will be unnecessary once we can bind the reaper this cpu
219  * and not risk having it switch to another in case it sleeps.
220  */
221 void
222 sched_exit(struct proc *p)
223 {
224 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
225 	struct proc *idle;
226 	int s;
227 
228 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
229 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
230 		clockintr_cancel(spc->spc_itimer);
231 	}
232 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
233 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
234 		clockintr_cancel(spc->spc_profclock);
235 	}
236 
237 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
238 
239 #ifdef MULTIPROCESSOR
240 	/* This process no longer needs to hold the kernel lock. */
241 	KERNEL_ASSERT_LOCKED();
242 	__mp_release_all(&kernel_lock);
243 #endif
244 
245 	SCHED_LOCK(s);
246 	idle = spc->spc_idleproc;
247 	idle->p_stat = SRUN;
248 	cpu_switchto(NULL, idle);
249 	panic("cpu_switchto returned");
250 }
251 
252 /*
253  * Run queue management.
254  */
255 void
256 sched_init_runqueues(void)
257 {
258 }
259 
260 void
261 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
262 {
263 	struct schedstate_percpu *spc;
264 	int queue = prio >> 2;
265 
266 	if (ci == NULL)
267 		ci = sched_choosecpu(p);
268 
269 	KASSERT(ci != NULL);
270 	SCHED_ASSERT_LOCKED();
271 	KASSERT(p->p_wchan == NULL);
272 
273 	p->p_cpu = ci;
274 	p->p_stat = SRUN;
275 	p->p_runpri = prio;
276 
277 	spc = &p->p_cpu->ci_schedstate;
278 	spc->spc_nrun++;
279 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
280 	    p->p_p->ps_pid);
281 
282 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
283 	spc->spc_whichqs |= (1U << queue);
284 	cpuset_add(&sched_queued_cpus, p->p_cpu);
285 
286 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
287 		cpu_unidle(p->p_cpu);
288 
289 	if (prio < spc->spc_curpriority)
290 		need_resched(ci);
291 }
292 
293 void
294 remrunqueue(struct proc *p)
295 {
296 	struct schedstate_percpu *spc;
297 	int queue = p->p_runpri >> 2;
298 
299 	SCHED_ASSERT_LOCKED();
300 	spc = &p->p_cpu->ci_schedstate;
301 	spc->spc_nrun--;
302 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
303 	    p->p_p->ps_pid);
304 
305 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
306 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
307 		spc->spc_whichqs &= ~(1U << queue);
308 		if (spc->spc_whichqs == 0)
309 			cpuset_del(&sched_queued_cpus, p->p_cpu);
310 	}
311 }
312 
313 struct proc *
314 sched_chooseproc(void)
315 {
316 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
317 	struct proc *p;
318 	int queue;
319 
320 	SCHED_ASSERT_LOCKED();
321 
322 #ifdef MULTIPROCESSOR
323 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
324 		if (spc->spc_whichqs) {
325 			for (queue = 0; queue < SCHED_NQS; queue++) {
326 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
327 					remrunqueue(p);
328 					setrunqueue(NULL, p, p->p_runpri);
329 					if (p->p_cpu == curcpu()) {
330 						KASSERT(p->p_flag & P_CPUPEG);
331 						goto again;
332 					}
333 				}
334 			}
335 		}
336 		p = spc->spc_idleproc;
337 		KASSERT(p);
338 		KASSERT(p->p_wchan == NULL);
339 		p->p_stat = SRUN;
340 		return (p);
341 	}
342 #endif
343 
344 again:
345 	if (spc->spc_whichqs) {
346 		queue = ffs(spc->spc_whichqs) - 1;
347 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
348 		remrunqueue(p);
349 		sched_noidle++;
350 		if (p->p_stat != SRUN)
351 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
352 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
353 		p = spc->spc_idleproc;
354 		if (p == NULL) {
355                         int s;
356 			/*
357 			 * We get here if someone decides to switch during
358 			 * boot before forking kthreads, bleh.
359 			 * This is kind of like a stupid idle loop.
360 			 */
361 #ifdef MULTIPROCESSOR
362 			__mp_unlock(&sched_lock);
363 #endif
364 			spl0();
365 			delay(10);
366 			SCHED_LOCK(s);
367 			goto again;
368                 }
369 		KASSERT(p);
370 		p->p_stat = SRUN;
371 	}
372 
373 	KASSERT(p->p_wchan == NULL);
374 	return (p);
375 }
376 
377 struct cpu_info *
378 sched_choosecpu_fork(struct proc *parent, int flags)
379 {
380 #ifdef MULTIPROCESSOR
381 	struct cpu_info *choice = NULL;
382 	int run, best_run = INT_MAX;
383 	struct cpu_info *ci;
384 	struct cpuset set;
385 
386 #if 0
387 	/*
388 	 * XXX
389 	 * Don't do this until we have a painless way to move the cpu in exec.
390 	 * Preferably when nuking the old pmap and getting a new one on a
391 	 * new cpu.
392 	 */
393 	/*
394 	 * PPWAIT forks are simple. We know that the parent will not
395 	 * run until we exec and choose another cpu, so we just steal its
396 	 * cpu.
397 	 */
398 	if (flags & FORK_PPWAIT)
399 		return (parent->p_cpu);
400 #endif
401 
402 	/*
403 	 * Look at all cpus that are currently idle and have nothing queued.
404 	 * If there are none, pick the one with least queued procs first,
405 	 * then the one with lowest load average.
406 	 */
407 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
408 	cpuset_intersection(&set, &set, &sched_all_cpus);
409 	if (cpuset_first(&set) == NULL)
410 		cpuset_copy(&set, &sched_all_cpus);
411 
412 	while ((ci = cpuset_first(&set)) != NULL) {
413 		cpuset_del(&set, ci);
414 
415 		run = ci->ci_schedstate.spc_nrun;
416 
417 		if (choice == NULL || run < best_run) {
418 			choice = ci;
419 			best_run = run;
420 		}
421 	}
422 
423 	return (choice);
424 #else
425 	return (curcpu());
426 #endif
427 }
428 
429 struct cpu_info *
430 sched_choosecpu(struct proc *p)
431 {
432 #ifdef MULTIPROCESSOR
433 	struct cpu_info *choice = NULL;
434 	int last_cost = INT_MAX;
435 	struct cpu_info *ci;
436 	struct cpuset set;
437 
438 	/*
439 	 * If pegged to a cpu, don't allow it to move.
440 	 */
441 	if (p->p_flag & P_CPUPEG)
442 		return (p->p_cpu);
443 
444 	sched_choose++;
445 
446 	/*
447 	 * Look at all cpus that are currently idle and have nothing queued.
448 	 * If there are none, pick the cheapest of those.
449 	 * (idle + queued could mean that the cpu is handling an interrupt
450 	 * at this moment and haven't had time to leave idle yet).
451 	 */
452 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
453 	cpuset_intersection(&set, &set, &sched_all_cpus);
454 
455 	/*
456 	 * First, just check if our current cpu is in that set, if it is,
457 	 * this is simple.
458 	 * Also, our cpu might not be idle, but if it's the current cpu
459 	 * and it has nothing else queued and we're curproc, take it.
460 	 */
461 	if (cpuset_isset(&set, p->p_cpu) ||
462 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
463 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
464 	    curproc == p)) {
465 		sched_wasidle++;
466 		return (p->p_cpu);
467 	}
468 
469 	if (cpuset_first(&set) == NULL)
470 		cpuset_copy(&set, &sched_all_cpus);
471 
472 	while ((ci = cpuset_first(&set)) != NULL) {
473 		int cost = sched_proc_to_cpu_cost(ci, p);
474 
475 		if (choice == NULL || cost < last_cost) {
476 			choice = ci;
477 			last_cost = cost;
478 		}
479 		cpuset_del(&set, ci);
480 	}
481 
482 	if (p->p_cpu != choice)
483 		sched_nmigrations++;
484 	else
485 		sched_nomigrations++;
486 
487 	return (choice);
488 #else
489 	return (curcpu());
490 #endif
491 }
492 
493 /*
494  * Attempt to steal a proc from some cpu.
495  */
496 struct proc *
497 sched_steal_proc(struct cpu_info *self)
498 {
499 	struct proc *best = NULL;
500 #ifdef MULTIPROCESSOR
501 	struct schedstate_percpu *spc;
502 	int bestcost = INT_MAX;
503 	struct cpu_info *ci;
504 	struct cpuset set;
505 
506 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
507 
508 	/* Don't steal if we don't want to schedule processes in this CPU. */
509 	if (!cpuset_isset(&sched_all_cpus, self))
510 		return (NULL);
511 
512 	cpuset_copy(&set, &sched_queued_cpus);
513 
514 	while ((ci = cpuset_first(&set)) != NULL) {
515 		struct proc *p;
516 		int queue;
517 		int cost;
518 
519 		cpuset_del(&set, ci);
520 
521 		spc = &ci->ci_schedstate;
522 
523 		queue = ffs(spc->spc_whichqs) - 1;
524 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
525 			if (p->p_flag & P_CPUPEG)
526 				continue;
527 
528 			cost = sched_proc_to_cpu_cost(self, p);
529 
530 			if (best == NULL || cost < bestcost) {
531 				best = p;
532 				bestcost = cost;
533 			}
534 		}
535 	}
536 	if (best == NULL)
537 		return (NULL);
538 
539 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
540 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
541 
542 	remrunqueue(best);
543 	best->p_cpu = self;
544 
545 	sched_stolen++;
546 #endif
547 	return (best);
548 }
549 
550 #ifdef MULTIPROCESSOR
551 /*
552  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
553  */
554 static int
555 log2(unsigned int i)
556 {
557 	int ret = 0;
558 
559 	while (i >>= 1)
560 		ret++;
561 
562 	return (ret);
563 }
564 
565 /*
566  * Calculate the cost of moving the proc to this cpu.
567  *
568  * What we want is some guesstimate of how much "performance" it will
569  * cost us to move the proc here. Not just for caches and TLBs and NUMA
570  * memory, but also for the proc itself. A highly loaded cpu might not
571  * be the best candidate for this proc since it won't get run.
572  *
573  * Just total guesstimates for now.
574  */
575 
576 int sched_cost_load = 1;
577 int sched_cost_priority = 1;
578 int sched_cost_runnable = 3;
579 int sched_cost_resident = 1;
580 #endif
581 
582 int
583 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
584 {
585 	int cost = 0;
586 #ifdef MULTIPROCESSOR
587 	struct schedstate_percpu *spc;
588 	int l2resident = 0;
589 
590 	spc = &ci->ci_schedstate;
591 
592 	/*
593 	 * First, account for the priority of the proc we want to move.
594 	 * More willing to move, the lower the priority of the destination
595 	 * and the higher the priority of the proc.
596 	 */
597 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
598 		cost += (p->p_usrpri - spc->spc_curpriority) *
599 		    sched_cost_priority;
600 		cost += sched_cost_runnable;
601 	}
602 	if (cpuset_isset(&sched_queued_cpus, ci))
603 		cost += spc->spc_nrun * sched_cost_runnable;
604 
605 	/*
606 	 * Try to avoid the primary cpu as it handles hardware interrupts.
607 	 *
608 	 * XXX Needs to be revisited when we distribute interrupts
609 	 * over cpus.
610 	 */
611 	if (CPU_IS_PRIMARY(ci))
612 		cost += sched_cost_runnable;
613 
614 	/*
615 	 * If the proc is on this cpu already, lower the cost by how much
616 	 * it has been running and an estimate of its footprint.
617 	 */
618 	if (p->p_cpu == ci && p->p_slptime == 0) {
619 		l2resident =
620 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
621 		cost -= l2resident * sched_cost_resident;
622 	}
623 #endif
624 	return (cost);
625 }
626 
627 /*
628  * Peg a proc to a cpu.
629  */
630 void
631 sched_peg_curproc(struct cpu_info *ci)
632 {
633 	struct proc *p = curproc;
634 	int s;
635 
636 	SCHED_LOCK(s);
637 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
638 	setrunqueue(ci, p, p->p_usrpri);
639 	p->p_ru.ru_nvcsw++;
640 	mi_switch();
641 	SCHED_UNLOCK(s);
642 }
643 
644 #ifdef MULTIPROCESSOR
645 
646 void
647 sched_start_secondary_cpus(void)
648 {
649 	CPU_INFO_ITERATOR cii;
650 	struct cpu_info *ci;
651 
652 	CPU_INFO_FOREACH(cii, ci) {
653 		struct schedstate_percpu *spc = &ci->ci_schedstate;
654 
655 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
656 			continue;
657 		atomic_clearbits_int(&spc->spc_schedflags,
658 		    SPCF_SHOULDHALT | SPCF_HALTED);
659 #ifdef __HAVE_CPU_TOPOLOGY
660 		if (!sched_smt && ci->ci_smt_id > 0)
661 			continue;
662 #endif
663 		cpuset_add(&sched_all_cpus, ci);
664 	}
665 }
666 
667 void
668 sched_stop_secondary_cpus(void)
669 {
670 	CPU_INFO_ITERATOR cii;
671 	struct cpu_info *ci;
672 
673 	/*
674 	 * Make sure we stop the secondary CPUs.
675 	 */
676 	CPU_INFO_FOREACH(cii, ci) {
677 		struct schedstate_percpu *spc = &ci->ci_schedstate;
678 
679 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
680 			continue;
681 		cpuset_del(&sched_all_cpus, ci);
682 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
683 	}
684 	CPU_INFO_FOREACH(cii, ci) {
685 		struct schedstate_percpu *spc = &ci->ci_schedstate;
686 
687 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
688 			continue;
689 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
690 			sleep_setup(spc, PZERO, "schedstate");
691 			sleep_finish(0,
692 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
693 		}
694 	}
695 }
696 
697 struct sched_barrier_state {
698 	struct cpu_info *ci;
699 	struct cond cond;
700 };
701 
702 void
703 sched_barrier_task(void *arg)
704 {
705 	struct sched_barrier_state *sb = arg;
706 	struct cpu_info *ci = sb->ci;
707 
708 	sched_peg_curproc(ci);
709 	cond_signal(&sb->cond);
710 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
711 }
712 
713 void
714 sched_barrier(struct cpu_info *ci)
715 {
716 	struct sched_barrier_state sb;
717 	struct task task;
718 	CPU_INFO_ITERATOR cii;
719 
720 	if (ci == NULL) {
721 		CPU_INFO_FOREACH(cii, ci) {
722 			if (CPU_IS_PRIMARY(ci))
723 				break;
724 		}
725 	}
726 	KASSERT(ci != NULL);
727 
728 	if (ci == curcpu())
729 		return;
730 
731 	sb.ci = ci;
732 	cond_init(&sb.cond);
733 	task_set(&task, sched_barrier_task, &sb);
734 
735 	task_add(systqmp, &task);
736 	cond_wait(&sb.cond, "sbar");
737 }
738 
739 #else
740 
741 void
742 sched_barrier(struct cpu_info *ci)
743 {
744 }
745 
746 #endif
747 
748 /*
749  * Functions to manipulate cpu sets.
750  */
751 struct cpu_info *cpuset_infos[MAXCPUS];
752 static struct cpuset cpuset_all;
753 
754 void
755 cpuset_init_cpu(struct cpu_info *ci)
756 {
757 	cpuset_add(&cpuset_all, ci);
758 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
759 }
760 
761 void
762 cpuset_clear(struct cpuset *cs)
763 {
764 	memset(cs, 0, sizeof(*cs));
765 }
766 
767 void
768 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
769 {
770 	unsigned int num = CPU_INFO_UNIT(ci);
771 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
772 }
773 
774 void
775 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
776 {
777 	unsigned int num = CPU_INFO_UNIT(ci);
778 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
779 }
780 
781 int
782 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
783 {
784 	unsigned int num = CPU_INFO_UNIT(ci);
785 	return (cs->cs_set[num/32] & (1U << (num % 32)));
786 }
787 
788 void
789 cpuset_add_all(struct cpuset *cs)
790 {
791 	cpuset_copy(cs, &cpuset_all);
792 }
793 
794 void
795 cpuset_copy(struct cpuset *to, struct cpuset *from)
796 {
797 	memcpy(to, from, sizeof(*to));
798 }
799 
800 struct cpu_info *
801 cpuset_first(struct cpuset *cs)
802 {
803 	int i;
804 
805 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
806 		if (cs->cs_set[i])
807 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
808 
809 	return (NULL);
810 }
811 
812 void
813 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
814 {
815 	int i;
816 
817 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
818 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
819 }
820 
821 void
822 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
823 {
824 	int i;
825 
826 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
827 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
828 }
829 
830 void
831 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
832 {
833 	int i;
834 
835 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
836 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
837 }
838 
839 int
840 cpuset_cardinality(struct cpuset *cs)
841 {
842 	int cardinality, i, n;
843 
844 	cardinality = 0;
845 
846 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
847 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
848 			cardinality++;
849 
850 	return (cardinality);
851 }
852 
853 int
854 sysctl_hwncpuonline(void)
855 {
856 	return cpuset_cardinality(&sched_all_cpus);
857 }
858 
859 int
860 cpu_is_online(struct cpu_info *ci)
861 {
862 	return cpuset_isset(&sched_all_cpus, ci);
863 }
864 
865 #ifdef __HAVE_CPU_TOPOLOGY
866 
867 #include <sys/sysctl.h>
868 
869 int
870 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
871 {
872 	CPU_INFO_ITERATOR cii;
873 	struct cpu_info *ci;
874 	int err, newsmt;
875 
876 	newsmt = sched_smt;
877 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
878 	if (err)
879 		return err;
880 	if (newsmt == sched_smt)
881 		return 0;
882 
883 	sched_smt = newsmt;
884 	CPU_INFO_FOREACH(cii, ci) {
885 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
886 			continue;
887 		if (ci->ci_smt_id == 0)
888 			continue;
889 		if (sched_smt)
890 			cpuset_add(&sched_all_cpus, ci);
891 		else
892 			cpuset_del(&sched_all_cpus, ci);
893 	}
894 
895 	return 0;
896 }
897 
898 #endif
899