xref: /openbsd-src/sys/kern/kern_sched.c (revision 1d970828bcdfaa6f1de83da4a110fd1536b535fe)
1 /*	$OpenBSD: kern_sched.c,v 1.94 2024/01/24 19:23:38 cheloha Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	clockintr_bind(&spc->spc_itimer, ci, itimer_update, NULL);
92 	clockintr_bind(&spc->spc_profclock, ci, profclock, NULL);
93 	clockintr_bind(&spc->spc_roundrobin, ci, roundrobin, NULL);
94 	clockintr_bind(&spc->spc_statclock, ci, statclock, NULL);
95 
96 	kthread_create_deferred(sched_kthreads_create, ci);
97 
98 	LIST_INIT(&spc->spc_deadproc);
99 	SIMPLEQ_INIT(&spc->spc_deferred);
100 
101 	/*
102 	 * Slight hack here until the cpuset code handles cpu_info
103 	 * structures.
104 	 */
105 	cpuset_init_cpu(ci);
106 
107 #ifdef __HAVE_CPU_TOPOLOGY
108 	if (!sched_smt && ci->ci_smt_id > 0)
109 		return;
110 #endif
111 	cpuset_add(&sched_all_cpus, ci);
112 }
113 
114 void
115 sched_kthreads_create(void *v)
116 {
117 	struct cpu_info *ci = v;
118 	struct schedstate_percpu *spc = &ci->ci_schedstate;
119 	static int num;
120 
121 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
122 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
123 	    &spc->spc_idleproc))
124 		panic("fork idle");
125 
126 	/* Name it as specified. */
127 	snprintf(spc->spc_idleproc->p_p->ps_comm,
128 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
129 	    "idle%d", num);
130 
131 	num++;
132 }
133 
134 void
135 sched_idle(void *v)
136 {
137 	struct schedstate_percpu *spc;
138 	struct proc *p = curproc;
139 	struct cpu_info *ci = v;
140 	int s;
141 
142 	KERNEL_UNLOCK();
143 
144 	spc = &ci->ci_schedstate;
145 
146 	/*
147 	 * First time we enter here, we're not supposed to idle,
148 	 * just go away for a while.
149 	 */
150 	SCHED_LOCK(s);
151 	cpuset_add(&sched_idle_cpus, ci);
152 	p->p_stat = SSLEEP;
153 	p->p_cpu = ci;
154 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
155 	mi_switch();
156 	cpuset_del(&sched_idle_cpus, ci);
157 	SCHED_UNLOCK(s);
158 
159 	KASSERT(ci == curcpu());
160 	KASSERT(curproc == spc->spc_idleproc);
161 
162 	while (1) {
163 		while (!cpu_is_idle(curcpu())) {
164 			struct proc *dead;
165 
166 			SCHED_LOCK(s);
167 			p->p_stat = SSLEEP;
168 			mi_switch();
169 			SCHED_UNLOCK(s);
170 
171 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
172 				LIST_REMOVE(dead, p_hash);
173 				exit2(dead);
174 			}
175 		}
176 
177 		splassert(IPL_NONE);
178 
179 		smr_idle();
180 
181 		cpuset_add(&sched_idle_cpus, ci);
182 		cpu_idle_enter();
183 		while (spc->spc_whichqs == 0) {
184 #ifdef MULTIPROCESSOR
185 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
186 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
187 				cpuset_del(&sched_idle_cpus, ci);
188 				SCHED_LOCK(s);
189 				atomic_setbits_int(&spc->spc_schedflags,
190 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
191 				SCHED_UNLOCK(s);
192 				wakeup(spc);
193 			}
194 #endif
195 			cpu_idle_cycle();
196 		}
197 		cpu_idle_leave();
198 		cpuset_del(&sched_idle_cpus, ci);
199 	}
200 }
201 
202 /*
203  * To free our address space we have to jump through a few hoops.
204  * The freeing is done by the reaper, but until we have one reaper
205  * per cpu, we have no way of putting this proc on the deadproc list
206  * and waking up the reaper without risking having our address space and
207  * stack torn from under us before we manage to switch to another proc.
208  * Therefore we have a per-cpu list of dead processes where we put this
209  * proc and have idle clean up that list and move it to the reaper list.
210  * All this will be unnecessary once we can bind the reaper this cpu
211  * and not risk having it switch to another in case it sleeps.
212  */
213 void
214 sched_exit(struct proc *p)
215 {
216 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
217 
218 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
219 
220 	KERNEL_ASSERT_LOCKED();
221 	sched_toidle();
222 }
223 
224 void
225 sched_toidle(void)
226 {
227 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
228 	struct proc *idle;
229 	int s;
230 
231 #ifdef MULTIPROCESSOR
232 	/* This process no longer needs to hold the kernel lock. */
233 	if (_kernel_lock_held())
234 		__mp_release_all(&kernel_lock);
235 #endif
236 
237 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
238 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
239 		clockintr_cancel(&spc->spc_itimer);
240 	}
241 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
242 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
243 		clockintr_cancel(&spc->spc_profclock);
244 	}
245 
246 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
247 
248 	SCHED_LOCK(s);
249 
250 	idle = spc->spc_idleproc;
251 	idle->p_stat = SRUN;
252 
253 	uvmexp.swtch++;
254 	TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
255 	    idle->p_p->ps_pid);
256 	cpu_switchto(NULL, idle);
257 	panic("cpu_switchto returned");
258 }
259 
260 /*
261  * Run queue management.
262  */
263 void
264 sched_init_runqueues(void)
265 {
266 }
267 
268 void
269 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
270 {
271 	struct schedstate_percpu *spc;
272 	int queue = prio >> 2;
273 
274 	if (ci == NULL)
275 		ci = sched_choosecpu(p);
276 
277 	KASSERT(ci != NULL);
278 	SCHED_ASSERT_LOCKED();
279 	KASSERT(p->p_wchan == NULL);
280 
281 	p->p_cpu = ci;
282 	p->p_stat = SRUN;
283 	p->p_runpri = prio;
284 
285 	spc = &p->p_cpu->ci_schedstate;
286 	spc->spc_nrun++;
287 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
288 	    p->p_p->ps_pid);
289 
290 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
291 	spc->spc_whichqs |= (1U << queue);
292 	cpuset_add(&sched_queued_cpus, p->p_cpu);
293 
294 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
295 		cpu_unidle(p->p_cpu);
296 
297 	if (prio < spc->spc_curpriority)
298 		need_resched(ci);
299 }
300 
301 void
302 remrunqueue(struct proc *p)
303 {
304 	struct schedstate_percpu *spc;
305 	int queue = p->p_runpri >> 2;
306 
307 	SCHED_ASSERT_LOCKED();
308 	spc = &p->p_cpu->ci_schedstate;
309 	spc->spc_nrun--;
310 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
311 	    p->p_p->ps_pid);
312 
313 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
314 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
315 		spc->spc_whichqs &= ~(1U << queue);
316 		if (spc->spc_whichqs == 0)
317 			cpuset_del(&sched_queued_cpus, p->p_cpu);
318 	}
319 }
320 
321 struct proc *
322 sched_chooseproc(void)
323 {
324 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
325 	struct proc *p;
326 	int queue;
327 
328 	SCHED_ASSERT_LOCKED();
329 
330 #ifdef MULTIPROCESSOR
331 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
332 		if (spc->spc_whichqs) {
333 			for (queue = 0; queue < SCHED_NQS; queue++) {
334 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
335 					remrunqueue(p);
336 					setrunqueue(NULL, p, p->p_runpri);
337 					if (p->p_cpu == curcpu()) {
338 						KASSERT(p->p_flag & P_CPUPEG);
339 						goto again;
340 					}
341 				}
342 			}
343 		}
344 		p = spc->spc_idleproc;
345 		if (p == NULL)
346 			panic("no idleproc set on CPU%d",
347 			    CPU_INFO_UNIT(curcpu()));
348 		p->p_stat = SRUN;
349 		KASSERT(p->p_wchan == NULL);
350 		return (p);
351 	}
352 again:
353 #endif
354 
355 	if (spc->spc_whichqs) {
356 		queue = ffs(spc->spc_whichqs) - 1;
357 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
358 		remrunqueue(p);
359 		sched_noidle++;
360 		if (p->p_stat != SRUN)
361 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
362 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
363 		p = spc->spc_idleproc;
364 		if (p == NULL)
365 			panic("no idleproc set on CPU%d",
366 			    CPU_INFO_UNIT(curcpu()));
367 		p->p_stat = SRUN;
368 	}
369 
370 	KASSERT(p->p_wchan == NULL);
371 	return (p);
372 }
373 
374 struct cpu_info *
375 sched_choosecpu_fork(struct proc *parent, int flags)
376 {
377 #ifdef MULTIPROCESSOR
378 	struct cpu_info *choice = NULL;
379 	int run, best_run = INT_MAX;
380 	struct cpu_info *ci;
381 	struct cpuset set;
382 
383 #if 0
384 	/*
385 	 * XXX
386 	 * Don't do this until we have a painless way to move the cpu in exec.
387 	 * Preferably when nuking the old pmap and getting a new one on a
388 	 * new cpu.
389 	 */
390 	/*
391 	 * PPWAIT forks are simple. We know that the parent will not
392 	 * run until we exec and choose another cpu, so we just steal its
393 	 * cpu.
394 	 */
395 	if (flags & FORK_PPWAIT)
396 		return (parent->p_cpu);
397 #endif
398 
399 	/*
400 	 * Look at all cpus that are currently idle and have nothing queued.
401 	 * If there are none, pick the one with least queued procs first,
402 	 * then the one with lowest load average.
403 	 */
404 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
405 	cpuset_intersection(&set, &set, &sched_all_cpus);
406 	if (cpuset_first(&set) == NULL)
407 		cpuset_copy(&set, &sched_all_cpus);
408 
409 	while ((ci = cpuset_first(&set)) != NULL) {
410 		cpuset_del(&set, ci);
411 
412 		run = ci->ci_schedstate.spc_nrun;
413 
414 		if (choice == NULL || run < best_run) {
415 			choice = ci;
416 			best_run = run;
417 		}
418 	}
419 
420 	return (choice);
421 #else
422 	return (curcpu());
423 #endif
424 }
425 
426 struct cpu_info *
427 sched_choosecpu(struct proc *p)
428 {
429 #ifdef MULTIPROCESSOR
430 	struct cpu_info *choice = NULL;
431 	int last_cost = INT_MAX;
432 	struct cpu_info *ci;
433 	struct cpuset set;
434 
435 	/*
436 	 * If pegged to a cpu, don't allow it to move.
437 	 */
438 	if (p->p_flag & P_CPUPEG)
439 		return (p->p_cpu);
440 
441 	sched_choose++;
442 
443 	/*
444 	 * Look at all cpus that are currently idle and have nothing queued.
445 	 * If there are none, pick the cheapest of those.
446 	 * (idle + queued could mean that the cpu is handling an interrupt
447 	 * at this moment and haven't had time to leave idle yet).
448 	 */
449 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
450 	cpuset_intersection(&set, &set, &sched_all_cpus);
451 
452 	/*
453 	 * First, just check if our current cpu is in that set, if it is,
454 	 * this is simple.
455 	 * Also, our cpu might not be idle, but if it's the current cpu
456 	 * and it has nothing else queued and we're curproc, take it.
457 	 */
458 	if (cpuset_isset(&set, p->p_cpu) ||
459 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
460 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
461 	    curproc == p)) {
462 		sched_wasidle++;
463 		return (p->p_cpu);
464 	}
465 
466 	if (cpuset_first(&set) == NULL)
467 		cpuset_copy(&set, &sched_all_cpus);
468 
469 	while ((ci = cpuset_first(&set)) != NULL) {
470 		int cost = sched_proc_to_cpu_cost(ci, p);
471 
472 		if (choice == NULL || cost < last_cost) {
473 			choice = ci;
474 			last_cost = cost;
475 		}
476 		cpuset_del(&set, ci);
477 	}
478 
479 	if (p->p_cpu != choice)
480 		sched_nmigrations++;
481 	else
482 		sched_nomigrations++;
483 
484 	return (choice);
485 #else
486 	return (curcpu());
487 #endif
488 }
489 
490 /*
491  * Attempt to steal a proc from some cpu.
492  */
493 struct proc *
494 sched_steal_proc(struct cpu_info *self)
495 {
496 	struct proc *best = NULL;
497 #ifdef MULTIPROCESSOR
498 	struct schedstate_percpu *spc;
499 	int bestcost = INT_MAX;
500 	struct cpu_info *ci;
501 	struct cpuset set;
502 
503 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
504 
505 	/* Don't steal if we don't want to schedule processes in this CPU. */
506 	if (!cpuset_isset(&sched_all_cpus, self))
507 		return (NULL);
508 
509 	cpuset_copy(&set, &sched_queued_cpus);
510 
511 	while ((ci = cpuset_first(&set)) != NULL) {
512 		struct proc *p;
513 		int queue;
514 		int cost;
515 
516 		cpuset_del(&set, ci);
517 
518 		spc = &ci->ci_schedstate;
519 
520 		queue = ffs(spc->spc_whichqs) - 1;
521 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
522 			if (p->p_flag & P_CPUPEG)
523 				continue;
524 
525 			cost = sched_proc_to_cpu_cost(self, p);
526 
527 			if (best == NULL || cost < bestcost) {
528 				best = p;
529 				bestcost = cost;
530 			}
531 		}
532 	}
533 	if (best == NULL)
534 		return (NULL);
535 
536 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
537 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
538 
539 	remrunqueue(best);
540 	best->p_cpu = self;
541 
542 	sched_stolen++;
543 #endif
544 	return (best);
545 }
546 
547 #ifdef MULTIPROCESSOR
548 /*
549  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
550  */
551 static int
552 log2(unsigned int i)
553 {
554 	int ret = 0;
555 
556 	while (i >>= 1)
557 		ret++;
558 
559 	return (ret);
560 }
561 
562 /*
563  * Calculate the cost of moving the proc to this cpu.
564  *
565  * What we want is some guesstimate of how much "performance" it will
566  * cost us to move the proc here. Not just for caches and TLBs and NUMA
567  * memory, but also for the proc itself. A highly loaded cpu might not
568  * be the best candidate for this proc since it won't get run.
569  *
570  * Just total guesstimates for now.
571  */
572 
573 int sched_cost_load = 1;
574 int sched_cost_priority = 1;
575 int sched_cost_runnable = 3;
576 int sched_cost_resident = 1;
577 #endif
578 
579 int
580 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
581 {
582 	int cost = 0;
583 #ifdef MULTIPROCESSOR
584 	struct schedstate_percpu *spc;
585 	int l2resident = 0;
586 
587 	spc = &ci->ci_schedstate;
588 
589 	/*
590 	 * First, account for the priority of the proc we want to move.
591 	 * More willing to move, the lower the priority of the destination
592 	 * and the higher the priority of the proc.
593 	 */
594 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
595 		cost += (p->p_usrpri - spc->spc_curpriority) *
596 		    sched_cost_priority;
597 		cost += sched_cost_runnable;
598 	}
599 	if (cpuset_isset(&sched_queued_cpus, ci))
600 		cost += spc->spc_nrun * sched_cost_runnable;
601 
602 	/*
603 	 * Try to avoid the primary cpu as it handles hardware interrupts.
604 	 *
605 	 * XXX Needs to be revisited when we distribute interrupts
606 	 * over cpus.
607 	 */
608 	if (CPU_IS_PRIMARY(ci))
609 		cost += sched_cost_runnable;
610 
611 	/*
612 	 * If the proc is on this cpu already, lower the cost by how much
613 	 * it has been running and an estimate of its footprint.
614 	 */
615 	if (p->p_cpu == ci && p->p_slptime == 0) {
616 		l2resident =
617 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
618 		cost -= l2resident * sched_cost_resident;
619 	}
620 #endif
621 	return (cost);
622 }
623 
624 /*
625  * Peg a proc to a cpu.
626  */
627 void
628 sched_peg_curproc(struct cpu_info *ci)
629 {
630 	struct proc *p = curproc;
631 	int s;
632 
633 	SCHED_LOCK(s);
634 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
635 	setrunqueue(ci, p, p->p_usrpri);
636 	p->p_ru.ru_nvcsw++;
637 	mi_switch();
638 	SCHED_UNLOCK(s);
639 }
640 
641 #ifdef MULTIPROCESSOR
642 
643 void
644 sched_start_secondary_cpus(void)
645 {
646 	CPU_INFO_ITERATOR cii;
647 	struct cpu_info *ci;
648 
649 	CPU_INFO_FOREACH(cii, ci) {
650 		struct schedstate_percpu *spc = &ci->ci_schedstate;
651 
652 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
653 			continue;
654 		atomic_clearbits_int(&spc->spc_schedflags,
655 		    SPCF_SHOULDHALT | SPCF_HALTED);
656 #ifdef __HAVE_CPU_TOPOLOGY
657 		if (!sched_smt && ci->ci_smt_id > 0)
658 			continue;
659 #endif
660 		cpuset_add(&sched_all_cpus, ci);
661 	}
662 }
663 
664 void
665 sched_stop_secondary_cpus(void)
666 {
667 	CPU_INFO_ITERATOR cii;
668 	struct cpu_info *ci;
669 
670 	/*
671 	 * Make sure we stop the secondary CPUs.
672 	 */
673 	CPU_INFO_FOREACH(cii, ci) {
674 		struct schedstate_percpu *spc = &ci->ci_schedstate;
675 
676 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
677 			continue;
678 		cpuset_del(&sched_all_cpus, ci);
679 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
680 	}
681 	CPU_INFO_FOREACH(cii, ci) {
682 		struct schedstate_percpu *spc = &ci->ci_schedstate;
683 
684 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
685 			continue;
686 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
687 			sleep_setup(spc, PZERO, "schedstate");
688 			sleep_finish(0,
689 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
690 		}
691 	}
692 }
693 
694 struct sched_barrier_state {
695 	struct cpu_info *ci;
696 	struct cond cond;
697 };
698 
699 void
700 sched_barrier_task(void *arg)
701 {
702 	struct sched_barrier_state *sb = arg;
703 	struct cpu_info *ci = sb->ci;
704 
705 	sched_peg_curproc(ci);
706 	cond_signal(&sb->cond);
707 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
708 }
709 
710 void
711 sched_barrier(struct cpu_info *ci)
712 {
713 	struct sched_barrier_state sb;
714 	struct task task;
715 	CPU_INFO_ITERATOR cii;
716 
717 	if (ci == NULL) {
718 		CPU_INFO_FOREACH(cii, ci) {
719 			if (CPU_IS_PRIMARY(ci))
720 				break;
721 		}
722 	}
723 	KASSERT(ci != NULL);
724 
725 	if (ci == curcpu())
726 		return;
727 
728 	sb.ci = ci;
729 	cond_init(&sb.cond);
730 	task_set(&task, sched_barrier_task, &sb);
731 
732 	task_add(systqmp, &task);
733 	cond_wait(&sb.cond, "sbar");
734 }
735 
736 #else
737 
738 void
739 sched_barrier(struct cpu_info *ci)
740 {
741 }
742 
743 #endif
744 
745 /*
746  * Functions to manipulate cpu sets.
747  */
748 struct cpu_info *cpuset_infos[MAXCPUS];
749 static struct cpuset cpuset_all;
750 
751 void
752 cpuset_init_cpu(struct cpu_info *ci)
753 {
754 	cpuset_add(&cpuset_all, ci);
755 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
756 }
757 
758 void
759 cpuset_clear(struct cpuset *cs)
760 {
761 	memset(cs, 0, sizeof(*cs));
762 }
763 
764 void
765 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
766 {
767 	unsigned int num = CPU_INFO_UNIT(ci);
768 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
769 }
770 
771 void
772 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
773 {
774 	unsigned int num = CPU_INFO_UNIT(ci);
775 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
776 }
777 
778 int
779 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
780 {
781 	unsigned int num = CPU_INFO_UNIT(ci);
782 	return (cs->cs_set[num/32] & (1U << (num % 32)));
783 }
784 
785 void
786 cpuset_add_all(struct cpuset *cs)
787 {
788 	cpuset_copy(cs, &cpuset_all);
789 }
790 
791 void
792 cpuset_copy(struct cpuset *to, struct cpuset *from)
793 {
794 	memcpy(to, from, sizeof(*to));
795 }
796 
797 struct cpu_info *
798 cpuset_first(struct cpuset *cs)
799 {
800 	int i;
801 
802 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
803 		if (cs->cs_set[i])
804 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
805 
806 	return (NULL);
807 }
808 
809 void
810 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
811 {
812 	int i;
813 
814 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
815 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
816 }
817 
818 void
819 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
820 {
821 	int i;
822 
823 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
824 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
825 }
826 
827 void
828 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
829 {
830 	int i;
831 
832 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
833 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
834 }
835 
836 int
837 cpuset_cardinality(struct cpuset *cs)
838 {
839 	int cardinality, i, n;
840 
841 	cardinality = 0;
842 
843 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
844 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
845 			cardinality++;
846 
847 	return (cardinality);
848 }
849 
850 int
851 sysctl_hwncpuonline(void)
852 {
853 	return cpuset_cardinality(&sched_all_cpus);
854 }
855 
856 int
857 cpu_is_online(struct cpu_info *ci)
858 {
859 	return cpuset_isset(&sched_all_cpus, ci);
860 }
861 
862 #ifdef __HAVE_CPU_TOPOLOGY
863 
864 #include <sys/sysctl.h>
865 
866 int
867 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
868 {
869 	CPU_INFO_ITERATOR cii;
870 	struct cpu_info *ci;
871 	int err, newsmt;
872 
873 	newsmt = sched_smt;
874 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
875 	if (err)
876 		return err;
877 	if (newsmt == sched_smt)
878 		return 0;
879 
880 	sched_smt = newsmt;
881 	CPU_INFO_FOREACH(cii, ci) {
882 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
883 			continue;
884 		if (ci->ci_smt_id == 0)
885 			continue;
886 		if (sched_smt)
887 			cpuset_add(&sched_all_cpus, ci);
888 		else
889 			cpuset_del(&sched_all_cpus, ci);
890 	}
891 
892 	return 0;
893 }
894 
895 #endif
896