xref: /openbsd-src/sys/kern/kern_sched.c (revision 2286c11fdedee6a9de522cebc12b77049c41c7be)
1 /*	$OpenBSD: kern_sched.c,v 1.95 2024/02/28 13:43:44 mpi Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	clockintr_bind(&spc->spc_itimer, ci, itimer_update, NULL);
92 	clockintr_bind(&spc->spc_profclock, ci, profclock, NULL);
93 	clockintr_bind(&spc->spc_roundrobin, ci, roundrobin, NULL);
94 	clockintr_bind(&spc->spc_statclock, ci, statclock, NULL);
95 
96 	kthread_create_deferred(sched_kthreads_create, ci);
97 
98 	LIST_INIT(&spc->spc_deadproc);
99 	SIMPLEQ_INIT(&spc->spc_deferred);
100 
101 	/*
102 	 * Slight hack here until the cpuset code handles cpu_info
103 	 * structures.
104 	 */
105 	cpuset_init_cpu(ci);
106 
107 #ifdef __HAVE_CPU_TOPOLOGY
108 	if (!sched_smt && ci->ci_smt_id > 0)
109 		return;
110 #endif
111 	cpuset_add(&sched_all_cpus, ci);
112 }
113 
114 void
115 sched_kthreads_create(void *v)
116 {
117 	struct cpu_info *ci = v;
118 	struct schedstate_percpu *spc = &ci->ci_schedstate;
119 	static int num;
120 
121 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
122 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
123 	    &spc->spc_idleproc))
124 		panic("fork idle");
125 
126 	/* Name it as specified. */
127 	snprintf(spc->spc_idleproc->p_p->ps_comm,
128 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
129 	    "idle%d", num);
130 
131 	num++;
132 }
133 
134 void
135 sched_idle(void *v)
136 {
137 	struct schedstate_percpu *spc;
138 	struct proc *p = curproc;
139 	struct cpu_info *ci = v;
140 	int s;
141 
142 	KERNEL_UNLOCK();
143 
144 	spc = &ci->ci_schedstate;
145 
146 	/*
147 	 * First time we enter here, we're not supposed to idle,
148 	 * just go away for a while.
149 	 */
150 	SCHED_LOCK(s);
151 	cpuset_add(&sched_idle_cpus, ci);
152 	p->p_stat = SSLEEP;
153 	p->p_cpu = ci;
154 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
155 	mi_switch();
156 	cpuset_del(&sched_idle_cpus, ci);
157 	SCHED_UNLOCK(s);
158 
159 	KASSERT(ci == curcpu());
160 	KASSERT(curproc == spc->spc_idleproc);
161 
162 	while (1) {
163 		while (!cpu_is_idle(curcpu())) {
164 			struct proc *dead;
165 
166 			SCHED_LOCK(s);
167 			p->p_stat = SSLEEP;
168 			mi_switch();
169 			SCHED_UNLOCK(s);
170 
171 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
172 				LIST_REMOVE(dead, p_hash);
173 				exit2(dead);
174 			}
175 		}
176 
177 		splassert(IPL_NONE);
178 
179 		smr_idle();
180 
181 		cpuset_add(&sched_idle_cpus, ci);
182 		cpu_idle_enter();
183 		while (spc->spc_whichqs == 0) {
184 #ifdef MULTIPROCESSOR
185 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
186 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
187 				cpuset_del(&sched_idle_cpus, ci);
188 				SCHED_LOCK(s);
189 				atomic_setbits_int(&spc->spc_schedflags,
190 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
191 				SCHED_UNLOCK(s);
192 				wakeup(spc);
193 			}
194 #endif
195 			cpu_idle_cycle();
196 		}
197 		cpu_idle_leave();
198 		cpuset_del(&sched_idle_cpus, ci);
199 	}
200 }
201 
202 /*
203  * To free our address space we have to jump through a few hoops.
204  * The freeing is done by the reaper, but until we have one reaper
205  * per cpu, we have no way of putting this proc on the deadproc list
206  * and waking up the reaper without risking having our address space and
207  * stack torn from under us before we manage to switch to another proc.
208  * Therefore we have a per-cpu list of dead processes where we put this
209  * proc and have idle clean up that list and move it to the reaper list.
210  * All this will be unnecessary once we can bind the reaper this cpu
211  * and not risk having it switch to another in case it sleeps.
212  */
213 void
214 sched_exit(struct proc *p)
215 {
216 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
217 
218 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
219 
220 	KERNEL_ASSERT_LOCKED();
221 	sched_toidle();
222 }
223 
224 void
225 sched_toidle(void)
226 {
227 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
228 	struct proc *idle;
229 	int s;
230 
231 #ifdef MULTIPROCESSOR
232 	/* This process no longer needs to hold the kernel lock. */
233 	if (_kernel_lock_held())
234 		__mp_release_all(&kernel_lock);
235 #endif
236 
237 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
238 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
239 		clockintr_cancel(&spc->spc_itimer);
240 	}
241 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
242 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
243 		clockintr_cancel(&spc->spc_profclock);
244 	}
245 
246 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
247 
248 	SCHED_LOCK(s);
249 
250 	idle = spc->spc_idleproc;
251 	idle->p_stat = SRUN;
252 
253 	uvmexp.swtch++;
254 	TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
255 	    idle->p_p->ps_pid);
256 	cpu_switchto(NULL, idle);
257 	panic("cpu_switchto returned");
258 }
259 
260 /*
261  * Run queue management.
262  */
263 void
264 sched_init_runqueues(void)
265 {
266 }
267 
268 void
269 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
270 {
271 	struct schedstate_percpu *spc;
272 	int queue = prio >> 2;
273 
274 	if (ci == NULL)
275 		ci = sched_choosecpu(p);
276 
277 	KASSERT(ci != NULL);
278 	SCHED_ASSERT_LOCKED();
279 	KASSERT(p->p_wchan == NULL);
280 
281 	p->p_cpu = ci;
282 	p->p_stat = SRUN;
283 	p->p_runpri = prio;
284 
285 	spc = &p->p_cpu->ci_schedstate;
286 	spc->spc_nrun++;
287 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
288 	    p->p_p->ps_pid);
289 
290 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
291 	spc->spc_whichqs |= (1U << queue);
292 	cpuset_add(&sched_queued_cpus, p->p_cpu);
293 
294 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
295 		cpu_unidle(p->p_cpu);
296 	else if (prio < spc->spc_curpriority)
297 		need_resched(ci);
298 }
299 
300 void
301 remrunqueue(struct proc *p)
302 {
303 	struct schedstate_percpu *spc;
304 	int queue = p->p_runpri >> 2;
305 
306 	SCHED_ASSERT_LOCKED();
307 	spc = &p->p_cpu->ci_schedstate;
308 	spc->spc_nrun--;
309 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
310 	    p->p_p->ps_pid);
311 
312 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
313 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
314 		spc->spc_whichqs &= ~(1U << queue);
315 		if (spc->spc_whichqs == 0)
316 			cpuset_del(&sched_queued_cpus, p->p_cpu);
317 	}
318 }
319 
320 struct proc *
321 sched_chooseproc(void)
322 {
323 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
324 	struct proc *p;
325 	int queue;
326 
327 	SCHED_ASSERT_LOCKED();
328 
329 #ifdef MULTIPROCESSOR
330 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
331 		if (spc->spc_whichqs) {
332 			for (queue = 0; queue < SCHED_NQS; queue++) {
333 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
334 					remrunqueue(p);
335 					setrunqueue(NULL, p, p->p_runpri);
336 					if (p->p_cpu == curcpu()) {
337 						KASSERT(p->p_flag & P_CPUPEG);
338 						goto again;
339 					}
340 				}
341 			}
342 		}
343 		p = spc->spc_idleproc;
344 		if (p == NULL)
345 			panic("no idleproc set on CPU%d",
346 			    CPU_INFO_UNIT(curcpu()));
347 		p->p_stat = SRUN;
348 		KASSERT(p->p_wchan == NULL);
349 		return (p);
350 	}
351 again:
352 #endif
353 
354 	if (spc->spc_whichqs) {
355 		queue = ffs(spc->spc_whichqs) - 1;
356 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
357 		remrunqueue(p);
358 		sched_noidle++;
359 		if (p->p_stat != SRUN)
360 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
361 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
362 		p = spc->spc_idleproc;
363 		if (p == NULL)
364 			panic("no idleproc set on CPU%d",
365 			    CPU_INFO_UNIT(curcpu()));
366 		p->p_stat = SRUN;
367 	}
368 
369 	KASSERT(p->p_wchan == NULL);
370 	return (p);
371 }
372 
373 struct cpu_info *
374 sched_choosecpu_fork(struct proc *parent, int flags)
375 {
376 #ifdef MULTIPROCESSOR
377 	struct cpu_info *choice = NULL;
378 	int run, best_run = INT_MAX;
379 	struct cpu_info *ci;
380 	struct cpuset set;
381 
382 #if 0
383 	/*
384 	 * XXX
385 	 * Don't do this until we have a painless way to move the cpu in exec.
386 	 * Preferably when nuking the old pmap and getting a new one on a
387 	 * new cpu.
388 	 */
389 	/*
390 	 * PPWAIT forks are simple. We know that the parent will not
391 	 * run until we exec and choose another cpu, so we just steal its
392 	 * cpu.
393 	 */
394 	if (flags & FORK_PPWAIT)
395 		return (parent->p_cpu);
396 #endif
397 
398 	/*
399 	 * Look at all cpus that are currently idle and have nothing queued.
400 	 * If there are none, pick the one with least queued procs first,
401 	 * then the one with lowest load average.
402 	 */
403 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
404 	cpuset_intersection(&set, &set, &sched_all_cpus);
405 	if (cpuset_first(&set) == NULL)
406 		cpuset_copy(&set, &sched_all_cpus);
407 
408 	while ((ci = cpuset_first(&set)) != NULL) {
409 		cpuset_del(&set, ci);
410 
411 		run = ci->ci_schedstate.spc_nrun;
412 
413 		if (choice == NULL || run < best_run) {
414 			choice = ci;
415 			best_run = run;
416 		}
417 	}
418 
419 	return (choice);
420 #else
421 	return (curcpu());
422 #endif
423 }
424 
425 struct cpu_info *
426 sched_choosecpu(struct proc *p)
427 {
428 #ifdef MULTIPROCESSOR
429 	struct cpu_info *choice = NULL;
430 	int last_cost = INT_MAX;
431 	struct cpu_info *ci;
432 	struct cpuset set;
433 
434 	/*
435 	 * If pegged to a cpu, don't allow it to move.
436 	 */
437 	if (p->p_flag & P_CPUPEG)
438 		return (p->p_cpu);
439 
440 	sched_choose++;
441 
442 	/*
443 	 * Look at all cpus that are currently idle and have nothing queued.
444 	 * If there are none, pick the cheapest of those.
445 	 * (idle + queued could mean that the cpu is handling an interrupt
446 	 * at this moment and haven't had time to leave idle yet).
447 	 */
448 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
449 	cpuset_intersection(&set, &set, &sched_all_cpus);
450 
451 	/*
452 	 * First, just check if our current cpu is in that set, if it is,
453 	 * this is simple.
454 	 * Also, our cpu might not be idle, but if it's the current cpu
455 	 * and it has nothing else queued and we're curproc, take it.
456 	 */
457 	if (cpuset_isset(&set, p->p_cpu) ||
458 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
459 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
460 	    curproc == p)) {
461 		sched_wasidle++;
462 		return (p->p_cpu);
463 	}
464 
465 	if (cpuset_first(&set) == NULL)
466 		cpuset_copy(&set, &sched_all_cpus);
467 
468 	while ((ci = cpuset_first(&set)) != NULL) {
469 		int cost = sched_proc_to_cpu_cost(ci, p);
470 
471 		if (choice == NULL || cost < last_cost) {
472 			choice = ci;
473 			last_cost = cost;
474 		}
475 		cpuset_del(&set, ci);
476 	}
477 
478 	if (p->p_cpu != choice)
479 		sched_nmigrations++;
480 	else
481 		sched_nomigrations++;
482 
483 	return (choice);
484 #else
485 	return (curcpu());
486 #endif
487 }
488 
489 /*
490  * Attempt to steal a proc from some cpu.
491  */
492 struct proc *
493 sched_steal_proc(struct cpu_info *self)
494 {
495 	struct proc *best = NULL;
496 #ifdef MULTIPROCESSOR
497 	struct schedstate_percpu *spc;
498 	int bestcost = INT_MAX;
499 	struct cpu_info *ci;
500 	struct cpuset set;
501 
502 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
503 
504 	/* Don't steal if we don't want to schedule processes in this CPU. */
505 	if (!cpuset_isset(&sched_all_cpus, self))
506 		return (NULL);
507 
508 	cpuset_copy(&set, &sched_queued_cpus);
509 
510 	while ((ci = cpuset_first(&set)) != NULL) {
511 		struct proc *p;
512 		int queue;
513 		int cost;
514 
515 		cpuset_del(&set, ci);
516 
517 		spc = &ci->ci_schedstate;
518 
519 		queue = ffs(spc->spc_whichqs) - 1;
520 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
521 			if (p->p_flag & P_CPUPEG)
522 				continue;
523 
524 			cost = sched_proc_to_cpu_cost(self, p);
525 
526 			if (best == NULL || cost < bestcost) {
527 				best = p;
528 				bestcost = cost;
529 			}
530 		}
531 	}
532 	if (best == NULL)
533 		return (NULL);
534 
535 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
536 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
537 
538 	remrunqueue(best);
539 	best->p_cpu = self;
540 
541 	sched_stolen++;
542 #endif
543 	return (best);
544 }
545 
546 #ifdef MULTIPROCESSOR
547 /*
548  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
549  */
550 static int
551 log2(unsigned int i)
552 {
553 	int ret = 0;
554 
555 	while (i >>= 1)
556 		ret++;
557 
558 	return (ret);
559 }
560 
561 /*
562  * Calculate the cost of moving the proc to this cpu.
563  *
564  * What we want is some guesstimate of how much "performance" it will
565  * cost us to move the proc here. Not just for caches and TLBs and NUMA
566  * memory, but also for the proc itself. A highly loaded cpu might not
567  * be the best candidate for this proc since it won't get run.
568  *
569  * Just total guesstimates for now.
570  */
571 
572 int sched_cost_load = 1;
573 int sched_cost_priority = 1;
574 int sched_cost_runnable = 3;
575 int sched_cost_resident = 1;
576 #endif
577 
578 int
579 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
580 {
581 	int cost = 0;
582 #ifdef MULTIPROCESSOR
583 	struct schedstate_percpu *spc;
584 	int l2resident = 0;
585 
586 	spc = &ci->ci_schedstate;
587 
588 	/*
589 	 * First, account for the priority of the proc we want to move.
590 	 * More willing to move, the lower the priority of the destination
591 	 * and the higher the priority of the proc.
592 	 */
593 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
594 		cost += (p->p_usrpri - spc->spc_curpriority) *
595 		    sched_cost_priority;
596 		cost += sched_cost_runnable;
597 	}
598 	if (cpuset_isset(&sched_queued_cpus, ci))
599 		cost += spc->spc_nrun * sched_cost_runnable;
600 
601 	/*
602 	 * Try to avoid the primary cpu as it handles hardware interrupts.
603 	 *
604 	 * XXX Needs to be revisited when we distribute interrupts
605 	 * over cpus.
606 	 */
607 	if (CPU_IS_PRIMARY(ci))
608 		cost += sched_cost_runnable;
609 
610 	/*
611 	 * If the proc is on this cpu already, lower the cost by how much
612 	 * it has been running and an estimate of its footprint.
613 	 */
614 	if (p->p_cpu == ci && p->p_slptime == 0) {
615 		l2resident =
616 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
617 		cost -= l2resident * sched_cost_resident;
618 	}
619 #endif
620 	return (cost);
621 }
622 
623 /*
624  * Peg a proc to a cpu.
625  */
626 void
627 sched_peg_curproc(struct cpu_info *ci)
628 {
629 	struct proc *p = curproc;
630 	int s;
631 
632 	SCHED_LOCK(s);
633 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
634 	setrunqueue(ci, p, p->p_usrpri);
635 	p->p_ru.ru_nvcsw++;
636 	mi_switch();
637 	SCHED_UNLOCK(s);
638 }
639 
640 #ifdef MULTIPROCESSOR
641 
642 void
643 sched_start_secondary_cpus(void)
644 {
645 	CPU_INFO_ITERATOR cii;
646 	struct cpu_info *ci;
647 
648 	CPU_INFO_FOREACH(cii, ci) {
649 		struct schedstate_percpu *spc = &ci->ci_schedstate;
650 
651 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
652 			continue;
653 		atomic_clearbits_int(&spc->spc_schedflags,
654 		    SPCF_SHOULDHALT | SPCF_HALTED);
655 #ifdef __HAVE_CPU_TOPOLOGY
656 		if (!sched_smt && ci->ci_smt_id > 0)
657 			continue;
658 #endif
659 		cpuset_add(&sched_all_cpus, ci);
660 	}
661 }
662 
663 void
664 sched_stop_secondary_cpus(void)
665 {
666 	CPU_INFO_ITERATOR cii;
667 	struct cpu_info *ci;
668 
669 	/*
670 	 * Make sure we stop the secondary CPUs.
671 	 */
672 	CPU_INFO_FOREACH(cii, ci) {
673 		struct schedstate_percpu *spc = &ci->ci_schedstate;
674 
675 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
676 			continue;
677 		cpuset_del(&sched_all_cpus, ci);
678 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
679 	}
680 	CPU_INFO_FOREACH(cii, ci) {
681 		struct schedstate_percpu *spc = &ci->ci_schedstate;
682 
683 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
684 			continue;
685 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
686 			sleep_setup(spc, PZERO, "schedstate");
687 			sleep_finish(0,
688 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
689 		}
690 	}
691 }
692 
693 struct sched_barrier_state {
694 	struct cpu_info *ci;
695 	struct cond cond;
696 };
697 
698 void
699 sched_barrier_task(void *arg)
700 {
701 	struct sched_barrier_state *sb = arg;
702 	struct cpu_info *ci = sb->ci;
703 
704 	sched_peg_curproc(ci);
705 	cond_signal(&sb->cond);
706 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
707 }
708 
709 void
710 sched_barrier(struct cpu_info *ci)
711 {
712 	struct sched_barrier_state sb;
713 	struct task task;
714 	CPU_INFO_ITERATOR cii;
715 
716 	if (ci == NULL) {
717 		CPU_INFO_FOREACH(cii, ci) {
718 			if (CPU_IS_PRIMARY(ci))
719 				break;
720 		}
721 	}
722 	KASSERT(ci != NULL);
723 
724 	if (ci == curcpu())
725 		return;
726 
727 	sb.ci = ci;
728 	cond_init(&sb.cond);
729 	task_set(&task, sched_barrier_task, &sb);
730 
731 	task_add(systqmp, &task);
732 	cond_wait(&sb.cond, "sbar");
733 }
734 
735 #else
736 
737 void
738 sched_barrier(struct cpu_info *ci)
739 {
740 }
741 
742 #endif
743 
744 /*
745  * Functions to manipulate cpu sets.
746  */
747 struct cpu_info *cpuset_infos[MAXCPUS];
748 static struct cpuset cpuset_all;
749 
750 void
751 cpuset_init_cpu(struct cpu_info *ci)
752 {
753 	cpuset_add(&cpuset_all, ci);
754 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
755 }
756 
757 void
758 cpuset_clear(struct cpuset *cs)
759 {
760 	memset(cs, 0, sizeof(*cs));
761 }
762 
763 void
764 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
765 {
766 	unsigned int num = CPU_INFO_UNIT(ci);
767 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
768 }
769 
770 void
771 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
772 {
773 	unsigned int num = CPU_INFO_UNIT(ci);
774 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
775 }
776 
777 int
778 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
779 {
780 	unsigned int num = CPU_INFO_UNIT(ci);
781 	return (cs->cs_set[num/32] & (1U << (num % 32)));
782 }
783 
784 void
785 cpuset_add_all(struct cpuset *cs)
786 {
787 	cpuset_copy(cs, &cpuset_all);
788 }
789 
790 void
791 cpuset_copy(struct cpuset *to, struct cpuset *from)
792 {
793 	memcpy(to, from, sizeof(*to));
794 }
795 
796 struct cpu_info *
797 cpuset_first(struct cpuset *cs)
798 {
799 	int i;
800 
801 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
802 		if (cs->cs_set[i])
803 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
804 
805 	return (NULL);
806 }
807 
808 void
809 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
810 {
811 	int i;
812 
813 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
814 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
815 }
816 
817 void
818 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
819 {
820 	int i;
821 
822 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
823 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
824 }
825 
826 void
827 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
828 {
829 	int i;
830 
831 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
832 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
833 }
834 
835 int
836 cpuset_cardinality(struct cpuset *cs)
837 {
838 	int cardinality, i, n;
839 
840 	cardinality = 0;
841 
842 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
843 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
844 			cardinality++;
845 
846 	return (cardinality);
847 }
848 
849 int
850 sysctl_hwncpuonline(void)
851 {
852 	return cpuset_cardinality(&sched_all_cpus);
853 }
854 
855 int
856 cpu_is_online(struct cpu_info *ci)
857 {
858 	return cpuset_isset(&sched_all_cpus, ci);
859 }
860 
861 #ifdef __HAVE_CPU_TOPOLOGY
862 
863 #include <sys/sysctl.h>
864 
865 int
866 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
867 {
868 	CPU_INFO_ITERATOR cii;
869 	struct cpu_info *ci;
870 	int err, newsmt;
871 
872 	newsmt = sched_smt;
873 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
874 	if (err)
875 		return err;
876 	if (newsmt == sched_smt)
877 		return 0;
878 
879 	sched_smt = newsmt;
880 	CPU_INFO_FOREACH(cii, ci) {
881 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
882 			continue;
883 		if (ci->ci_smt_id == 0)
884 			continue;
885 		if (sched_smt)
886 			cpuset_add(&sched_all_cpus, ci);
887 		else
888 			cpuset_del(&sched_all_cpus, ci);
889 	}
890 
891 	return 0;
892 }
893 
894 #endif
895