xref: /openbsd-src/sys/kern/kern_sched.c (revision cf31dfdee0cd9bc598be108ae94f2bdefce0a488)
1 /*	$OpenBSD: kern_sched.c,v 1.98 2024/07/08 14:46:47 mpi Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	clockintr_bind(&spc->spc_itimer, ci, itimer_update, NULL);
92 	clockintr_bind(&spc->spc_profclock, ci, profclock, NULL);
93 	clockintr_bind(&spc->spc_roundrobin, ci, roundrobin, NULL);
94 	clockintr_bind(&spc->spc_statclock, ci, statclock, NULL);
95 
96 	kthread_create_deferred(sched_kthreads_create, ci);
97 
98 	LIST_INIT(&spc->spc_deadproc);
99 	SIMPLEQ_INIT(&spc->spc_deferred);
100 
101 	/*
102 	 * Slight hack here until the cpuset code handles cpu_info
103 	 * structures.
104 	 */
105 	cpuset_init_cpu(ci);
106 
107 #ifdef __HAVE_CPU_TOPOLOGY
108 	if (!sched_smt && ci->ci_smt_id > 0)
109 		return;
110 #endif
111 	cpuset_add(&sched_all_cpus, ci);
112 }
113 
114 void
115 sched_kthreads_create(void *v)
116 {
117 	struct cpu_info *ci = v;
118 	struct schedstate_percpu *spc = &ci->ci_schedstate;
119 	static int num;
120 
121 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
122 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
123 	    &spc->spc_idleproc))
124 		panic("fork idle");
125 
126 	/* Name it as specified. */
127 	snprintf(spc->spc_idleproc->p_p->ps_comm,
128 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
129 	    "idle%d", num);
130 
131 	num++;
132 }
133 
134 void
135 sched_idle(void *v)
136 {
137 	struct schedstate_percpu *spc;
138 	struct proc *p = curproc;
139 	struct cpu_info *ci = v;
140 
141 	KERNEL_UNLOCK();
142 
143 	spc = &ci->ci_schedstate;
144 
145 	/*
146 	 * First time we enter here, we're not supposed to idle,
147 	 * just go away for a while.
148 	 */
149 	SCHED_LOCK();
150 	cpuset_add(&sched_idle_cpus, ci);
151 	p->p_stat = SSLEEP;
152 	p->p_cpu = ci;
153 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
154 	mi_switch();
155 	cpuset_del(&sched_idle_cpus, ci);
156 	SCHED_UNLOCK();
157 
158 	KASSERT(ci == curcpu());
159 	KASSERT(curproc == spc->spc_idleproc);
160 
161 	while (1) {
162 		while (!cpu_is_idle(curcpu())) {
163 			struct proc *dead;
164 
165 			SCHED_LOCK();
166 			p->p_stat = SSLEEP;
167 			mi_switch();
168 			SCHED_UNLOCK();
169 
170 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
171 				LIST_REMOVE(dead, p_hash);
172 				exit2(dead);
173 			}
174 		}
175 
176 		splassert(IPL_NONE);
177 
178 		smr_idle();
179 
180 		cpuset_add(&sched_idle_cpus, ci);
181 		cpu_idle_enter();
182 		while (spc->spc_whichqs == 0) {
183 #ifdef MULTIPROCESSOR
184 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
185 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
186 				cpuset_del(&sched_idle_cpus, ci);
187 				SCHED_LOCK();
188 				atomic_setbits_int(&spc->spc_schedflags,
189 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
190 				SCHED_UNLOCK();
191 				wakeup(spc);
192 			}
193 #endif
194 			cpu_idle_cycle();
195 		}
196 		cpu_idle_leave();
197 		cpuset_del(&sched_idle_cpus, ci);
198 	}
199 }
200 
201 /*
202  * To free our address space we have to jump through a few hoops.
203  * The freeing is done by the reaper, but until we have one reaper
204  * per cpu, we have no way of putting this proc on the deadproc list
205  * and waking up the reaper without risking having our address space and
206  * stack torn from under us before we manage to switch to another proc.
207  * Therefore we have a per-cpu list of dead processes where we put this
208  * proc and have idle clean up that list and move it to the reaper list.
209  * All this will be unnecessary once we can bind the reaper this cpu
210  * and not risk having it switch to another in case it sleeps.
211  */
212 void
213 sched_exit(struct proc *p)
214 {
215 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
216 	struct timespec ts;
217 
218 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
219 
220 	/* update the tu_runtime one last time */
221 	nanouptime(&ts);
222 	if (timespeccmp(&ts, &spc->spc_runtime, <))
223 		timespecclear(&ts);
224 	else
225 		timespecsub(&ts, &spc->spc_runtime, &ts);
226 
227 	/* add the time counts for this thread */
228 	tu_enter(&p->p_tu);
229 	timespecadd(&p->p_tu.tu_runtime, &ts, &p->p_tu.tu_runtime);
230 	tu_leave(&p->p_tu);
231 
232 	KERNEL_ASSERT_LOCKED();
233 	sched_toidle();
234 }
235 
236 void
237 sched_toidle(void)
238 {
239 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
240 	struct proc *idle;
241 
242 #ifdef MULTIPROCESSOR
243 	/* This process no longer needs to hold the kernel lock. */
244 	if (_kernel_lock_held())
245 		__mp_release_all(&kernel_lock);
246 #endif
247 
248 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
249 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
250 		clockintr_cancel(&spc->spc_itimer);
251 	}
252 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
253 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
254 		clockintr_cancel(&spc->spc_profclock);
255 	}
256 
257 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
258 
259 	SCHED_LOCK();
260 	idle = spc->spc_idleproc;
261 	idle->p_stat = SRUN;
262 
263 	uvmexp.swtch++;
264 	TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
265 	    idle->p_p->ps_pid);
266 	cpu_switchto(NULL, idle);
267 	panic("cpu_switchto returned");
268 }
269 
270 /*
271  * Run queue management.
272  */
273 void
274 sched_init_runqueues(void)
275 {
276 }
277 
278 void
279 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
280 {
281 	struct schedstate_percpu *spc;
282 	int queue = prio >> 2;
283 
284 	if (ci == NULL)
285 		ci = sched_choosecpu(p);
286 
287 	KASSERT(ci != NULL);
288 	SCHED_ASSERT_LOCKED();
289 	KASSERT(p->p_wchan == NULL);
290 
291 	p->p_cpu = ci;
292 	p->p_stat = SRUN;
293 	p->p_runpri = prio;
294 
295 	spc = &p->p_cpu->ci_schedstate;
296 	spc->spc_nrun++;
297 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
298 	    p->p_p->ps_pid);
299 
300 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
301 	spc->spc_whichqs |= (1U << queue);
302 	cpuset_add(&sched_queued_cpus, p->p_cpu);
303 
304 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
305 		cpu_unidle(p->p_cpu);
306 	else if (prio < spc->spc_curpriority)
307 		need_resched(ci);
308 }
309 
310 void
311 remrunqueue(struct proc *p)
312 {
313 	struct schedstate_percpu *spc;
314 	int queue = p->p_runpri >> 2;
315 
316 	SCHED_ASSERT_LOCKED();
317 	spc = &p->p_cpu->ci_schedstate;
318 	spc->spc_nrun--;
319 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
320 	    p->p_p->ps_pid);
321 
322 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
323 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
324 		spc->spc_whichqs &= ~(1U << queue);
325 		if (spc->spc_whichqs == 0)
326 			cpuset_del(&sched_queued_cpus, p->p_cpu);
327 	}
328 }
329 
330 struct proc *
331 sched_chooseproc(void)
332 {
333 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
334 	struct proc *p;
335 	int queue;
336 
337 	SCHED_ASSERT_LOCKED();
338 
339 #ifdef MULTIPROCESSOR
340 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
341 		if (spc->spc_whichqs) {
342 			for (queue = 0; queue < SCHED_NQS; queue++) {
343 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
344 					remrunqueue(p);
345 					setrunqueue(NULL, p, p->p_runpri);
346 					if (p->p_cpu == curcpu()) {
347 						KASSERT(p->p_flag & P_CPUPEG);
348 						goto again;
349 					}
350 				}
351 			}
352 		}
353 		p = spc->spc_idleproc;
354 		if (p == NULL)
355 			panic("no idleproc set on CPU%d",
356 			    CPU_INFO_UNIT(curcpu()));
357 		p->p_stat = SRUN;
358 		KASSERT(p->p_wchan == NULL);
359 		return (p);
360 	}
361 again:
362 #endif
363 
364 	if (spc->spc_whichqs) {
365 		queue = ffs(spc->spc_whichqs) - 1;
366 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
367 		remrunqueue(p);
368 		sched_noidle++;
369 		if (p->p_stat != SRUN)
370 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
371 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
372 		p = spc->spc_idleproc;
373 		if (p == NULL)
374 			panic("no idleproc set on CPU%d",
375 			    CPU_INFO_UNIT(curcpu()));
376 		p->p_stat = SRUN;
377 	}
378 
379 	KASSERT(p->p_wchan == NULL);
380 	return (p);
381 }
382 
383 struct cpu_info *
384 sched_choosecpu_fork(struct proc *parent, int flags)
385 {
386 #ifdef MULTIPROCESSOR
387 	struct cpu_info *choice = NULL;
388 	int run, best_run = INT_MAX;
389 	struct cpu_info *ci;
390 	struct cpuset set;
391 
392 #if 0
393 	/*
394 	 * XXX
395 	 * Don't do this until we have a painless way to move the cpu in exec.
396 	 * Preferably when nuking the old pmap and getting a new one on a
397 	 * new cpu.
398 	 */
399 	/*
400 	 * PPWAIT forks are simple. We know that the parent will not
401 	 * run until we exec and choose another cpu, so we just steal its
402 	 * cpu.
403 	 */
404 	if (flags & FORK_PPWAIT)
405 		return (parent->p_cpu);
406 #endif
407 
408 	/*
409 	 * Look at all cpus that are currently idle and have nothing queued.
410 	 * If there are none, pick the one with least queued procs first,
411 	 * then the one with lowest load average.
412 	 */
413 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
414 	cpuset_intersection(&set, &set, &sched_all_cpus);
415 	if (cpuset_first(&set) == NULL)
416 		cpuset_copy(&set, &sched_all_cpus);
417 
418 	while ((ci = cpuset_first(&set)) != NULL) {
419 		cpuset_del(&set, ci);
420 
421 		run = ci->ci_schedstate.spc_nrun;
422 
423 		if (choice == NULL || run < best_run) {
424 			choice = ci;
425 			best_run = run;
426 		}
427 	}
428 
429 	return (choice);
430 #else
431 	return (curcpu());
432 #endif
433 }
434 
435 struct cpu_info *
436 sched_choosecpu(struct proc *p)
437 {
438 #ifdef MULTIPROCESSOR
439 	struct cpu_info *choice = NULL;
440 	int last_cost = INT_MAX;
441 	struct cpu_info *ci;
442 	struct cpuset set;
443 
444 	/*
445 	 * If pegged to a cpu, don't allow it to move.
446 	 */
447 	if (p->p_flag & P_CPUPEG)
448 		return (p->p_cpu);
449 
450 	sched_choose++;
451 
452 	/*
453 	 * Look at all cpus that are currently idle and have nothing queued.
454 	 * If there are none, pick the cheapest of those.
455 	 * (idle + queued could mean that the cpu is handling an interrupt
456 	 * at this moment and haven't had time to leave idle yet).
457 	 */
458 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
459 	cpuset_intersection(&set, &set, &sched_all_cpus);
460 
461 	/*
462 	 * First, just check if our current cpu is in that set, if it is,
463 	 * this is simple.
464 	 * Also, our cpu might not be idle, but if it's the current cpu
465 	 * and it has nothing else queued and we're curproc, take it.
466 	 */
467 	if (cpuset_isset(&set, p->p_cpu) ||
468 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
469 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
470 	    curproc == p)) {
471 		sched_wasidle++;
472 		return (p->p_cpu);
473 	}
474 
475 	if (cpuset_first(&set) == NULL)
476 		cpuset_copy(&set, &sched_all_cpus);
477 
478 	while ((ci = cpuset_first(&set)) != NULL) {
479 		int cost = sched_proc_to_cpu_cost(ci, p);
480 
481 		if (choice == NULL || cost < last_cost) {
482 			choice = ci;
483 			last_cost = cost;
484 		}
485 		cpuset_del(&set, ci);
486 	}
487 
488 	if (p->p_cpu != choice)
489 		sched_nmigrations++;
490 	else
491 		sched_nomigrations++;
492 
493 	return (choice);
494 #else
495 	return (curcpu());
496 #endif
497 }
498 
499 /*
500  * Attempt to steal a proc from some cpu.
501  */
502 struct proc *
503 sched_steal_proc(struct cpu_info *self)
504 {
505 	struct proc *best = NULL;
506 #ifdef MULTIPROCESSOR
507 	struct schedstate_percpu *spc;
508 	int bestcost = INT_MAX;
509 	struct cpu_info *ci;
510 	struct cpuset set;
511 
512 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
513 
514 	/* Don't steal if we don't want to schedule processes in this CPU. */
515 	if (!cpuset_isset(&sched_all_cpus, self))
516 		return (NULL);
517 
518 	cpuset_copy(&set, &sched_queued_cpus);
519 
520 	while ((ci = cpuset_first(&set)) != NULL) {
521 		struct proc *p;
522 		int queue;
523 		int cost;
524 
525 		cpuset_del(&set, ci);
526 
527 		spc = &ci->ci_schedstate;
528 
529 		queue = ffs(spc->spc_whichqs) - 1;
530 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
531 			if (p->p_flag & P_CPUPEG)
532 				continue;
533 
534 			cost = sched_proc_to_cpu_cost(self, p);
535 
536 			if (best == NULL || cost < bestcost) {
537 				best = p;
538 				bestcost = cost;
539 			}
540 		}
541 	}
542 	if (best == NULL)
543 		return (NULL);
544 
545 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
546 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
547 
548 	remrunqueue(best);
549 	best->p_cpu = self;
550 
551 	sched_stolen++;
552 #endif
553 	return (best);
554 }
555 
556 #ifdef MULTIPROCESSOR
557 /*
558  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
559  */
560 static int
561 log2(unsigned int i)
562 {
563 	int ret = 0;
564 
565 	while (i >>= 1)
566 		ret++;
567 
568 	return (ret);
569 }
570 
571 /*
572  * Calculate the cost of moving the proc to this cpu.
573  *
574  * What we want is some guesstimate of how much "performance" it will
575  * cost us to move the proc here. Not just for caches and TLBs and NUMA
576  * memory, but also for the proc itself. A highly loaded cpu might not
577  * be the best candidate for this proc since it won't get run.
578  *
579  * Just total guesstimates for now.
580  */
581 
582 int sched_cost_load = 1;
583 int sched_cost_priority = 1;
584 int sched_cost_runnable = 3;
585 int sched_cost_resident = 1;
586 #endif
587 
588 int
589 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
590 {
591 	int cost = 0;
592 #ifdef MULTIPROCESSOR
593 	struct schedstate_percpu *spc;
594 	int l2resident = 0;
595 
596 	spc = &ci->ci_schedstate;
597 
598 	/*
599 	 * First, account for the priority of the proc we want to move.
600 	 * More willing to move, the lower the priority of the destination
601 	 * and the higher the priority of the proc.
602 	 */
603 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
604 		cost += (p->p_usrpri - spc->spc_curpriority) *
605 		    sched_cost_priority;
606 		cost += sched_cost_runnable;
607 	}
608 	if (cpuset_isset(&sched_queued_cpus, ci))
609 		cost += spc->spc_nrun * sched_cost_runnable;
610 
611 	/*
612 	 * Try to avoid the primary cpu as it handles hardware interrupts.
613 	 *
614 	 * XXX Needs to be revisited when we distribute interrupts
615 	 * over cpus.
616 	 */
617 	if (CPU_IS_PRIMARY(ci))
618 		cost += sched_cost_runnable;
619 
620 	/*
621 	 * If the proc is on this cpu already, lower the cost by how much
622 	 * it has been running and an estimate of its footprint.
623 	 */
624 	if (p->p_cpu == ci && p->p_slptime == 0) {
625 		l2resident =
626 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
627 		cost -= l2resident * sched_cost_resident;
628 	}
629 #endif
630 	return (cost);
631 }
632 
633 /*
634  * Peg a proc to a cpu.
635  */
636 void
637 sched_peg_curproc(struct cpu_info *ci)
638 {
639 	struct proc *p = curproc;
640 
641 	SCHED_LOCK();
642 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
643 	setrunqueue(ci, p, p->p_usrpri);
644 	p->p_ru.ru_nvcsw++;
645 	mi_switch();
646 	SCHED_UNLOCK();
647 }
648 
649 void
650 sched_unpeg_curproc(void)
651 {
652 	struct proc *p = curproc;
653 
654 	KASSERT(ISSET(p->p_flag, P_CPUPEG));
655 
656 	atomic_clearbits_int(&p->p_flag, P_CPUPEG);
657 }
658 
659 #ifdef MULTIPROCESSOR
660 
661 void
662 sched_start_secondary_cpus(void)
663 {
664 	CPU_INFO_ITERATOR cii;
665 	struct cpu_info *ci;
666 
667 	CPU_INFO_FOREACH(cii, ci) {
668 		struct schedstate_percpu *spc = &ci->ci_schedstate;
669 
670 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
671 			continue;
672 		atomic_clearbits_int(&spc->spc_schedflags,
673 		    SPCF_SHOULDHALT | SPCF_HALTED);
674 #ifdef __HAVE_CPU_TOPOLOGY
675 		if (!sched_smt && ci->ci_smt_id > 0)
676 			continue;
677 #endif
678 		cpuset_add(&sched_all_cpus, ci);
679 	}
680 }
681 
682 void
683 sched_stop_secondary_cpus(void)
684 {
685 	CPU_INFO_ITERATOR cii;
686 	struct cpu_info *ci;
687 
688 	/*
689 	 * Make sure we stop the secondary CPUs.
690 	 */
691 	CPU_INFO_FOREACH(cii, ci) {
692 		struct schedstate_percpu *spc = &ci->ci_schedstate;
693 
694 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
695 			continue;
696 		cpuset_del(&sched_all_cpus, ci);
697 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
698 	}
699 	CPU_INFO_FOREACH(cii, ci) {
700 		struct schedstate_percpu *spc = &ci->ci_schedstate;
701 
702 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
703 			continue;
704 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
705 			sleep_setup(spc, PZERO, "schedstate");
706 			sleep_finish(0,
707 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
708 		}
709 	}
710 }
711 
712 struct sched_barrier_state {
713 	struct cpu_info *ci;
714 	struct cond cond;
715 };
716 
717 void
718 sched_barrier_task(void *arg)
719 {
720 	struct sched_barrier_state *sb = arg;
721 	struct cpu_info *ci = sb->ci;
722 
723 	sched_peg_curproc(ci);
724 	cond_signal(&sb->cond);
725 	sched_unpeg_curproc();
726 }
727 
728 void
729 sched_barrier(struct cpu_info *ci)
730 {
731 	struct sched_barrier_state sb;
732 	struct task task;
733 	CPU_INFO_ITERATOR cii;
734 
735 	if (ci == NULL) {
736 		CPU_INFO_FOREACH(cii, ci) {
737 			if (CPU_IS_PRIMARY(ci))
738 				break;
739 		}
740 	}
741 	KASSERT(ci != NULL);
742 
743 	if (ci == curcpu())
744 		return;
745 
746 	sb.ci = ci;
747 	cond_init(&sb.cond);
748 	task_set(&task, sched_barrier_task, &sb);
749 
750 	task_add(systqmp, &task);
751 	cond_wait(&sb.cond, "sbar");
752 }
753 
754 #else
755 
756 void
757 sched_barrier(struct cpu_info *ci)
758 {
759 }
760 
761 #endif
762 
763 /*
764  * Functions to manipulate cpu sets.
765  */
766 struct cpu_info *cpuset_infos[MAXCPUS];
767 static struct cpuset cpuset_all;
768 
769 void
770 cpuset_init_cpu(struct cpu_info *ci)
771 {
772 	cpuset_add(&cpuset_all, ci);
773 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
774 }
775 
776 void
777 cpuset_clear(struct cpuset *cs)
778 {
779 	memset(cs, 0, sizeof(*cs));
780 }
781 
782 void
783 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
784 {
785 	unsigned int num = CPU_INFO_UNIT(ci);
786 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
787 }
788 
789 void
790 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
791 {
792 	unsigned int num = CPU_INFO_UNIT(ci);
793 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
794 }
795 
796 int
797 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
798 {
799 	unsigned int num = CPU_INFO_UNIT(ci);
800 	return (cs->cs_set[num/32] & (1U << (num % 32)));
801 }
802 
803 void
804 cpuset_add_all(struct cpuset *cs)
805 {
806 	cpuset_copy(cs, &cpuset_all);
807 }
808 
809 void
810 cpuset_copy(struct cpuset *to, struct cpuset *from)
811 {
812 	memcpy(to, from, sizeof(*to));
813 }
814 
815 struct cpu_info *
816 cpuset_first(struct cpuset *cs)
817 {
818 	int i;
819 
820 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
821 		if (cs->cs_set[i])
822 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
823 
824 	return (NULL);
825 }
826 
827 void
828 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
829 {
830 	int i;
831 
832 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
833 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
834 }
835 
836 void
837 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
838 {
839 	int i;
840 
841 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
842 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
843 }
844 
845 void
846 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
847 {
848 	int i;
849 
850 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
851 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
852 }
853 
854 int
855 cpuset_cardinality(struct cpuset *cs)
856 {
857 	int cardinality, i, n;
858 
859 	cardinality = 0;
860 
861 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
862 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
863 			cardinality++;
864 
865 	return (cardinality);
866 }
867 
868 int
869 sysctl_hwncpuonline(void)
870 {
871 	return cpuset_cardinality(&sched_all_cpus);
872 }
873 
874 int
875 cpu_is_online(struct cpu_info *ci)
876 {
877 	return cpuset_isset(&sched_all_cpus, ci);
878 }
879 
880 #ifdef __HAVE_CPU_TOPOLOGY
881 
882 #include <sys/sysctl.h>
883 
884 int
885 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
886 {
887 	CPU_INFO_ITERATOR cii;
888 	struct cpu_info *ci;
889 	int err, newsmt;
890 
891 	newsmt = sched_smt;
892 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
893 	if (err)
894 		return err;
895 	if (newsmt == sched_smt)
896 		return 0;
897 
898 	sched_smt = newsmt;
899 	CPU_INFO_FOREACH(cii, ci) {
900 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
901 			continue;
902 		if (ci->ci_smt_id == 0)
903 			continue;
904 		if (sched_smt)
905 			cpuset_add(&sched_all_cpus, ci);
906 		else
907 			cpuset_del(&sched_all_cpus, ci);
908 	}
909 
910 	return 0;
911 }
912 
913 #endif
914