xref: /openbsd-src/sys/kern/kern_sched.c (revision f90ef06a3045119dcc88b72d8b98ca60e3c00d5a)
1 /*	$OpenBSD: kern_sched.c,v 1.86 2023/08/14 08:33:24 mpi Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	if (spc->spc_itimer == NULL) {
92 		spc->spc_itimer = clockintr_establish(&ci->ci_queue,
93 		    itimer_update);
94 		if (spc->spc_itimer == NULL) {
95 			panic("%s: clockintr_establish itimer_update",
96 			    __func__);
97 		}
98 	}
99 	if (spc->spc_profclock == NULL) {
100 		spc->spc_profclock = clockintr_establish(&ci->ci_queue,
101 		    profclock);
102 		if (spc->spc_profclock == NULL)
103 			panic("%s: clockintr_establish profclock", __func__);
104 	}
105 	if (spc->spc_roundrobin == NULL) {
106 		spc->spc_roundrobin = clockintr_establish(&ci->ci_queue,
107 		    roundrobin);
108 		if (spc->spc_roundrobin == NULL)
109 			panic("%s: clockintr_establish roundrobin", __func__);
110 	}
111 
112 	kthread_create_deferred(sched_kthreads_create, ci);
113 
114 	LIST_INIT(&spc->spc_deadproc);
115 	SIMPLEQ_INIT(&spc->spc_deferred);
116 
117 	/*
118 	 * Slight hack here until the cpuset code handles cpu_info
119 	 * structures.
120 	 */
121 	cpuset_init_cpu(ci);
122 
123 #ifdef __HAVE_CPU_TOPOLOGY
124 	if (!sched_smt && ci->ci_smt_id > 0)
125 		return;
126 #endif
127 	cpuset_add(&sched_all_cpus, ci);
128 }
129 
130 void
131 sched_kthreads_create(void *v)
132 {
133 	struct cpu_info *ci = v;
134 	struct schedstate_percpu *spc = &ci->ci_schedstate;
135 	static int num;
136 
137 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
138 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
139 	    &spc->spc_idleproc))
140 		panic("fork idle");
141 
142 	/* Name it as specified. */
143 	snprintf(spc->spc_idleproc->p_p->ps_comm,
144 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
145 	    "idle%d", num);
146 
147 	num++;
148 }
149 
150 void
151 sched_idle(void *v)
152 {
153 	struct schedstate_percpu *spc;
154 	struct proc *p = curproc;
155 	struct cpu_info *ci = v;
156 	int s;
157 
158 	KERNEL_UNLOCK();
159 
160 	spc = &ci->ci_schedstate;
161 
162 	/*
163 	 * First time we enter here, we're not supposed to idle,
164 	 * just go away for a while.
165 	 */
166 	SCHED_LOCK(s);
167 	cpuset_add(&sched_idle_cpus, ci);
168 	p->p_stat = SSLEEP;
169 	p->p_cpu = ci;
170 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
171 	mi_switch();
172 	cpuset_del(&sched_idle_cpus, ci);
173 	SCHED_UNLOCK(s);
174 
175 	KASSERT(ci == curcpu());
176 	KASSERT(curproc == spc->spc_idleproc);
177 
178 	while (1) {
179 		while (!cpu_is_idle(curcpu())) {
180 			struct proc *dead;
181 
182 			SCHED_LOCK(s);
183 			p->p_stat = SSLEEP;
184 			mi_switch();
185 			SCHED_UNLOCK(s);
186 
187 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
188 				LIST_REMOVE(dead, p_hash);
189 				exit2(dead);
190 			}
191 		}
192 
193 		splassert(IPL_NONE);
194 
195 		smr_idle();
196 
197 		cpuset_add(&sched_idle_cpus, ci);
198 		cpu_idle_enter();
199 		while (spc->spc_whichqs == 0) {
200 #ifdef MULTIPROCESSOR
201 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
202 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
203 				cpuset_del(&sched_idle_cpus, ci);
204 				SCHED_LOCK(s);
205 				atomic_setbits_int(&spc->spc_schedflags,
206 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
207 				SCHED_UNLOCK(s);
208 				wakeup(spc);
209 			}
210 #endif
211 			cpu_idle_cycle();
212 		}
213 		cpu_idle_leave();
214 		cpuset_del(&sched_idle_cpus, ci);
215 	}
216 }
217 
218 /*
219  * To free our address space we have to jump through a few hoops.
220  * The freeing is done by the reaper, but until we have one reaper
221  * per cpu, we have no way of putting this proc on the deadproc list
222  * and waking up the reaper without risking having our address space and
223  * stack torn from under us before we manage to switch to another proc.
224  * Therefore we have a per-cpu list of dead processes where we put this
225  * proc and have idle clean up that list and move it to the reaper list.
226  * All this will be unnecessary once we can bind the reaper this cpu
227  * and not risk having it switch to another in case it sleeps.
228  */
229 void
230 sched_exit(struct proc *p)
231 {
232 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
233 	struct timespec ts;
234 	struct proc *idle;
235 	int s;
236 
237 	nanouptime(&ts);
238 	timespecsub(&ts, &spc->spc_runtime, &ts);
239 	timespecadd(&p->p_rtime, &ts, &p->p_rtime);
240 
241 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
242 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
243 		clockintr_cancel(spc->spc_itimer);
244 	}
245 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
246 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
247 		clockintr_cancel(spc->spc_profclock);
248 	}
249 
250 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
251 
252 #ifdef MULTIPROCESSOR
253 	/* This process no longer needs to hold the kernel lock. */
254 	KERNEL_ASSERT_LOCKED();
255 	__mp_release_all(&kernel_lock);
256 #endif
257 
258 	SCHED_LOCK(s);
259 	idle = spc->spc_idleproc;
260 	idle->p_stat = SRUN;
261 	cpu_switchto(NULL, idle);
262 	panic("cpu_switchto returned");
263 }
264 
265 /*
266  * Run queue management.
267  */
268 void
269 sched_init_runqueues(void)
270 {
271 }
272 
273 void
274 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
275 {
276 	struct schedstate_percpu *spc;
277 	int queue = prio >> 2;
278 
279 	if (ci == NULL)
280 		ci = sched_choosecpu(p);
281 
282 	KASSERT(ci != NULL);
283 	SCHED_ASSERT_LOCKED();
284 
285 	p->p_cpu = ci;
286 	p->p_stat = SRUN;
287 	p->p_runpri = prio;
288 
289 	spc = &p->p_cpu->ci_schedstate;
290 	spc->spc_nrun++;
291 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
292 	    p->p_p->ps_pid);
293 
294 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
295 	spc->spc_whichqs |= (1U << queue);
296 	cpuset_add(&sched_queued_cpus, p->p_cpu);
297 
298 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
299 		cpu_unidle(p->p_cpu);
300 
301 	if (prio < spc->spc_curpriority)
302 		need_resched(ci);
303 }
304 
305 void
306 remrunqueue(struct proc *p)
307 {
308 	struct schedstate_percpu *spc;
309 	int queue = p->p_runpri >> 2;
310 
311 	SCHED_ASSERT_LOCKED();
312 	spc = &p->p_cpu->ci_schedstate;
313 	spc->spc_nrun--;
314 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
315 	    p->p_p->ps_pid);
316 
317 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
318 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
319 		spc->spc_whichqs &= ~(1U << queue);
320 		if (spc->spc_whichqs == 0)
321 			cpuset_del(&sched_queued_cpus, p->p_cpu);
322 	}
323 }
324 
325 struct proc *
326 sched_chooseproc(void)
327 {
328 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
329 	struct proc *p;
330 	int queue;
331 
332 	SCHED_ASSERT_LOCKED();
333 
334 #ifdef MULTIPROCESSOR
335 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
336 		if (spc->spc_whichqs) {
337 			for (queue = 0; queue < SCHED_NQS; queue++) {
338 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
339 					remrunqueue(p);
340 					setrunqueue(NULL, p, p->p_runpri);
341 					if (p->p_cpu == curcpu()) {
342 						KASSERT(p->p_flag & P_CPUPEG);
343 						goto again;
344 					}
345 				}
346 			}
347 		}
348 		p = spc->spc_idleproc;
349 		KASSERT(p);
350 		KASSERT(p->p_wchan == NULL);
351 		p->p_stat = SRUN;
352 		return (p);
353 	}
354 #endif
355 
356 again:
357 	if (spc->spc_whichqs) {
358 		queue = ffs(spc->spc_whichqs) - 1;
359 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
360 		remrunqueue(p);
361 		sched_noidle++;
362 		if (p->p_stat != SRUN)
363 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
364 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
365 		p = spc->spc_idleproc;
366 		if (p == NULL) {
367                         int s;
368 			/*
369 			 * We get here if someone decides to switch during
370 			 * boot before forking kthreads, bleh.
371 			 * This is kind of like a stupid idle loop.
372 			 */
373 #ifdef MULTIPROCESSOR
374 			__mp_unlock(&sched_lock);
375 #endif
376 			spl0();
377 			delay(10);
378 			SCHED_LOCK(s);
379 			goto again;
380                 }
381 		KASSERT(p);
382 		p->p_stat = SRUN;
383 	}
384 
385 	KASSERT(p->p_wchan == NULL);
386 	return (p);
387 }
388 
389 struct cpu_info *
390 sched_choosecpu_fork(struct proc *parent, int flags)
391 {
392 #ifdef MULTIPROCESSOR
393 	struct cpu_info *choice = NULL;
394 	int run, best_run = INT_MAX;
395 	struct cpu_info *ci;
396 	struct cpuset set;
397 
398 #if 0
399 	/*
400 	 * XXX
401 	 * Don't do this until we have a painless way to move the cpu in exec.
402 	 * Preferably when nuking the old pmap and getting a new one on a
403 	 * new cpu.
404 	 */
405 	/*
406 	 * PPWAIT forks are simple. We know that the parent will not
407 	 * run until we exec and choose another cpu, so we just steal its
408 	 * cpu.
409 	 */
410 	if (flags & FORK_PPWAIT)
411 		return (parent->p_cpu);
412 #endif
413 
414 	/*
415 	 * Look at all cpus that are currently idle and have nothing queued.
416 	 * If there are none, pick the one with least queued procs first,
417 	 * then the one with lowest load average.
418 	 */
419 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
420 	cpuset_intersection(&set, &set, &sched_all_cpus);
421 	if (cpuset_first(&set) == NULL)
422 		cpuset_copy(&set, &sched_all_cpus);
423 
424 	while ((ci = cpuset_first(&set)) != NULL) {
425 		cpuset_del(&set, ci);
426 
427 		run = ci->ci_schedstate.spc_nrun;
428 
429 		if (choice == NULL || run < best_run) {
430 			choice = ci;
431 			best_run = run;
432 		}
433 	}
434 
435 	return (choice);
436 #else
437 	return (curcpu());
438 #endif
439 }
440 
441 struct cpu_info *
442 sched_choosecpu(struct proc *p)
443 {
444 #ifdef MULTIPROCESSOR
445 	struct cpu_info *choice = NULL;
446 	int last_cost = INT_MAX;
447 	struct cpu_info *ci;
448 	struct cpuset set;
449 
450 	/*
451 	 * If pegged to a cpu, don't allow it to move.
452 	 */
453 	if (p->p_flag & P_CPUPEG)
454 		return (p->p_cpu);
455 
456 	sched_choose++;
457 
458 	/*
459 	 * Look at all cpus that are currently idle and have nothing queued.
460 	 * If there are none, pick the cheapest of those.
461 	 * (idle + queued could mean that the cpu is handling an interrupt
462 	 * at this moment and haven't had time to leave idle yet).
463 	 */
464 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
465 	cpuset_intersection(&set, &set, &sched_all_cpus);
466 
467 	/*
468 	 * First, just check if our current cpu is in that set, if it is,
469 	 * this is simple.
470 	 * Also, our cpu might not be idle, but if it's the current cpu
471 	 * and it has nothing else queued and we're curproc, take it.
472 	 */
473 	if (cpuset_isset(&set, p->p_cpu) ||
474 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
475 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
476 	    curproc == p)) {
477 		sched_wasidle++;
478 		return (p->p_cpu);
479 	}
480 
481 	if (cpuset_first(&set) == NULL)
482 		cpuset_copy(&set, &sched_all_cpus);
483 
484 	while ((ci = cpuset_first(&set)) != NULL) {
485 		int cost = sched_proc_to_cpu_cost(ci, p);
486 
487 		if (choice == NULL || cost < last_cost) {
488 			choice = ci;
489 			last_cost = cost;
490 		}
491 		cpuset_del(&set, ci);
492 	}
493 
494 	if (p->p_cpu != choice)
495 		sched_nmigrations++;
496 	else
497 		sched_nomigrations++;
498 
499 	return (choice);
500 #else
501 	return (curcpu());
502 #endif
503 }
504 
505 /*
506  * Attempt to steal a proc from some cpu.
507  */
508 struct proc *
509 sched_steal_proc(struct cpu_info *self)
510 {
511 	struct proc *best = NULL;
512 #ifdef MULTIPROCESSOR
513 	struct schedstate_percpu *spc;
514 	int bestcost = INT_MAX;
515 	struct cpu_info *ci;
516 	struct cpuset set;
517 
518 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
519 
520 	/* Don't steal if we don't want to schedule processes in this CPU. */
521 	if (!cpuset_isset(&sched_all_cpus, self))
522 		return (NULL);
523 
524 	cpuset_copy(&set, &sched_queued_cpus);
525 
526 	while ((ci = cpuset_first(&set)) != NULL) {
527 		struct proc *p;
528 		int queue;
529 		int cost;
530 
531 		cpuset_del(&set, ci);
532 
533 		spc = &ci->ci_schedstate;
534 
535 		queue = ffs(spc->spc_whichqs) - 1;
536 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
537 			if (p->p_flag & P_CPUPEG)
538 				continue;
539 
540 			cost = sched_proc_to_cpu_cost(self, p);
541 
542 			if (best == NULL || cost < bestcost) {
543 				best = p;
544 				bestcost = cost;
545 			}
546 		}
547 	}
548 	if (best == NULL)
549 		return (NULL);
550 
551 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
552 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
553 
554 	remrunqueue(best);
555 	best->p_cpu = self;
556 
557 	sched_stolen++;
558 #endif
559 	return (best);
560 }
561 
562 #ifdef MULTIPROCESSOR
563 /*
564  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
565  */
566 static int
567 log2(unsigned int i)
568 {
569 	int ret = 0;
570 
571 	while (i >>= 1)
572 		ret++;
573 
574 	return (ret);
575 }
576 
577 /*
578  * Calculate the cost of moving the proc to this cpu.
579  *
580  * What we want is some guesstimate of how much "performance" it will
581  * cost us to move the proc here. Not just for caches and TLBs and NUMA
582  * memory, but also for the proc itself. A highly loaded cpu might not
583  * be the best candidate for this proc since it won't get run.
584  *
585  * Just total guesstimates for now.
586  */
587 
588 int sched_cost_load = 1;
589 int sched_cost_priority = 1;
590 int sched_cost_runnable = 3;
591 int sched_cost_resident = 1;
592 #endif
593 
594 int
595 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
596 {
597 	int cost = 0;
598 #ifdef MULTIPROCESSOR
599 	struct schedstate_percpu *spc;
600 	int l2resident = 0;
601 
602 	spc = &ci->ci_schedstate;
603 
604 	/*
605 	 * First, account for the priority of the proc we want to move.
606 	 * More willing to move, the lower the priority of the destination
607 	 * and the higher the priority of the proc.
608 	 */
609 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
610 		cost += (p->p_usrpri - spc->spc_curpriority) *
611 		    sched_cost_priority;
612 		cost += sched_cost_runnable;
613 	}
614 	if (cpuset_isset(&sched_queued_cpus, ci))
615 		cost += spc->spc_nrun * sched_cost_runnable;
616 
617 	/*
618 	 * Try to avoid the primary cpu as it handles hardware interrupts.
619 	 *
620 	 * XXX Needs to be revisited when we distribute interrupts
621 	 * over cpus.
622 	 */
623 	if (CPU_IS_PRIMARY(ci))
624 		cost += sched_cost_runnable;
625 
626 	/*
627 	 * If the proc is on this cpu already, lower the cost by how much
628 	 * it has been running and an estimate of its footprint.
629 	 */
630 	if (p->p_cpu == ci && p->p_slptime == 0) {
631 		l2resident =
632 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
633 		cost -= l2resident * sched_cost_resident;
634 	}
635 #endif
636 	return (cost);
637 }
638 
639 /*
640  * Peg a proc to a cpu.
641  */
642 void
643 sched_peg_curproc(struct cpu_info *ci)
644 {
645 	struct proc *p = curproc;
646 	int s;
647 
648 	SCHED_LOCK(s);
649 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
650 	setrunqueue(ci, p, p->p_usrpri);
651 	p->p_ru.ru_nvcsw++;
652 	mi_switch();
653 	SCHED_UNLOCK(s);
654 }
655 
656 #ifdef MULTIPROCESSOR
657 
658 void
659 sched_start_secondary_cpus(void)
660 {
661 	CPU_INFO_ITERATOR cii;
662 	struct cpu_info *ci;
663 
664 	CPU_INFO_FOREACH(cii, ci) {
665 		struct schedstate_percpu *spc = &ci->ci_schedstate;
666 
667 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
668 			continue;
669 		atomic_clearbits_int(&spc->spc_schedflags,
670 		    SPCF_SHOULDHALT | SPCF_HALTED);
671 #ifdef __HAVE_CPU_TOPOLOGY
672 		if (!sched_smt && ci->ci_smt_id > 0)
673 			continue;
674 #endif
675 		cpuset_add(&sched_all_cpus, ci);
676 	}
677 }
678 
679 void
680 sched_stop_secondary_cpus(void)
681 {
682 	CPU_INFO_ITERATOR cii;
683 	struct cpu_info *ci;
684 
685 	/*
686 	 * Make sure we stop the secondary CPUs.
687 	 */
688 	CPU_INFO_FOREACH(cii, ci) {
689 		struct schedstate_percpu *spc = &ci->ci_schedstate;
690 
691 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
692 			continue;
693 		cpuset_del(&sched_all_cpus, ci);
694 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
695 	}
696 	CPU_INFO_FOREACH(cii, ci) {
697 		struct schedstate_percpu *spc = &ci->ci_schedstate;
698 
699 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
700 			continue;
701 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
702 			sleep_setup(spc, PZERO, "schedstate");
703 			sleep_finish(0,
704 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
705 		}
706 	}
707 }
708 
709 struct sched_barrier_state {
710 	struct cpu_info *ci;
711 	struct cond cond;
712 };
713 
714 void
715 sched_barrier_task(void *arg)
716 {
717 	struct sched_barrier_state *sb = arg;
718 	struct cpu_info *ci = sb->ci;
719 
720 	sched_peg_curproc(ci);
721 	cond_signal(&sb->cond);
722 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
723 }
724 
725 void
726 sched_barrier(struct cpu_info *ci)
727 {
728 	struct sched_barrier_state sb;
729 	struct task task;
730 	CPU_INFO_ITERATOR cii;
731 
732 	if (ci == NULL) {
733 		CPU_INFO_FOREACH(cii, ci) {
734 			if (CPU_IS_PRIMARY(ci))
735 				break;
736 		}
737 	}
738 	KASSERT(ci != NULL);
739 
740 	if (ci == curcpu())
741 		return;
742 
743 	sb.ci = ci;
744 	cond_init(&sb.cond);
745 	task_set(&task, sched_barrier_task, &sb);
746 
747 	task_add(systqmp, &task);
748 	cond_wait(&sb.cond, "sbar");
749 }
750 
751 #else
752 
753 void
754 sched_barrier(struct cpu_info *ci)
755 {
756 }
757 
758 #endif
759 
760 /*
761  * Functions to manipulate cpu sets.
762  */
763 struct cpu_info *cpuset_infos[MAXCPUS];
764 static struct cpuset cpuset_all;
765 
766 void
767 cpuset_init_cpu(struct cpu_info *ci)
768 {
769 	cpuset_add(&cpuset_all, ci);
770 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
771 }
772 
773 void
774 cpuset_clear(struct cpuset *cs)
775 {
776 	memset(cs, 0, sizeof(*cs));
777 }
778 
779 void
780 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
781 {
782 	unsigned int num = CPU_INFO_UNIT(ci);
783 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
784 }
785 
786 void
787 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
788 {
789 	unsigned int num = CPU_INFO_UNIT(ci);
790 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
791 }
792 
793 int
794 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
795 {
796 	unsigned int num = CPU_INFO_UNIT(ci);
797 	return (cs->cs_set[num/32] & (1U << (num % 32)));
798 }
799 
800 void
801 cpuset_add_all(struct cpuset *cs)
802 {
803 	cpuset_copy(cs, &cpuset_all);
804 }
805 
806 void
807 cpuset_copy(struct cpuset *to, struct cpuset *from)
808 {
809 	memcpy(to, from, sizeof(*to));
810 }
811 
812 struct cpu_info *
813 cpuset_first(struct cpuset *cs)
814 {
815 	int i;
816 
817 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
818 		if (cs->cs_set[i])
819 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
820 
821 	return (NULL);
822 }
823 
824 void
825 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
826 {
827 	int i;
828 
829 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
830 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
831 }
832 
833 void
834 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
835 {
836 	int i;
837 
838 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
839 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
840 }
841 
842 void
843 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
844 {
845 	int i;
846 
847 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
848 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
849 }
850 
851 int
852 cpuset_cardinality(struct cpuset *cs)
853 {
854 	int cardinality, i, n;
855 
856 	cardinality = 0;
857 
858 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
859 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
860 			cardinality++;
861 
862 	return (cardinality);
863 }
864 
865 int
866 sysctl_hwncpuonline(void)
867 {
868 	return cpuset_cardinality(&sched_all_cpus);
869 }
870 
871 int
872 cpu_is_online(struct cpu_info *ci)
873 {
874 	return cpuset_isset(&sched_all_cpus, ci);
875 }
876 
877 #ifdef __HAVE_CPU_TOPOLOGY
878 
879 #include <sys/sysctl.h>
880 
881 int
882 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
883 {
884 	CPU_INFO_ITERATOR cii;
885 	struct cpu_info *ci;
886 	int err, newsmt;
887 
888 	newsmt = sched_smt;
889 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
890 	if (err)
891 		return err;
892 	if (newsmt == sched_smt)
893 		return 0;
894 
895 	sched_smt = newsmt;
896 	CPU_INFO_FOREACH(cii, ci) {
897 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
898 			continue;
899 		if (ci->ci_smt_id == 0)
900 			continue;
901 		if (sched_smt)
902 			cpuset_add(&sched_all_cpus, ci);
903 		else
904 			cpuset_del(&sched_all_cpus, ci);
905 	}
906 
907 	return 0;
908 }
909 
910 #endif
911