xref: /openbsd-src/sys/kern/kern_sched.c (revision 4e1ee0786f11cc571bd0be17d38e46f635c719fc)
1 /*	$OpenBSD: kern_sched.c,v 1.73 2021/09/09 18:41:39 mpi Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/resourcevar.h>
25 #include <sys/signalvar.h>
26 #include <sys/mutex.h>
27 #include <sys/task.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	kthread_create_deferred(sched_kthreads_create, ci);
92 
93 	LIST_INIT(&spc->spc_deadproc);
94 	SIMPLEQ_INIT(&spc->spc_deferred);
95 
96 	/*
97 	 * Slight hack here until the cpuset code handles cpu_info
98 	 * structures.
99 	 */
100 	cpuset_init_cpu(ci);
101 
102 #ifdef __HAVE_CPU_TOPOLOGY
103 	if (!sched_smt && ci->ci_smt_id > 0)
104 		return;
105 #endif
106 	cpuset_add(&sched_all_cpus, ci);
107 }
108 
109 void
110 sched_kthreads_create(void *v)
111 {
112 	struct cpu_info *ci = v;
113 	struct schedstate_percpu *spc = &ci->ci_schedstate;
114 	static int num;
115 
116 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
117 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
118 	    &spc->spc_idleproc))
119 		panic("fork idle");
120 
121 	/* Name it as specified. */
122 	snprintf(spc->spc_idleproc->p_p->ps_comm,
123 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
124 	    "idle%d", num);
125 
126 	num++;
127 }
128 
129 void
130 sched_idle(void *v)
131 {
132 	struct schedstate_percpu *spc;
133 	struct proc *p = curproc;
134 	struct cpu_info *ci = v;
135 	int s;
136 
137 	KERNEL_UNLOCK();
138 
139 	spc = &ci->ci_schedstate;
140 
141 	/*
142 	 * First time we enter here, we're not supposed to idle,
143 	 * just go away for a while.
144 	 */
145 	SCHED_LOCK(s);
146 	cpuset_add(&sched_idle_cpus, ci);
147 	p->p_stat = SSLEEP;
148 	p->p_cpu = ci;
149 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
150 	mi_switch();
151 	cpuset_del(&sched_idle_cpus, ci);
152 	SCHED_UNLOCK(s);
153 
154 	KASSERT(ci == curcpu());
155 	KASSERT(curproc == spc->spc_idleproc);
156 
157 	while (1) {
158 		while (!cpu_is_idle(curcpu())) {
159 			struct proc *dead;
160 
161 			SCHED_LOCK(s);
162 			p->p_stat = SSLEEP;
163 			mi_switch();
164 			SCHED_UNLOCK(s);
165 
166 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
167 				LIST_REMOVE(dead, p_hash);
168 				exit2(dead);
169 			}
170 		}
171 
172 		splassert(IPL_NONE);
173 
174 		smr_idle();
175 
176 		cpuset_add(&sched_idle_cpus, ci);
177 		cpu_idle_enter();
178 		while (spc->spc_whichqs == 0) {
179 #ifdef MULTIPROCESSOR
180 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
181 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
182 				cpuset_del(&sched_idle_cpus, ci);
183 				SCHED_LOCK(s);
184 				atomic_setbits_int(&spc->spc_schedflags,
185 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
186 				SCHED_UNLOCK(s);
187 				wakeup(spc);
188 			}
189 #endif
190 			cpu_idle_cycle();
191 		}
192 		cpu_idle_leave();
193 		cpuset_del(&sched_idle_cpus, ci);
194 	}
195 }
196 
197 /*
198  * To free our address space we have to jump through a few hoops.
199  * The freeing is done by the reaper, but until we have one reaper
200  * per cpu, we have no way of putting this proc on the deadproc list
201  * and waking up the reaper without risking having our address space and
202  * stack torn from under us before we manage to switch to another proc.
203  * Therefore we have a per-cpu list of dead processes where we put this
204  * proc and have idle clean up that list and move it to the reaper list.
205  * All this will be unnecessary once we can bind the reaper this cpu
206  * and not risk having it switch to another in case it sleeps.
207  */
208 void
209 sched_exit(struct proc *p)
210 {
211 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
212 	struct timespec ts;
213 	struct proc *idle;
214 	int s;
215 
216 	nanouptime(&ts);
217 	timespecsub(&ts, &spc->spc_runtime, &ts);
218 	timespecadd(&p->p_rtime, &ts, &p->p_rtime);
219 
220 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
221 
222 #ifdef MULTIPROCESSOR
223 	/* This process no longer needs to hold the kernel lock. */
224 	KERNEL_ASSERT_LOCKED();
225 	__mp_release_all(&kernel_lock);
226 #endif
227 
228 	SCHED_LOCK(s);
229 	idle = spc->spc_idleproc;
230 	idle->p_stat = SRUN;
231 	cpu_switchto(NULL, idle);
232 	panic("cpu_switchto returned");
233 }
234 
235 /*
236  * Run queue management.
237  */
238 void
239 sched_init_runqueues(void)
240 {
241 }
242 
243 void
244 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
245 {
246 	struct schedstate_percpu *spc;
247 	int queue = prio >> 2;
248 
249 	if (ci == NULL)
250 		ci = sched_choosecpu(p);
251 
252 	KASSERT(ci != NULL);
253 	SCHED_ASSERT_LOCKED();
254 
255 	p->p_cpu = ci;
256 	p->p_stat = SRUN;
257 	p->p_runpri = prio;
258 
259 	spc = &p->p_cpu->ci_schedstate;
260 	spc->spc_nrun++;
261 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
262 	    p->p_p->ps_pid);
263 
264 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
265 	spc->spc_whichqs |= (1 << queue);
266 	cpuset_add(&sched_queued_cpus, p->p_cpu);
267 
268 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
269 		cpu_unidle(p->p_cpu);
270 
271 	if (prio < spc->spc_curpriority)
272 		need_resched(ci);
273 }
274 
275 void
276 remrunqueue(struct proc *p)
277 {
278 	struct schedstate_percpu *spc;
279 	int queue = p->p_runpri >> 2;
280 
281 	SCHED_ASSERT_LOCKED();
282 	spc = &p->p_cpu->ci_schedstate;
283 	spc->spc_nrun--;
284 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
285 	    p->p_p->ps_pid);
286 
287 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
288 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
289 		spc->spc_whichqs &= ~(1 << queue);
290 		if (spc->spc_whichqs == 0)
291 			cpuset_del(&sched_queued_cpus, p->p_cpu);
292 	}
293 }
294 
295 struct proc *
296 sched_chooseproc(void)
297 {
298 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
299 	struct proc *p;
300 	int queue;
301 
302 	SCHED_ASSERT_LOCKED();
303 
304 #ifdef MULTIPROCESSOR
305 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
306 		if (spc->spc_whichqs) {
307 			for (queue = 0; queue < SCHED_NQS; queue++) {
308 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
309 					remrunqueue(p);
310 					setrunqueue(NULL, p, p->p_runpri);
311 					if (p->p_cpu == curcpu()) {
312 						KASSERT(p->p_flag & P_CPUPEG);
313 						goto again;
314 					}
315 				}
316 			}
317 		}
318 		p = spc->spc_idleproc;
319 		KASSERT(p);
320 		KASSERT(p->p_wchan == NULL);
321 		p->p_stat = SRUN;
322 		return (p);
323 	}
324 #endif
325 
326 again:
327 	if (spc->spc_whichqs) {
328 		queue = ffs(spc->spc_whichqs) - 1;
329 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
330 		remrunqueue(p);
331 		sched_noidle++;
332 		if (p->p_stat != SRUN)
333 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
334 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
335 		p = spc->spc_idleproc;
336 		if (p == NULL) {
337                         int s;
338 			/*
339 			 * We get here if someone decides to switch during
340 			 * boot before forking kthreads, bleh.
341 			 * This is kind of like a stupid idle loop.
342 			 */
343 #ifdef MULTIPROCESSOR
344 			__mp_unlock(&sched_lock);
345 #endif
346 			spl0();
347 			delay(10);
348 			SCHED_LOCK(s);
349 			goto again;
350                 }
351 		KASSERT(p);
352 		p->p_stat = SRUN;
353 	}
354 
355 	KASSERT(p->p_wchan == NULL);
356 	return (p);
357 }
358 
359 struct cpu_info *
360 sched_choosecpu_fork(struct proc *parent, int flags)
361 {
362 #ifdef MULTIPROCESSOR
363 	struct cpu_info *choice = NULL;
364 	fixpt_t load, best_load = ~0;
365 	int run, best_run = INT_MAX;
366 	struct cpu_info *ci;
367 	struct cpuset set;
368 
369 #if 0
370 	/*
371 	 * XXX
372 	 * Don't do this until we have a painless way to move the cpu in exec.
373 	 * Preferably when nuking the old pmap and getting a new one on a
374 	 * new cpu.
375 	 */
376 	/*
377 	 * PPWAIT forks are simple. We know that the parent will not
378 	 * run until we exec and choose another cpu, so we just steal its
379 	 * cpu.
380 	 */
381 	if (flags & FORK_PPWAIT)
382 		return (parent->p_cpu);
383 #endif
384 
385 	/*
386 	 * Look at all cpus that are currently idle and have nothing queued.
387 	 * If there are none, pick the one with least queued procs first,
388 	 * then the one with lowest load average.
389 	 */
390 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
391 	cpuset_intersection(&set, &set, &sched_all_cpus);
392 	if (cpuset_first(&set) == NULL)
393 		cpuset_copy(&set, &sched_all_cpus);
394 
395 	while ((ci = cpuset_first(&set)) != NULL) {
396 		cpuset_del(&set, ci);
397 
398 		load = ci->ci_schedstate.spc_ldavg;
399 		run = ci->ci_schedstate.spc_nrun;
400 
401 		if (choice == NULL || run < best_run ||
402 		    (run == best_run &&load < best_load)) {
403 			choice = ci;
404 			best_load = load;
405 			best_run = run;
406 		}
407 	}
408 
409 	return (choice);
410 #else
411 	return (curcpu());
412 #endif
413 }
414 
415 struct cpu_info *
416 sched_choosecpu(struct proc *p)
417 {
418 #ifdef MULTIPROCESSOR
419 	struct cpu_info *choice = NULL;
420 	int last_cost = INT_MAX;
421 	struct cpu_info *ci;
422 	struct cpuset set;
423 
424 	/*
425 	 * If pegged to a cpu, don't allow it to move.
426 	 */
427 	if (p->p_flag & P_CPUPEG)
428 		return (p->p_cpu);
429 
430 	sched_choose++;
431 
432 	/*
433 	 * Look at all cpus that are currently idle and have nothing queued.
434 	 * If there are none, pick the cheapest of those.
435 	 * (idle + queued could mean that the cpu is handling an interrupt
436 	 * at this moment and haven't had time to leave idle yet).
437 	 */
438 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
439 	cpuset_intersection(&set, &set, &sched_all_cpus);
440 
441 	/*
442 	 * First, just check if our current cpu is in that set, if it is,
443 	 * this is simple.
444 	 * Also, our cpu might not be idle, but if it's the current cpu
445 	 * and it has nothing else queued and we're curproc, take it.
446 	 */
447 	if (cpuset_isset(&set, p->p_cpu) ||
448 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
449 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
450 	    curproc == p)) {
451 		sched_wasidle++;
452 		return (p->p_cpu);
453 	}
454 
455 	if (cpuset_first(&set) == NULL)
456 		cpuset_copy(&set, &sched_all_cpus);
457 
458 	while ((ci = cpuset_first(&set)) != NULL) {
459 		int cost = sched_proc_to_cpu_cost(ci, p);
460 
461 		if (choice == NULL || cost < last_cost) {
462 			choice = ci;
463 			last_cost = cost;
464 		}
465 		cpuset_del(&set, ci);
466 	}
467 
468 	if (p->p_cpu != choice)
469 		sched_nmigrations++;
470 	else
471 		sched_nomigrations++;
472 
473 	return (choice);
474 #else
475 	return (curcpu());
476 #endif
477 }
478 
479 /*
480  * Attempt to steal a proc from some cpu.
481  */
482 struct proc *
483 sched_steal_proc(struct cpu_info *self)
484 {
485 	struct proc *best = NULL;
486 #ifdef MULTIPROCESSOR
487 	struct schedstate_percpu *spc;
488 	int bestcost = INT_MAX;
489 	struct cpu_info *ci;
490 	struct cpuset set;
491 
492 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
493 
494 	/* Don't steal if we don't want to schedule processes in this CPU. */
495 	if (!cpuset_isset(&sched_all_cpus, self))
496 		return (NULL);
497 
498 	cpuset_copy(&set, &sched_queued_cpus);
499 
500 	while ((ci = cpuset_first(&set)) != NULL) {
501 		struct proc *p;
502 		int queue;
503 		int cost;
504 
505 		cpuset_del(&set, ci);
506 
507 		spc = &ci->ci_schedstate;
508 
509 		queue = ffs(spc->spc_whichqs) - 1;
510 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
511 			if (p->p_flag & P_CPUPEG)
512 				continue;
513 
514 			cost = sched_proc_to_cpu_cost(self, p);
515 
516 			if (best == NULL || cost < bestcost) {
517 				best = p;
518 				bestcost = cost;
519 			}
520 		}
521 	}
522 	if (best == NULL)
523 		return (NULL);
524 
525 	remrunqueue(best);
526 	best->p_cpu = self;
527 
528 	sched_stolen++;
529 #endif
530 	return (best);
531 }
532 
533 #ifdef MULTIPROCESSOR
534 /*
535  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
536  */
537 static int
538 log2(unsigned int i)
539 {
540 	int ret = 0;
541 
542 	while (i >>= 1)
543 		ret++;
544 
545 	return (ret);
546 }
547 
548 /*
549  * Calculate the cost of moving the proc to this cpu.
550  *
551  * What we want is some guesstimate of how much "performance" it will
552  * cost us to move the proc here. Not just for caches and TLBs and NUMA
553  * memory, but also for the proc itself. A highly loaded cpu might not
554  * be the best candidate for this proc since it won't get run.
555  *
556  * Just total guesstimates for now.
557  */
558 
559 int sched_cost_load = 1;
560 int sched_cost_priority = 1;
561 int sched_cost_runnable = 3;
562 int sched_cost_resident = 1;
563 #endif
564 
565 int
566 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
567 {
568 	int cost = 0;
569 #ifdef MULTIPROCESSOR
570 	struct schedstate_percpu *spc;
571 	int l2resident = 0;
572 
573 	spc = &ci->ci_schedstate;
574 
575 	/*
576 	 * First, account for the priority of the proc we want to move.
577 	 * More willing to move, the lower the priority of the destination
578 	 * and the higher the priority of the proc.
579 	 */
580 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
581 		cost += (p->p_usrpri - spc->spc_curpriority) *
582 		    sched_cost_priority;
583 		cost += sched_cost_runnable;
584 	}
585 	if (cpuset_isset(&sched_queued_cpus, ci))
586 		cost += spc->spc_nrun * sched_cost_runnable;
587 
588 	/*
589 	 * Try to avoid the primary cpu as it handles hardware interrupts.
590 	 *
591 	 * XXX Needs to be revisited when we distribute interrupts
592 	 * over cpus.
593 	 */
594 	if (CPU_IS_PRIMARY(ci))
595 		cost += sched_cost_runnable;
596 
597 	/*
598 	 * Higher load on the destination means we don't want to go there.
599 	 */
600 	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
601 
602 	/*
603 	 * If the proc is on this cpu already, lower the cost by how much
604 	 * it has been running and an estimate of its footprint.
605 	 */
606 	if (p->p_cpu == ci && p->p_slptime == 0) {
607 		l2resident =
608 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
609 		cost -= l2resident * sched_cost_resident;
610 	}
611 #endif
612 	return (cost);
613 }
614 
615 /*
616  * Peg a proc to a cpu.
617  */
618 void
619 sched_peg_curproc(struct cpu_info *ci)
620 {
621 	struct proc *p = curproc;
622 	int s;
623 
624 	SCHED_LOCK(s);
625 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
626 	setrunqueue(ci, p, p->p_usrpri);
627 	p->p_ru.ru_nvcsw++;
628 	mi_switch();
629 	SCHED_UNLOCK(s);
630 }
631 
632 #ifdef MULTIPROCESSOR
633 
634 void
635 sched_start_secondary_cpus(void)
636 {
637 	CPU_INFO_ITERATOR cii;
638 	struct cpu_info *ci;
639 
640 	CPU_INFO_FOREACH(cii, ci) {
641 		struct schedstate_percpu *spc = &ci->ci_schedstate;
642 
643 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
644 			continue;
645 		atomic_clearbits_int(&spc->spc_schedflags,
646 		    SPCF_SHOULDHALT | SPCF_HALTED);
647 #ifdef __HAVE_CPU_TOPOLOGY
648 		if (!sched_smt && ci->ci_smt_id > 0)
649 			continue;
650 #endif
651 		cpuset_add(&sched_all_cpus, ci);
652 	}
653 }
654 
655 void
656 sched_stop_secondary_cpus(void)
657 {
658 	CPU_INFO_ITERATOR cii;
659 	struct cpu_info *ci;
660 
661 	/*
662 	 * Make sure we stop the secondary CPUs.
663 	 */
664 	CPU_INFO_FOREACH(cii, ci) {
665 		struct schedstate_percpu *spc = &ci->ci_schedstate;
666 
667 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
668 			continue;
669 		cpuset_del(&sched_all_cpus, ci);
670 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
671 	}
672 	CPU_INFO_FOREACH(cii, ci) {
673 		struct schedstate_percpu *spc = &ci->ci_schedstate;
674 		struct sleep_state sls;
675 
676 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
677 			continue;
678 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
679 			sleep_setup(&sls, spc, PZERO, "schedstate", 0);
680 			sleep_finish(&sls,
681 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
682 		}
683 	}
684 }
685 
686 struct sched_barrier_state {
687 	struct cpu_info *ci;
688 	struct cond cond;
689 };
690 
691 void
692 sched_barrier_task(void *arg)
693 {
694 	struct sched_barrier_state *sb = arg;
695 	struct cpu_info *ci = sb->ci;
696 
697 	sched_peg_curproc(ci);
698 	cond_signal(&sb->cond);
699 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
700 }
701 
702 void
703 sched_barrier(struct cpu_info *ci)
704 {
705 	struct sched_barrier_state sb;
706 	struct task task;
707 	CPU_INFO_ITERATOR cii;
708 
709 	if (ci == NULL) {
710 		CPU_INFO_FOREACH(cii, ci) {
711 			if (CPU_IS_PRIMARY(ci))
712 				break;
713 		}
714 	}
715 	KASSERT(ci != NULL);
716 
717 	if (ci == curcpu())
718 		return;
719 
720 	sb.ci = ci;
721 	cond_init(&sb.cond);
722 	task_set(&task, sched_barrier_task, &sb);
723 
724 	task_add(systqmp, &task);
725 	cond_wait(&sb.cond, "sbar");
726 }
727 
728 #else
729 
730 void
731 sched_barrier(struct cpu_info *ci)
732 {
733 }
734 
735 #endif
736 
737 /*
738  * Functions to manipulate cpu sets.
739  */
740 struct cpu_info *cpuset_infos[MAXCPUS];
741 static struct cpuset cpuset_all;
742 
743 void
744 cpuset_init_cpu(struct cpu_info *ci)
745 {
746 	cpuset_add(&cpuset_all, ci);
747 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
748 }
749 
750 void
751 cpuset_clear(struct cpuset *cs)
752 {
753 	memset(cs, 0, sizeof(*cs));
754 }
755 
756 void
757 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
758 {
759 	unsigned int num = CPU_INFO_UNIT(ci);
760 	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
761 }
762 
763 void
764 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
765 {
766 	unsigned int num = CPU_INFO_UNIT(ci);
767 	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
768 }
769 
770 int
771 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
772 {
773 	unsigned int num = CPU_INFO_UNIT(ci);
774 	return (cs->cs_set[num/32] & (1 << (num % 32)));
775 }
776 
777 void
778 cpuset_add_all(struct cpuset *cs)
779 {
780 	cpuset_copy(cs, &cpuset_all);
781 }
782 
783 void
784 cpuset_copy(struct cpuset *to, struct cpuset *from)
785 {
786 	memcpy(to, from, sizeof(*to));
787 }
788 
789 struct cpu_info *
790 cpuset_first(struct cpuset *cs)
791 {
792 	int i;
793 
794 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
795 		if (cs->cs_set[i])
796 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
797 
798 	return (NULL);
799 }
800 
801 void
802 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
803 {
804 	int i;
805 
806 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
807 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
808 }
809 
810 void
811 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
812 {
813 	int i;
814 
815 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
816 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
817 }
818 
819 void
820 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
821 {
822 	int i;
823 
824 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
825 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
826 }
827 
828 int
829 cpuset_cardinality(struct cpuset *cs)
830 {
831 	int cardinality, i, n;
832 
833 	cardinality = 0;
834 
835 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
836 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
837 			cardinality++;
838 
839 	return (cardinality);
840 }
841 
842 int
843 sysctl_hwncpuonline(void)
844 {
845 	return cpuset_cardinality(&sched_all_cpus);
846 }
847 
848 int
849 cpu_is_online(struct cpu_info *ci)
850 {
851 	return cpuset_isset(&sched_all_cpus, ci);
852 }
853 
854 #ifdef __HAVE_CPU_TOPOLOGY
855 
856 #include <sys/sysctl.h>
857 
858 int
859 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
860 {
861 	CPU_INFO_ITERATOR cii;
862 	struct cpu_info *ci;
863 	int err, newsmt;
864 
865 	newsmt = sched_smt;
866 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
867 	if (err)
868 		return err;
869 	if (newsmt == sched_smt)
870 		return 0;
871 
872 	sched_smt = newsmt;
873 	CPU_INFO_FOREACH(cii, ci) {
874 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
875 			continue;
876 		if (ci->ci_smt_id == 0)
877 			continue;
878 		if (sched_smt)
879 			cpuset_add(&sched_all_cpus, ci);
880 		else
881 			cpuset_del(&sched_all_cpus, ci);
882 	}
883 
884 	return 0;
885 }
886 
887 #endif
888