xref: /openbsd-src/sys/kern/kern_sched.c (revision 1588c8424bef641c645c007c47349a63cc556a2d)
1 /*	$OpenBSD: kern_sched.c,v 1.83 2023/08/05 12:41:04 claudio Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/smr.h>
28 #include <sys/tracepoint.h>
29 
30 #include <uvm/uvm_extern.h>
31 
32 void sched_kthreads_create(void *);
33 
34 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
35 struct proc *sched_steal_proc(struct cpu_info *);
36 
37 /*
38  * To help choosing which cpu should run which process we keep track
39  * of cpus which are currently idle and which cpus have processes
40  * queued.
41  */
42 struct cpuset sched_idle_cpus;
43 struct cpuset sched_queued_cpus;
44 struct cpuset sched_all_cpus;
45 
46 /*
47  * Some general scheduler counters.
48  */
49 uint64_t sched_nmigrations;	/* Cpu migration counter */
50 uint64_t sched_nomigrations;	/* Cpu no migration counter */
51 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
52 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
53 uint64_t sched_choose;		/* Times we chose a cpu */
54 uint64_t sched_wasidle;		/* Times we came out of idle */
55 
56 int sched_smt;
57 
58 /*
59  * A few notes about cpu_switchto that is implemented in MD code.
60  *
61  * cpu_switchto takes two arguments, the old proc and the proc
62  * it should switch to. The new proc will never be NULL, so we always have
63  * a saved state that we need to switch to. The old proc however can
64  * be NULL if the process is exiting. NULL for the old proc simply
65  * means "don't bother saving old state".
66  *
67  * cpu_switchto is supposed to atomically load the new state of the process
68  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
69  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
70  * cpus in the system must not depend on this state being consistent.
71  * Therefore no locking is necessary in cpu_switchto other than blocking
72  * interrupts during the context switch.
73  */
74 
75 /*
76  * sched_init_cpu is called from main() for the boot cpu, then it's the
77  * responsibility of the MD code to call it for all other cpus.
78  */
79 void
80 sched_init_cpu(struct cpu_info *ci)
81 {
82 	struct schedstate_percpu *spc = &ci->ci_schedstate;
83 	int i;
84 
85 	for (i = 0; i < SCHED_NQS; i++)
86 		TAILQ_INIT(&spc->spc_qs[i]);
87 
88 	spc->spc_idleproc = NULL;
89 
90 	if (spc->spc_profclock == NULL) {
91 		spc->spc_profclock = clockintr_establish(&ci->ci_queue,
92 		    profclock);
93 		if (spc->spc_profclock == NULL)
94 			panic("%s: clockintr_establish profclock", __func__);
95 	}
96 
97 	kthread_create_deferred(sched_kthreads_create, ci);
98 
99 	LIST_INIT(&spc->spc_deadproc);
100 	SIMPLEQ_INIT(&spc->spc_deferred);
101 
102 	/*
103 	 * Slight hack here until the cpuset code handles cpu_info
104 	 * structures.
105 	 */
106 	cpuset_init_cpu(ci);
107 
108 #ifdef __HAVE_CPU_TOPOLOGY
109 	if (!sched_smt && ci->ci_smt_id > 0)
110 		return;
111 #endif
112 	cpuset_add(&sched_all_cpus, ci);
113 }
114 
115 void
116 sched_kthreads_create(void *v)
117 {
118 	struct cpu_info *ci = v;
119 	struct schedstate_percpu *spc = &ci->ci_schedstate;
120 	static int num;
121 
122 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
123 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
124 	    &spc->spc_idleproc))
125 		panic("fork idle");
126 
127 	/* Name it as specified. */
128 	snprintf(spc->spc_idleproc->p_p->ps_comm,
129 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
130 	    "idle%d", num);
131 
132 	num++;
133 }
134 
135 void
136 sched_idle(void *v)
137 {
138 	struct schedstate_percpu *spc;
139 	struct proc *p = curproc;
140 	struct cpu_info *ci = v;
141 	int s;
142 
143 	KERNEL_UNLOCK();
144 
145 	spc = &ci->ci_schedstate;
146 
147 	/*
148 	 * First time we enter here, we're not supposed to idle,
149 	 * just go away for a while.
150 	 */
151 	SCHED_LOCK(s);
152 	cpuset_add(&sched_idle_cpus, ci);
153 	p->p_stat = SSLEEP;
154 	p->p_cpu = ci;
155 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
156 	mi_switch();
157 	cpuset_del(&sched_idle_cpus, ci);
158 	SCHED_UNLOCK(s);
159 
160 	KASSERT(ci == curcpu());
161 	KASSERT(curproc == spc->spc_idleproc);
162 
163 	while (1) {
164 		while (!cpu_is_idle(curcpu())) {
165 			struct proc *dead;
166 
167 			SCHED_LOCK(s);
168 			p->p_stat = SSLEEP;
169 			mi_switch();
170 			SCHED_UNLOCK(s);
171 
172 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
173 				LIST_REMOVE(dead, p_hash);
174 				exit2(dead);
175 			}
176 		}
177 
178 		splassert(IPL_NONE);
179 
180 		smr_idle();
181 
182 		cpuset_add(&sched_idle_cpus, ci);
183 		cpu_idle_enter();
184 		while (spc->spc_whichqs == 0) {
185 #ifdef MULTIPROCESSOR
186 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
187 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
188 				cpuset_del(&sched_idle_cpus, ci);
189 				SCHED_LOCK(s);
190 				atomic_setbits_int(&spc->spc_schedflags,
191 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
192 				SCHED_UNLOCK(s);
193 				wakeup(spc);
194 			}
195 #endif
196 			cpu_idle_cycle();
197 		}
198 		cpu_idle_leave();
199 		cpuset_del(&sched_idle_cpus, ci);
200 	}
201 }
202 
203 /*
204  * To free our address space we have to jump through a few hoops.
205  * The freeing is done by the reaper, but until we have one reaper
206  * per cpu, we have no way of putting this proc on the deadproc list
207  * and waking up the reaper without risking having our address space and
208  * stack torn from under us before we manage to switch to another proc.
209  * Therefore we have a per-cpu list of dead processes where we put this
210  * proc and have idle clean up that list and move it to the reaper list.
211  * All this will be unnecessary once we can bind the reaper this cpu
212  * and not risk having it switch to another in case it sleeps.
213  */
214 void
215 sched_exit(struct proc *p)
216 {
217 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
218 	struct timespec ts;
219 	struct proc *idle;
220 	int s;
221 
222 	nanouptime(&ts);
223 	timespecsub(&ts, &spc->spc_runtime, &ts);
224 	timespecadd(&p->p_rtime, &ts, &p->p_rtime);
225 
226 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
227 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
228 		clockintr_cancel(spc->spc_profclock);
229 	}
230 
231 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
232 
233 #ifdef MULTIPROCESSOR
234 	/* This process no longer needs to hold the kernel lock. */
235 	KERNEL_ASSERT_LOCKED();
236 	__mp_release_all(&kernel_lock);
237 #endif
238 
239 	SCHED_LOCK(s);
240 	idle = spc->spc_idleproc;
241 	idle->p_stat = SRUN;
242 	cpu_switchto(NULL, idle);
243 	panic("cpu_switchto returned");
244 }
245 
246 /*
247  * Run queue management.
248  */
249 void
250 sched_init_runqueues(void)
251 {
252 }
253 
254 void
255 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
256 {
257 	struct schedstate_percpu *spc;
258 	int queue = prio >> 2;
259 
260 	if (ci == NULL)
261 		ci = sched_choosecpu(p);
262 
263 	KASSERT(ci != NULL);
264 	SCHED_ASSERT_LOCKED();
265 
266 	p->p_cpu = ci;
267 	p->p_stat = SRUN;
268 	p->p_runpri = prio;
269 
270 	spc = &p->p_cpu->ci_schedstate;
271 	spc->spc_nrun++;
272 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
273 	    p->p_p->ps_pid);
274 
275 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
276 	spc->spc_whichqs |= (1U << queue);
277 	cpuset_add(&sched_queued_cpus, p->p_cpu);
278 
279 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
280 		cpu_unidle(p->p_cpu);
281 
282 	if (prio < spc->spc_curpriority)
283 		need_resched(ci);
284 }
285 
286 void
287 remrunqueue(struct proc *p)
288 {
289 	struct schedstate_percpu *spc;
290 	int queue = p->p_runpri >> 2;
291 
292 	SCHED_ASSERT_LOCKED();
293 	spc = &p->p_cpu->ci_schedstate;
294 	spc->spc_nrun--;
295 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
296 	    p->p_p->ps_pid);
297 
298 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
299 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
300 		spc->spc_whichqs &= ~(1U << queue);
301 		if (spc->spc_whichqs == 0)
302 			cpuset_del(&sched_queued_cpus, p->p_cpu);
303 	}
304 }
305 
306 struct proc *
307 sched_chooseproc(void)
308 {
309 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
310 	struct proc *p;
311 	int queue;
312 
313 	SCHED_ASSERT_LOCKED();
314 
315 #ifdef MULTIPROCESSOR
316 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
317 		if (spc->spc_whichqs) {
318 			for (queue = 0; queue < SCHED_NQS; queue++) {
319 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
320 					remrunqueue(p);
321 					setrunqueue(NULL, p, p->p_runpri);
322 					if (p->p_cpu == curcpu()) {
323 						KASSERT(p->p_flag & P_CPUPEG);
324 						goto again;
325 					}
326 				}
327 			}
328 		}
329 		p = spc->spc_idleproc;
330 		KASSERT(p);
331 		KASSERT(p->p_wchan == NULL);
332 		p->p_stat = SRUN;
333 		return (p);
334 	}
335 #endif
336 
337 again:
338 	if (spc->spc_whichqs) {
339 		queue = ffs(spc->spc_whichqs) - 1;
340 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
341 		remrunqueue(p);
342 		sched_noidle++;
343 		if (p->p_stat != SRUN)
344 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
345 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
346 		p = spc->spc_idleproc;
347 		if (p == NULL) {
348                         int s;
349 			/*
350 			 * We get here if someone decides to switch during
351 			 * boot before forking kthreads, bleh.
352 			 * This is kind of like a stupid idle loop.
353 			 */
354 #ifdef MULTIPROCESSOR
355 			__mp_unlock(&sched_lock);
356 #endif
357 			spl0();
358 			delay(10);
359 			SCHED_LOCK(s);
360 			goto again;
361                 }
362 		KASSERT(p);
363 		p->p_stat = SRUN;
364 	}
365 
366 	KASSERT(p->p_wchan == NULL);
367 	return (p);
368 }
369 
370 struct cpu_info *
371 sched_choosecpu_fork(struct proc *parent, int flags)
372 {
373 #ifdef MULTIPROCESSOR
374 	struct cpu_info *choice = NULL;
375 	int run, best_run = INT_MAX;
376 	struct cpu_info *ci;
377 	struct cpuset set;
378 
379 #if 0
380 	/*
381 	 * XXX
382 	 * Don't do this until we have a painless way to move the cpu in exec.
383 	 * Preferably when nuking the old pmap and getting a new one on a
384 	 * new cpu.
385 	 */
386 	/*
387 	 * PPWAIT forks are simple. We know that the parent will not
388 	 * run until we exec and choose another cpu, so we just steal its
389 	 * cpu.
390 	 */
391 	if (flags & FORK_PPWAIT)
392 		return (parent->p_cpu);
393 #endif
394 
395 	/*
396 	 * Look at all cpus that are currently idle and have nothing queued.
397 	 * If there are none, pick the one with least queued procs first,
398 	 * then the one with lowest load average.
399 	 */
400 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
401 	cpuset_intersection(&set, &set, &sched_all_cpus);
402 	if (cpuset_first(&set) == NULL)
403 		cpuset_copy(&set, &sched_all_cpus);
404 
405 	while ((ci = cpuset_first(&set)) != NULL) {
406 		cpuset_del(&set, ci);
407 
408 		run = ci->ci_schedstate.spc_nrun;
409 
410 		if (choice == NULL || run < best_run) {
411 			choice = ci;
412 			best_run = run;
413 		}
414 	}
415 
416 	return (choice);
417 #else
418 	return (curcpu());
419 #endif
420 }
421 
422 struct cpu_info *
423 sched_choosecpu(struct proc *p)
424 {
425 #ifdef MULTIPROCESSOR
426 	struct cpu_info *choice = NULL;
427 	int last_cost = INT_MAX;
428 	struct cpu_info *ci;
429 	struct cpuset set;
430 
431 	/*
432 	 * If pegged to a cpu, don't allow it to move.
433 	 */
434 	if (p->p_flag & P_CPUPEG)
435 		return (p->p_cpu);
436 
437 	sched_choose++;
438 
439 	/*
440 	 * Look at all cpus that are currently idle and have nothing queued.
441 	 * If there are none, pick the cheapest of those.
442 	 * (idle + queued could mean that the cpu is handling an interrupt
443 	 * at this moment and haven't had time to leave idle yet).
444 	 */
445 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
446 	cpuset_intersection(&set, &set, &sched_all_cpus);
447 
448 	/*
449 	 * First, just check if our current cpu is in that set, if it is,
450 	 * this is simple.
451 	 * Also, our cpu might not be idle, but if it's the current cpu
452 	 * and it has nothing else queued and we're curproc, take it.
453 	 */
454 	if (cpuset_isset(&set, p->p_cpu) ||
455 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
456 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
457 	    curproc == p)) {
458 		sched_wasidle++;
459 		return (p->p_cpu);
460 	}
461 
462 	if (cpuset_first(&set) == NULL)
463 		cpuset_copy(&set, &sched_all_cpus);
464 
465 	while ((ci = cpuset_first(&set)) != NULL) {
466 		int cost = sched_proc_to_cpu_cost(ci, p);
467 
468 		if (choice == NULL || cost < last_cost) {
469 			choice = ci;
470 			last_cost = cost;
471 		}
472 		cpuset_del(&set, ci);
473 	}
474 
475 	if (p->p_cpu != choice)
476 		sched_nmigrations++;
477 	else
478 		sched_nomigrations++;
479 
480 	return (choice);
481 #else
482 	return (curcpu());
483 #endif
484 }
485 
486 /*
487  * Attempt to steal a proc from some cpu.
488  */
489 struct proc *
490 sched_steal_proc(struct cpu_info *self)
491 {
492 	struct proc *best = NULL;
493 #ifdef MULTIPROCESSOR
494 	struct schedstate_percpu *spc;
495 	int bestcost = INT_MAX;
496 	struct cpu_info *ci;
497 	struct cpuset set;
498 
499 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
500 
501 	/* Don't steal if we don't want to schedule processes in this CPU. */
502 	if (!cpuset_isset(&sched_all_cpus, self))
503 		return (NULL);
504 
505 	cpuset_copy(&set, &sched_queued_cpus);
506 
507 	while ((ci = cpuset_first(&set)) != NULL) {
508 		struct proc *p;
509 		int queue;
510 		int cost;
511 
512 		cpuset_del(&set, ci);
513 
514 		spc = &ci->ci_schedstate;
515 
516 		queue = ffs(spc->spc_whichqs) - 1;
517 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
518 			if (p->p_flag & P_CPUPEG)
519 				continue;
520 
521 			cost = sched_proc_to_cpu_cost(self, p);
522 
523 			if (best == NULL || cost < bestcost) {
524 				best = p;
525 				bestcost = cost;
526 			}
527 		}
528 	}
529 	if (best == NULL)
530 		return (NULL);
531 
532 	remrunqueue(best);
533 	best->p_cpu = self;
534 
535 	sched_stolen++;
536 #endif
537 	return (best);
538 }
539 
540 #ifdef MULTIPROCESSOR
541 /*
542  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
543  */
544 static int
545 log2(unsigned int i)
546 {
547 	int ret = 0;
548 
549 	while (i >>= 1)
550 		ret++;
551 
552 	return (ret);
553 }
554 
555 /*
556  * Calculate the cost of moving the proc to this cpu.
557  *
558  * What we want is some guesstimate of how much "performance" it will
559  * cost us to move the proc here. Not just for caches and TLBs and NUMA
560  * memory, but also for the proc itself. A highly loaded cpu might not
561  * be the best candidate for this proc since it won't get run.
562  *
563  * Just total guesstimates for now.
564  */
565 
566 int sched_cost_load = 1;
567 int sched_cost_priority = 1;
568 int sched_cost_runnable = 3;
569 int sched_cost_resident = 1;
570 #endif
571 
572 int
573 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
574 {
575 	int cost = 0;
576 #ifdef MULTIPROCESSOR
577 	struct schedstate_percpu *spc;
578 	int l2resident = 0;
579 
580 	spc = &ci->ci_schedstate;
581 
582 	/*
583 	 * First, account for the priority of the proc we want to move.
584 	 * More willing to move, the lower the priority of the destination
585 	 * and the higher the priority of the proc.
586 	 */
587 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
588 		cost += (p->p_usrpri - spc->spc_curpriority) *
589 		    sched_cost_priority;
590 		cost += sched_cost_runnable;
591 	}
592 	if (cpuset_isset(&sched_queued_cpus, ci))
593 		cost += spc->spc_nrun * sched_cost_runnable;
594 
595 	/*
596 	 * Try to avoid the primary cpu as it handles hardware interrupts.
597 	 *
598 	 * XXX Needs to be revisited when we distribute interrupts
599 	 * over cpus.
600 	 */
601 	if (CPU_IS_PRIMARY(ci))
602 		cost += sched_cost_runnable;
603 
604 	/*
605 	 * If the proc is on this cpu already, lower the cost by how much
606 	 * it has been running and an estimate of its footprint.
607 	 */
608 	if (p->p_cpu == ci && p->p_slptime == 0) {
609 		l2resident =
610 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
611 		cost -= l2resident * sched_cost_resident;
612 	}
613 #endif
614 	return (cost);
615 }
616 
617 /*
618  * Peg a proc to a cpu.
619  */
620 void
621 sched_peg_curproc(struct cpu_info *ci)
622 {
623 	struct proc *p = curproc;
624 	int s;
625 
626 	SCHED_LOCK(s);
627 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
628 	setrunqueue(ci, p, p->p_usrpri);
629 	p->p_ru.ru_nvcsw++;
630 	mi_switch();
631 	SCHED_UNLOCK(s);
632 }
633 
634 #ifdef MULTIPROCESSOR
635 
636 void
637 sched_start_secondary_cpus(void)
638 {
639 	CPU_INFO_ITERATOR cii;
640 	struct cpu_info *ci;
641 
642 	CPU_INFO_FOREACH(cii, ci) {
643 		struct schedstate_percpu *spc = &ci->ci_schedstate;
644 
645 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
646 			continue;
647 		atomic_clearbits_int(&spc->spc_schedflags,
648 		    SPCF_SHOULDHALT | SPCF_HALTED);
649 #ifdef __HAVE_CPU_TOPOLOGY
650 		if (!sched_smt && ci->ci_smt_id > 0)
651 			continue;
652 #endif
653 		cpuset_add(&sched_all_cpus, ci);
654 	}
655 }
656 
657 void
658 sched_stop_secondary_cpus(void)
659 {
660 	CPU_INFO_ITERATOR cii;
661 	struct cpu_info *ci;
662 
663 	/*
664 	 * Make sure we stop the secondary CPUs.
665 	 */
666 	CPU_INFO_FOREACH(cii, ci) {
667 		struct schedstate_percpu *spc = &ci->ci_schedstate;
668 
669 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
670 			continue;
671 		cpuset_del(&sched_all_cpus, ci);
672 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
673 	}
674 	CPU_INFO_FOREACH(cii, ci) {
675 		struct schedstate_percpu *spc = &ci->ci_schedstate;
676 
677 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
678 			continue;
679 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
680 			sleep_setup(spc, PZERO, "schedstate");
681 			sleep_finish(0,
682 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
683 		}
684 	}
685 }
686 
687 struct sched_barrier_state {
688 	struct cpu_info *ci;
689 	struct cond cond;
690 };
691 
692 void
693 sched_barrier_task(void *arg)
694 {
695 	struct sched_barrier_state *sb = arg;
696 	struct cpu_info *ci = sb->ci;
697 
698 	sched_peg_curproc(ci);
699 	cond_signal(&sb->cond);
700 	atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
701 }
702 
703 void
704 sched_barrier(struct cpu_info *ci)
705 {
706 	struct sched_barrier_state sb;
707 	struct task task;
708 	CPU_INFO_ITERATOR cii;
709 
710 	if (ci == NULL) {
711 		CPU_INFO_FOREACH(cii, ci) {
712 			if (CPU_IS_PRIMARY(ci))
713 				break;
714 		}
715 	}
716 	KASSERT(ci != NULL);
717 
718 	if (ci == curcpu())
719 		return;
720 
721 	sb.ci = ci;
722 	cond_init(&sb.cond);
723 	task_set(&task, sched_barrier_task, &sb);
724 
725 	task_add(systqmp, &task);
726 	cond_wait(&sb.cond, "sbar");
727 }
728 
729 #else
730 
731 void
732 sched_barrier(struct cpu_info *ci)
733 {
734 }
735 
736 #endif
737 
738 /*
739  * Functions to manipulate cpu sets.
740  */
741 struct cpu_info *cpuset_infos[MAXCPUS];
742 static struct cpuset cpuset_all;
743 
744 void
745 cpuset_init_cpu(struct cpu_info *ci)
746 {
747 	cpuset_add(&cpuset_all, ci);
748 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
749 }
750 
751 void
752 cpuset_clear(struct cpuset *cs)
753 {
754 	memset(cs, 0, sizeof(*cs));
755 }
756 
757 void
758 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
759 {
760 	unsigned int num = CPU_INFO_UNIT(ci);
761 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
762 }
763 
764 void
765 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
766 {
767 	unsigned int num = CPU_INFO_UNIT(ci);
768 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
769 }
770 
771 int
772 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
773 {
774 	unsigned int num = CPU_INFO_UNIT(ci);
775 	return (cs->cs_set[num/32] & (1U << (num % 32)));
776 }
777 
778 void
779 cpuset_add_all(struct cpuset *cs)
780 {
781 	cpuset_copy(cs, &cpuset_all);
782 }
783 
784 void
785 cpuset_copy(struct cpuset *to, struct cpuset *from)
786 {
787 	memcpy(to, from, sizeof(*to));
788 }
789 
790 struct cpu_info *
791 cpuset_first(struct cpuset *cs)
792 {
793 	int i;
794 
795 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
796 		if (cs->cs_set[i])
797 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
798 
799 	return (NULL);
800 }
801 
802 void
803 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
804 {
805 	int i;
806 
807 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
808 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
809 }
810 
811 void
812 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
813 {
814 	int i;
815 
816 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
817 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
818 }
819 
820 void
821 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
822 {
823 	int i;
824 
825 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
826 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
827 }
828 
829 int
830 cpuset_cardinality(struct cpuset *cs)
831 {
832 	int cardinality, i, n;
833 
834 	cardinality = 0;
835 
836 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
837 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
838 			cardinality++;
839 
840 	return (cardinality);
841 }
842 
843 int
844 sysctl_hwncpuonline(void)
845 {
846 	return cpuset_cardinality(&sched_all_cpus);
847 }
848 
849 int
850 cpu_is_online(struct cpu_info *ci)
851 {
852 	return cpuset_isset(&sched_all_cpus, ci);
853 }
854 
855 #ifdef __HAVE_CPU_TOPOLOGY
856 
857 #include <sys/sysctl.h>
858 
859 int
860 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
861 {
862 	CPU_INFO_ITERATOR cii;
863 	struct cpu_info *ci;
864 	int err, newsmt;
865 
866 	newsmt = sched_smt;
867 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
868 	if (err)
869 		return err;
870 	if (newsmt == sched_smt)
871 		return 0;
872 
873 	sched_smt = newsmt;
874 	CPU_INFO_FOREACH(cii, ci) {
875 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
876 			continue;
877 		if (ci->ci_smt_id == 0)
878 			continue;
879 		if (sched_smt)
880 			cpuset_add(&sched_all_cpus, ci);
881 		else
882 			cpuset_del(&sched_all_cpus, ci);
883 	}
884 
885 	return 0;
886 }
887 
888 #endif
889