xref: /netbsd-src/sys/kern/kern_runq.c (revision 5dd36a3bc8bf2a9dec29ceb6349550414570c447)
1 /*	$NetBSD: kern_runq.c,v 1.62 2020/01/25 15:09:54 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55  * SUCH DAMAGE.
56  */
57 
58 #include <sys/cdefs.h>
59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.62 2020/01/25 15:09:54 ad Exp $");
60 
61 #include "opt_dtrace.h"
62 
63 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/bitops.h>
66 #include <sys/cpu.h>
67 #include <sys/idle.h>
68 #include <sys/intr.h>
69 #include <sys/kmem.h>
70 #include <sys/lwp.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/pset.h>
74 #include <sys/sched.h>
75 #include <sys/syscallargs.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/types.h>
79 #include <sys/evcnt.h>
80 #include <sys/atomic.h>
81 
82 /*
83  * Bits per map.
84  */
85 #define	BITMAP_BITS	(32)
86 #define	BITMAP_SHIFT	(5)
87 #define	BITMAP_MSB	(0x80000000U)
88 #define	BITMAP_MASK	(BITMAP_BITS - 1)
89 
90 const int	schedppq = 1;
91 
92 static void	*sched_getrq(struct schedstate_percpu *, const pri_t);
93 #ifdef MULTIPROCESSOR
94 static lwp_t *	sched_catchlwp(struct cpu_info *);
95 #endif
96 
97 /*
98  * Preemption control.
99  */
100 #ifdef __HAVE_PREEMPTION
101 # ifdef DEBUG
102 int		sched_kpreempt_pri = 0;
103 # else
104 int		sched_kpreempt_pri = PRI_USER_RT;
105 # endif
106 #else
107 int		sched_kpreempt_pri = 1000;
108 #endif
109 
110 /*
111  * Migration and balancing.
112  */
113 static u_int	cacheht_time;	/* Cache hotness time */
114 static u_int	min_catch;	/* Minimal LWP count for catching */
115 static u_int	skim_interval;	/* Rate limit for stealing LWPs */
116 
117 #ifdef KDTRACE_HOOKS
118 struct lwp *curthread;
119 #endif
120 
121 void
122 runq_init(void)
123 {
124 
125 	/* Pulling from remote packages, LWP must not have run for 10ms. */
126 	cacheht_time = 10;
127 
128 	/* Minimal count of LWPs for catching */
129 	min_catch = 1;
130 
131 	/* Steal from other CPUs at most every 10ms. */
132 	skim_interval = 10;
133 }
134 
135 void
136 sched_cpuattach(struct cpu_info *ci)
137 {
138 	struct schedstate_percpu *spc;
139 	size_t size;
140 	void *p;
141 	u_int i;
142 
143 	spc = &ci->ci_schedstate;
144 	spc->spc_nextpkg = ci;
145 
146 	if (spc->spc_lwplock == NULL) {
147 		spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
148 	}
149 	if (ci == lwp0.l_cpu) {
150 		/* Initialize the scheduler structure of the primary LWP */
151 		lwp0.l_mutex = spc->spc_lwplock;
152 	}
153 	if (spc->spc_mutex != NULL) {
154 		/* Already initialized. */
155 		return;
156 	}
157 
158 	/* Allocate the run queue */
159 	size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
160 	    coherency_unit;
161 	p = kmem_alloc(size, KM_SLEEP);
162 	spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
163 
164 	/* Initialize run queues */
165 	spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
166 	for (i = 0; i < PRI_COUNT; i++)
167 		TAILQ_INIT(&spc->spc_queue[i]);
168 }
169 
170 /*
171  * Control of the runqueue.
172  */
173 static inline void *
174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
175 {
176 
177 	KASSERT(prio < PRI_COUNT);
178 	return &spc->spc_queue[prio];
179 }
180 
181 /*
182  * Put an LWP onto a run queue.  The LWP must be locked by spc_mutex for
183  * l_cpu.
184  */
185 void
186 sched_enqueue(struct lwp *l)
187 {
188 	struct schedstate_percpu *spc;
189 	TAILQ_HEAD(, lwp) *q_head;
190 	const pri_t eprio = lwp_eprio(l);
191 	struct cpu_info *ci;
192 
193 	ci = l->l_cpu;
194 	spc = &ci->ci_schedstate;
195 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
196 
197 	/* Enqueue the thread */
198 	q_head = sched_getrq(spc, eprio);
199 	if (TAILQ_EMPTY(q_head)) {
200 		u_int i;
201 		uint32_t q;
202 
203 		/* Mark bit */
204 		i = eprio >> BITMAP_SHIFT;
205 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
206 		KASSERT((spc->spc_bitmap[i] & q) == 0);
207 		spc->spc_bitmap[i] |= q;
208 	}
209 	/* Preempted SCHED_RR and SCHED_FIFO LWPs go to the queue head. */
210 	if (l->l_class != SCHED_OTHER && (l->l_pflag & LP_PREEMPTING) != 0) {
211 		TAILQ_INSERT_HEAD(q_head, l, l_runq);
212 	} else {
213 		TAILQ_INSERT_TAIL(q_head, l, l_runq);
214 	}
215 	spc->spc_flags &= ~SPCF_IDLE;
216 	spc->spc_count++;
217 	if ((l->l_pflag & LP_BOUND) == 0)
218 		spc->spc_mcount++;
219 
220 	/*
221 	 * Update the value of highest priority in the runqueue,
222 	 * if priority of this thread is higher.
223 	 */
224 	if (eprio > spc->spc_maxpriority)
225 		spc->spc_maxpriority = eprio;
226 
227 	sched_newts(l);
228 }
229 
230 /*
231  * Remove and LWP from the run queue it's on.  The LWP must be in state
232  * LSRUN.
233  */
234 void
235 sched_dequeue(struct lwp *l)
236 {
237 	TAILQ_HEAD(, lwp) *q_head;
238 	struct schedstate_percpu *spc;
239 	const pri_t eprio = lwp_eprio(l);
240 
241 	spc = &l->l_cpu->ci_schedstate;
242 
243 	KASSERT(lwp_locked(l, spc->spc_mutex));
244 	KASSERT(eprio <= spc->spc_maxpriority);
245 	KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
246 	KASSERT(spc->spc_count > 0);
247 
248 	if (spc->spc_migrating == l)
249 		spc->spc_migrating = NULL;
250 
251 	spc->spc_count--;
252 	if ((l->l_pflag & LP_BOUND) == 0)
253 		spc->spc_mcount--;
254 
255 	q_head = sched_getrq(spc, eprio);
256 	TAILQ_REMOVE(q_head, l, l_runq);
257 	if (TAILQ_EMPTY(q_head)) {
258 		u_int i;
259 		uint32_t q;
260 
261 		/* Unmark bit */
262 		i = eprio >> BITMAP_SHIFT;
263 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
264 		KASSERT((spc->spc_bitmap[i] & q) != 0);
265 		spc->spc_bitmap[i] &= ~q;
266 
267 		/*
268 		 * Update the value of highest priority in the runqueue, in a
269 		 * case it was a last thread in the queue of highest priority.
270 		 */
271 		if (eprio != spc->spc_maxpriority)
272 			return;
273 
274 		do {
275 			if (spc->spc_bitmap[i] != 0) {
276 				q = ffs(spc->spc_bitmap[i]);
277 				spc->spc_maxpriority =
278 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
279 				return;
280 			}
281 		} while (i--);
282 
283 		/* If not found - set the lowest value */
284 		spc->spc_maxpriority = 0;
285 	}
286 }
287 
288 /*
289  * Cause a preemption on the given CPU, if the priority "pri" is higher
290  * priority than the running LWP.  If "unlock" is specified, and ideally it
291  * will be for concurrency reasons, spc_mutex will be dropped before return.
292  */
293 void
294 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
295 {
296 	struct schedstate_percpu *spc;
297 	u_int o, n, f;
298 	lwp_t *l;
299 
300 	spc = &ci->ci_schedstate;
301 
302 	KASSERT(mutex_owned(spc->spc_mutex));
303 
304 	/*
305 	 * If the priority level we're evaluating wouldn't cause a new LWP
306 	 * to be run on the CPU, then we have nothing to do.
307 	 */
308 	if (pri <= spc->spc_curpriority || !mp_online) {
309 		if (__predict_true(unlock)) {
310 			spc_unlock(ci);
311 		}
312 		return;
313 	}
314 
315 	/*
316 	 * Figure out what kind of preemption we should do.
317 	 */
318 	l = ci->ci_onproc;
319 	if ((l->l_flag & LW_IDLE) != 0) {
320 		f = RESCHED_IDLE | RESCHED_UPREEMPT;
321 	} else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
322 		/* We can't currently preempt softints - should be able to. */
323 #ifdef __HAVE_PREEMPTION
324 		f = RESCHED_KPREEMPT;
325 #else
326 		/* Leave door open for test: set kpreempt_pri with sysctl. */
327 		f = RESCHED_UPREEMPT;
328 #endif
329 		/*
330 		 * l_dopreempt must be set with the CPU locked to sync with
331 		 * mi_switch().  It must also be set with an atomic to sync
332 		 * with kpreempt().
333 		 */
334 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
335 	} else {
336 		f = RESCHED_UPREEMPT;
337 	}
338 	if (ci != curcpu()) {
339 		f |= RESCHED_REMOTE;
340 	}
341 
342 	/*
343 	 * Things start as soon as we touch ci_want_resched: x86 for example
344 	 * has an instruction that monitors the memory cell it's in.  We
345 	 * want to drop the schedstate lock in advance, otherwise the remote
346 	 * CPU can awaken and immediately block on the lock.
347 	 */
348 	if (__predict_true(unlock)) {
349 		spc_unlock(ci);
350 	}
351 
352 	/*
353 	 * The caller will always have a second scheduler lock held: either
354 	 * the running LWP lock (spc_lwplock), or a sleep queue lock.  That
355 	 * keeps preemption disabled, which among other things ensures all
356 	 * LWPs involved won't be freed while we're here (see lwp_dtor()).
357 	 */
358  	KASSERT(kpreempt_disabled());
359 
360 	for (o = 0;; o = n) {
361 		n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
362 		if (__predict_true(o == n)) {
363 			/*
364 			 * We're the first.  If we're in process context on
365 			 * the same CPU, we can avoid the visit to trap().
366 			 */
367 			if (l != curlwp || cpu_intr_p()) {
368 				cpu_need_resched(ci, l, f);
369 			}
370 			break;
371 		}
372 		if (__predict_true(
373 		    (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
374 		    (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
375 			/* Already in progress, nothing to do. */
376 			break;
377 		}
378 	}
379 }
380 
381 /*
382  * Cause a preemption on the given CPU, if the priority of LWP "l" in state
383  * LSRUN, is higher priority than the running LWP.  If "unlock" is
384  * specified, and ideally it will be for concurrency reasons, spc_mutex will
385  * be dropped before return.
386  */
387 void
388 sched_resched_lwp(struct lwp *l, bool unlock)
389 {
390 	struct cpu_info *ci = l->l_cpu;
391 
392 	KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
393 	KASSERT(l->l_stat == LSRUN);
394 
395 	sched_resched_cpu(ci, lwp_eprio(l), unlock);
396 }
397 
398 /*
399  * Migration and balancing.
400  */
401 
402 #ifdef MULTIPROCESSOR
403 
404 /*
405  * Estimate if LWP is cache-hot.
406  */
407 static inline bool
408 lwp_cache_hot(const struct lwp *l)
409 {
410 
411 	/* Leave new LWPs in peace, determination has already been made. */
412 	if (l->l_stat == LSIDL)
413 		return true;
414 
415 	if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
416 		return false;
417 
418 	return (hardclock_ticks - l->l_rticks < mstohz(cacheht_time));
419 }
420 
421 /*
422  * Check if LWP can migrate to the chosen CPU.
423  */
424 static inline bool
425 sched_migratable(const struct lwp *l, struct cpu_info *ci)
426 {
427 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
428 	KASSERT(lwp_locked(__UNCONST(l), NULL));
429 
430 	/* Is CPU offline? */
431 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
432 		return false;
433 
434 	/* Is affinity set? */
435 	if (__predict_false(l->l_affinity))
436 		return kcpuset_isset(l->l_affinity, cpu_index(ci));
437 
438 	/* Is there a processor-set? */
439 	return (spc->spc_psid == l->l_psid);
440 }
441 
442 /*
443  * A small helper to do round robin through CPU packages.
444  */
445 static struct cpu_info *
446 sched_nextpkg(void)
447 {
448 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
449 
450 	spc->spc_nextpkg =
451 	    spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
452 
453 	return spc->spc_nextpkg;
454 }
455 
456 /*
457  * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
458  * thread.  In case of equal priority, prefer first class CPUs, and amongst
459  * the remainder choose the CPU with the fewest runqueue entries.
460  *
461  * Begin the search in the CPU package which "pivot" is a member of.
462  */
463 static struct cpu_info * __noinline
464 sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
465 {
466 	struct cpu_info *bestci, *curci, *outer;
467 	struct schedstate_percpu *bestspc, *curspc;
468 	pri_t bestpri, curpri;
469 
470 	/*
471 	 * If this fails (it shouldn't), run on the given CPU.  This also
472 	 * gives us a weak preference for "pivot" to begin with.
473 	 */
474 	bestci = pivot;
475 	bestspc = &bestci->ci_schedstate;
476 	bestpri = MAX(bestspc->spc_curpriority, bestspc->spc_maxpriority);
477 
478 	/* In the outer loop scroll through all CPU packages. */
479 	pivot = pivot->ci_package1st;
480 	outer = pivot;
481 	do {
482 		/* In the inner loop scroll through all CPUs in package. */
483 		curci = outer;
484 		do {
485 			if (!sched_migratable(l, curci)) {
486 				continue;
487 			}
488 
489 			curspc = &curci->ci_schedstate;
490 
491 			/* If this CPU is idle and 1st class, we're done. */
492 			if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
493 			    (SPCF_IDLE | SPCF_1STCLASS)) {
494 				return curci;
495 			}
496 
497 			curpri = MAX(curspc->spc_curpriority,
498 			    curspc->spc_maxpriority);
499 
500 			if (curpri > bestpri) {
501 				continue;
502 			}
503 			if (curpri == bestpri) {
504 				/* Prefer first class CPUs over others. */
505 				if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
506 				    (bestspc->spc_flags & SPCF_1STCLASS) != 0) {
507 				    	continue;
508 				}
509 				/*
510 				 * Pick the least busy CPU.  Make sure this is not
511 				 * <=, otherwise it defeats the above preference.
512 				 */
513 				if (bestspc->spc_count < curspc->spc_count) {
514 					continue;
515 				}
516 			}
517 
518 			bestpri = curpri;
519 			bestci = curci;
520 			bestspc = curspc;
521 
522 		} while (curci = curci->ci_sibling[CPUREL_PACKAGE],
523 		    curci != outer);
524 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
525 	    outer != pivot);
526 
527 	return bestci;
528 }
529 
530 /*
531  * Estimate the migration of LWP to the other CPU.
532  * Take and return the CPU, if migration is needed.
533  */
534 struct cpu_info *
535 sched_takecpu(struct lwp *l)
536 {
537 	struct schedstate_percpu *spc, *tspc;
538 	struct cpu_info *ci, *curci, *tci;
539 	pri_t eprio;
540 	int flags;
541 
542 	KASSERT(lwp_locked(l, NULL));
543 
544 	/* If thread is strictly bound, do not estimate other CPUs */
545 	ci = l->l_cpu;
546 	if (l->l_pflag & LP_BOUND)
547 		return ci;
548 
549 	spc = &ci->ci_schedstate;
550 	eprio = lwp_eprio(l);
551 
552 	/*
553 	 * Handle new LWPs.  For vfork() with a timeshared child, make it
554 	 * run on the same CPU as the parent if no other LWPs in queue.
555 	 * Otherwise scatter far and wide - try for an even distribution
556 	 * across all CPU packages and CPUs.
557 	 */
558 	if (l->l_stat == LSIDL) {
559 		if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) {
560 			if (sched_migratable(l, curlwp->l_cpu) && eprio >
561 			    curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
562 				return curlwp->l_cpu;
563 			}
564 		} else {
565 			return sched_bestcpu(l, sched_nextpkg());
566 		}
567 		flags = SPCF_IDLE;
568 	} else {
569 		flags = SPCF_IDLE | SPCF_1STCLASS;
570 	}
571 
572 	/*
573 	 * Try to send the LWP back to the first CPU in the same core if
574 	 * idle.  This keeps LWPs clustered in the run queues of 1st class
575 	 * CPUs.  This implies stickiness.  If we didn't find a home for
576 	 * a vfork() child above, try to use any SMT sibling to help out.
577 	 */
578 	tci = ci;
579 	do {
580 		tspc = &tci->ci_schedstate;
581 		if ((tspc->spc_flags & flags) == flags &&
582 		    sched_migratable(l, tci)) {
583 			return tci;
584 		}
585 		tci = tci->ci_sibling[CPUREL_CORE];
586 	} while (tci != ci);
587 
588 	/*
589 	 * Otherwise the LWP is "sticky", i.e.  generally preferring to stay
590 	 * on the same CPU.
591 	 */
592 	if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority ||
593 	    (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
594 		return ci;
595 	}
596 
597 	/*
598 	 * If the current CPU core is idle, run there and avoid the
599 	 * expensive scan of CPUs below.
600 	 */
601 	curci = curcpu();
602 	tci = curci;
603 	do {
604 		tspc = &tci->ci_schedstate;
605 		if ((tspc->spc_flags & flags) == flags &&
606 		    sched_migratable(l, tci)) {
607 			return tci;
608 		}
609 		tci = tci->ci_sibling[CPUREL_CORE];
610 	} while (tci != curci);
611 
612 	/*
613 	 * Didn't find a new home above - happens infrequently.  Start the
614 	 * search in last CPU package that the LWP ran in, but expand to
615 	 * include the whole system if needed.
616 	 */
617 	return sched_bestcpu(l, l->l_cpu);
618 }
619 
620 /*
621  * Tries to catch an LWP from the runqueue of other CPU.
622  */
623 static struct lwp *
624 sched_catchlwp(struct cpu_info *ci)
625 {
626 	struct cpu_info *curci = curcpu();
627 	struct schedstate_percpu *spc, *curspc;
628 	TAILQ_HEAD(, lwp) *q_head;
629 	struct lwp *l;
630 	bool gentle;
631 
632 	curspc = &curci->ci_schedstate;
633 	spc = &ci->ci_schedstate;
634 
635 	/*
636 	 * Be more aggressive if this CPU is first class, and the other
637 	 * is not.
638 	 */
639 	gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
640 	    (spc->spc_flags & SPCF_1STCLASS) != 0);
641 
642 	if (spc->spc_mcount < (gentle ? min_catch : 1) ||
643 	    curspc->spc_psid != spc->spc_psid) {
644 		spc_unlock(ci);
645 		return NULL;
646 	}
647 
648 	/* Take the highest priority thread */
649 	q_head = sched_getrq(spc, spc->spc_maxpriority);
650 	l = TAILQ_FIRST(q_head);
651 
652 	for (;;) {
653 		/* Check the first and next result from the queue */
654 		if (l == NULL) {
655 			break;
656 		}
657 		KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
658 		    ci->ci_data.cpu_name,
659 		    l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
660 
661 		/* Look for threads, whose are allowed to migrate */
662 		if ((l->l_pflag & LP_BOUND) ||
663 		    (gentle && lwp_cache_hot(l)) ||
664 		    !sched_migratable(l, curci)) {
665 			l = TAILQ_NEXT(l, l_runq);
666 			/* XXX Gap: could walk down priority list. */
667 			continue;
668 		}
669 
670 		/* Grab the thread, and move to the local run queue */
671 		sched_dequeue(l);
672 		l->l_cpu = curci;
673 		lwp_unlock_to(l, curspc->spc_mutex);
674 		sched_enqueue(l);
675 		return l;
676 	}
677 	spc_unlock(ci);
678 
679 	return l;
680 }
681 
682 /*
683  * Called from sched_idle() to handle migration.
684  */
685 static void
686 sched_idle_migrate(void)
687 {
688 	struct cpu_info *ci = curcpu(), *tci = NULL;
689 	struct schedstate_percpu *spc, *tspc;
690 	bool dlock = false;
691 
692 	spc = &ci->ci_schedstate;
693 	spc_lock(ci);
694 	for (;;) {
695 		struct lwp *l;
696 
697 		l = spc->spc_migrating;
698 		if (l == NULL)
699 			break;
700 
701 		/*
702 		 * If second attempt, and target CPU has changed,
703 		 * drop the old lock.
704 		 */
705 		if (dlock == true && tci != l->l_target_cpu) {
706 			KASSERT(tci != NULL);
707 			spc_unlock(tci);
708 			dlock = false;
709 		}
710 
711 		/*
712 		 * Nothing to do if destination has changed to the
713 		 * local CPU, or migration was done by other CPU.
714 		 */
715 		tci = l->l_target_cpu;
716 		if (tci == NULL || tci == ci) {
717 			spc->spc_migrating = NULL;
718 			l->l_target_cpu = NULL;
719 			break;
720 		}
721 		tspc = &tci->ci_schedstate;
722 
723 		/*
724 		 * Double-lock the runqueues.
725 		 * We do that only once.
726 		 */
727 		if (dlock == false) {
728 			dlock = true;
729 			if (ci < tci) {
730 				spc_lock(tci);
731 			} else if (!mutex_tryenter(tspc->spc_mutex)) {
732 				spc_unlock(ci);
733 				spc_lock(tci);
734 				spc_lock(ci);
735 				/* Check the situation again.. */
736 				continue;
737 			}
738 		}
739 
740 		/* Migrate the thread */
741 		KASSERT(l->l_stat == LSRUN);
742 		spc->spc_migrating = NULL;
743 		l->l_target_cpu = NULL;
744 		sched_dequeue(l);
745 		l->l_cpu = tci;
746 		lwp_setlock(l, tspc->spc_mutex);
747 		sched_enqueue(l);
748 		sched_resched_lwp(l, true);
749 		/* tci now unlocked */
750 		spc_unlock(ci);
751 		return;
752 	}
753 	if (dlock == true) {
754 		KASSERT(tci != NULL);
755 		spc_unlock(tci);
756 	}
757 	spc_unlock(ci);
758 }
759 
760 /*
761  * Try to steal an LWP from "tci".
762  */
763 static bool
764 sched_steal(struct cpu_info *ci, struct cpu_info *tci)
765 {
766 	struct schedstate_percpu *spc, *tspc;
767 	lwp_t *l;
768 
769 	spc = &ci->ci_schedstate;
770 	tspc = &tci->ci_schedstate;
771 	if (tspc->spc_mcount != 0 && spc->spc_psid == tspc->spc_psid) {
772 		spc_dlock(ci, tci);
773 		l = sched_catchlwp(tci);
774 		spc_unlock(ci);
775 		if (l != NULL) {
776 			return true;
777 		}
778 	}
779 	return false;
780 }
781 
782 /*
783  * Called from each CPU's idle loop.
784  */
785 void
786 sched_idle(void)
787 {
788 	struct cpu_info *ci = curcpu(), *inner, *outer, *first, *tci = NULL;
789 	struct schedstate_percpu *spc, *tspc;
790 	struct lwp *l;
791 
792 	spc = &ci->ci_schedstate;
793 
794 	/*
795 	 * Handle LWP migrations off this CPU to another.  If there a is
796 	 * migration to do then go idle afterwards (we'll wake again soon),
797 	 * as we don't want to instantly steal back the LWP we just moved
798 	 * out.
799 	 */
800 	if (spc->spc_migrating != NULL) {
801 		sched_idle_migrate();
802 		return;
803 	}
804 
805 	/* If this CPU is offline, or we have an LWP to run, we're done. */
806 	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
807 		return;
808 	}
809 
810 	/* Deal with SMT. */
811 	if (ci->ci_nsibling[CPUREL_CORE] > 1) {
812 		/* Try to help our siblings out. */
813 		tci = ci->ci_sibling[CPUREL_CORE];
814 		while (tci != ci) {
815 			if (sched_steal(ci, tci)) {
816 				return;
817 			}
818 			tci = tci->ci_sibling[CPUREL_CORE];
819 		}
820 		/*
821 		 * If not the first SMT in the core, and in the default
822 		 * processor set, the search ends here.
823 		 */
824 		if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
825 		    spc->spc_psid == PS_NONE) {
826 			return;
827 		}
828 	}
829 
830 	/*
831 	 * Find something to run, unless this CPU exceeded the rate limit.
832 	 * Start looking on the current package to maximise L2/L3 cache
833 	 * locality.  Then expand to looking at the rest of the system.
834 	 *
835 	 * XXX Should probably look at 2nd class CPUs first, but they will
836 	 * shed jobs via preempt() anyway.
837 	 */
838 	if (spc->spc_nextskim > hardclock_ticks) {
839 		return;
840 	}
841 	spc->spc_nextskim = hardclock_ticks + mstohz(skim_interval);
842 
843 	/* In the outer loop scroll through all CPU packages, starting here. */
844 	first = ci->ci_package1st;
845 	outer = first;
846 	do {
847 		/* In the inner loop scroll through all CPUs in package. */
848 		inner = outer;
849 		do {
850 			/* Don't hit the locks unless needed. */
851 			tspc = &inner->ci_schedstate;
852 			if (ci == inner || spc->spc_psid != tspc->spc_psid ||
853 			    tspc->spc_mcount < min_catch) {
854 				continue;
855 			}
856 			spc_dlock(ci, inner);
857 			l = sched_catchlwp(inner);
858 			spc_unlock(ci);
859 			if (l != NULL) {
860 				/* Got it! */
861 				return;
862 			}
863 		} while (inner = inner->ci_sibling[CPUREL_PACKAGE],
864 		    inner != outer);
865 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
866 	    outer != first);
867 }
868 
869 /*
870  * Called from mi_switch() when an LWP has been preempted / has yielded.
871  * The LWP is presently in the CPU's run queue.  Here we look for a better
872  * CPU to teleport the LWP to; there may not be one.
873  */
874 void
875 sched_preempted(struct lwp *l)
876 {
877 	struct schedstate_percpu *tspc;
878 	struct cpu_info *ci, *tci;
879 
880 	ci = l->l_cpu;
881 	tspc = &ci->ci_schedstate;
882 
883 	KASSERT(tspc->spc_count >= 1);
884 
885 	/*
886 	 * Try to select another CPU if:
887 	 *
888 	 * - there is no migration pending already
889 	 * - and this LWP is running on a 2nd class CPU
890 	 * - or this LWP is a child of vfork() that has just done execve()
891 	 */
892 	if (l->l_target_cpu != NULL ||
893 	    ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
894 	    (l->l_pflag & LP_TELEPORT) == 0)) {
895 		return;
896 	}
897 
898 	/*
899 	 * Fast path: if the first SMT in the core is idle, send it back
900 	 * there, because the cache is shared (cheap) and we want all LWPs
901 	 * to be clustered on 1st class CPUs (either running there or on
902 	 * their runqueues).
903 	 */
904 	tci = ci->ci_sibling[CPUREL_CORE];
905 	while (tci != ci) {
906 		const int flags = SPCF_IDLE | SPCF_1STCLASS;
907 		tspc = &tci->ci_schedstate;
908 		if ((tspc->spc_flags & flags) == flags &&
909 		    sched_migratable(l, tci)) {
910 		    	l->l_target_cpu = tci;
911 		    	return;
912 		}
913 		tci = tci->ci_sibling[CPUREL_CORE];
914 	}
915 
916 	if ((l->l_pflag & LP_TELEPORT) != 0) {
917 		/*
918 		 * A child of vfork(): now that the parent is released,
919 		 * scatter far and wide, to match the LSIDL distribution
920 		 * done in sched_takecpu().
921 		 */
922 		l->l_pflag &= ~LP_TELEPORT;
923 		tci = sched_bestcpu(l, sched_nextpkg());
924 		if (tci != ci) {
925 			l->l_target_cpu = tci;
926 		}
927 	} else {
928 		/*
929 		 * Try to find a better CPU to take it, but don't move to
930 		 * another 2nd class CPU; there's not much point.
931 		 *
932 		 * Search in the current CPU package in order to try and
933 		 * keep L2/L3 cache locality, but expand to include the
934 		 * whole system if needed.
935 		 */
936 		tci = sched_bestcpu(l, l->l_cpu);
937 		if (tci != ci &&
938 		    (tci->ci_schedstate.spc_flags & SPCF_1STCLASS) != 0) {
939 			l->l_target_cpu = tci;
940 		}
941 	}
942 }
943 
944 /*
945  * Called during execve() by a child of vfork().  Does two things:
946  *
947  * - If the parent has been awoken and put back on curcpu then give the
948  *   CPU back to the parent.
949  *
950  * - If curlwp is not on a 1st class CPU then find somewhere else to run,
951  *   since it dodged the distribution in sched_takecpu() when first set
952  *   runnable.
953  */
954 void
955 sched_vforkexec(struct lwp *l, bool samecpu)
956 {
957 
958 	KASSERT(l == curlwp);
959 	if ((samecpu && ncpu > 1) ||
960 	    (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
961 		l->l_pflag |= LP_TELEPORT;
962 		preempt();
963 	}
964 }
965 
966 #else
967 
968 /*
969  * stubs for !MULTIPROCESSOR
970  */
971 
972 struct cpu_info *
973 sched_takecpu(struct lwp *l)
974 {
975 
976 	return l->l_cpu;
977 }
978 
979 void
980 sched_idle(void)
981 {
982 
983 }
984 
985 void
986 sched_preempted(struct lwp *l)
987 {
988 
989 }
990 
991 void
992 sched_vforkexec(struct lwp *l, bool samecpu)
993 {
994 
995 	KASSERT(l == curlwp);
996 }
997 
998 #endif	/* MULTIPROCESSOR */
999 
1000 /*
1001  * Scheduling statistics and balancing.
1002  */
1003 void
1004 sched_lwp_stats(struct lwp *l)
1005 {
1006 	int batch;
1007 
1008 	KASSERT(lwp_locked(l, NULL));
1009 
1010 	/* Update sleep time */
1011 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
1012 	    l->l_stat == LSSUSPENDED)
1013 		l->l_slptime++;
1014 
1015 	/*
1016 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
1017 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
1018 	 */
1019 	batch = (l->l_rticksum > l->l_slpticksum);
1020 	if (batch != 0) {
1021 		if ((l->l_flag & LW_BATCH) == 0)
1022 			batch = 0;
1023 		l->l_flag |= LW_BATCH;
1024 	} else
1025 		l->l_flag &= ~LW_BATCH;
1026 
1027 	/* Reset the time sums */
1028 	l->l_slpticksum = 0;
1029 	l->l_rticksum = 0;
1030 
1031 	/* Scheduler-specific hook */
1032 	sched_pstats_hook(l, batch);
1033 #ifdef KDTRACE_HOOKS
1034 	curthread = l;
1035 #endif
1036 }
1037 
1038 /*
1039  * Scheduler mill.
1040  */
1041 struct lwp *
1042 sched_nextlwp(void)
1043 {
1044 	struct cpu_info *ci = curcpu();
1045 	struct schedstate_percpu *spc;
1046 	TAILQ_HEAD(, lwp) *q_head;
1047 	struct lwp *l;
1048 
1049 	/* Update the last run time on switch */
1050 	l = curlwp;
1051 	l->l_rticksum += (hardclock_ticks - l->l_rticks);
1052 
1053 	/* Return to idle LWP if there is a migrating thread */
1054 	spc = &ci->ci_schedstate;
1055 	if (__predict_false(spc->spc_migrating != NULL))
1056 		return NULL;
1057 
1058 	/* Return to idle LWP if there is no runnable job */
1059 	if (__predict_false(spc->spc_count == 0))
1060 		return NULL;
1061 
1062 	/* Take the highest priority thread */
1063 	KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
1064 	q_head = sched_getrq(spc, spc->spc_maxpriority);
1065 	l = TAILQ_FIRST(q_head);
1066 	KASSERT(l != NULL);
1067 
1068 	sched_oncpu(l);
1069 	l->l_rticks = hardclock_ticks;
1070 
1071 	return l;
1072 }
1073 
1074 /*
1075  * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
1076  */
1077 
1078 bool
1079 sched_curcpu_runnable_p(void)
1080 {
1081 	const struct cpu_info *ci;
1082 	const struct schedstate_percpu *spc;
1083 	bool rv;
1084 
1085 	kpreempt_disable();
1086 	ci = curcpu();
1087 	spc = &ci->ci_schedstate;
1088 
1089 #ifndef __HAVE_FAST_SOFTINTS
1090 	if (ci->ci_data.cpu_softints) {
1091 		kpreempt_enable();
1092 		return true;
1093 	}
1094 #endif
1095 
1096 	rv = (spc->spc_count != 0) ? true : false;
1097 	kpreempt_enable();
1098 
1099 	return rv;
1100 }
1101 
1102 /*
1103  * Sysctl nodes and initialization.
1104  */
1105 
1106 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
1107 {
1108 	const struct sysctlnode *node = NULL;
1109 
1110 	sysctl_createv(clog, 0, NULL, &node,
1111 		CTLFLAG_PERMANENT,
1112 		CTLTYPE_NODE, "sched",
1113 		SYSCTL_DESCR("Scheduler options"),
1114 		NULL, 0, NULL, 0,
1115 		CTL_KERN, CTL_CREATE, CTL_EOL);
1116 
1117 	if (node == NULL)
1118 		return;
1119 
1120 	sysctl_createv(clog, 0, &node, NULL,
1121 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1122 		CTLTYPE_INT, "cacheht_time",
1123 		SYSCTL_DESCR("Cache hotness time (in ms)"),
1124 		NULL, 0, &cacheht_time, 0,
1125 		CTL_CREATE, CTL_EOL);
1126 	sysctl_createv(clog, 0, &node, NULL,
1127 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1128 		CTLTYPE_INT, "skim_interval",
1129 		SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
1130 		NULL, 0, &skim_interval, 0,
1131 		CTL_CREATE, CTL_EOL);
1132 	sysctl_createv(clog, 0, &node, NULL,
1133 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1134 		CTLTYPE_INT, "min_catch",
1135 		SYSCTL_DESCR("Minimal count of threads for catching"),
1136 		NULL, 0, &min_catch, 0,
1137 		CTL_CREATE, CTL_EOL);
1138 	sysctl_createv(clog, 0, &node, NULL,
1139 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1140 		CTLTYPE_INT, "timesoftints",
1141 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
1142 		NULL, 0, &softint_timing, 0,
1143 		CTL_CREATE, CTL_EOL);
1144 	sysctl_createv(clog, 0, &node, NULL,
1145 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1146 		CTLTYPE_INT, "kpreempt_pri",
1147 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
1148 		NULL, 0, &sched_kpreempt_pri, 0,
1149 		CTL_CREATE, CTL_EOL);
1150 }
1151 
1152 /*
1153  * Debugging.
1154  */
1155 
1156 #ifdef DDB
1157 
1158 void
1159 sched_print_runqueue(void (*pr)(const char *, ...))
1160 {
1161 	struct cpu_info *ci, *tci;
1162 	struct schedstate_percpu *spc;
1163 	struct lwp *l;
1164 	struct proc *p;
1165 	CPU_INFO_ITERATOR cii;
1166 
1167 	for (CPU_INFO_FOREACH(cii, ci)) {
1168 		int i;
1169 
1170 		spc = &ci->ci_schedstate;
1171 
1172 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
1173 		(*pr)(" pid.lid = %d.%d, r_count = %u, "
1174 		    "maxpri = %d, mlwp = %p\n",
1175 #ifdef MULTIPROCESSOR
1176 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
1177 #else
1178 		    curlwp->l_proc->p_pid, curlwp->l_lid,
1179 #endif
1180 		    spc->spc_count, spc->spc_maxpriority,
1181 		    spc->spc_migrating);
1182 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
1183 		do {
1184 			uint32_t q;
1185 			q = spc->spc_bitmap[i];
1186 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
1187 		} while (i--);
1188 	}
1189 
1190 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
1191 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
1192 
1193 	PROCLIST_FOREACH(p, &allproc) {
1194 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
1195 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1196 			ci = l->l_cpu;
1197 			tci = l->l_target_cpu;
1198 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
1199 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
1200 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
1201 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
1202 			    l, ci->ci_index, (tci ? tci->ci_index : -1),
1203 			    (u_int)(hardclock_ticks - l->l_rticks));
1204 		}
1205 	}
1206 }
1207 
1208 #endif
1209