xref: /netbsd-src/sys/kern/kern_runq.c (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1 /*	$NetBSD: kern_runq.c,v 1.46 2016/12/22 14:11:58 mlelstv Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.46 2016/12/22 14:11:58 mlelstv Exp $");
31 
32 #include "opt_dtrace.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/bitops.h>
37 #include <sys/cpu.h>
38 #include <sys/idle.h>
39 #include <sys/intr.h>
40 #include <sys/kmem.h>
41 #include <sys/lwp.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/sched.h>
45 #include <sys/syscallargs.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48 #include <sys/types.h>
49 #include <sys/evcnt.h>
50 
51 /*
52  * Priority related definitions.
53  */
54 #define	PRI_TS_COUNT	(NPRI_USER)
55 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
56 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
57 
58 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
59 
60 /*
61  * Bits per map.
62  */
63 #define	BITMAP_BITS	(32)
64 #define	BITMAP_SHIFT	(5)
65 #define	BITMAP_MSB	(0x80000000U)
66 #define	BITMAP_MASK	(BITMAP_BITS - 1)
67 
68 /*
69  * Structures, runqueue.
70  */
71 
72 const int	schedppq = 1;
73 
74 typedef struct {
75 	TAILQ_HEAD(, lwp) q_head;
76 } queue_t;
77 
78 typedef struct {
79 	/* Bitmap */
80 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
81 	/* Counters */
82 	u_int		r_count;	/* Count of the threads */
83 	u_int		r_avgcount;	/* Average count of threads (* 256) */
84 	u_int		r_mcount;	/* Count of migratable threads */
85 	/* Runqueues */
86 	queue_t		r_rt_queue[PRI_RT_COUNT];
87 	queue_t		r_ts_queue[PRI_TS_COUNT];
88 	/* Event counters */
89 	struct evcnt	r_ev_pull;
90 	struct evcnt	r_ev_push;
91 	struct evcnt	r_ev_stay;
92 	struct evcnt	r_ev_localize;
93 } runqueue_t;
94 
95 static void *	sched_getrq(runqueue_t *, const pri_t);
96 #ifdef MULTIPROCESSOR
97 static lwp_t *	sched_catchlwp(struct cpu_info *);
98 static void	sched_balance(void *);
99 #endif
100 
101 /*
102  * Preemption control.
103  */
104 int		sched_upreempt_pri = 0;
105 #ifdef __HAVE_PREEMPTION
106 # ifdef DEBUG
107 int		sched_kpreempt_pri = 0;
108 # else
109 int		sched_kpreempt_pri = PRI_USER_RT;
110 # endif
111 #else
112 int		sched_kpreempt_pri = 1000;
113 #endif
114 
115 /*
116  * Migration and balancing.
117  */
118 static u_int	cacheht_time;		/* Cache hotness time */
119 static u_int	min_catch;		/* Minimal LWP count for catching */
120 static u_int	balance_period;		/* Balance period */
121 static u_int	average_weight;		/* Weight old thread count average */
122 static struct cpu_info *worker_ci;	/* Victim CPU */
123 #ifdef MULTIPROCESSOR
124 static struct callout balance_ch;	/* Callout of balancer */
125 #endif
126 
127 #ifdef KDTRACE_HOOKS
128 struct lwp *curthread;
129 #endif
130 
131 void
132 runq_init(void)
133 {
134 
135 	/* Balancing */
136 	worker_ci = curcpu();
137 	cacheht_time = mstohz(3);		/*   ~3 ms */
138 	balance_period = mstohz(300);		/* ~300 ms */
139 
140 	/* Minimal count of LWPs for catching */
141 	min_catch = 1;
142 	/* Weight of historical average */
143 	average_weight = 50;			/*   0.5   */
144 
145 	/* Initialize balancing callout and run it */
146 #ifdef MULTIPROCESSOR
147 	callout_init(&balance_ch, CALLOUT_MPSAFE);
148 	callout_setfunc(&balance_ch, sched_balance, NULL);
149 	callout_schedule(&balance_ch, balance_period);
150 #endif
151 }
152 
153 void
154 sched_cpuattach(struct cpu_info *ci)
155 {
156 	runqueue_t *ci_rq;
157 	void *rq_ptr;
158 	u_int i, size;
159 
160 	if (ci->ci_schedstate.spc_lwplock == NULL) {
161 		ci->ci_schedstate.spc_lwplock =
162 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
163 	}
164 	if (ci == lwp0.l_cpu) {
165 		/* Initialize the scheduler structure of the primary LWP */
166 		lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
167 	}
168 	if (ci->ci_schedstate.spc_mutex != NULL) {
169 		/* Already initialized. */
170 		return;
171 	}
172 
173 	/* Allocate the run queue */
174 	size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
175 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
176 	if (rq_ptr == NULL) {
177 		panic("sched_cpuattach: could not allocate the runqueue");
178 	}
179 	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
180 
181 	/* Initialize run queues */
182 	ci->ci_schedstate.spc_mutex =
183 	    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
184 	for (i = 0; i < PRI_RT_COUNT; i++)
185 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
186 	for (i = 0; i < PRI_TS_COUNT; i++)
187 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
188 
189 	ci->ci_schedstate.spc_sched_info = ci_rq;
190 
191 	evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL,
192 	   cpu_name(ci), "runqueue pull");
193 	evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL,
194 	   cpu_name(ci), "runqueue push");
195 	evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL,
196 	   cpu_name(ci), "runqueue stay");
197 	evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL,
198 	   cpu_name(ci), "runqueue localize");
199 }
200 
201 /*
202  * Control of the runqueue.
203  */
204 
205 static inline void *
206 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
207 {
208 
209 	KASSERT(prio < PRI_COUNT);
210 	return (prio <= PRI_HIGHEST_TS) ?
211 	    &ci_rq->r_ts_queue[prio].q_head :
212 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
213 }
214 
215 void
216 sched_enqueue(struct lwp *l, bool swtch)
217 {
218 	runqueue_t *ci_rq;
219 	struct schedstate_percpu *spc;
220 	TAILQ_HEAD(, lwp) *q_head;
221 	const pri_t eprio = lwp_eprio(l);
222 	struct cpu_info *ci;
223 	int type;
224 
225 	ci = l->l_cpu;
226 	spc = &ci->ci_schedstate;
227 	ci_rq = spc->spc_sched_info;
228 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
229 
230 	/* Update the last run time on switch */
231 	if (__predict_true(swtch == true))
232 		l->l_rticksum += (hardclock_ticks - l->l_rticks);
233 	else if (l->l_rticks == 0)
234 		l->l_rticks = hardclock_ticks;
235 
236 	/* Enqueue the thread */
237 	q_head = sched_getrq(ci_rq, eprio);
238 	if (TAILQ_EMPTY(q_head)) {
239 		u_int i;
240 		uint32_t q;
241 
242 		/* Mark bit */
243 		i = eprio >> BITMAP_SHIFT;
244 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
245 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
246 		ci_rq->r_bitmap[i] |= q;
247 	}
248 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
249 	ci_rq->r_count++;
250 	if ((l->l_pflag & LP_BOUND) == 0)
251 		ci_rq->r_mcount++;
252 
253 	/*
254 	 * Update the value of highest priority in the runqueue,
255 	 * if priority of this thread is higher.
256 	 */
257 	if (eprio > spc->spc_maxpriority)
258 		spc->spc_maxpriority = eprio;
259 
260 	sched_newts(l);
261 
262 	/*
263 	 * Wake the chosen CPU or cause a preemption if the newly
264 	 * enqueued thread has higher priority.  Don't cause a
265 	 * preemption if the thread is yielding (swtch).
266 	 */
267 	if (!swtch && eprio > spc->spc_curpriority) {
268 		if (eprio >= sched_kpreempt_pri)
269 			type = RESCHED_KPREEMPT;
270 		else if (eprio >= sched_upreempt_pri)
271 			type = RESCHED_IMMED;
272 		else
273 			type = RESCHED_LAZY;
274 		cpu_need_resched(ci, type);
275 	}
276 }
277 
278 void
279 sched_dequeue(struct lwp *l)
280 {
281 	runqueue_t *ci_rq;
282 	TAILQ_HEAD(, lwp) *q_head;
283 	struct schedstate_percpu *spc;
284 	const pri_t eprio = lwp_eprio(l);
285 
286 	spc = & l->l_cpu->ci_schedstate;
287 	ci_rq = spc->spc_sched_info;
288 	KASSERT(lwp_locked(l, spc->spc_mutex));
289 
290 	KASSERT(eprio <= spc->spc_maxpriority);
291 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
292 	KASSERT(ci_rq->r_count > 0);
293 
294 	if (spc->spc_migrating == l)
295 		spc->spc_migrating = NULL;
296 
297 	ci_rq->r_count--;
298 	if ((l->l_pflag & LP_BOUND) == 0)
299 		ci_rq->r_mcount--;
300 
301 	q_head = sched_getrq(ci_rq, eprio);
302 	TAILQ_REMOVE(q_head, l, l_runq);
303 	if (TAILQ_EMPTY(q_head)) {
304 		u_int i;
305 		uint32_t q;
306 
307 		/* Unmark bit */
308 		i = eprio >> BITMAP_SHIFT;
309 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
310 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
311 		ci_rq->r_bitmap[i] &= ~q;
312 
313 		/*
314 		 * Update the value of highest priority in the runqueue, in a
315 		 * case it was a last thread in the queue of highest priority.
316 		 */
317 		if (eprio != spc->spc_maxpriority)
318 			return;
319 
320 		do {
321 			if (ci_rq->r_bitmap[i] != 0) {
322 				q = ffs(ci_rq->r_bitmap[i]);
323 				spc->spc_maxpriority =
324 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
325 				return;
326 			}
327 		} while (i--);
328 
329 		/* If not found - set the lowest value */
330 		spc->spc_maxpriority = 0;
331 	}
332 }
333 
334 /*
335  * Migration and balancing.
336  */
337 
338 #ifdef MULTIPROCESSOR
339 
340 /* Estimate if LWP is cache-hot */
341 static inline bool
342 lwp_cache_hot(const struct lwp *l)
343 {
344 
345 	if (__predict_false(l->l_slptime || l->l_rticks == 0))
346 		return false;
347 
348 	return (hardclock_ticks - l->l_rticks <= cacheht_time);
349 }
350 
351 /* Check if LWP can migrate to the chosen CPU */
352 static inline bool
353 sched_migratable(const struct lwp *l, struct cpu_info *ci)
354 {
355 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
356 	KASSERT(lwp_locked(__UNCONST(l), NULL));
357 
358 	/* Is CPU offline? */
359 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
360 		return false;
361 
362 	/* Is affinity set? */
363 	if (__predict_false(l->l_affinity))
364 		return kcpuset_isset(l->l_affinity, cpu_index(ci));
365 
366 	/* Is there a processor-set? */
367 	return (spc->spc_psid == l->l_psid);
368 }
369 
370 /*
371  * Estimate the migration of LWP to the other CPU.
372  * Take and return the CPU, if migration is needed.
373  */
374 struct cpu_info *
375 sched_takecpu(struct lwp *l)
376 {
377 	struct cpu_info *ci, *tci, *pivot, *next;
378 	struct schedstate_percpu *spc;
379 	runqueue_t *ci_rq, *ici_rq;
380 	pri_t eprio, lpri, pri;
381 
382 	KASSERT(lwp_locked(l, NULL));
383 
384 	/* If thread is strictly bound, do not estimate other CPUs */
385 	ci = l->l_cpu;
386 	if (l->l_pflag & LP_BOUND)
387 		return ci;
388 
389 	spc = &ci->ci_schedstate;
390 	ci_rq = spc->spc_sched_info;
391 
392 	/* Make sure that thread is in appropriate processor-set */
393 	if (__predict_true(spc->spc_psid == l->l_psid)) {
394 		/* If CPU of this thread is idling - run there */
395 		if (ci_rq->r_count == 0) {
396 			ci_rq->r_ev_stay.ev_count++;
397 			return ci;
398 		}
399 		/* Stay if thread is cache-hot */
400 		eprio = lwp_eprio(l);
401 		if (__predict_true(l->l_stat != LSIDL) &&
402 		    lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
403 			ci_rq->r_ev_stay.ev_count++;
404 			return ci;
405 		}
406 	} else {
407 		eprio = lwp_eprio(l);
408 	}
409 
410 	/* Run on current CPU if priority of thread is higher */
411 	ci = curcpu();
412 	spc = &ci->ci_schedstate;
413 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) {
414 		ci_rq = spc->spc_sched_info;
415 		ci_rq->r_ev_localize.ev_count++;
416 		return ci;
417 	}
418 
419 	/*
420 	 * Look for the CPU with the lowest priority thread.  In case of
421 	 * equal priority, choose the CPU with the fewest of threads.
422 	 */
423 	pivot = l->l_cpu;
424 	ci = pivot;
425 	tci = pivot;
426 	lpri = PRI_COUNT;
427 	do {
428 		if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) {
429 			/* Reached the end, start from the beginning. */
430 			next = cpu_lookup(0);
431 		}
432 		spc = &ci->ci_schedstate;
433 		ici_rq = spc->spc_sched_info;
434 		pri = MAX(spc->spc_curpriority, spc->spc_maxpriority);
435 		if (pri > lpri)
436 			continue;
437 
438 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
439 			continue;
440 
441 		if (!sched_migratable(l, ci))
442 			continue;
443 
444 		lpri = pri;
445 		tci = ci;
446 		ci_rq = ici_rq;
447 	} while (ci = next, ci != pivot);
448 
449 	ci_rq = tci->ci_schedstate.spc_sched_info;
450 	ci_rq->r_ev_push.ev_count++;
451 
452 	return tci;
453 }
454 
455 /*
456  * Tries to catch an LWP from the runqueue of other CPU.
457  */
458 static struct lwp *
459 sched_catchlwp(struct cpu_info *ci)
460 {
461 	struct cpu_info *curci = curcpu();
462 	struct schedstate_percpu *spc, *curspc;
463 	TAILQ_HEAD(, lwp) *q_head;
464 	runqueue_t *ci_rq;
465 	struct lwp *l;
466 
467 	curspc = &curci->ci_schedstate;
468 	spc = &ci->ci_schedstate;
469 	KASSERT(curspc->spc_psid == spc->spc_psid);
470 
471 	ci_rq = spc->spc_sched_info;
472 	if (ci_rq->r_mcount < min_catch) {
473 		spc_unlock(ci);
474 		return NULL;
475 	}
476 
477 	/* Take the highest priority thread */
478 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
479 	l = TAILQ_FIRST(q_head);
480 
481 	for (;;) {
482 		/* Check the first and next result from the queue */
483 		if (l == NULL) {
484 			break;
485 		}
486 		KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
487 		    ci->ci_data.cpu_name,
488 		    l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
489 
490 		/* Look for threads, whose are allowed to migrate */
491 		if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
492 		    !sched_migratable(l, curci)) {
493 			l = TAILQ_NEXT(l, l_runq);
494 			continue;
495 		}
496 
497 		/* Grab the thread, and move to the local run queue */
498 		sched_dequeue(l);
499 
500 		/*
501 		 * If LWP is still context switching, we may need to
502 		 * spin-wait before changing its CPU.
503 		 */
504 		if (__predict_false(l->l_ctxswtch != 0)) {
505 			u_int count;
506 			count = SPINLOCK_BACKOFF_MIN;
507 			while (l->l_ctxswtch)
508 				SPINLOCK_BACKOFF(count);
509 		}
510 		l->l_cpu = curci;
511 		ci_rq->r_ev_pull.ev_count++;
512 		lwp_unlock_to(l, curspc->spc_mutex);
513 		sched_enqueue(l, false);
514 		return l;
515 	}
516 	spc_unlock(ci);
517 
518 	return l;
519 }
520 
521 /*
522  * Periodical calculations for balancing.
523  */
524 static void
525 sched_balance(void *nocallout)
526 {
527 	struct cpu_info *ci, *hci;
528 	runqueue_t *ci_rq;
529 	CPU_INFO_ITERATOR cii;
530 	u_int highest;
531 	u_int weight;
532 
533 	/* sanitize sysctl value */
534 	weight = MIN(average_weight, 100);
535 
536 	hci = curcpu();
537 	highest = 0;
538 
539 	/* Make lockless countings */
540 	for (CPU_INFO_FOREACH(cii, ci)) {
541 		ci_rq = ci->ci_schedstate.spc_sched_info;
542 
543 		/*
544 		 * Average count of the threads
545 		 *
546 		 * The average is computed as a fixpoint number with
547 		 * 8 fractional bits.
548 		 */
549 		ci_rq->r_avgcount = (
550 			weight * ci_rq->r_avgcount + (100 - weight) * 256 * ci_rq->r_mcount
551 			) / 100;
552 
553 		/* Look for CPU with the highest average */
554 		if (ci_rq->r_avgcount > highest) {
555 			hci = ci;
556 			highest = ci_rq->r_avgcount;
557 		}
558 	}
559 
560 	/* Update the worker */
561 	worker_ci = hci;
562 
563 	if (nocallout == NULL)
564 		callout_schedule(&balance_ch, balance_period);
565 }
566 
567 /*
568  * Called from each CPU's idle loop.
569  */
570 void
571 sched_idle(void)
572 {
573 	struct cpu_info *ci = curcpu(), *tci = NULL;
574 	struct schedstate_percpu *spc, *tspc;
575 	runqueue_t *ci_rq;
576 	bool dlock = false;
577 
578 	/* Check if there is a migrating LWP */
579 	spc = &ci->ci_schedstate;
580 	if (spc->spc_migrating == NULL)
581 		goto no_migration;
582 
583 	spc_lock(ci);
584 	for (;;) {
585 		struct lwp *l;
586 
587 		l = spc->spc_migrating;
588 		if (l == NULL)
589 			break;
590 
591 		/*
592 		 * If second attempt, and target CPU has changed,
593 		 * drop the old lock.
594 		 */
595 		if (dlock == true && tci != l->l_target_cpu) {
596 			KASSERT(tci != NULL);
597 			spc_unlock(tci);
598 			dlock = false;
599 		}
600 
601 		/*
602 		 * Nothing to do if destination has changed to the
603 		 * local CPU, or migration was done by other CPU.
604 		 */
605 		tci = l->l_target_cpu;
606 		if (tci == NULL || tci == ci) {
607 			spc->spc_migrating = NULL;
608 			l->l_target_cpu = NULL;
609 			break;
610 		}
611 		tspc = &tci->ci_schedstate;
612 
613 		/*
614 		 * Double-lock the runqueues.
615 		 * We do that only once.
616 		 */
617 		if (dlock == false) {
618 			dlock = true;
619 			if (ci < tci) {
620 				spc_lock(tci);
621 			} else if (!mutex_tryenter(tspc->spc_mutex)) {
622 				spc_unlock(ci);
623 				spc_lock(tci);
624 				spc_lock(ci);
625 				/* Check the situation again.. */
626 				continue;
627 			}
628 		}
629 
630 		/* Migrate the thread */
631 		KASSERT(l->l_stat == LSRUN);
632 		spc->spc_migrating = NULL;
633 		l->l_target_cpu = NULL;
634 		sched_dequeue(l);
635 		l->l_cpu = tci;
636 		lwp_setlock(l, tspc->spc_mutex);
637 		sched_enqueue(l, false);
638 		break;
639 	}
640 	if (dlock == true) {
641 		KASSERT(tci != NULL);
642 		spc_unlock(tci);
643 	}
644 	spc_unlock(ci);
645 
646 no_migration:
647 	ci_rq = spc->spc_sched_info;
648 	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) {
649 		return;
650 	}
651 
652 	/* Reset the counter, and call the balancer */
653 	ci_rq->r_avgcount = 0;
654 	sched_balance(ci);
655 	tci = worker_ci;
656 	tspc = &tci->ci_schedstate;
657 	if (ci == tci || spc->spc_psid != tspc->spc_psid)
658 		return;
659 	spc_dlock(ci, tci);
660 	(void)sched_catchlwp(tci);
661 	spc_unlock(ci);
662 }
663 
664 #else
665 
666 /*
667  * stubs for !MULTIPROCESSOR
668  */
669 
670 struct cpu_info *
671 sched_takecpu(struct lwp *l)
672 {
673 
674 	return l->l_cpu;
675 }
676 
677 void
678 sched_idle(void)
679 {
680 
681 }
682 #endif	/* MULTIPROCESSOR */
683 
684 /*
685  * Scheduling statistics and balancing.
686  */
687 void
688 sched_lwp_stats(struct lwp *l)
689 {
690 	int batch;
691 
692 	KASSERT(lwp_locked(l, NULL));
693 
694 	/* Update sleep time */
695 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
696 	    l->l_stat == LSSUSPENDED)
697 		l->l_slptime++;
698 
699 	/*
700 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
701 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
702 	 */
703 	batch = (l->l_rticksum > l->l_slpticksum);
704 	if (batch != 0) {
705 		if ((l->l_flag & LW_BATCH) == 0)
706 			batch = 0;
707 		l->l_flag |= LW_BATCH;
708 	} else
709 		l->l_flag &= ~LW_BATCH;
710 
711 	/*
712 	 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
713 	 * In such case reset the value of last sleep, and check it later, if
714 	 * it is still zero - perform the migration, unmark the batch flag.
715 	 */
716 	if (batch && (l->l_slptime + l->l_slpticksum) == 0) {
717 		if (l->l_slpticks == 0) {
718 			if (l->l_target_cpu == NULL &&
719 			    (l->l_stat == LSRUN || l->l_stat == LSONPROC)) {
720 				struct cpu_info *ci = sched_takecpu(l);
721 				l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL;
722 			}
723 			l->l_flag &= ~LW_BATCH;
724 		} else {
725 			l->l_slpticks = 0;
726 		}
727 	}
728 
729 	/* Reset the time sums */
730 	l->l_slpticksum = 0;
731 	l->l_rticksum = 0;
732 
733 	/* Scheduler-specific hook */
734 	sched_pstats_hook(l, batch);
735 #ifdef KDTRACE_HOOKS
736 	curthread = l;
737 #endif
738 }
739 
740 /*
741  * Scheduler mill.
742  */
743 struct lwp *
744 sched_nextlwp(void)
745 {
746 	struct cpu_info *ci = curcpu();
747 	struct schedstate_percpu *spc;
748 	TAILQ_HEAD(, lwp) *q_head;
749 	runqueue_t *ci_rq;
750 	struct lwp *l;
751 
752 	/* Return to idle LWP if there is a migrating thread */
753 	spc = &ci->ci_schedstate;
754 	if (__predict_false(spc->spc_migrating != NULL))
755 		return NULL;
756 	ci_rq = spc->spc_sched_info;
757 
758 #ifdef MULTIPROCESSOR
759 	/* If runqueue is empty, try to catch some thread from other CPU */
760 	if (__predict_false(ci_rq->r_count == 0)) {
761 		struct schedstate_percpu *cspc;
762 		struct cpu_info *cci;
763 
764 		/* Offline CPUs should not perform this, however */
765 		if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
766 			return NULL;
767 
768 		/* Reset the counter, and call the balancer */
769 		ci_rq->r_avgcount = 0;
770 		sched_balance(ci);
771 		cci = worker_ci;
772 		cspc = &cci->ci_schedstate;
773 		if (ci == cci || spc->spc_psid != cspc->spc_psid ||
774 		    !mutex_tryenter(cci->ci_schedstate.spc_mutex))
775 			return NULL;
776 		return sched_catchlwp(cci);
777 	}
778 #else
779 	if (__predict_false(ci_rq->r_count == 0))
780 		return NULL;
781 #endif
782 
783 	/* Take the highest priority thread */
784 	KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
785 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
786 	l = TAILQ_FIRST(q_head);
787 	KASSERT(l != NULL);
788 
789 	sched_oncpu(l);
790 	l->l_rticks = hardclock_ticks;
791 
792 	return l;
793 }
794 
795 /*
796  * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
797  */
798 
799 bool
800 sched_curcpu_runnable_p(void)
801 {
802 	const struct cpu_info *ci;
803 	const struct schedstate_percpu *spc;
804 	const runqueue_t *ci_rq;
805 	bool rv;
806 
807 	kpreempt_disable();
808 	ci = curcpu();
809 	spc = &ci->ci_schedstate;
810 	ci_rq = spc->spc_sched_info;
811 
812 #ifndef __HAVE_FAST_SOFTINTS
813 	if (ci->ci_data.cpu_softints) {
814 		kpreempt_enable();
815 		return true;
816 	}
817 #endif
818 
819 	rv = (ci_rq->r_count != 0) ? true : false;
820 	kpreempt_enable();
821 
822 	return rv;
823 }
824 
825 /*
826  * Sysctl nodes and initialization.
827  */
828 
829 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
830 {
831 	const struct sysctlnode *node = NULL;
832 
833 	sysctl_createv(clog, 0, NULL, &node,
834 		CTLFLAG_PERMANENT,
835 		CTLTYPE_NODE, "sched",
836 		SYSCTL_DESCR("Scheduler options"),
837 		NULL, 0, NULL, 0,
838 		CTL_KERN, CTL_CREATE, CTL_EOL);
839 
840 	if (node == NULL)
841 		return;
842 
843 	sysctl_createv(clog, 0, &node, NULL,
844 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
845 		CTLTYPE_INT, "cacheht_time",
846 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
847 		NULL, 0, &cacheht_time, 0,
848 		CTL_CREATE, CTL_EOL);
849 	sysctl_createv(clog, 0, &node, NULL,
850 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
851 		CTLTYPE_INT, "balance_period",
852 		SYSCTL_DESCR("Balance period (in ticks)"),
853 		NULL, 0, &balance_period, 0,
854 		CTL_CREATE, CTL_EOL);
855 	sysctl_createv(clog, 0, &node, NULL,
856 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
857 		CTLTYPE_INT, "average_weight",
858 		SYSCTL_DESCR("Thread count averaging weight (in percent)"),
859 		NULL, 0, &average_weight, 0,
860 		CTL_CREATE, CTL_EOL);
861 	sysctl_createv(clog, 0, &node, NULL,
862 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
863 		CTLTYPE_INT, "min_catch",
864 		SYSCTL_DESCR("Minimal count of threads for catching"),
865 		NULL, 0, &min_catch, 0,
866 		CTL_CREATE, CTL_EOL);
867 	sysctl_createv(clog, 0, &node, NULL,
868 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
869 		CTLTYPE_INT, "timesoftints",
870 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
871 		NULL, 0, &softint_timing, 0,
872 		CTL_CREATE, CTL_EOL);
873 	sysctl_createv(clog, 0, &node, NULL,
874 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
875 		CTLTYPE_INT, "kpreempt_pri",
876 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
877 		NULL, 0, &sched_kpreempt_pri, 0,
878 		CTL_CREATE, CTL_EOL);
879 	sysctl_createv(clog, 0, &node, NULL,
880 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
881 		CTLTYPE_INT, "upreempt_pri",
882 		SYSCTL_DESCR("Minimum priority to trigger user preemption"),
883 		NULL, 0, &sched_upreempt_pri, 0,
884 		CTL_CREATE, CTL_EOL);
885 }
886 
887 /*
888  * Debugging.
889  */
890 
891 #ifdef DDB
892 
893 void
894 sched_print_runqueue(void (*pr)(const char *, ...))
895 {
896 	runqueue_t *ci_rq;
897 	struct cpu_info *ci, *tci;
898 	struct schedstate_percpu *spc;
899 	struct lwp *l;
900 	struct proc *p;
901 	CPU_INFO_ITERATOR cii;
902 
903 	for (CPU_INFO_FOREACH(cii, ci)) {
904 		int i;
905 
906 		spc = &ci->ci_schedstate;
907 		ci_rq = spc->spc_sched_info;
908 
909 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
910 		(*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, "
911 		    "maxpri = %d, mlwp = %p\n",
912 #ifdef MULTIPROCESSOR
913 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
914 #else
915 		    curlwp->l_proc->p_pid, curlwp->l_lid,
916 #endif
917 		    ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority,
918 		    spc->spc_migrating);
919 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
920 		do {
921 			uint32_t q;
922 			q = ci_rq->r_bitmap[i];
923 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
924 		} while (i--);
925 	}
926 
927 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
928 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
929 
930 	PROCLIST_FOREACH(p, &allproc) {
931 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
932 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
933 			ci = l->l_cpu;
934 			tci = l->l_target_cpu;
935 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
936 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
937 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
938 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
939 			    l, ci->ci_index, (tci ? tci->ci_index : -1),
940 			    (u_int)(hardclock_ticks - l->l_rticks));
941 		}
942 	}
943 }
944 
945 #endif
946