xref: /netbsd-src/sys/kern/kern_runq.c (revision 2de962bd804263c16657f586aa00f1704045df8e)
1 /*	$NetBSD: kern_runq.c,v 1.8 2008/05/21 15:41:03 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.8 2008/05/21 15:41:03 ad Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/bitops.h>
35 #include <sys/cpu.h>
36 #include <sys/idle.h>
37 #include <sys/intr.h>
38 #include <sys/kmem.h>
39 #include <sys/lwp.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/syscallargs.h>
44 #include <sys/sysctl.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 
48 /*
49  * Priority related defintions.
50  */
51 #define	PRI_TS_COUNT	(NPRI_USER)
52 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
53 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
54 
55 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
56 
57 /*
58  * Bits per map.
59  */
60 #define	BITMAP_BITS	(32)
61 #define	BITMAP_SHIFT	(5)
62 #define	BITMAP_MSB	(0x80000000U)
63 #define	BITMAP_MASK	(BITMAP_BITS - 1)
64 
65 /*
66  * Structures, runqueue.
67  */
68 
69 const int	schedppq = 1;
70 
71 typedef struct {
72 	TAILQ_HEAD(, lwp) q_head;
73 } queue_t;
74 
75 typedef struct {
76 	/* Lock and bitmap */
77 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
78 	/* Counters */
79 	u_int		r_count;	/* Count of the threads */
80 	u_int		r_avgcount;	/* Average count of threads */
81 	u_int		r_mcount;	/* Count of migratable threads */
82 	/* Runqueues */
83 	queue_t		r_rt_queue[PRI_RT_COUNT];
84 	queue_t		r_ts_queue[PRI_TS_COUNT];
85 } runqueue_t;
86 
87 static void *	sched_getrq(runqueue_t *, const pri_t);
88 #ifdef MULTIPROCESSOR
89 static lwp_t *	sched_catchlwp(void);
90 static void	sched_balance(void *);
91 #endif
92 
93 /*
94  * Preemption control.
95  */
96 int		sched_upreempt_pri = PRI_KERNEL;
97 #if defined(__HAVE_PREEMPTION) && defined(i386)
98 int		sched_kpreempt_pri = PRI_USER_RT;
99 #else
100 /* XXX disable for now until any bugs are worked out. */
101 int		sched_kpreempt_pri = 1000;
102 #endif
103 
104 /*
105  * Migration and balancing.
106  */
107 static u_int	cacheht_time;		/* Cache hotness time */
108 static u_int	min_catch;		/* Minimal LWP count for catching */
109 static u_int	balance_period;		/* Balance period */
110 static struct cpu_info *worker_ci;	/* Victim CPU */
111 #ifdef MULTIPROCESSOR
112 static struct callout balance_ch;	/* Callout of balancer */
113 #endif
114 
115 void
116 runq_init(void)
117 {
118 
119 	/* Balancing */
120 	worker_ci = curcpu();
121 	cacheht_time = mstohz(3);		/* ~3 ms  */
122 	balance_period = mstohz(300);		/* ~300ms */
123 
124 	/* Minimal count of LWPs for catching */
125 	min_catch = 1;
126 
127 	/* Initialize balancing callout and run it */
128 #ifdef MULTIPROCESSOR
129 	callout_init(&balance_ch, CALLOUT_MPSAFE);
130 	callout_setfunc(&balance_ch, sched_balance, NULL);
131 	callout_schedule(&balance_ch, balance_period);
132 #endif
133 }
134 
135 void
136 sched_cpuattach(struct cpu_info *ci)
137 {
138 	runqueue_t *ci_rq;
139 	void *rq_ptr;
140 	u_int i, size;
141 
142 	if (ci->ci_schedstate.spc_lwplock == NULL) {
143 		ci->ci_schedstate.spc_lwplock =
144 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
145 	}
146 	if (ci == lwp0.l_cpu) {
147 		/* Initialize the scheduler structure of the primary LWP */
148 		lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
149 	}
150 	if (ci->ci_schedstate.spc_mutex != NULL) {
151 		/* Already initialized. */
152 		return;
153 	}
154 
155 	/* Allocate the run queue */
156 	size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
157 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
158 	if (rq_ptr == NULL) {
159 		panic("sched_cpuattach: could not allocate the runqueue");
160 	}
161 	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
162 
163 	/* Initialize run queues */
164 	ci->ci_schedstate.spc_mutex =
165 	    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
166 	for (i = 0; i < PRI_RT_COUNT; i++)
167 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
168 	for (i = 0; i < PRI_TS_COUNT; i++)
169 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
170 
171 	ci->ci_schedstate.spc_sched_info = ci_rq;
172 }
173 
174 /*
175  * Control of the runqueue.
176  */
177 
178 static void *
179 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
180 {
181 
182 	KASSERT(prio < PRI_COUNT);
183 	return (prio <= PRI_HIGHEST_TS) ?
184 	    &ci_rq->r_ts_queue[prio].q_head :
185 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
186 }
187 
188 void
189 sched_enqueue(struct lwp *l, bool swtch)
190 {
191 	runqueue_t *ci_rq;
192 	struct schedstate_percpu *spc;
193 	TAILQ_HEAD(, lwp) *q_head;
194 	const pri_t eprio = lwp_eprio(l);
195 	struct cpu_info *ci;
196 	int type;
197 
198 	ci = l->l_cpu;
199 	spc = &ci->ci_schedstate;
200 	ci_rq = spc->spc_sched_info;
201 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
202 
203 	/* Update the last run time on switch */
204 	if (__predict_true(swtch == true))
205 		l->l_rticksum += (hardclock_ticks - l->l_rticks);
206 	else if (l->l_rticks == 0)
207 		l->l_rticks = hardclock_ticks;
208 
209 	/* Enqueue the thread */
210 	q_head = sched_getrq(ci_rq, eprio);
211 	if (TAILQ_EMPTY(q_head)) {
212 		u_int i;
213 		uint32_t q;
214 
215 		/* Mark bit */
216 		i = eprio >> BITMAP_SHIFT;
217 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
218 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
219 		ci_rq->r_bitmap[i] |= q;
220 	}
221 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
222 	ci_rq->r_count++;
223 	if ((l->l_pflag & LP_BOUND) == 0)
224 		ci_rq->r_mcount++;
225 
226 	/*
227 	 * Update the value of highest priority in the runqueue,
228 	 * if priority of this thread is higher.
229 	 */
230 	if (eprio > spc->spc_maxpriority)
231 		spc->spc_maxpriority = eprio;
232 
233 	sched_newts(l);
234 
235 	/*
236 	 * Wake the chosen CPU or cause a preemption if the newly
237 	 * enqueued thread has higher priority.  Don't cause a
238 	 * preemption if the thread is yielding (swtch).
239 	 */
240 	if (!swtch && eprio > spc->spc_curpriority) {
241 		if (eprio >= sched_kpreempt_pri)
242 			type = RESCHED_KPREEMPT;
243 		else if (eprio >= sched_upreempt_pri)
244 			type = RESCHED_IMMED;
245 		else
246 			type = 0;
247 		cpu_need_resched(ci, type);
248 	}
249 }
250 
251 void
252 sched_dequeue(struct lwp *l)
253 {
254 	runqueue_t *ci_rq;
255 	TAILQ_HEAD(, lwp) *q_head;
256 	struct schedstate_percpu *spc;
257 	const pri_t eprio = lwp_eprio(l);
258 
259 	spc = & l->l_cpu->ci_schedstate;
260 	ci_rq = spc->spc_sched_info;
261 	KASSERT(lwp_locked(l, spc->spc_mutex));
262 
263 	KASSERT(eprio <= spc->spc_maxpriority);
264 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
265 	KASSERT(ci_rq->r_count > 0);
266 
267 	ci_rq->r_count--;
268 	if ((l->l_pflag & LP_BOUND) == 0)
269 		ci_rq->r_mcount--;
270 
271 	q_head = sched_getrq(ci_rq, eprio);
272 	TAILQ_REMOVE(q_head, l, l_runq);
273 	if (TAILQ_EMPTY(q_head)) {
274 		u_int i;
275 		uint32_t q;
276 
277 		/* Unmark bit */
278 		i = eprio >> BITMAP_SHIFT;
279 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
280 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
281 		ci_rq->r_bitmap[i] &= ~q;
282 
283 		/*
284 		 * Update the value of highest priority in the runqueue, in a
285 		 * case it was a last thread in the queue of highest priority.
286 		 */
287 		if (eprio != spc->spc_maxpriority)
288 			return;
289 
290 		do {
291 			if (ci_rq->r_bitmap[i] != 0) {
292 				q = ffs(ci_rq->r_bitmap[i]);
293 				spc->spc_maxpriority =
294 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
295 				return;
296 			}
297 		} while (i--);
298 
299 		/* If not found - set the lowest value */
300 		spc->spc_maxpriority = 0;
301 	}
302 }
303 
304 /*
305  * Migration and balancing.
306  */
307 
308 #ifdef MULTIPROCESSOR
309 
310 /* Estimate if LWP is cache-hot */
311 static inline bool
312 lwp_cache_hot(const struct lwp *l)
313 {
314 
315 	if (l->l_slptime || l->l_rticks == 0)
316 		return false;
317 
318 	return (hardclock_ticks - l->l_rticks <= cacheht_time);
319 }
320 
321 /* Check if LWP can migrate to the chosen CPU */
322 static inline bool
323 sched_migratable(const struct lwp *l, struct cpu_info *ci)
324 {
325 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
326 
327 	/* CPU is offline */
328 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
329 		return false;
330 
331 	/* Affinity bind */
332 	if (__predict_false(l->l_flag & LW_AFFINITY))
333 		return CPU_ISSET(cpu_index(ci), &l->l_affinity);
334 
335 	/* Processor-set */
336 	return (spc->spc_psid == l->l_psid);
337 }
338 
339 /*
340  * Estimate the migration of LWP to the other CPU.
341  * Take and return the CPU, if migration is needed.
342  */
343 struct cpu_info *
344 sched_takecpu(struct lwp *l)
345 {
346 	struct cpu_info *ci, *tci, *first, *next;
347 	struct schedstate_percpu *spc;
348 	runqueue_t *ci_rq, *ici_rq;
349 	pri_t eprio, lpri, pri;
350 
351 	KASSERT(lwp_locked(l, NULL));
352 
353 	ci = l->l_cpu;
354 	spc = &ci->ci_schedstate;
355 	ci_rq = spc->spc_sched_info;
356 
357 	/* If thread is strictly bound, do not estimate other CPUs */
358 	if (l->l_pflag & LP_BOUND)
359 		return ci;
360 
361 	/* CPU of this thread is idling - run there */
362 	if (ci_rq->r_count == 0)
363 		return ci;
364 
365 	eprio = lwp_eprio(l);
366 
367 	/* Stay if thread is cache-hot */
368 	if (__predict_true(l->l_stat != LSIDL) &&
369 	    lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
370 		return ci;
371 
372 	/* Run on current CPU if priority of thread is higher */
373 	ci = curcpu();
374 	spc = &ci->ci_schedstate;
375 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
376 		return ci;
377 
378 	/*
379 	 * Look for the CPU with the lowest priority thread.  In case of
380 	 * equal priority, choose the CPU with the fewest of threads.
381 	 */
382 	first = l->l_cpu;
383 	ci = first;
384 	tci = first;
385 	lpri = PRI_COUNT;
386 	do {
387 		next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
388 		spc = &ci->ci_schedstate;
389 		ici_rq = spc->spc_sched_info;
390 		pri = max(spc->spc_curpriority, spc->spc_maxpriority);
391 		if (pri > lpri)
392 			continue;
393 
394 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
395 			continue;
396 
397 		if (!sched_migratable(l, ci))
398 			continue;
399 
400 		lpri = pri;
401 		tci = ci;
402 		ci_rq = ici_rq;
403 	} while (ci = next, ci != first);
404 
405 	return tci;
406 }
407 
408 /*
409  * Tries to catch an LWP from the runqueue of other CPU.
410  */
411 static struct lwp *
412 sched_catchlwp(void)
413 {
414 	struct cpu_info *curci = curcpu(), *ci = worker_ci;
415 	struct schedstate_percpu *spc;
416 	TAILQ_HEAD(, lwp) *q_head;
417 	runqueue_t *ci_rq;
418 	struct lwp *l;
419 
420 	if (curci == ci)
421 		return NULL;
422 
423 	/* Lockless check */
424 	spc = &ci->ci_schedstate;
425 	ci_rq = spc->spc_sched_info;
426 	if (ci_rq->r_mcount < min_catch)
427 		return NULL;
428 
429 	/*
430 	 * Double-lock the runqueues.
431 	 */
432 	if (curci < ci) {
433 		spc_lock(ci);
434 	} else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
435 		const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
436 
437 		spc_unlock(curci);
438 		spc_lock(ci);
439 		spc_lock(curci);
440 
441 		if (cur_rq->r_count) {
442 			spc_unlock(ci);
443 			return NULL;
444 		}
445 	}
446 
447 	if (ci_rq->r_mcount < min_catch) {
448 		spc_unlock(ci);
449 		return NULL;
450 	}
451 
452 	/* Take the highest priority thread */
453 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
454 	l = TAILQ_FIRST(q_head);
455 
456 	for (;;) {
457 		/* Check the first and next result from the queue */
458 		if (l == NULL)
459 			break;
460 		KASSERT(l->l_stat == LSRUN);
461 		KASSERT(l->l_flag & LW_INMEM);
462 
463 		/* Look for threads, whose are allowed to migrate */
464 		if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
465 		    !sched_migratable(l, curci)) {
466 			l = TAILQ_NEXT(l, l_runq);
467 			continue;
468 		}
469 
470 		/* Grab the thread, and move to the local run queue */
471 		sched_dequeue(l);
472 		l->l_cpu = curci;
473 		lwp_unlock_to(l, curci->ci_schedstate.spc_mutex);
474 		sched_enqueue(l, false);
475 		return l;
476 	}
477 	spc_unlock(ci);
478 
479 	return l;
480 }
481 
482 /*
483  * Periodical calculations for balancing.
484  */
485 static void
486 sched_balance(void *nocallout)
487 {
488 	struct cpu_info *ci, *hci;
489 	runqueue_t *ci_rq;
490 	CPU_INFO_ITERATOR cii;
491 	u_int highest;
492 
493 	hci = curcpu();
494 	highest = 0;
495 
496 	/* Make lockless countings */
497 	for (CPU_INFO_FOREACH(cii, ci)) {
498 		ci_rq = ci->ci_schedstate.spc_sched_info;
499 
500 		/* Average count of the threads */
501 		ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
502 
503 		/* Look for CPU with the highest average */
504 		if (ci_rq->r_avgcount > highest) {
505 			hci = ci;
506 			highest = ci_rq->r_avgcount;
507 		}
508 	}
509 
510 	/* Update the worker */
511 	worker_ci = hci;
512 
513 	if (nocallout == NULL)
514 		callout_schedule(&balance_ch, balance_period);
515 }
516 
517 #else
518 
519 struct cpu_info *
520 sched_takecpu(struct lwp *l)
521 {
522 
523 	return l->l_cpu;
524 }
525 
526 #endif	/* MULTIPROCESSOR */
527 
528 /*
529  * Scheduling statistics and balancing.
530  */
531 void
532 sched_lwp_stats(struct lwp *l)
533 {
534 	int batch;
535 
536 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
537 	    l->l_stat == LSSUSPENDED)
538 		l->l_slptime++;
539 
540 	/*
541 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
542 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
543 	 */
544 	batch = (l->l_rticksum > l->l_slpticksum);
545 	if (batch != 0) {
546 		if ((l->l_flag & LW_BATCH) == 0)
547 			batch = 0;
548 		l->l_flag |= LW_BATCH;
549 	} else
550 		l->l_flag &= ~LW_BATCH;
551 
552 	/*
553 	 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
554 	 * In such case reset the value of last sleep, and check it later, if
555 	 * it is still zero - perform the migration, unmark the batch flag.
556 	 */
557 	if (batch && (l->l_slptime + l->l_slpticksum) == 0) {
558 		if (l->l_slpticks == 0) {
559 			if (l->l_target_cpu == NULL &&
560 			    (l->l_stat == LSRUN || l->l_stat == LSONPROC)) {
561 				struct cpu_info *ci = sched_takecpu(l);
562 				l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL;
563 			}
564 			l->l_flag &= ~LW_BATCH;
565 		} else {
566 			l->l_slpticks = 0;
567 		}
568 	}
569 
570 	/* Reset the time sums */
571 	l->l_slpticksum = 0;
572 	l->l_rticksum = 0;
573 
574 	/* Scheduler-specific hook */
575 	sched_pstats_hook(l, batch);
576 }
577 
578 /*
579  * Scheduler mill.
580  */
581 struct lwp *
582 sched_nextlwp(void)
583 {
584 	struct cpu_info *ci = curcpu();
585 	struct schedstate_percpu *spc;
586 	TAILQ_HEAD(, lwp) *q_head;
587 	runqueue_t *ci_rq;
588 	struct lwp *l;
589 
590 	spc = &ci->ci_schedstate;
591 	ci_rq = spc->spc_sched_info;
592 
593 #ifdef MULTIPROCESSOR
594 	/* If runqueue is empty, try to catch some thread from other CPU */
595 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
596 		if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
597 			return NULL;
598 	} else if (ci_rq->r_count == 0) {
599 		/* Reset the counter, and call the balancer */
600 		ci_rq->r_avgcount = 0;
601 		sched_balance(ci);
602 
603 		/* The re-locking will be done inside */
604 		return sched_catchlwp();
605 	}
606 #else
607 	if (ci_rq->r_count == 0)
608 		return NULL;
609 #endif
610 
611 	/* Take the highest priority thread */
612 	KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
613 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
614 	l = TAILQ_FIRST(q_head);
615 	KASSERT(l != NULL);
616 
617 	sched_oncpu(l);
618 	l->l_rticks = hardclock_ticks;
619 
620 	return l;
621 }
622 
623 bool
624 sched_curcpu_runnable_p(void)
625 {
626 	const struct cpu_info *ci;
627 	const struct schedstate_percpu *spc;
628 	const runqueue_t *ci_rq;
629 	bool rv;
630 
631 	kpreempt_disable();
632 	ci = curcpu();
633 	spc = &ci->ci_schedstate;
634 	ci_rq = spc->spc_sched_info;
635 
636 #ifndef __HAVE_FAST_SOFTINTS
637 	if (ci->ci_data.cpu_softints) {
638 		kpreempt_enable();
639 		return true;
640 	}
641 #endif
642 
643 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
644 		rv = (ci_rq->r_count - ci_rq->r_mcount);
645 	else
646 		rv = ci_rq->r_count != 0;
647 	kpreempt_enable();
648 
649 	return rv;
650 }
651 
652 /*
653  * Sysctl nodes and initialization.
654  */
655 
656 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
657 {
658 	const struct sysctlnode *node = NULL;
659 
660 	sysctl_createv(clog, 0, NULL, NULL,
661 		CTLFLAG_PERMANENT,
662 		CTLTYPE_NODE, "kern", NULL,
663 		NULL, 0, NULL, 0,
664 		CTL_KERN, CTL_EOL);
665 	sysctl_createv(clog, 0, NULL, &node,
666 		CTLFLAG_PERMANENT,
667 		CTLTYPE_NODE, "sched",
668 		SYSCTL_DESCR("Scheduler options"),
669 		NULL, 0, NULL, 0,
670 		CTL_KERN, CTL_CREATE, CTL_EOL);
671 
672 	if (node == NULL)
673 		return;
674 
675 	sysctl_createv(clog, 0, &node, NULL,
676 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
677 		CTLTYPE_INT, "cacheht_time",
678 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
679 		NULL, 0, &cacheht_time, 0,
680 		CTL_CREATE, CTL_EOL);
681 	sysctl_createv(clog, 0, &node, NULL,
682 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
683 		CTLTYPE_INT, "balance_period",
684 		SYSCTL_DESCR("Balance period (in ticks)"),
685 		NULL, 0, &balance_period, 0,
686 		CTL_CREATE, CTL_EOL);
687 	sysctl_createv(clog, 0, &node, NULL,
688 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
689 		CTLTYPE_INT, "min_catch",
690 		SYSCTL_DESCR("Minimal count of threads for catching"),
691 		NULL, 0, &min_catch, 0,
692 		CTL_CREATE, CTL_EOL);
693 	sysctl_createv(clog, 0, &node, NULL,
694 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
695 		CTLTYPE_INT, "timesoftints",
696 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
697 		NULL, 0, &softint_timing, 0,
698 		CTL_CREATE, CTL_EOL);
699 	sysctl_createv(clog, 0, &node, NULL,
700 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
701 		CTLTYPE_INT, "kpreempt_pri",
702 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
703 		NULL, 0, &sched_kpreempt_pri, 0,
704 		CTL_CREATE, CTL_EOL);
705 	sysctl_createv(clog, 0, &node, NULL,
706 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
707 		CTLTYPE_INT, "upreempt_pri",
708 		SYSCTL_DESCR("Minimum priority to trigger user preemption"),
709 		NULL, 0, &sched_upreempt_pri, 0,
710 		CTL_CREATE, CTL_EOL);
711 }
712 
713 /*
714  * Debugging.
715  */
716 
717 #ifdef DDB
718 
719 void
720 sched_print_runqueue(void (*pr)(const char *, ...)
721     __attribute__((__format__(__printf__,1,2))))
722 {
723 	runqueue_t *ci_rq;
724 	struct schedstate_percpu *spc;
725 	struct lwp *l;
726 	struct proc *p;
727 	int i;
728 	struct cpu_info *ci;
729 	CPU_INFO_ITERATOR cii;
730 
731 	for (CPU_INFO_FOREACH(cii, ci)) {
732 		spc = &ci->ci_schedstate;
733 		ci_rq = spc->spc_sched_info;
734 
735 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
736 		(*pr)(" pid.lid = %d.%d, threads count = %u, "
737 		    "avgcount = %u, highest pri = %d\n",
738 #ifdef MULTIPROCESSOR
739 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
740 #else
741 		    curlwp->l_proc->p_pid, curlwp->l_lid,
742 #endif
743 		    ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority);
744 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
745 		do {
746 			uint32_t q;
747 			q = ci_rq->r_bitmap[i];
748 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
749 		} while (i--);
750 	}
751 
752 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %s\n",
753 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "LRTIME");
754 
755 	PROCLIST_FOREACH(p, &allproc) {
756 		if ((p->p_flag & PK_MARKER) != 0)
757 			continue;
758 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
759 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
760 			ci = l->l_cpu;
761 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %u\n",
762 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
763 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
764 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
765 			    l, ci->ci_index,
766 			    (u_int)(hardclock_ticks - l->l_rticks));
767 		}
768 	}
769 }
770 
771 #endif
772