xref: /openbsd-src/sys/kern/kern_clockintr.c (revision 0b5493cb0c742202428c9cd6ec8d6e309804d203)
1 /* $OpenBSD: kern_clockintr.c,v 1.27 2023/07/02 19:02:27 cheloha Exp $ */
2 /*
3  * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
4  * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
5  * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/atomic.h>
23 #include <sys/clockintr.h>
24 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/mutex.h>
27 #include <sys/queue.h>
28 #include <sys/stdint.h>
29 #include <sys/sysctl.h>
30 #include <sys/time.h>
31 
32 /*
33  * Protection for global variables in this file:
34  *
35  *	C	Global clockintr configuration mutex (clockintr_mtx).
36  *	I	Immutable after initialization.
37  */
38 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
39 
40 u_int clockintr_flags;			/* [I] global state + behavior flags */
41 uint32_t hardclock_period;		/* [I] hardclock period (ns) */
42 uint32_t schedclock_period;		/* [I] schedclock period (ns) */
43 volatile u_int statclock_gen = 1;	/* [C] statclock update generation */
44 volatile uint32_t statclock_avg;	/* [C] average statclock period (ns) */
45 uint32_t statclock_min;			/* [C] minimum statclock period (ns) */
46 uint32_t statclock_mask;		/* [C] set of allowed offsets */
47 uint32_t stat_avg;			/* [I] average stathz period (ns) */
48 uint32_t stat_min;			/* [I] set of allowed offsets */
49 uint32_t stat_mask;			/* [I] max offset from minimum (ns) */
50 uint32_t prof_avg;			/* [I] average profhz period (ns) */
51 uint32_t prof_min;			/* [I] minimum profhz period (ns) */
52 uint32_t prof_mask;			/* [I] set of allowed offsets */
53 
54 uint64_t clockintr_advance(struct clockintr *, uint64_t);
55 void clockintr_cancel(struct clockintr *);
56 void clockintr_cancel_locked(struct clockintr *);
57 struct clockintr *clockintr_establish(struct clockintr_queue *,
58     void (*)(struct clockintr *, void *));
59 uint64_t clockintr_expiration(const struct clockintr *);
60 void clockintr_hardclock(struct clockintr *, void *);
61 uint64_t clockintr_nsecuptime(const struct clockintr *);
62 void clockintr_schedclock(struct clockintr *, void *);
63 void clockintr_schedule(struct clockintr *, uint64_t);
64 void clockintr_schedule_locked(struct clockintr *, uint64_t);
65 void clockintr_stagger(struct clockintr *, uint64_t, u_int, u_int);
66 void clockintr_statclock(struct clockintr *, void *);
67 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
68 uint64_t clockqueue_next(const struct clockintr_queue *);
69 void clockqueue_reset_intrclock(struct clockintr_queue *);
70 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
71 
72 /*
73  * Initialize global state.  Set flags and compute intervals.
74  */
75 void
76 clockintr_init(u_int flags)
77 {
78 	KASSERT(CPU_IS_PRIMARY(curcpu()));
79 	KASSERT(clockintr_flags == 0);
80 	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
81 
82 	KASSERT(hz > 0 && hz <= 1000000000);
83 	hardclock_period = 1000000000 / hz;
84 
85 	KASSERT(stathz >= 1 && stathz <= 1000000000);
86 	KASSERT(profhz >= stathz && profhz <= 1000000000);
87 	KASSERT(profhz % stathz == 0);
88 	clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
89 	clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
90 	SET(clockintr_flags, CL_STATCLOCK);
91 	clockintr_setstatclockrate(stathz);
92 
93 	KASSERT(schedhz >= 0 && schedhz <= 1000000000);
94 	if (schedhz != 0)
95 		schedclock_period = 1000000000 / schedhz;
96 
97 	SET(clockintr_flags, flags | CL_INIT);
98 }
99 
100 /*
101  * Ready the calling CPU for clockintr_dispatch().  If this is our
102  * first time here, install the intrclock, if any, and set necessary
103  * flags.  Advance the schedule as needed.
104  */
105 void
106 clockintr_cpu_init(const struct intrclock *ic)
107 {
108 	uint64_t multiplier = 0;
109 	struct cpu_info *ci = curcpu();
110 	struct clockintr_queue *cq = &ci->ci_queue;
111 	int reset_cq_intrclock = 0;
112 
113 	KASSERT(ISSET(clockintr_flags, CL_INIT));
114 
115 	if (ic != NULL && !ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
116 		cq->cq_intrclock = *ic;
117 		SET(cq->cq_flags, CQ_INTRCLOCK);
118 	}
119 
120 	/* TODO: Remove these from struct clockintr_queue. */
121 	if (cq->cq_hardclock == NULL) {
122 		cq->cq_hardclock = clockintr_establish(cq, clockintr_hardclock);
123 		if (cq->cq_hardclock == NULL)
124 			panic("%s: failed to establish hardclock", __func__);
125 	}
126 	if (cq->cq_statclock == NULL) {
127 		cq->cq_statclock = clockintr_establish(cq, clockintr_statclock);
128 		if (cq->cq_statclock == NULL)
129 			panic("%s: failed to establish statclock", __func__);
130 	}
131 	if (schedhz != 0 && cq->cq_schedclock == NULL) {
132 		cq->cq_schedclock = clockintr_establish(cq,
133 		    clockintr_schedclock);
134 		if (cq->cq_schedclock == NULL)
135 			panic("%s: failed to establish schedclock", __func__);
136 	}
137 
138 	/*
139 	 * Mask CQ_INTRCLOCK while we're advancing the internal clock
140 	 * interrupts.  We don't want the intrclock to fire until this
141 	 * thread reaches clockintr_trigger().
142 	 */
143 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
144 		CLR(cq->cq_flags, CQ_INTRCLOCK);
145 		reset_cq_intrclock = 1;
146 	}
147 
148 	/*
149 	 * Until we understand scheduler lock contention better, stagger
150 	 * the hardclock and statclock so they don't all happen at once.
151 	 * If we have no intrclock it doesn't matter, we have no control
152 	 * anyway.  The primary CPU's starting offset is always zero, so
153 	 * leave the multiplier zero.
154 	 */
155 	if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
156 		multiplier = CPU_INFO_UNIT(ci);
157 
158 	/*
159 	 * The first time we do this, the primary CPU cannot skip any
160 	 * hardclocks.  We can skip hardclocks on subsequent calls because
161 	 * the global tick value is advanced during inittodr(9) on our
162 	 * behalf.
163 	 */
164 	if (CPU_IS_PRIMARY(ci)) {
165 		if (cq->cq_hardclock->cl_expiration == 0)
166 			clockintr_schedule(cq->cq_hardclock, 0);
167 		else
168 			clockintr_advance(cq->cq_hardclock, hardclock_period);
169 	} else {
170 		if (cq->cq_hardclock->cl_expiration == 0) {
171 			clockintr_stagger(cq->cq_hardclock, hardclock_period,
172 			     multiplier, MAXCPUS);
173 		}
174 		clockintr_advance(cq->cq_hardclock, hardclock_period);
175 	}
176 
177 	/*
178 	 * We can always advance the statclock and schedclock.
179 	 * There is no reason to stagger a randomized statclock.
180 	 */
181 	if (!ISSET(clockintr_flags, CL_RNDSTAT)) {
182 		if (cq->cq_statclock->cl_expiration == 0) {
183 			clockintr_stagger(cq->cq_statclock, statclock_avg,
184 			    multiplier, MAXCPUS);
185 		}
186 	}
187 	clockintr_advance(cq->cq_statclock, statclock_avg);
188 	if (schedhz != 0) {
189 		if (cq->cq_schedclock->cl_expiration == 0) {
190 			clockintr_stagger(cq->cq_schedclock, schedclock_period,
191 			    multiplier, MAXCPUS);
192 		}
193 		clockintr_advance(cq->cq_schedclock, schedclock_period);
194 	}
195 
196 	if (reset_cq_intrclock)
197 		SET(cq->cq_flags, CQ_INTRCLOCK);
198 }
199 
200 /*
201  * If we have an intrclock, trigger it to start the dispatch cycle.
202  */
203 void
204 clockintr_trigger(void)
205 {
206 	struct clockintr_queue *cq = &curcpu()->ci_queue;
207 
208 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
209 
210 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
211 		intrclock_trigger(&cq->cq_intrclock);
212 }
213 
214 /*
215  * Run all expired events scheduled on the calling CPU.
216  */
217 int
218 clockintr_dispatch(void *frame)
219 {
220 	uint64_t lateness, run = 0, start;
221 	struct cpu_info *ci = curcpu();
222 	struct clockintr *cl;
223 	struct clockintr_queue *cq = &ci->ci_queue;
224 	u_int ogen;
225 
226 	if (cq->cq_dispatch != 0)
227 		panic("%s: recursive dispatch", __func__);
228 	cq->cq_dispatch = 1;
229 
230 	splassert(IPL_CLOCK);
231 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
232 
233 	mtx_enter(&cq->cq_mtx);
234 
235 	/*
236 	 * If nothing is scheduled or we arrived too early, we have
237 	 * nothing to do.
238 	 */
239 	start = nsecuptime();
240 	cq->cq_uptime = start;
241 	if (TAILQ_EMPTY(&cq->cq_pend))
242 		goto stats;
243 	if (cq->cq_uptime < clockqueue_next(cq))
244 		goto rearm;
245 	lateness = start - clockqueue_next(cq);
246 
247 	/*
248 	 * Dispatch expired events.
249 	 */
250 	for (;;) {
251 		cl = TAILQ_FIRST(&cq->cq_pend);
252 		if (cl == NULL)
253 			break;
254 		if (cq->cq_uptime < cl->cl_expiration) {
255 			/* Double-check the time before giving up. */
256 			cq->cq_uptime = nsecuptime();
257 			if (cq->cq_uptime < cl->cl_expiration)
258 				break;
259 		}
260 		clockintr_cancel_locked(cl);
261 		cq->cq_shadow.cl_expiration = cl->cl_expiration;
262 		cq->cq_running = cl;
263 		mtx_leave(&cq->cq_mtx);
264 
265 		cl->cl_func(&cq->cq_shadow, frame);
266 
267 		mtx_enter(&cq->cq_mtx);
268 		cq->cq_running = NULL;
269 		if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
270 			CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
271 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
272 		}
273 		if (ISSET(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING)) {
274 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
275 			clockintr_schedule_locked(cl,
276 			    cq->cq_shadow.cl_expiration);
277 		}
278 		run++;
279 	}
280 
281 	/*
282 	 * Dispatch complete.
283 	 */
284 rearm:
285 	/* Rearm the interrupt clock if we have one. */
286 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
287 		if (!TAILQ_EMPTY(&cq->cq_pend)) {
288 			intrclock_rearm(&cq->cq_intrclock,
289 			    clockqueue_next(cq) - cq->cq_uptime);
290 		}
291 	}
292 stats:
293 	/* Update our stats. */
294 	ogen = cq->cq_gen;
295 	cq->cq_gen = 0;
296 	membar_producer();
297 	cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
298 	if (run > 0) {
299 		cq->cq_stat.cs_lateness += lateness;
300 		cq->cq_stat.cs_prompt++;
301 		cq->cq_stat.cs_run += run;
302 	} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
303 		cq->cq_stat.cs_early++;
304 		cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
305 	} else
306 		cq->cq_stat.cs_spurious++;
307 	membar_producer();
308 	cq->cq_gen = MAX(1, ogen + 1);
309 
310 	mtx_leave(&cq->cq_mtx);
311 
312 	if (cq->cq_dispatch != 1)
313 		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
314 	cq->cq_dispatch = 0;
315 
316 	return run > 0;
317 }
318 
319 uint64_t
320 clockintr_advance(struct clockintr *cl, uint64_t period)
321 {
322 	uint64_t count, expiration;
323 	struct clockintr_queue *cq = cl->cl_queue;
324 
325 	if (cl == &cq->cq_shadow) {
326 		count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime);
327 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
328 		return count;
329 	}
330 
331 	mtx_enter(&cq->cq_mtx);
332 	expiration = cl->cl_expiration;
333 	count = nsec_advance(&expiration, period, nsecuptime());
334 	if (ISSET(cl->cl_flags, CLST_PENDING))
335 		clockintr_cancel_locked(cl);
336 	clockintr_schedule_locked(cl, expiration);
337 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
338 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
339 			if (cq == &curcpu()->ci_queue)
340 				clockqueue_reset_intrclock(cq);
341 		}
342 	}
343 	if (cl == cq->cq_running)
344 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
345 	mtx_leave(&cq->cq_mtx);
346 	return count;
347 }
348 
349 void
350 clockintr_cancel(struct clockintr *cl)
351 {
352 	struct clockintr_queue *cq = cl->cl_queue;
353 	int was_next;
354 
355 	if (cl == &cq->cq_shadow) {
356 		CLR(cl->cl_flags, CLST_SHADOW_PENDING);
357 		return;
358 	}
359 
360 	mtx_enter(&cq->cq_mtx);
361 	if (ISSET(cl->cl_flags, CLST_PENDING)) {
362 		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
363 		clockintr_cancel_locked(cl);
364 		if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
365 			if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
366 				if (cq == &curcpu()->ci_queue)
367 					clockqueue_reset_intrclock(cq);
368 			}
369 		}
370 	}
371 	if (cl == cq->cq_running)
372 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
373 	mtx_leave(&cq->cq_mtx);
374 }
375 
376 void
377 clockintr_cancel_locked(struct clockintr *cl)
378 {
379 	struct clockintr_queue *cq = cl->cl_queue;
380 
381 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
382 	KASSERT(ISSET(cl->cl_flags, CLST_PENDING));
383 
384 	TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
385 	CLR(cl->cl_flags, CLST_PENDING);
386 }
387 
388 struct clockintr *
389 clockintr_establish(struct clockintr_queue *cq,
390     void (*func)(struct clockintr *, void *))
391 {
392 	struct clockintr *cl;
393 
394 	cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO);
395 	if (cl == NULL)
396 		return NULL;
397 	cl->cl_func = func;
398 	cl->cl_queue = cq;
399 
400 	mtx_enter(&cq->cq_mtx);
401 	TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink);
402 	mtx_leave(&cq->cq_mtx);
403 	return cl;
404 }
405 
406 uint64_t
407 clockintr_expiration(const struct clockintr *cl)
408 {
409 	uint64_t expiration;
410 	struct clockintr_queue *cq = cl->cl_queue;
411 
412 	if (cl == &cq->cq_shadow)
413 		return cl->cl_expiration;
414 
415 	mtx_enter(&cq->cq_mtx);
416 	expiration = cl->cl_expiration;
417 	mtx_leave(&cq->cq_mtx);
418 	return expiration;
419 }
420 
421 void
422 clockintr_schedule(struct clockintr *cl, uint64_t expiration)
423 {
424 	struct clockintr_queue *cq = cl->cl_queue;
425 
426 	if (cl == &cq->cq_shadow) {
427 		cl->cl_expiration = expiration;
428 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
429 		return;
430 	}
431 
432 	mtx_enter(&cq->cq_mtx);
433 	if (ISSET(cl->cl_flags, CLST_PENDING))
434 		clockintr_cancel_locked(cl);
435 	clockintr_schedule_locked(cl, expiration);
436 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
437 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
438 			if (cq == &curcpu()->ci_queue)
439 				clockqueue_reset_intrclock(cq);
440 		}
441 	}
442 	if (cl == cq->cq_running)
443 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
444 	mtx_leave(&cq->cq_mtx);
445 }
446 
447 void
448 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
449 {
450 	struct clockintr *elm;
451 	struct clockintr_queue *cq = cl->cl_queue;
452 
453 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
454 	KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));
455 
456 	cl->cl_expiration = expiration;
457 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
458 		if (cl->cl_expiration < elm->cl_expiration)
459 			break;
460 	}
461 	if (elm == NULL)
462 		TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
463 	else
464 		TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
465 	SET(cl->cl_flags, CLST_PENDING);
466 }
467 
468 void
469 clockintr_stagger(struct clockintr *cl, uint64_t period, u_int n, u_int count)
470 {
471 	struct clockintr_queue *cq = cl->cl_queue;
472 
473 	KASSERT(n < count);
474 
475 	mtx_enter(&cq->cq_mtx);
476 	if (ISSET(cl->cl_flags, CLST_PENDING))
477 		panic("%s: clock interrupt pending", __func__);
478 	cl->cl_expiration = period / count * n;
479 	mtx_leave(&cq->cq_mtx);
480 }
481 
482 /*
483  * Compute the period (avg) for the given frequency and a range around
484  * that period.  The range is [min + 1, min + mask].  The range is used
485  * during dispatch to choose a new pseudorandom deadline for each statclock
486  * event.
487  */
488 void
489 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
490 {
491 	uint32_t half_avg, var;
492 
493 	KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
494 	KASSERT(freq > 0 && freq <= 1000000000);
495 
496 	/* Compute avg, the average period. */
497 	*avg = 1000000000 / freq;
498 
499 	/* Find var, the largest power of two such that var <= avg / 2. */
500 	half_avg = *avg / 2;
501 	for (var = 1U << 31; var > half_avg; var /= 2)
502 		continue;
503 
504 	/* Using avg and var, set a lower bound for the range. */
505 	*min = *avg - (var / 2);
506 
507 	/* The mask is just (var - 1). */
508 	*mask = var - 1;
509 }
510 
511 /*
512  * Update the statclock_* variables according to the given frequency.
513  * Must only be called after clockintr_statvar_init() initializes both
514  * stathz_* and profhz_*.
515  */
516 void
517 clockintr_setstatclockrate(int freq)
518 {
519 	u_int ogen;
520 
521 	KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
522 
523 	mtx_enter(&clockintr_mtx);
524 
525 	ogen = statclock_gen;
526 	statclock_gen = 0;
527 	membar_producer();
528 	if (freq == stathz) {
529 		statclock_avg = stat_avg;
530 		statclock_min = stat_min;
531 		statclock_mask = stat_mask;
532 	} else if (freq == profhz) {
533 		statclock_avg = prof_avg;
534 		statclock_min = prof_min;
535 		statclock_mask = prof_mask;
536 	} else {
537 		panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
538 		    __func__, stathz, profhz, freq);
539 	}
540 	membar_producer();
541 	statclock_gen = MAX(1, ogen + 1);
542 
543 	mtx_leave(&clockintr_mtx);
544 }
545 
546 uint64_t
547 clockintr_nsecuptime(const struct clockintr *cl)
548 {
549 	KASSERT(cl == &cl->cl_queue->cq_shadow);
550 	return cl->cl_queue->cq_uptime;
551 }
552 
553 void
554 clockintr_hardclock(struct clockintr *cl, void *frame)
555 {
556 	uint64_t count, i;
557 
558 	count = clockintr_advance(cl, hardclock_period);
559 	for (i = 0; i < count; i++)
560 		hardclock(frame);
561 }
562 
563 void
564 clockintr_schedclock(struct clockintr *cl, void *unused)
565 {
566 	uint64_t count, i;
567 	struct proc *p = curproc;
568 
569 	count = clockintr_advance(cl, schedclock_period);
570 	if (p != NULL) {
571 		for (i = 0; i < count; i++)
572 			schedclock(p);
573 	}
574 }
575 
576 void
577 clockintr_statclock(struct clockintr *cl, void *frame)
578 {
579 	uint64_t count, expiration, i, uptime;
580 	uint32_t mask, min, off;
581 	u_int gen;
582 
583 	if (ISSET(clockintr_flags, CL_RNDSTAT)) {
584 		do {
585 			gen = statclock_gen;
586 			membar_consumer();
587 			min = statclock_min;
588 			mask = statclock_mask;
589 			membar_consumer();
590 		} while (gen == 0 || gen != statclock_gen);
591 		count = 0;
592 		expiration = clockintr_expiration(cl);
593 		uptime = clockintr_nsecuptime(cl);
594 		while (expiration <= uptime) {
595 			while ((off = (random() & mask)) == 0)
596 				continue;
597 			expiration += min + off;
598 			count++;
599 		}
600 		clockintr_schedule(cl, expiration);
601 	} else {
602 		count = clockintr_advance(cl, statclock_avg);
603 	}
604 	for (i = 0; i < count; i++)
605 		statclock(frame);
606 }
607 
608 void
609 clockqueue_init(struct clockintr_queue *cq)
610 {
611 	if (ISSET(cq->cq_flags, CQ_INIT))
612 		return;
613 
614 	cq->cq_shadow.cl_queue = cq;
615 	mtx_init(&cq->cq_mtx, IPL_CLOCK);
616 	TAILQ_INIT(&cq->cq_est);
617 	TAILQ_INIT(&cq->cq_pend);
618 	cq->cq_gen = 1;
619 	SET(cq->cq_flags, CQ_INIT);
620 }
621 
622 uint64_t
623 clockqueue_next(const struct clockintr_queue *cq)
624 {
625 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
626 	return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
627 }
628 
629 void
630 clockqueue_reset_intrclock(struct clockintr_queue *cq)
631 {
632 	uint64_t exp, now;
633 
634 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
635 	KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));
636 
637 	exp = clockqueue_next(cq);
638 	now = nsecuptime();
639 	if (now < exp)
640 		intrclock_rearm(&cq->cq_intrclock, exp - now);
641 	else
642 		intrclock_trigger(&cq->cq_intrclock);
643 }
644 
645 /*
646  * Advance *next in increments of period until it exceeds now.
647  * Returns the number of increments *next was advanced.
648  *
649  * We check the common cases first to avoid division if possible.
650  * This does no overflow checking.
651  */
652 uint64_t
653 nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
654 {
655 	uint64_t elapsed;
656 
657 	if (now < *next)
658 		return 0;
659 
660 	if (now < *next + period) {
661 		*next += period;
662 		return 1;
663 	}
664 
665 	elapsed = (now - *next) / period + 1;
666 	*next += period * elapsed;
667 	return elapsed;
668 }
669 
670 int
671 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
672     void *newp, size_t newlen)
673 {
674 	struct clockintr_stat sum, tmp;
675 	struct clockintr_queue *cq;
676 	struct cpu_info *ci;
677 	CPU_INFO_ITERATOR cii;
678 	u_int gen;
679 
680 	if (namelen != 1)
681 		return ENOTDIR;
682 
683 	switch (name[0]) {
684 	case KERN_CLOCKINTR_STATS:
685 		memset(&sum, 0, sizeof sum);
686 		CPU_INFO_FOREACH(cii, ci) {
687 			cq = &ci->ci_queue;
688 			if (!ISSET(cq->cq_flags, CQ_INIT))
689 				continue;
690 			do {
691 				gen = cq->cq_gen;
692 				membar_consumer();
693 				tmp = cq->cq_stat;
694 				membar_consumer();
695 			} while (gen == 0 || gen != cq->cq_gen);
696 			sum.cs_dispatched += tmp.cs_dispatched;
697 			sum.cs_early += tmp.cs_early;
698 			sum.cs_earliness += tmp.cs_earliness;
699 			sum.cs_lateness += tmp.cs_lateness;
700 			sum.cs_prompt += tmp.cs_prompt;
701 			sum.cs_run += tmp.cs_run;
702 			sum.cs_spurious += tmp.cs_spurious;
703 		}
704 		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
705 	default:
706 		break;
707 	}
708 
709 	return EINVAL;
710 }
711 
712 #ifdef DDB
713 
714 #include <machine/db_machdep.h>
715 
716 #include <ddb/db_interface.h>
717 #include <ddb/db_output.h>
718 #include <ddb/db_sym.h>
719 
720 void db_show_clockintr(const struct clockintr *, const char *, u_int);
721 void db_show_clockintr_cpu(struct cpu_info *);
722 
723 void
724 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
725 {
726 	struct timespec now;
727 	struct cpu_info *ci;
728 	CPU_INFO_ITERATOR cii;
729 
730 	nanouptime(&now);
731 	db_printf("%20s\n", "UPTIME");
732 	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
733 	db_printf("\n");
734 	db_printf("%20s  %5s  %3s  %s\n", "EXPIRATION", "STATE", "CPU", "NAME");
735 	CPU_INFO_FOREACH(cii, ci) {
736 		if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
737 			db_show_clockintr_cpu(ci);
738 	}
739 }
740 
741 void
742 db_show_clockintr_cpu(struct cpu_info *ci)
743 {
744 	struct clockintr *elm;
745 	struct clockintr_queue *cq = &ci->ci_queue;
746 	u_int cpu = CPU_INFO_UNIT(ci);
747 
748 	if (cq->cq_running != NULL)
749 		db_show_clockintr(cq->cq_running, "run", cpu);
750 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
751 		db_show_clockintr(elm, "pend", cpu);
752 	TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) {
753 		if (!ISSET(elm->cl_flags, CLST_PENDING))
754 			db_show_clockintr(elm, "idle", cpu);
755 	}
756 }
757 
758 void
759 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
760 {
761 	struct timespec ts;
762 	char *name;
763 	db_expr_t offset;
764 
765 	NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
766 	db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
767 	if (name == NULL)
768 		name = "?";
769 	db_printf("%10lld.%09ld  %5s  %3u  %s\n",
770 	    ts.tv_sec, ts.tv_nsec, state, cpu, name);
771 }
772 
773 #endif /* DDB */
774