xref: /openbsd-src/sys/kern/kern_clockintr.c (revision 521ba2f2ab0e0e89d1776559874b3ecc227442fc)
1 /* $OpenBSD: kern_clockintr.c,v 1.26 2023/07/02 00:55:18 cheloha Exp $ */
2 /*
3  * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
4  * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
5  * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/atomic.h>
23 #include <sys/clockintr.h>
24 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/mutex.h>
27 #include <sys/queue.h>
28 #include <sys/stdint.h>
29 #include <sys/sysctl.h>
30 #include <sys/time.h>
31 
32 #ifdef __HAVE_CLOCKINTR
33 
34 /*
35  * Protection for global variables in this file:
36  *
37  *	C	Global clockintr configuration mutex (clockintr_mtx).
38  *	I	Immutable after initialization.
39  */
40 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
41 
42 u_int clockintr_flags;			/* [I] global state + behavior flags */
43 uint32_t hardclock_period;		/* [I] hardclock period (ns) */
44 uint32_t schedclock_period;		/* [I] schedclock period (ns) */
45 volatile u_int statclock_gen = 1;	/* [C] statclock update generation */
46 volatile uint32_t statclock_avg;	/* [C] average statclock period (ns) */
47 uint32_t statclock_min;			/* [C] minimum statclock period (ns) */
48 uint32_t statclock_mask;		/* [C] set of allowed offsets */
49 uint32_t stat_avg;			/* [I] average stathz period (ns) */
50 uint32_t stat_min;			/* [I] set of allowed offsets */
51 uint32_t stat_mask;			/* [I] max offset from minimum (ns) */
52 uint32_t prof_avg;			/* [I] average profhz period (ns) */
53 uint32_t prof_min;			/* [I] minimum profhz period (ns) */
54 uint32_t prof_mask;			/* [I] set of allowed offsets */
55 
56 uint64_t clockintr_advance(struct clockintr *, uint64_t);
57 void clockintr_cancel(struct clockintr *);
58 void clockintr_cancel_locked(struct clockintr *);
59 struct clockintr *clockintr_establish(struct clockintr_queue *,
60     void (*)(struct clockintr *, void *));
61 uint64_t clockintr_expiration(const struct clockintr *);
62 void clockintr_hardclock(struct clockintr *, void *);
63 uint64_t clockintr_nsecuptime(const struct clockintr *);
64 void clockintr_schedclock(struct clockintr *, void *);
65 void clockintr_schedule(struct clockintr *, uint64_t);
66 void clockintr_schedule_locked(struct clockintr *, uint64_t);
67 void clockintr_stagger(struct clockintr *, uint64_t, u_int, u_int);
68 void clockintr_statclock(struct clockintr *, void *);
69 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
70 uint64_t clockqueue_next(const struct clockintr_queue *);
71 void clockqueue_reset_intrclock(struct clockintr_queue *);
72 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
73 
74 /*
75  * Initialize global state.  Set flags and compute intervals.
76  */
77 void
78 clockintr_init(u_int flags)
79 {
80 	KASSERT(CPU_IS_PRIMARY(curcpu()));
81 	KASSERT(clockintr_flags == 0);
82 	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
83 
84 	KASSERT(hz > 0 && hz <= 1000000000);
85 	hardclock_period = 1000000000 / hz;
86 
87 	KASSERT(stathz >= 1 && stathz <= 1000000000);
88 	KASSERT(profhz >= stathz && profhz <= 1000000000);
89 	KASSERT(profhz % stathz == 0);
90 	clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
91 	clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
92 	SET(clockintr_flags, CL_STATCLOCK);
93 	clockintr_setstatclockrate(stathz);
94 
95 	KASSERT(schedhz >= 0 && schedhz <= 1000000000);
96 	if (schedhz != 0)
97 		schedclock_period = 1000000000 / schedhz;
98 
99 	SET(clockintr_flags, flags | CL_INIT);
100 }
101 
102 /*
103  * Ready the calling CPU for clockintr_dispatch().  If this is our
104  * first time here, install the intrclock, if any, and set necessary
105  * flags.  Advance the schedule as needed.
106  */
107 void
108 clockintr_cpu_init(const struct intrclock *ic)
109 {
110 	uint64_t multiplier = 0;
111 	struct cpu_info *ci = curcpu();
112 	struct clockintr_queue *cq = &ci->ci_queue;
113 	int reset_cq_intrclock = 0;
114 
115 	KASSERT(ISSET(clockintr_flags, CL_INIT));
116 
117 	if (ic != NULL && !ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
118 		cq->cq_intrclock = *ic;
119 		SET(cq->cq_flags, CQ_INTRCLOCK);
120 	}
121 
122 	/* TODO: Remove these from struct clockintr_queue. */
123 	if (cq->cq_hardclock == NULL) {
124 		cq->cq_hardclock = clockintr_establish(cq, clockintr_hardclock);
125 		if (cq->cq_hardclock == NULL)
126 			panic("%s: failed to establish hardclock", __func__);
127 	}
128 	if (cq->cq_statclock == NULL) {
129 		cq->cq_statclock = clockintr_establish(cq, clockintr_statclock);
130 		if (cq->cq_statclock == NULL)
131 			panic("%s: failed to establish statclock", __func__);
132 	}
133 	if (schedhz != 0 && cq->cq_schedclock == NULL) {
134 		cq->cq_schedclock = clockintr_establish(cq,
135 		    clockintr_schedclock);
136 		if (cq->cq_schedclock == NULL)
137 			panic("%s: failed to establish schedclock", __func__);
138 	}
139 
140 	/*
141 	 * Mask CQ_INTRCLOCK while we're advancing the internal clock
142 	 * interrupts.  We don't want the intrclock to fire until this
143 	 * thread reaches clockintr_trigger().
144 	 */
145 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
146 		CLR(cq->cq_flags, CQ_INTRCLOCK);
147 		reset_cq_intrclock = 1;
148 	}
149 
150 	/*
151 	 * Until we understand scheduler lock contention better, stagger
152 	 * the hardclock and statclock so they don't all happen at once.
153 	 * If we have no intrclock it doesn't matter, we have no control
154 	 * anyway.  The primary CPU's starting offset is always zero, so
155 	 * leave the multiplier zero.
156 	 */
157 	if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
158 		multiplier = CPU_INFO_UNIT(ci);
159 
160 	/*
161 	 * The first time we do this, the primary CPU cannot skip any
162 	 * hardclocks.  We can skip hardclocks on subsequent calls because
163 	 * the global tick value is advanced during inittodr(9) on our
164 	 * behalf.
165 	 */
166 	if (CPU_IS_PRIMARY(ci)) {
167 		if (cq->cq_hardclock->cl_expiration == 0)
168 			clockintr_schedule(cq->cq_hardclock, 0);
169 		else
170 			clockintr_advance(cq->cq_hardclock, hardclock_period);
171 	} else {
172 		if (cq->cq_hardclock->cl_expiration == 0) {
173 			clockintr_stagger(cq->cq_hardclock, hardclock_period,
174 			     multiplier, MAXCPUS);
175 		}
176 		clockintr_advance(cq->cq_hardclock, hardclock_period);
177 	}
178 
179 	/*
180 	 * We can always advance the statclock and schedclock.
181 	 * There is no reason to stagger a randomized statclock.
182 	 */
183 	if (!ISSET(clockintr_flags, CL_RNDSTAT)) {
184 		if (cq->cq_statclock->cl_expiration == 0) {
185 			clockintr_stagger(cq->cq_statclock, statclock_avg,
186 			    multiplier, MAXCPUS);
187 		}
188 	}
189 	clockintr_advance(cq->cq_statclock, statclock_avg);
190 	if (schedhz != 0) {
191 		if (cq->cq_schedclock->cl_expiration == 0) {
192 			clockintr_stagger(cq->cq_schedclock, schedclock_period,
193 			    multiplier, MAXCPUS);
194 		}
195 		clockintr_advance(cq->cq_schedclock, schedclock_period);
196 	}
197 
198 	if (reset_cq_intrclock)
199 		SET(cq->cq_flags, CQ_INTRCLOCK);
200 }
201 
202 /*
203  * If we have an intrclock, trigger it to start the dispatch cycle.
204  */
205 void
206 clockintr_trigger(void)
207 {
208 	struct clockintr_queue *cq = &curcpu()->ci_queue;
209 
210 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
211 
212 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
213 		intrclock_trigger(&cq->cq_intrclock);
214 }
215 
216 /*
217  * Run all expired events scheduled on the calling CPU.
218  */
219 int
220 clockintr_dispatch(void *frame)
221 {
222 	uint64_t lateness, run = 0, start;
223 	struct cpu_info *ci = curcpu();
224 	struct clockintr *cl;
225 	struct clockintr_queue *cq = &ci->ci_queue;
226 	u_int ogen;
227 
228 	if (cq->cq_dispatch != 0)
229 		panic("%s: recursive dispatch", __func__);
230 	cq->cq_dispatch = 1;
231 
232 	splassert(IPL_CLOCK);
233 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
234 
235 	mtx_enter(&cq->cq_mtx);
236 
237 	/*
238 	 * If nothing is scheduled or we arrived too early, we have
239 	 * nothing to do.
240 	 */
241 	start = nsecuptime();
242 	cq->cq_uptime = start;
243 	if (TAILQ_EMPTY(&cq->cq_pend))
244 		goto stats;
245 	if (cq->cq_uptime < clockqueue_next(cq))
246 		goto rearm;
247 	lateness = start - clockqueue_next(cq);
248 
249 	/*
250 	 * Dispatch expired events.
251 	 */
252 	for (;;) {
253 		cl = TAILQ_FIRST(&cq->cq_pend);
254 		if (cl == NULL)
255 			break;
256 		if (cq->cq_uptime < cl->cl_expiration) {
257 			/* Double-check the time before giving up. */
258 			cq->cq_uptime = nsecuptime();
259 			if (cq->cq_uptime < cl->cl_expiration)
260 				break;
261 		}
262 		clockintr_cancel_locked(cl);
263 		cq->cq_shadow.cl_expiration = cl->cl_expiration;
264 		cq->cq_running = cl;
265 		mtx_leave(&cq->cq_mtx);
266 
267 		cl->cl_func(&cq->cq_shadow, frame);
268 
269 		mtx_enter(&cq->cq_mtx);
270 		cq->cq_running = NULL;
271 		if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
272 			CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
273 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
274 		}
275 		if (ISSET(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING)) {
276 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
277 			clockintr_schedule_locked(cl,
278 			    cq->cq_shadow.cl_expiration);
279 		}
280 		run++;
281 	}
282 
283 	/*
284 	 * Dispatch complete.
285 	 */
286 rearm:
287 	/* Rearm the interrupt clock if we have one. */
288 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
289 		if (!TAILQ_EMPTY(&cq->cq_pend)) {
290 			intrclock_rearm(&cq->cq_intrclock,
291 			    clockqueue_next(cq) - cq->cq_uptime);
292 		}
293 	}
294 stats:
295 	/* Update our stats. */
296 	ogen = cq->cq_gen;
297 	cq->cq_gen = 0;
298 	membar_producer();
299 	cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
300 	if (run > 0) {
301 		cq->cq_stat.cs_lateness += lateness;
302 		cq->cq_stat.cs_prompt++;
303 		cq->cq_stat.cs_run += run;
304 	} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
305 		cq->cq_stat.cs_early++;
306 		cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
307 	} else
308 		cq->cq_stat.cs_spurious++;
309 	membar_producer();
310 	cq->cq_gen = MAX(1, ogen + 1);
311 
312 	mtx_leave(&cq->cq_mtx);
313 
314 	if (cq->cq_dispatch != 1)
315 		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
316 	cq->cq_dispatch = 0;
317 
318 	return run > 0;
319 }
320 
321 uint64_t
322 clockintr_advance(struct clockintr *cl, uint64_t period)
323 {
324 	uint64_t count, expiration;
325 	struct clockintr_queue *cq = cl->cl_queue;
326 
327 	if (cl == &cq->cq_shadow) {
328 		count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime);
329 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
330 		return count;
331 	}
332 
333 	mtx_enter(&cq->cq_mtx);
334 	expiration = cl->cl_expiration;
335 	count = nsec_advance(&expiration, period, nsecuptime());
336 	if (ISSET(cl->cl_flags, CLST_PENDING))
337 		clockintr_cancel_locked(cl);
338 	clockintr_schedule_locked(cl, expiration);
339 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
340 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
341 			if (cq == &curcpu()->ci_queue)
342 				clockqueue_reset_intrclock(cq);
343 		}
344 	}
345 	if (cl == cq->cq_running)
346 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
347 	mtx_leave(&cq->cq_mtx);
348 	return count;
349 }
350 
351 void
352 clockintr_cancel(struct clockintr *cl)
353 {
354 	struct clockintr_queue *cq = cl->cl_queue;
355 	int was_next;
356 
357 	if (cl == &cq->cq_shadow) {
358 		CLR(cl->cl_flags, CLST_SHADOW_PENDING);
359 		return;
360 	}
361 
362 	mtx_enter(&cq->cq_mtx);
363 	if (ISSET(cl->cl_flags, CLST_PENDING)) {
364 		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
365 		clockintr_cancel_locked(cl);
366 		if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
367 			if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
368 				if (cq == &curcpu()->ci_queue)
369 					clockqueue_reset_intrclock(cq);
370 			}
371 		}
372 	}
373 	if (cl == cq->cq_running)
374 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
375 	mtx_leave(&cq->cq_mtx);
376 }
377 
378 void
379 clockintr_cancel_locked(struct clockintr *cl)
380 {
381 	struct clockintr_queue *cq = cl->cl_queue;
382 
383 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
384 	KASSERT(ISSET(cl->cl_flags, CLST_PENDING));
385 
386 	TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
387 	CLR(cl->cl_flags, CLST_PENDING);
388 }
389 
390 struct clockintr *
391 clockintr_establish(struct clockintr_queue *cq,
392     void (*func)(struct clockintr *, void *))
393 {
394 	struct clockintr *cl;
395 
396 	cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO);
397 	if (cl == NULL)
398 		return NULL;
399 	cl->cl_func = func;
400 	cl->cl_queue = cq;
401 
402 	mtx_enter(&cq->cq_mtx);
403 	TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink);
404 	mtx_leave(&cq->cq_mtx);
405 	return cl;
406 }
407 
408 uint64_t
409 clockintr_expiration(const struct clockintr *cl)
410 {
411 	uint64_t expiration;
412 	struct clockintr_queue *cq = cl->cl_queue;
413 
414 	if (cl == &cq->cq_shadow)
415 		return cl->cl_expiration;
416 
417 	mtx_enter(&cq->cq_mtx);
418 	expiration = cl->cl_expiration;
419 	mtx_leave(&cq->cq_mtx);
420 	return expiration;
421 }
422 
423 void
424 clockintr_schedule(struct clockintr *cl, uint64_t expiration)
425 {
426 	struct clockintr_queue *cq = cl->cl_queue;
427 
428 	if (cl == &cq->cq_shadow) {
429 		cl->cl_expiration = expiration;
430 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
431 		return;
432 	}
433 
434 	mtx_enter(&cq->cq_mtx);
435 	if (ISSET(cl->cl_flags, CLST_PENDING))
436 		clockintr_cancel_locked(cl);
437 	clockintr_schedule_locked(cl, expiration);
438 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
439 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
440 			if (cq == &curcpu()->ci_queue)
441 				clockqueue_reset_intrclock(cq);
442 		}
443 	}
444 	if (cl == cq->cq_running)
445 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
446 	mtx_leave(&cq->cq_mtx);
447 }
448 
449 void
450 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
451 {
452 	struct clockintr *elm;
453 	struct clockintr_queue *cq = cl->cl_queue;
454 
455 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
456 	KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));
457 
458 	cl->cl_expiration = expiration;
459 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
460 		if (cl->cl_expiration < elm->cl_expiration)
461 			break;
462 	}
463 	if (elm == NULL)
464 		TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
465 	else
466 		TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
467 	SET(cl->cl_flags, CLST_PENDING);
468 }
469 
470 void
471 clockintr_stagger(struct clockintr *cl, uint64_t period, u_int n, u_int count)
472 {
473 	struct clockintr_queue *cq = cl->cl_queue;
474 
475 	KASSERT(n < count);
476 
477 	mtx_enter(&cq->cq_mtx);
478 	if (ISSET(cl->cl_flags, CLST_PENDING))
479 		panic("%s: clock interrupt pending", __func__);
480 	cl->cl_expiration = period / count * n;
481 	mtx_leave(&cq->cq_mtx);
482 }
483 
484 /*
485  * Compute the period (avg) for the given frequency and a range around
486  * that period.  The range is [min + 1, min + mask].  The range is used
487  * during dispatch to choose a new pseudorandom deadline for each statclock
488  * event.
489  */
490 void
491 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
492 {
493 	uint32_t half_avg, var;
494 
495 	KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
496 	KASSERT(freq > 0 && freq <= 1000000000);
497 
498 	/* Compute avg, the average period. */
499 	*avg = 1000000000 / freq;
500 
501 	/* Find var, the largest power of two such that var <= avg / 2. */
502 	half_avg = *avg / 2;
503 	for (var = 1U << 31; var > half_avg; var /= 2)
504 		continue;
505 
506 	/* Using avg and var, set a lower bound for the range. */
507 	*min = *avg - (var / 2);
508 
509 	/* The mask is just (var - 1). */
510 	*mask = var - 1;
511 }
512 
513 /*
514  * Update the statclock_* variables according to the given frequency.
515  * Must only be called after clockintr_statvar_init() initializes both
516  * stathz_* and profhz_*.
517  */
518 void
519 clockintr_setstatclockrate(int freq)
520 {
521 	u_int ogen;
522 
523 	KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
524 
525 	mtx_enter(&clockintr_mtx);
526 
527 	ogen = statclock_gen;
528 	statclock_gen = 0;
529 	membar_producer();
530 	if (freq == stathz) {
531 		statclock_avg = stat_avg;
532 		statclock_min = stat_min;
533 		statclock_mask = stat_mask;
534 	} else if (freq == profhz) {
535 		statclock_avg = prof_avg;
536 		statclock_min = prof_min;
537 		statclock_mask = prof_mask;
538 	} else {
539 		panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
540 		    __func__, stathz, profhz, freq);
541 	}
542 	membar_producer();
543 	statclock_gen = MAX(1, ogen + 1);
544 
545 	mtx_leave(&clockintr_mtx);
546 }
547 
548 uint64_t
549 clockintr_nsecuptime(const struct clockintr *cl)
550 {
551 	KASSERT(cl == &cl->cl_queue->cq_shadow);
552 	return cl->cl_queue->cq_uptime;
553 }
554 
555 void
556 clockintr_hardclock(struct clockintr *cl, void *frame)
557 {
558 	uint64_t count, i;
559 
560 	count = clockintr_advance(cl, hardclock_period);
561 	for (i = 0; i < count; i++)
562 		hardclock(frame);
563 }
564 
565 void
566 clockintr_schedclock(struct clockintr *cl, void *unused)
567 {
568 	uint64_t count, i;
569 	struct proc *p = curproc;
570 
571 	count = clockintr_advance(cl, schedclock_period);
572 	if (p != NULL) {
573 		for (i = 0; i < count; i++)
574 			schedclock(p);
575 	}
576 }
577 
578 void
579 clockintr_statclock(struct clockintr *cl, void *frame)
580 {
581 	uint64_t count, expiration, i, uptime;
582 	uint32_t mask, min, off;
583 	u_int gen;
584 
585 	if (ISSET(clockintr_flags, CL_RNDSTAT)) {
586 		do {
587 			gen = statclock_gen;
588 			membar_consumer();
589 			min = statclock_min;
590 			mask = statclock_mask;
591 			membar_consumer();
592 		} while (gen == 0 || gen != statclock_gen);
593 		count = 0;
594 		expiration = clockintr_expiration(cl);
595 		uptime = clockintr_nsecuptime(cl);
596 		while (expiration <= uptime) {
597 			while ((off = (random() & mask)) == 0)
598 				continue;
599 			expiration += min + off;
600 			count++;
601 		}
602 		clockintr_schedule(cl, expiration);
603 	} else {
604 		count = clockintr_advance(cl, statclock_avg);
605 	}
606 	for (i = 0; i < count; i++)
607 		statclock(frame);
608 }
609 
610 void
611 clockqueue_init(struct clockintr_queue *cq)
612 {
613 	if (ISSET(cq->cq_flags, CQ_INIT))
614 		return;
615 
616 	cq->cq_shadow.cl_queue = cq;
617 	mtx_init(&cq->cq_mtx, IPL_CLOCK);
618 	TAILQ_INIT(&cq->cq_est);
619 	TAILQ_INIT(&cq->cq_pend);
620 	cq->cq_gen = 1;
621 	SET(cq->cq_flags, CQ_INIT);
622 }
623 
624 uint64_t
625 clockqueue_next(const struct clockintr_queue *cq)
626 {
627 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
628 	return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
629 }
630 
631 void
632 clockqueue_reset_intrclock(struct clockintr_queue *cq)
633 {
634 	uint64_t exp, now;
635 
636 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
637 	KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));
638 
639 	exp = clockqueue_next(cq);
640 	now = nsecuptime();
641 	if (now < exp)
642 		intrclock_rearm(&cq->cq_intrclock, exp - now);
643 	else
644 		intrclock_trigger(&cq->cq_intrclock);
645 }
646 
647 /*
648  * Advance *next in increments of period until it exceeds now.
649  * Returns the number of increments *next was advanced.
650  *
651  * We check the common cases first to avoid division if possible.
652  * This does no overflow checking.
653  */
654 uint64_t
655 nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
656 {
657 	uint64_t elapsed;
658 
659 	if (now < *next)
660 		return 0;
661 
662 	if (now < *next + period) {
663 		*next += period;
664 		return 1;
665 	}
666 
667 	elapsed = (now - *next) / period + 1;
668 	*next += period * elapsed;
669 	return elapsed;
670 }
671 
672 int
673 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
674     void *newp, size_t newlen)
675 {
676 	struct clockintr_stat sum, tmp;
677 	struct clockintr_queue *cq;
678 	struct cpu_info *ci;
679 	CPU_INFO_ITERATOR cii;
680 	u_int gen;
681 
682 	if (namelen != 1)
683 		return ENOTDIR;
684 
685 	switch (name[0]) {
686 	case KERN_CLOCKINTR_STATS:
687 		memset(&sum, 0, sizeof sum);
688 		CPU_INFO_FOREACH(cii, ci) {
689 			cq = &ci->ci_queue;
690 			if (!ISSET(cq->cq_flags, CQ_INIT))
691 				continue;
692 			do {
693 				gen = cq->cq_gen;
694 				membar_consumer();
695 				tmp = cq->cq_stat;
696 				membar_consumer();
697 			} while (gen == 0 || gen != cq->cq_gen);
698 			sum.cs_dispatched += tmp.cs_dispatched;
699 			sum.cs_early += tmp.cs_early;
700 			sum.cs_earliness += tmp.cs_earliness;
701 			sum.cs_lateness += tmp.cs_lateness;
702 			sum.cs_prompt += tmp.cs_prompt;
703 			sum.cs_run += tmp.cs_run;
704 			sum.cs_spurious += tmp.cs_spurious;
705 		}
706 		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
707 	default:
708 		break;
709 	}
710 
711 	return EINVAL;
712 }
713 
714 #ifdef DDB
715 
716 #include <machine/db_machdep.h>
717 
718 #include <ddb/db_interface.h>
719 #include <ddb/db_output.h>
720 #include <ddb/db_sym.h>
721 
722 void db_show_clockintr(const struct clockintr *, const char *, u_int);
723 void db_show_clockintr_cpu(struct cpu_info *);
724 
725 void
726 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
727 {
728 	struct timespec now;
729 	struct cpu_info *ci;
730 	CPU_INFO_ITERATOR cii;
731 
732 	nanouptime(&now);
733 	db_printf("%20s\n", "UPTIME");
734 	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
735 	db_printf("\n");
736 	db_printf("%20s  %5s  %3s  %s\n", "EXPIRATION", "STATE", "CPU", "NAME");
737 	CPU_INFO_FOREACH(cii, ci) {
738 		if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
739 			db_show_clockintr_cpu(ci);
740 	}
741 }
742 
743 void
744 db_show_clockintr_cpu(struct cpu_info *ci)
745 {
746 	struct clockintr *elm;
747 	struct clockintr_queue *cq = &ci->ci_queue;
748 	u_int cpu = CPU_INFO_UNIT(ci);
749 
750 	if (cq->cq_running != NULL)
751 		db_show_clockintr(cq->cq_running, "run", cpu);
752 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
753 		db_show_clockintr(elm, "pend", cpu);
754 	TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) {
755 		if (!ISSET(elm->cl_flags, CLST_PENDING))
756 			db_show_clockintr(elm, "idle", cpu);
757 	}
758 }
759 
760 void
761 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
762 {
763 	struct timespec ts;
764 	char *name;
765 	db_expr_t offset;
766 
767 	NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
768 	db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
769 	if (name == NULL)
770 		name = "?";
771 	db_printf("%10lld.%09ld  %5s  %3u  %s\n",
772 	    ts.tv_sec, ts.tv_nsec, state, cpu, name);
773 }
774 
775 #endif /* DDB */
776 #endif /*__HAVE_CLOCKINTR */
777