xref: /openbsd-src/sys/kern/kern_clockintr.c (revision 00562ee792b488f490f6452d276d344814f820b3)
1 /* $OpenBSD: kern_clockintr.c,v 1.24 2023/06/18 23:19:01 cheloha Exp $ */
2 /*
3  * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
4  * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
5  * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/atomic.h>
23 #include <sys/clockintr.h>
24 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/mutex.h>
27 #include <sys/queue.h>
28 #include <sys/stdint.h>
29 #include <sys/sysctl.h>
30 #include <sys/time.h>
31 
32 #ifdef __HAVE_CLOCKINTR
33 
34 /*
35  * Protection for global variables in this file:
36  *
37  *	C	Global clockintr configuration mutex (clockintr_mtx).
38  *	I	Immutable after initialization.
39  */
40 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
41 
42 u_int clockintr_flags;			/* [I] global state + behavior flags */
43 uint32_t hardclock_period;		/* [I] hardclock period (ns) */
44 uint32_t schedclock_period;		/* [I] schedclock period (ns) */
45 volatile u_int statclock_gen = 1;	/* [C] statclock update generation */
46 volatile uint32_t statclock_avg;	/* [C] average statclock period (ns) */
47 uint32_t statclock_min;			/* [C] minimum statclock period (ns) */
48 uint32_t statclock_mask;		/* [C] set of allowed offsets */
49 uint32_t stat_avg;			/* [I] average stathz period (ns) */
50 uint32_t stat_min;			/* [I] set of allowed offsets */
51 uint32_t stat_mask;			/* [I] max offset from minimum (ns) */
52 uint32_t prof_avg;			/* [I] average profhz period (ns) */
53 uint32_t prof_min;			/* [I] minimum profhz period (ns) */
54 uint32_t prof_mask;			/* [I] set of allowed offsets */
55 
56 uint64_t clockintr_advance(struct clockintr *, uint64_t);
57 void clockintr_cancel(struct clockintr *);
58 void clockintr_cancel_locked(struct clockintr *);
59 struct clockintr *clockintr_establish(struct clockintr_queue *,
60     void (*)(struct clockintr *, void *));
61 uint64_t clockintr_expiration(const struct clockintr *);
62 void clockintr_hardclock(struct clockintr *, void *);
63 uint64_t clockintr_nsecuptime(const struct clockintr *);
64 void clockintr_schedclock(struct clockintr *, void *);
65 void clockintr_schedule(struct clockintr *, uint64_t);
66 void clockintr_schedule_locked(struct clockintr *, uint64_t);
67 void clockintr_stagger(struct clockintr *, uint64_t, u_int, u_int);
68 void clockintr_statclock(struct clockintr *, void *);
69 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
70 uint64_t clockqueue_next(const struct clockintr_queue *);
71 void clockqueue_reset_intrclock(struct clockintr_queue *);
72 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
73 
74 /*
75  * Initialize global state.  Set flags and compute intervals.
76  */
77 void
78 clockintr_init(u_int flags)
79 {
80 	KASSERT(CPU_IS_PRIMARY(curcpu()));
81 	KASSERT(clockintr_flags == 0);
82 	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
83 
84 	KASSERT(hz > 0 && hz <= 1000000000);
85 	hardclock_period = 1000000000 / hz;
86 
87 	KASSERT(stathz >= 1 && stathz <= 1000000000);
88 	KASSERT(profhz >= stathz && profhz <= 1000000000);
89 	KASSERT(profhz % stathz == 0);
90 	clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
91 	clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
92 	SET(clockintr_flags, CL_STATCLOCK);
93 	clockintr_setstatclockrate(stathz);
94 
95 	KASSERT(schedhz >= 0 && schedhz <= 1000000000);
96 	if (schedhz != 0)
97 		schedclock_period = 1000000000 / schedhz;
98 
99 	SET(clockintr_flags, flags | CL_INIT);
100 }
101 
102 /*
103  * Ready the calling CPU for clockintr_dispatch().  If this is our
104  * first time here, install the intrclock, if any, and set necessary
105  * flags.  Advance the schedule as needed.
106  */
107 void
108 clockintr_cpu_init(const struct intrclock *ic)
109 {
110 	uint64_t multiplier = 0;
111 	struct cpu_info *ci = curcpu();
112 	struct clockintr_queue *cq = &ci->ci_queue;
113 	int reset_cq_intrclock = 0;
114 
115 	KASSERT(ISSET(clockintr_flags, CL_INIT));
116 
117 	if (ic != NULL && !ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
118 		cq->cq_intrclock = *ic;
119 		SET(cq->cq_flags, CQ_INTRCLOCK);
120 	}
121 
122 	/* TODO: Remove these from struct clockintr_queue. */
123 	if (cq->cq_hardclock == NULL) {
124 		cq->cq_hardclock = clockintr_establish(cq, clockintr_hardclock);
125 		if (cq->cq_hardclock == NULL)
126 			panic("%s: failed to establish hardclock", __func__);
127 	}
128 	if (cq->cq_statclock == NULL) {
129 		cq->cq_statclock = clockintr_establish(cq, clockintr_statclock);
130 		if (cq->cq_statclock == NULL)
131 			panic("%s: failed to establish statclock", __func__);
132 	}
133 	if (schedhz != 0 && cq->cq_schedclock == NULL) {
134 		cq->cq_schedclock = clockintr_establish(cq,
135 		    clockintr_schedclock);
136 		if (cq->cq_schedclock == NULL)
137 			panic("%s: failed to establish schedclock", __func__);
138 	}
139 
140 	/*
141 	 * Mask CQ_INTRCLOCK while we're advancing the internal clock
142 	 * interrupts.  We don't want the intrclock to fire until this
143 	 * thread reaches clockintr_trigger().
144 	 */
145 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
146 		CLR(cq->cq_flags, CQ_INTRCLOCK);
147 		reset_cq_intrclock = 1;
148 	}
149 
150 	/*
151 	 * Until we understand scheduler lock contention better, stagger
152 	 * the hardclock and statclock so they don't all happen at once.
153 	 * If we have no intrclock it doesn't matter, we have no control
154 	 * anyway.  The primary CPU's starting offset is always zero, so
155 	 * leave the multiplier zero.
156 	 */
157 	if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
158 		multiplier = CPU_INFO_UNIT(ci);
159 
160 	/*
161 	 * The first time we do this, the primary CPU cannot skip any
162 	 * hardclocks.  We can skip hardclocks on subsequent calls because
163 	 * the global tick value is advanced during inittodr(9) on our
164 	 * behalf.
165 	 */
166 	if (CPU_IS_PRIMARY(ci)) {
167 		if (cq->cq_hardclock->cl_expiration == 0)
168 			clockintr_schedule(cq->cq_hardclock, 0);
169 		else
170 			clockintr_advance(cq->cq_hardclock, hardclock_period);
171 	} else {
172 		if (cq->cq_hardclock->cl_expiration == 0) {
173 			clockintr_stagger(cq->cq_hardclock, hardclock_period,
174 			    multiplier, ncpus);
175 		}
176 		clockintr_advance(cq->cq_hardclock, hardclock_period);
177 	}
178 
179 	/*
180 	 * We can always advance the statclock and schedclock.
181 	 */
182 	if (cq->cq_statclock->cl_expiration == 0) {
183 		clockintr_stagger(cq->cq_statclock, statclock_avg, multiplier,
184 		    ncpus);
185 	}
186 	clockintr_advance(cq->cq_statclock, statclock_avg);
187 	if (schedhz != 0) {
188 		if (cq->cq_schedclock->cl_expiration == 0) {
189 			clockintr_stagger(cq->cq_schedclock, schedclock_period,
190 			    multiplier, ncpus);
191 		}
192 		clockintr_advance(cq->cq_schedclock, schedclock_period);
193 	}
194 
195 	if (reset_cq_intrclock)
196 		SET(cq->cq_flags, CQ_INTRCLOCK);
197 }
198 
199 /*
200  * If we have an intrclock, trigger it to start the dispatch cycle.
201  */
202 void
203 clockintr_trigger(void)
204 {
205 	struct clockintr_queue *cq = &curcpu()->ci_queue;
206 
207 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
208 
209 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
210 		intrclock_trigger(&cq->cq_intrclock);
211 }
212 
213 /*
214  * Run all expired events scheduled on the calling CPU.
215  */
216 int
217 clockintr_dispatch(void *frame)
218 {
219 	uint64_t lateness, run = 0, start;
220 	struct cpu_info *ci = curcpu();
221 	struct clockintr *cl;
222 	struct clockintr_queue *cq = &ci->ci_queue;
223 	u_int ogen;
224 
225 	if (cq->cq_dispatch != 0)
226 		panic("%s: recursive dispatch", __func__);
227 	cq->cq_dispatch = 1;
228 
229 	splassert(IPL_CLOCK);
230 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
231 
232 	mtx_enter(&cq->cq_mtx);
233 
234 	/*
235 	 * If nothing is scheduled or we arrived too early, we have
236 	 * nothing to do.
237 	 */
238 	start = nsecuptime();
239 	cq->cq_uptime = start;
240 	if (TAILQ_EMPTY(&cq->cq_pend))
241 		goto stats;
242 	if (cq->cq_uptime < clockqueue_next(cq))
243 		goto rearm;
244 	lateness = start - clockqueue_next(cq);
245 
246 	/*
247 	 * Dispatch expired events.
248 	 */
249 	for (;;) {
250 		cl = TAILQ_FIRST(&cq->cq_pend);
251 		if (cl == NULL)
252 			break;
253 		if (cq->cq_uptime < cl->cl_expiration) {
254 			/* Double-check the time before giving up. */
255 			cq->cq_uptime = nsecuptime();
256 			if (cq->cq_uptime < cl->cl_expiration)
257 				break;
258 		}
259 		clockintr_cancel_locked(cl);
260 		cq->cq_shadow.cl_expiration = cl->cl_expiration;
261 		cq->cq_running = cl;
262 		mtx_leave(&cq->cq_mtx);
263 
264 		cl->cl_func(&cq->cq_shadow, frame);
265 
266 		mtx_enter(&cq->cq_mtx);
267 		cq->cq_running = NULL;
268 		if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
269 			CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
270 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
271 		}
272 		if (ISSET(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING)) {
273 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
274 			clockintr_schedule_locked(cl,
275 			    cq->cq_shadow.cl_expiration);
276 		}
277 		run++;
278 	}
279 
280 	/*
281 	 * Dispatch complete.
282 	 */
283 rearm:
284 	/* Rearm the interrupt clock if we have one. */
285 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
286 		if (!TAILQ_EMPTY(&cq->cq_pend)) {
287 			intrclock_rearm(&cq->cq_intrclock,
288 			    clockqueue_next(cq) - cq->cq_uptime);
289 		}
290 	}
291 stats:
292 	/* Update our stats. */
293 	ogen = cq->cq_gen;
294 	cq->cq_gen = 0;
295 	membar_producer();
296 	cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
297 	if (run > 0) {
298 		cq->cq_stat.cs_lateness += lateness;
299 		cq->cq_stat.cs_prompt++;
300 		cq->cq_stat.cs_run += run;
301 	} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
302 		cq->cq_stat.cs_early++;
303 		cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
304 	} else
305 		cq->cq_stat.cs_spurious++;
306 	membar_producer();
307 	cq->cq_gen = MAX(1, ogen + 1);
308 
309 	mtx_leave(&cq->cq_mtx);
310 
311 	if (cq->cq_dispatch != 1)
312 		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
313 	cq->cq_dispatch = 0;
314 
315 	return run > 0;
316 }
317 
318 uint64_t
319 clockintr_advance(struct clockintr *cl, uint64_t period)
320 {
321 	uint64_t count, expiration;
322 	struct clockintr_queue *cq = cl->cl_queue;
323 
324 	if (cl == &cq->cq_shadow) {
325 		count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime);
326 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
327 		return count;
328 	}
329 
330 	mtx_enter(&cq->cq_mtx);
331 	expiration = cl->cl_expiration;
332 	count = nsec_advance(&expiration, period, nsecuptime());
333 	if (ISSET(cl->cl_flags, CLST_PENDING))
334 		clockintr_cancel_locked(cl);
335 	clockintr_schedule_locked(cl, expiration);
336 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
337 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
338 			if (cq == &curcpu()->ci_queue)
339 				clockqueue_reset_intrclock(cq);
340 		}
341 	}
342 	if (cl == cq->cq_running)
343 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
344 	mtx_leave(&cq->cq_mtx);
345 	return count;
346 }
347 
348 void
349 clockintr_cancel(struct clockintr *cl)
350 {
351 	struct clockintr_queue *cq = cl->cl_queue;
352 	int was_next;
353 
354 	if (cl == &cq->cq_shadow) {
355 		CLR(cl->cl_flags, CLST_SHADOW_PENDING);
356 		return;
357 	}
358 
359 	mtx_enter(&cq->cq_mtx);
360 	if (ISSET(cl->cl_flags, CLST_PENDING)) {
361 		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
362 		clockintr_cancel_locked(cl);
363 		if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
364 			if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
365 				if (cq == &curcpu()->ci_queue)
366 					clockqueue_reset_intrclock(cq);
367 			}
368 		}
369 	}
370 	if (cl == cq->cq_running)
371 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
372 	mtx_leave(&cq->cq_mtx);
373 }
374 
375 void
376 clockintr_cancel_locked(struct clockintr *cl)
377 {
378 	struct clockintr_queue *cq = cl->cl_queue;
379 
380 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
381 	KASSERT(ISSET(cl->cl_flags, CLST_PENDING));
382 
383 	TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
384 	CLR(cl->cl_flags, CLST_PENDING);
385 }
386 
387 struct clockintr *
388 clockintr_establish(struct clockintr_queue *cq,
389     void (*func)(struct clockintr *, void *))
390 {
391 	struct clockintr *cl;
392 
393 	cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO);
394 	if (cl == NULL)
395 		return NULL;
396 	cl->cl_func = func;
397 	cl->cl_queue = cq;
398 
399 	mtx_enter(&cq->cq_mtx);
400 	TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink);
401 	mtx_leave(&cq->cq_mtx);
402 	return cl;
403 }
404 
405 uint64_t
406 clockintr_expiration(const struct clockintr *cl)
407 {
408 	uint64_t expiration;
409 	struct clockintr_queue *cq = cl->cl_queue;
410 
411 	if (cl == &cq->cq_shadow)
412 		return cl->cl_expiration;
413 
414 	mtx_enter(&cq->cq_mtx);
415 	expiration = cl->cl_expiration;
416 	mtx_leave(&cq->cq_mtx);
417 	return expiration;
418 }
419 
420 void
421 clockintr_schedule(struct clockintr *cl, uint64_t expiration)
422 {
423 	struct clockintr_queue *cq = cl->cl_queue;
424 
425 	if (cl == &cq->cq_shadow) {
426 		cl->cl_expiration = expiration;
427 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
428 		return;
429 	}
430 
431 	mtx_enter(&cq->cq_mtx);
432 	if (ISSET(cl->cl_flags, CLST_PENDING))
433 		clockintr_cancel_locked(cl);
434 	clockintr_schedule_locked(cl, expiration);
435 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
436 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
437 			if (cq == &curcpu()->ci_queue)
438 				clockqueue_reset_intrclock(cq);
439 		}
440 	}
441 	if (cl == cq->cq_running)
442 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
443 	mtx_leave(&cq->cq_mtx);
444 }
445 
446 void
447 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
448 {
449 	struct clockintr *elm;
450 	struct clockintr_queue *cq = cl->cl_queue;
451 
452 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
453 	KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));
454 
455 	cl->cl_expiration = expiration;
456 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
457 		if (cl->cl_expiration < elm->cl_expiration)
458 			break;
459 	}
460 	if (elm == NULL)
461 		TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
462 	else
463 		TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
464 	SET(cl->cl_flags, CLST_PENDING);
465 }
466 
467 void
468 clockintr_stagger(struct clockintr *cl, uint64_t period, u_int n, u_int count)
469 {
470 	struct clockintr_queue *cq = cl->cl_queue;
471 
472 	KASSERT(n < count);
473 
474 	mtx_enter(&cq->cq_mtx);
475 	if (ISSET(cl->cl_flags, CLST_PENDING))
476 		panic("%s: clock interrupt pending", __func__);
477 	cl->cl_expiration = period / count * n;
478 	mtx_leave(&cq->cq_mtx);
479 }
480 
481 /*
482  * Compute the period (avg) for the given frequency and a range around
483  * that period.  The range is [min + 1, min + mask].  The range is used
484  * during dispatch to choose a new pseudorandom deadline for each statclock
485  * event.
486  */
487 void
488 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
489 {
490 	uint32_t half_avg, var;
491 
492 	KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
493 	KASSERT(freq > 0 && freq <= 1000000000);
494 
495 	/* Compute avg, the average period. */
496 	*avg = 1000000000 / freq;
497 
498 	/* Find var, the largest power of two such that var <= avg / 2. */
499 	half_avg = *avg / 2;
500 	for (var = 1U << 31; var > half_avg; var /= 2)
501 		continue;
502 
503 	/* Using avg and var, set a lower bound for the range. */
504 	*min = *avg - (var / 2);
505 
506 	/* The mask is just (var - 1). */
507 	*mask = var - 1;
508 }
509 
510 /*
511  * Update the statclock_* variables according to the given frequency.
512  * Must only be called after clockintr_statvar_init() initializes both
513  * stathz_* and profhz_*.
514  */
515 void
516 clockintr_setstatclockrate(int freq)
517 {
518 	u_int ogen;
519 
520 	KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
521 
522 	mtx_enter(&clockintr_mtx);
523 
524 	ogen = statclock_gen;
525 	statclock_gen = 0;
526 	membar_producer();
527 	if (freq == stathz) {
528 		statclock_avg = stat_avg;
529 		statclock_min = stat_min;
530 		statclock_mask = stat_mask;
531 	} else if (freq == profhz) {
532 		statclock_avg = prof_avg;
533 		statclock_min = prof_min;
534 		statclock_mask = prof_mask;
535 	} else {
536 		panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
537 		    __func__, stathz, profhz, freq);
538 	}
539 	membar_producer();
540 	statclock_gen = MAX(1, ogen + 1);
541 
542 	mtx_leave(&clockintr_mtx);
543 }
544 
545 uint64_t
546 clockintr_nsecuptime(const struct clockintr *cl)
547 {
548 	KASSERT(cl == &cl->cl_queue->cq_shadow);
549 	return cl->cl_queue->cq_uptime;
550 }
551 
552 void
553 clockintr_hardclock(struct clockintr *cl, void *frame)
554 {
555 	uint64_t count, i;
556 
557 	count = clockintr_advance(cl, hardclock_period);
558 	for (i = 0; i < count; i++)
559 		hardclock(frame);
560 }
561 
562 void
563 clockintr_schedclock(struct clockintr *cl, void *unused)
564 {
565 	uint64_t count, i;
566 	struct proc *p = curproc;
567 
568 	count = clockintr_advance(cl, schedclock_period);
569 	if (p != NULL) {
570 		for (i = 0; i < count; i++)
571 			schedclock(p);
572 	}
573 }
574 
575 void
576 clockintr_statclock(struct clockintr *cl, void *frame)
577 {
578 	uint64_t count, expiration, i, uptime;
579 	uint32_t mask, min, off;
580 	u_int gen;
581 
582 	if (ISSET(clockintr_flags, CL_RNDSTAT)) {
583 		do {
584 			gen = statclock_gen;
585 			membar_consumer();
586 			min = statclock_min;
587 			mask = statclock_mask;
588 			membar_consumer();
589 		} while (gen == 0 || gen != statclock_gen);
590 		count = 0;
591 		expiration = clockintr_expiration(cl);
592 		uptime = clockintr_nsecuptime(cl);
593 		while (expiration <= uptime) {
594 			while ((off = (random() & mask)) == 0)
595 				continue;
596 			expiration += min + off;
597 			count++;
598 		}
599 		clockintr_schedule(cl, expiration);
600 	} else {
601 		count = clockintr_advance(cl, statclock_avg);
602 	}
603 	for (i = 0; i < count; i++)
604 		statclock(frame);
605 }
606 
607 void
608 clockqueue_init(struct clockintr_queue *cq)
609 {
610 	if (ISSET(cq->cq_flags, CQ_INIT))
611 		return;
612 
613 	cq->cq_shadow.cl_queue = cq;
614 	mtx_init(&cq->cq_mtx, IPL_CLOCK);
615 	TAILQ_INIT(&cq->cq_est);
616 	TAILQ_INIT(&cq->cq_pend);
617 	cq->cq_gen = 1;
618 	SET(cq->cq_flags, CQ_INIT);
619 }
620 
621 uint64_t
622 clockqueue_next(const struct clockintr_queue *cq)
623 {
624 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
625 	return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
626 }
627 
628 void
629 clockqueue_reset_intrclock(struct clockintr_queue *cq)
630 {
631 	uint64_t exp, now;
632 
633 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
634 	KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));
635 
636 	exp = clockqueue_next(cq);
637 	now = nsecuptime();
638 	if (now < exp)
639 		intrclock_rearm(&cq->cq_intrclock, exp - now);
640 	else
641 		intrclock_trigger(&cq->cq_intrclock);
642 }
643 
644 /*
645  * Advance *next in increments of period until it exceeds now.
646  * Returns the number of increments *next was advanced.
647  *
648  * We check the common cases first to avoid division if possible.
649  * This does no overflow checking.
650  */
651 uint64_t
652 nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
653 {
654 	uint64_t elapsed;
655 
656 	if (now < *next)
657 		return 0;
658 
659 	if (now < *next + period) {
660 		*next += period;
661 		return 1;
662 	}
663 
664 	elapsed = (now - *next) / period + 1;
665 	*next += period * elapsed;
666 	return elapsed;
667 }
668 
669 int
670 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
671     void *newp, size_t newlen)
672 {
673 	struct clockintr_stat sum, tmp;
674 	struct clockintr_queue *cq;
675 	struct cpu_info *ci;
676 	CPU_INFO_ITERATOR cii;
677 	u_int gen;
678 
679 	if (namelen != 1)
680 		return ENOTDIR;
681 
682 	switch (name[0]) {
683 	case KERN_CLOCKINTR_STATS:
684 		memset(&sum, 0, sizeof sum);
685 		CPU_INFO_FOREACH(cii, ci) {
686 			cq = &ci->ci_queue;
687 			if (!ISSET(cq->cq_flags, CQ_INIT))
688 				continue;
689 			do {
690 				gen = cq->cq_gen;
691 				membar_consumer();
692 				tmp = cq->cq_stat;
693 				membar_consumer();
694 			} while (gen == 0 || gen != cq->cq_gen);
695 			sum.cs_dispatched += tmp.cs_dispatched;
696 			sum.cs_early += tmp.cs_early;
697 			sum.cs_earliness += tmp.cs_earliness;
698 			sum.cs_lateness += tmp.cs_lateness;
699 			sum.cs_prompt += tmp.cs_prompt;
700 			sum.cs_run += tmp.cs_run;
701 			sum.cs_spurious += tmp.cs_spurious;
702 		}
703 		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
704 	default:
705 		break;
706 	}
707 
708 	return EINVAL;
709 }
710 
711 #ifdef DDB
712 
713 #include <machine/db_machdep.h>
714 
715 #include <ddb/db_interface.h>
716 #include <ddb/db_output.h>
717 #include <ddb/db_sym.h>
718 
719 void db_show_clockintr(const struct clockintr *, const char *, u_int);
720 void db_show_clockintr_cpu(struct cpu_info *);
721 
722 void
723 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
724 {
725 	struct timespec now;
726 	struct cpu_info *ci;
727 	CPU_INFO_ITERATOR cii;
728 
729 	nanouptime(&now);
730 	db_printf("%20s\n", "UPTIME");
731 	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
732 	db_printf("\n");
733 	db_printf("%20s  %5s  %3s  %s\n", "EXPIRATION", "STATE", "CPU", "NAME");
734 	CPU_INFO_FOREACH(cii, ci) {
735 		if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
736 			db_show_clockintr_cpu(ci);
737 	}
738 }
739 
740 void
741 db_show_clockintr_cpu(struct cpu_info *ci)
742 {
743 	struct clockintr *elm;
744 	struct clockintr_queue *cq = &ci->ci_queue;
745 	u_int cpu = CPU_INFO_UNIT(ci);
746 
747 	if (cq->cq_running != NULL)
748 		db_show_clockintr(cq->cq_running, "run", cpu);
749 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
750 		db_show_clockintr(elm, "pend", cpu);
751 	TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) {
752 		if (!ISSET(elm->cl_flags, CLST_PENDING))
753 			db_show_clockintr(elm, "idle", cpu);
754 	}
755 }
756 
757 void
758 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
759 {
760 	struct timespec ts;
761 	char *name;
762 	db_expr_t offset;
763 
764 	NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
765 	db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
766 	if (name == NULL)
767 		name = "?";
768 	db_printf("%10lld.%09ld  %5s  %3u  %s\n",
769 	    ts.tv_sec, ts.tv_nsec, state, cpu, name);
770 }
771 
772 #endif /* DDB */
773 #endif /*__HAVE_CLOCKINTR */
774