xref: /openbsd-src/sys/kern/kern_clockintr.c (revision ff0e7be1ebbcc809ea8ad2b6dafe215824da9e46)
1 /* $OpenBSD: kern_clockintr.c,v 1.21 2023/04/23 00:08:36 cheloha Exp $ */
2 /*
3  * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
4  * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
5  * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/atomic.h>
23 #include <sys/clockintr.h>
24 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/mutex.h>
27 #include <sys/queue.h>
28 #include <sys/stdint.h>
29 #include <sys/sysctl.h>
30 #include <sys/time.h>
31 
32 #ifdef __HAVE_CLOCKINTR
33 
34 /*
35  * Protection for global variables in this file:
36  *
37  *	C	Global clockintr configuration mutex (clockintr_mtx).
38  *	I	Immutable after initialization.
39  */
40 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
41 
42 u_int clockintr_flags;			/* [I] global state + behavior flags */
43 uint32_t hardclock_period;		/* [I] hardclock period (ns) */
44 uint32_t schedclock_period;		/* [I] schedclock period (ns) */
45 volatile u_int statclock_gen = 1;	/* [C] statclock update generation */
46 volatile uint32_t statclock_avg;	/* [C] average statclock period (ns) */
47 uint32_t statclock_min;			/* [C] minimum statclock period (ns) */
48 uint32_t statclock_mask;		/* [C] set of allowed offsets */
49 uint32_t stat_avg;			/* [I] average stathz period (ns) */
50 uint32_t stat_min;			/* [I] set of allowed offsets */
51 uint32_t stat_mask;			/* [I] max offset from minimum (ns) */
52 uint32_t prof_avg;			/* [I] average profhz period (ns) */
53 uint32_t prof_min;			/* [I] minimum profhz period (ns) */
54 uint32_t prof_mask;			/* [I] set of allowed offsets */
55 
56 uint64_t clockintr_advance(struct clockintr *, uint64_t);
57 void clockintr_cancel(struct clockintr *);
58 void clockintr_cancel_locked(struct clockintr *);
59 struct clockintr *clockintr_establish(struct clockintr_queue *,
60     void (*)(struct clockintr *, void *));
61 uint64_t clockintr_expiration(const struct clockintr *);
62 void clockintr_hardclock(struct clockintr *, void *);
63 uint64_t clockintr_nsecuptime(const struct clockintr *);
64 void clockintr_schedclock(struct clockintr *, void *);
65 void clockintr_schedule(struct clockintr *, uint64_t);
66 void clockintr_schedule_locked(struct clockintr *, uint64_t);
67 void clockintr_statclock(struct clockintr *, void *);
68 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
69 void clockqueue_init(struct clockintr_queue *);
70 uint64_t clockqueue_next(const struct clockintr_queue *);
71 void clockqueue_reset_intrclock(struct clockintr_queue *);
72 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
73 
74 /*
75  * Initialize global state.  Set flags and compute intervals.
76  */
77 void
78 clockintr_init(u_int flags)
79 {
80 	KASSERT(CPU_IS_PRIMARY(curcpu()));
81 	KASSERT(clockintr_flags == 0);
82 	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
83 
84 	KASSERT(hz > 0 && hz <= 1000000000);
85 	hardclock_period = 1000000000 / hz;
86 
87 	KASSERT(stathz >= 1 && stathz <= 1000000000);
88 	KASSERT(profhz >= stathz && profhz <= 1000000000);
89 	KASSERT(profhz % stathz == 0);
90 	clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
91 	clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
92 	SET(clockintr_flags, CL_STATCLOCK);
93 	clockintr_setstatclockrate(stathz);
94 
95 	KASSERT(schedhz >= 0 && schedhz <= 1000000000);
96 	if (schedhz != 0)
97 		schedclock_period = 1000000000 / schedhz;
98 
99 	SET(clockintr_flags, flags | CL_INIT);
100 }
101 
102 /*
103  * Ready the calling CPU for clockintr_dispatch().  If this is our
104  * first time here, install the intrclock, if any, and set necessary
105  * flags.  Advance the schedule as needed.
106  */
107 void
108 clockintr_cpu_init(const struct intrclock *ic)
109 {
110 	uint64_t multiplier = 0, offset;
111 	struct cpu_info *ci = curcpu();
112 	struct clockintr_queue *cq = &ci->ci_queue;
113 	int reset_cq_intrclock = 0;
114 
115 	KASSERT(ISSET(clockintr_flags, CL_INIT));
116 
117 	clockqueue_init(cq);
118 	if (ic != NULL && !ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
119 		cq->cq_intrclock = *ic;
120 		SET(cq->cq_flags, CQ_INTRCLOCK);
121 	}
122 
123 	/* TODO: Remove these from struct clockintr_queue. */
124 	if (cq->cq_hardclock == NULL) {
125 		cq->cq_hardclock = clockintr_establish(cq, clockintr_hardclock);
126 		if (cq->cq_hardclock == NULL)
127 			panic("%s: failed to establish hardclock", __func__);
128 	}
129 	if (cq->cq_statclock == NULL) {
130 		cq->cq_statclock = clockintr_establish(cq, clockintr_statclock);
131 		if (cq->cq_statclock == NULL)
132 			panic("%s: failed to establish statclock", __func__);
133 	}
134 	if (schedhz != 0 && cq->cq_schedclock == NULL) {
135 		cq->cq_schedclock = clockintr_establish(cq,
136 		    clockintr_schedclock);
137 		if (cq->cq_schedclock == NULL)
138 			panic("%s: failed to establish schedclock", __func__);
139 	}
140 
141 	/*
142 	 * Mask CQ_INTRCLOCK while we're advancing the internal clock
143 	 * interrupts.  We don't want the intrclock to fire until this
144 	 * thread reaches clockintr_trigger().
145 	 */
146 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
147 		CLR(cq->cq_flags, CQ_INTRCLOCK);
148 		reset_cq_intrclock = 1;
149 	}
150 
151 	/*
152 	 * Until we understand scheduler lock contention better, stagger
153 	 * the hardclock and statclock so they don't all happen at once.
154 	 * If we have no intrclock it doesn't matter, we have no control
155 	 * anyway.  The primary CPU's starting offset is always zero, so
156 	 * leave the multiplier zero.
157 	 */
158 	if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
159 		multiplier = CPU_INFO_UNIT(ci);
160 
161 	/*
162 	 * The first time we do this, the primary CPU cannot skip any
163 	 * hardclocks.  We can skip hardclocks on subsequent calls because
164 	 * the global tick value is advanced during inittodr(9) on our
165 	 * behalf.
166 	 */
167 	if (CPU_IS_PRIMARY(ci)) {
168 		if (cq->cq_hardclock->cl_expiration == 0)
169 			clockintr_schedule(cq->cq_hardclock, 0);
170 		else
171 			clockintr_advance(cq->cq_hardclock, hardclock_period);
172 	} else {
173 		if (cq->cq_hardclock->cl_expiration == 0) {
174 			offset = hardclock_period / ncpus * multiplier;
175 			cq->cq_hardclock->cl_expiration =  offset;
176 		}
177 		clockintr_advance(cq->cq_hardclock, hardclock_period);
178 	}
179 
180 	/*
181 	 * We can always advance the statclock and schedclock.
182 	 */
183 	offset = statclock_avg / ncpus * multiplier;
184 	clockintr_schedule(cq->cq_statclock, offset);
185 	clockintr_advance(cq->cq_statclock, statclock_avg);
186 	if (schedhz != 0) {
187 		offset = schedclock_period / ncpus * multiplier;
188 		clockintr_schedule(cq->cq_schedclock, offset);
189 		clockintr_advance(cq->cq_schedclock, schedclock_period);
190 	}
191 
192 	if (reset_cq_intrclock)
193 		SET(cq->cq_flags, CQ_INTRCLOCK);
194 }
195 
196 /*
197  * If we have an intrclock, trigger it to start the dispatch cycle.
198  */
199 void
200 clockintr_trigger(void)
201 {
202 	struct clockintr_queue *cq = &curcpu()->ci_queue;
203 
204 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
205 
206 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
207 		intrclock_trigger(&cq->cq_intrclock);
208 }
209 
210 /*
211  * Run all expired events scheduled on the calling CPU.
212  */
213 int
214 clockintr_dispatch(void *frame)
215 {
216 	uint64_t lateness, run = 0, start;
217 	struct cpu_info *ci = curcpu();
218 	struct clockintr *cl;
219 	struct clockintr_queue *cq = &ci->ci_queue;
220 	u_int ogen;
221 
222 	if (cq->cq_dispatch != 0)
223 		panic("%s: recursive dispatch", __func__);
224 	cq->cq_dispatch = 1;
225 
226 	splassert(IPL_CLOCK);
227 	KASSERT(ISSET(cq->cq_flags, CQ_INIT));
228 
229 	mtx_enter(&cq->cq_mtx);
230 
231 	/*
232 	 * If nothing is scheduled or we arrived too early, we have
233 	 * nothing to do.
234 	 */
235 	start = nsecuptime();
236 	cq->cq_uptime = start;
237 	if (TAILQ_EMPTY(&cq->cq_pend))
238 		goto stats;
239 	if (cq->cq_uptime < clockqueue_next(cq))
240 		goto rearm;
241 	lateness = start - clockqueue_next(cq);
242 
243 	/*
244 	 * Dispatch expired events.
245 	 */
246 	for (;;) {
247 		cl = TAILQ_FIRST(&cq->cq_pend);
248 		if (cl == NULL)
249 			break;
250 		if (cq->cq_uptime < cl->cl_expiration) {
251 			/* Double-check the time before giving up. */
252 			cq->cq_uptime = nsecuptime();
253 			if (cq->cq_uptime < cl->cl_expiration)
254 				break;
255 		}
256 		clockintr_cancel_locked(cl);
257 		cq->cq_shadow.cl_expiration = cl->cl_expiration;
258 		cq->cq_running = cl;
259 		mtx_leave(&cq->cq_mtx);
260 
261 		cl->cl_func(&cq->cq_shadow, frame);
262 
263 		mtx_enter(&cq->cq_mtx);
264 		cq->cq_running = NULL;
265 		if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
266 			CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
267 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
268 		}
269 		if (ISSET(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING)) {
270 			CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
271 			clockintr_schedule_locked(cl,
272 			    cq->cq_shadow.cl_expiration);
273 		}
274 		run++;
275 	}
276 
277 	/*
278 	 * Dispatch complete.
279 	 */
280 rearm:
281 	/* Rearm the interrupt clock if we have one. */
282 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
283 		if (!TAILQ_EMPTY(&cq->cq_pend)) {
284 			intrclock_rearm(&cq->cq_intrclock,
285 			    clockqueue_next(cq) - cq->cq_uptime);
286 		}
287 	}
288 stats:
289 	/* Update our stats. */
290 	ogen = cq->cq_gen;
291 	cq->cq_gen = 0;
292 	membar_producer();
293 	cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
294 	if (run > 0) {
295 		cq->cq_stat.cs_lateness += lateness;
296 		cq->cq_stat.cs_prompt++;
297 		cq->cq_stat.cs_run += run;
298 	} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
299 		cq->cq_stat.cs_early++;
300 		cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
301 	} else
302 		cq->cq_stat.cs_spurious++;
303 	membar_producer();
304 	cq->cq_gen = MAX(1, ogen + 1);
305 
306 	mtx_leave(&cq->cq_mtx);
307 
308 	if (cq->cq_dispatch != 1)
309 		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
310 	cq->cq_dispatch = 0;
311 
312 	return run > 0;
313 }
314 
315 uint64_t
316 clockintr_advance(struct clockintr *cl, uint64_t period)
317 {
318 	uint64_t count, expiration;
319 	struct clockintr_queue *cq = cl->cl_queue;
320 
321 	if (cl == &cq->cq_shadow) {
322 		count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime);
323 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
324 		return count;
325 	}
326 
327 	mtx_enter(&cq->cq_mtx);
328 	expiration = cl->cl_expiration;
329 	count = nsec_advance(&expiration, period, nsecuptime());
330 	if (ISSET(cl->cl_flags, CLST_PENDING))
331 		clockintr_cancel_locked(cl);
332 	clockintr_schedule_locked(cl, expiration);
333 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
334 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
335 			if (cq == &curcpu()->ci_queue)
336 				clockqueue_reset_intrclock(cq);
337 		}
338 	}
339 	if (cl == cq->cq_running)
340 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
341 	mtx_leave(&cq->cq_mtx);
342 	return count;
343 }
344 
345 void
346 clockintr_cancel(struct clockintr *cl)
347 {
348 	struct clockintr_queue *cq = cl->cl_queue;
349 	int was_next;
350 
351 	if (cl == &cq->cq_shadow) {
352 		CLR(cl->cl_flags, CLST_SHADOW_PENDING);
353 		return;
354 	}
355 
356 	mtx_enter(&cq->cq_mtx);
357 	if (ISSET(cl->cl_flags, CLST_PENDING)) {
358 		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
359 		clockintr_cancel_locked(cl);
360 		if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
361 			if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
362 				if (cq == &curcpu()->ci_queue)
363 					clockqueue_reset_intrclock(cq);
364 			}
365 		}
366 	}
367 	if (cl == cq->cq_running)
368 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
369 	mtx_leave(&cq->cq_mtx);
370 }
371 
372 void
373 clockintr_cancel_locked(struct clockintr *cl)
374 {
375 	struct clockintr_queue *cq = cl->cl_queue;
376 
377 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
378 	KASSERT(ISSET(cl->cl_flags, CLST_PENDING));
379 
380 	TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
381 	CLR(cl->cl_flags, CLST_PENDING);
382 }
383 
384 struct clockintr *
385 clockintr_establish(struct clockintr_queue *cq,
386     void (*func)(struct clockintr *, void *))
387 {
388 	struct clockintr *cl;
389 
390 	cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO);
391 	if (cl == NULL)
392 		return NULL;
393 	cl->cl_func = func;
394 	cl->cl_queue = cq;
395 
396 	mtx_enter(&cq->cq_mtx);
397 	TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink);
398 	mtx_leave(&cq->cq_mtx);
399 	return cl;
400 }
401 
402 uint64_t
403 clockintr_expiration(const struct clockintr *cl)
404 {
405 	uint64_t expiration;
406 	struct clockintr_queue *cq = cl->cl_queue;
407 
408 	if (cl == &cq->cq_shadow)
409 		return cl->cl_expiration;
410 
411 	mtx_enter(&cq->cq_mtx);
412 	expiration = cl->cl_expiration;
413 	mtx_leave(&cq->cq_mtx);
414 	return expiration;
415 }
416 
417 void
418 clockintr_schedule(struct clockintr *cl, uint64_t expiration)
419 {
420 	struct clockintr_queue *cq = cl->cl_queue;
421 
422 	if (cl == &cq->cq_shadow) {
423 		cl->cl_expiration = expiration;
424 		SET(cl->cl_flags, CLST_SHADOW_PENDING);
425 		return;
426 	}
427 
428 	mtx_enter(&cq->cq_mtx);
429 	if (ISSET(cl->cl_flags, CLST_PENDING))
430 		clockintr_cancel_locked(cl);
431 	clockintr_schedule_locked(cl, expiration);
432 	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
433 		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
434 			if (cq == &curcpu()->ci_queue)
435 				clockqueue_reset_intrclock(cq);
436 		}
437 	}
438 	if (cl == cq->cq_running)
439 		SET(cl->cl_flags, CLST_IGNORE_SHADOW);
440 	mtx_leave(&cq->cq_mtx);
441 }
442 
443 void
444 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
445 {
446 	struct clockintr *elm;
447 	struct clockintr_queue *cq = cl->cl_queue;
448 
449 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
450 	KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));
451 
452 	cl->cl_expiration = expiration;
453 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
454 		if (cl->cl_expiration < elm->cl_expiration)
455 			break;
456 	}
457 	if (elm == NULL)
458 		TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
459 	else
460 		TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
461 	SET(cl->cl_flags, CLST_PENDING);
462 }
463 
464 /*
465  * Compute the period (avg) for the given frequency and a range around
466  * that period.  The range is [min + 1, min + mask].  The range is used
467  * during dispatch to choose a new pseudorandom deadline for each statclock
468  * event.
469  */
470 void
471 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
472 {
473 	uint32_t half_avg, var;
474 
475 	KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
476 	KASSERT(freq > 0 && freq <= 1000000000);
477 
478 	/* Compute avg, the average period. */
479 	*avg = 1000000000 / freq;
480 
481 	/* Find var, the largest power of two such that var <= avg / 2. */
482 	half_avg = *avg / 2;
483 	for (var = 1U << 31; var > half_avg; var /= 2)
484 		continue;
485 
486 	/* Using avg and var, set a lower bound for the range. */
487 	*min = *avg - (var / 2);
488 
489 	/* The mask is just (var - 1). */
490 	*mask = var - 1;
491 }
492 
493 /*
494  * Update the statclock_* variables according to the given frequency.
495  * Must only be called after clockintr_statvar_init() initializes both
496  * stathz_* and profhz_*.
497  */
498 void
499 clockintr_setstatclockrate(int freq)
500 {
501 	u_int ogen;
502 
503 	KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
504 
505 	mtx_enter(&clockintr_mtx);
506 
507 	ogen = statclock_gen;
508 	statclock_gen = 0;
509 	membar_producer();
510 	if (freq == stathz) {
511 		statclock_avg = stat_avg;
512 		statclock_min = stat_min;
513 		statclock_mask = stat_mask;
514 	} else if (freq == profhz) {
515 		statclock_avg = prof_avg;
516 		statclock_min = prof_min;
517 		statclock_mask = prof_mask;
518 	} else {
519 		panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
520 		    __func__, stathz, profhz, freq);
521 	}
522 	membar_producer();
523 	statclock_gen = MAX(1, ogen + 1);
524 
525 	mtx_leave(&clockintr_mtx);
526 }
527 
528 uint64_t
529 clockintr_nsecuptime(const struct clockintr *cl)
530 {
531 	KASSERT(cl == &cl->cl_queue->cq_shadow);
532 	return cl->cl_queue->cq_uptime;
533 }
534 
535 void
536 clockintr_hardclock(struct clockintr *cl, void *frame)
537 {
538 	uint64_t count, i;
539 
540 	count = clockintr_advance(cl, hardclock_period);
541 	for (i = 0; i < count; i++)
542 		hardclock(frame);
543 }
544 
545 void
546 clockintr_schedclock(struct clockintr *cl, void *unused)
547 {
548 	uint64_t count, i;
549 	struct proc *p = curproc;
550 
551 	count = clockintr_advance(cl, schedclock_period);
552 	if (p != NULL) {
553 		for (i = 0; i < count; i++)
554 			schedclock(p);
555 	}
556 }
557 
558 void
559 clockintr_statclock(struct clockintr *cl, void *frame)
560 {
561 	uint64_t count, expiration, i, uptime;
562 	uint32_t mask, min, off;
563 	u_int gen;
564 
565 	if (ISSET(clockintr_flags, CL_RNDSTAT)) {
566 		do {
567 			gen = statclock_gen;
568 			membar_consumer();
569 			min = statclock_min;
570 			mask = statclock_mask;
571 			membar_consumer();
572 		} while (gen == 0 || gen != statclock_gen);
573 		count = 0;
574 		expiration = clockintr_expiration(cl);
575 		uptime = clockintr_nsecuptime(cl);
576 		while (expiration <= uptime) {
577 			while ((off = (random() & mask)) == 0)
578 				continue;
579 			expiration += min + off;
580 			count++;
581 		}
582 		clockintr_schedule(cl, expiration);
583 	} else {
584 		count = clockintr_advance(cl, statclock_avg);
585 	}
586 	for (i = 0; i < count; i++)
587 		statclock(frame);
588 }
589 
590 void
591 clockqueue_init(struct clockintr_queue *cq)
592 {
593 	if (ISSET(cq->cq_flags, CQ_INIT))
594 		return;
595 
596 	cq->cq_shadow.cl_queue = cq;
597 	mtx_init(&cq->cq_mtx, IPL_CLOCK);
598 	TAILQ_INIT(&cq->cq_est);
599 	TAILQ_INIT(&cq->cq_pend);
600 	cq->cq_gen = 1;
601 	SET(cq->cq_flags, CQ_INIT);
602 }
603 
604 uint64_t
605 clockqueue_next(const struct clockintr_queue *cq)
606 {
607 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
608 	return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
609 }
610 
611 void
612 clockqueue_reset_intrclock(struct clockintr_queue *cq)
613 {
614 	uint64_t exp, now;
615 
616 	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
617 	KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));
618 
619 	exp = clockqueue_next(cq);
620 	now = nsecuptime();
621 	if (now < exp)
622 		intrclock_rearm(&cq->cq_intrclock, exp - now);
623 	else
624 		intrclock_trigger(&cq->cq_intrclock);
625 }
626 
627 /*
628  * Advance *next in increments of period until it exceeds now.
629  * Returns the number of increments *next was advanced.
630  *
631  * We check the common cases first to avoid division if possible.
632  * This does no overflow checking.
633  */
634 uint64_t
635 nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
636 {
637 	uint64_t elapsed;
638 
639 	if (now < *next)
640 		return 0;
641 
642 	if (now < *next + period) {
643 		*next += period;
644 		return 1;
645 	}
646 
647 	elapsed = (now - *next) / period + 1;
648 	*next += period * elapsed;
649 	return elapsed;
650 }
651 
652 int
653 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
654     void *newp, size_t newlen)
655 {
656 	struct clockintr_stat sum, tmp;
657 	struct clockintr_queue *cq;
658 	struct cpu_info *ci;
659 	CPU_INFO_ITERATOR cii;
660 	u_int gen;
661 
662 	if (namelen != 1)
663 		return ENOTDIR;
664 
665 	switch (name[0]) {
666 	case KERN_CLOCKINTR_STATS:
667 		memset(&sum, 0, sizeof sum);
668 		CPU_INFO_FOREACH(cii, ci) {
669 			cq = &ci->ci_queue;
670 			if (!ISSET(cq->cq_flags, CQ_INIT))
671 				continue;
672 			do {
673 				gen = cq->cq_gen;
674 				membar_consumer();
675 				tmp = cq->cq_stat;
676 				membar_consumer();
677 			} while (gen == 0 || gen != cq->cq_gen);
678 			sum.cs_dispatched += tmp.cs_dispatched;
679 			sum.cs_early += tmp.cs_early;
680 			sum.cs_earliness += tmp.cs_earliness;
681 			sum.cs_lateness += tmp.cs_lateness;
682 			sum.cs_prompt += tmp.cs_prompt;
683 			sum.cs_run += tmp.cs_run;
684 			sum.cs_spurious += tmp.cs_spurious;
685 		}
686 		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
687 	default:
688 		break;
689 	}
690 
691 	return EINVAL;
692 }
693 
694 #ifdef DDB
695 
696 #include <machine/db_machdep.h>
697 
698 #include <ddb/db_interface.h>
699 #include <ddb/db_output.h>
700 #include <ddb/db_sym.h>
701 
702 void db_show_clockintr(const struct clockintr *, const char *, u_int);
703 void db_show_clockintr_cpu(struct cpu_info *);
704 
705 void
706 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
707 {
708 	struct timespec now;
709 	struct cpu_info *ci;
710 	CPU_INFO_ITERATOR cii;
711 
712 	nanouptime(&now);
713 	db_printf("%20s\n", "UPTIME");
714 	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
715 	db_printf("\n");
716 	db_printf("%20s  %5s  %3s  %s\n", "EXPIRATION", "STATE", "CPU", "NAME");
717 	CPU_INFO_FOREACH(cii, ci) {
718 		if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
719 			db_show_clockintr_cpu(ci);
720 	}
721 }
722 
723 void
724 db_show_clockintr_cpu(struct cpu_info *ci)
725 {
726 	struct clockintr *elm;
727 	struct clockintr_queue *cq = &ci->ci_queue;
728 	u_int cpu = CPU_INFO_UNIT(ci);
729 
730 	if (cq->cq_running != NULL)
731 		db_show_clockintr(cq->cq_running, "run", cpu);
732 	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
733 		db_show_clockintr(elm, "pend", cpu);
734 	TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) {
735 		if (!ISSET(elm->cl_flags, CLST_PENDING))
736 			db_show_clockintr(elm, "idle", cpu);
737 	}
738 }
739 
740 void
741 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
742 {
743 	struct timespec ts;
744 	char *name;
745 	db_expr_t offset;
746 
747 	NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
748 	db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
749 	if (name == NULL)
750 		name = "?";
751 	db_printf("%10lld.%09ld  %5s  %3u  %s\n",
752 	    ts.tv_sec, ts.tv_nsec, state, cpu, name);
753 }
754 
755 #endif /* DDB */
756 #endif /*__HAVE_CLOCKINTR */
757