xref: /openbsd-src/sys/kern/kern_clockintr.c (revision 3374c67d44f9b75b98444cbf63020f777792342e)
1 /* $OpenBSD: kern_clockintr.c,v 1.2 2022/12/31 00:48:53 cheloha Exp $ */
2 /*
3  * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
4  * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
5  * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/atomic.h>
23 #include <sys/clockintr.h>
24 #include <sys/kernel.h>
25 #include <sys/mutex.h>
26 #include <sys/stdint.h>
27 #include <sys/sysctl.h>
28 #include <sys/time.h>
29 
30 #ifdef __HAVE_CLOCKINTR
31 
32 /*
33  * Protection for global variables in this file:
34  *
35  *	C	Global clockintr configuration mutex (clockintr_mtx).
36  *	I	Immutable after initialization.
37  */
38 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
39 
40 u_int clockintr_flags;			/* [I] global state + behavior flags */
41 uint32_t hardclock_period;		/* [I] hardclock period (ns) */
42 uint32_t schedclock_period;		/* [I] schedclock period (ns) */
43 volatile u_int statclock_gen = 1;	/* [C] statclock update generation */
44 volatile uint32_t statclock_avg;	/* [C] average statclock period (ns) */
45 uint32_t statclock_min;			/* [C] minimum statclock period (ns) */
46 uint32_t statclock_mask;		/* [C] set of allowed offsets */
47 uint32_t stat_avg;			/* [I] average stathz period (ns) */
48 uint32_t stat_min;			/* [I] set of allowed offsets */
49 uint32_t stat_mask;			/* [I] max offset from minimum (ns) */
50 uint32_t prof_avg;			/* [I] average profhz period (ns) */
51 uint32_t prof_min;			/* [I] minimum profhz period (ns) */
52 uint32_t prof_mask;			/* [I] set of allowed offsets */
53 
54 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
55 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
56 
57 /*
58  * Initialize global state.  Set flags and compute intervals.
59  */
60 void
61 clockintr_init(u_int flags)
62 {
63 	KASSERT(CPU_IS_PRIMARY(curcpu()));
64 	KASSERT(clockintr_flags == 0);
65 	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
66 
67 	KASSERT(hz > 0 && hz <= 1000000000);
68 	hardclock_period = 1000000000 / hz;
69 
70 	KASSERT(stathz >= 1 && stathz <= 1000000000);
71 	KASSERT(profhz >= stathz && profhz <= 1000000000);
72 	KASSERT(profhz % stathz == 0);
73 	clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
74 	clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
75 	SET(clockintr_flags, CL_STATCLOCK);
76 	clockintr_setstatclockrate(stathz);
77 
78 	KASSERT(schedhz >= 0 && schedhz <= 1000000000);
79 	if (schedhz != 0) {
80 		schedclock_period = 1000000000 / schedhz;
81 		SET(clockintr_flags, CL_SCHEDCLOCK);
82 	}
83 
84 	SET(clockintr_flags, flags | CL_INIT);
85 }
86 
87 /*
88  * Ready the calling CPU for clockintr_dispatch().  If this is our
89  * first time here, install the intrclock, if any, and set necessary
90  * flags.  Advance the schedule as needed.
91  */
92 void
93 clockintr_cpu_init(const struct intrclock *ic)
94 {
95 	uint64_t multiplier, now;
96 	struct cpu_info *ci = curcpu();
97 	struct clockintr_queue *cq = &ci->ci_queue;
98 
99 	KASSERT(ISSET(clockintr_flags, CL_INIT));
100 
101 	if (!ISSET(cq->cq_flags, CL_CPU_INIT)) {
102 		if (ic != NULL) {
103 			cq->cq_intrclock = *ic;
104 			SET(cq->cq_flags, CL_CPU_INTRCLOCK);
105 		}
106 		cq->cq_gen = 1;
107 	}
108 
109 	/*
110 	 * Until we understand scheduler lock contention better, stagger
111 	 * the hardclock and statclock so they don't all happen at once.
112 	 * If we have no intrclock it doesn't matter, we have no control
113 	 * anyway.  The primary CPU's starting offset is always zero, so
114 	 * set multiplier to zero.
115 	 */
116 	if (!CPU_IS_PRIMARY(ci) && ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
117 		multiplier = CPU_INFO_UNIT(ci);
118 	else
119 		multiplier = 0;
120 
121 	now = nsecuptime();
122 
123 	/*
124 	 * The first time we do this, the primary CPU cannot skip any
125 	 * hardclocks.  We can skip hardclocks on subsequent calls because
126 	 * the global tick value is advanced during inittodr(9) on our
127 	 * behalf.
128 	 */
129 	if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) {
130 		cq->cq_next_hardclock = hardclock_period / ncpus * multiplier;
131 		nsec_advance(&cq->cq_next_hardclock, hardclock_period, now);
132 	}
133 
134 	/*
135 	 * We can always advance the statclock and schedclock.
136 	 */
137 	cq->cq_next_statclock = stat_avg / ncpus * multiplier;
138 	nsec_advance(&cq->cq_next_statclock, stat_avg, now);
139 	if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
140 		cq->cq_next_schedclock = schedclock_period / ncpus * multiplier;
141 		nsec_advance(&cq->cq_next_schedclock, schedclock_period, now);
142 	}
143 
144 	SET(cq->cq_flags, CL_CPU_INIT);
145 }
146 
147 /*
148  * If we have an intrclock, trigger it to start the dispatch cycle.
149  */
150 void
151 clockintr_trigger(void)
152 {
153 	struct clockintr_queue *cq = &curcpu()->ci_queue;
154 
155 	KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT));
156 
157 	if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
158 		intrclock_trigger(&cq->cq_intrclock);
159 }
160 
161 /*
162  * Run all expired events scheduled on the calling CPU.
163  */
164 int
165 clockintr_dispatch(void *frame)
166 {
167 	uint64_t count, i, lateness, now, run = 0, start;
168 	struct cpu_info *ci = curcpu();
169 	struct clockintr_queue *cq = &ci->ci_queue;
170 	struct proc *p = curproc;
171 	uint32_t mask, min, off;
172 	u_int gen, ogen;
173 
174 	if (cq->cq_dispatch != 0)
175 		panic("%s: recursive dispatch", __func__);
176 	cq->cq_dispatch = 1;
177 
178 	splassert(IPL_CLOCK);
179 	KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT));
180 
181 	/*
182 	 * If we arrived too early we have nothing to do.
183 	 */
184 	start = nsecuptime();
185 	now = start;
186 	if (now < cq->cq_next)
187 		goto done;
188 	lateness = now - cq->cq_next;
189 
190 	/*
191 	 * Dispatch expired events.
192 	 */
193 again:
194 	/* hardclock */
195 	count = nsec_advance(&cq->cq_next_hardclock, hardclock_period, now);
196 	for (i = 0; i < count; i++)
197 		hardclock(frame);
198 	run += count;
199 
200 	/* statclock */
201 	if (ISSET(clockintr_flags, CL_RNDSTAT)) {
202 		do {
203 			gen = statclock_gen;
204 			membar_consumer();
205 			min = statclock_min;
206 			mask = statclock_mask;
207 			membar_consumer();
208 		} while (gen == 0 || gen != statclock_gen);
209 		count = 0;
210 		while (cq->cq_next_statclock <= now) {
211 			while ((off = (random() & mask)) == 0)
212 				continue;
213 			cq->cq_next_statclock += min + off;
214 			count++;
215 		}
216 	} else {
217 		count = nsec_advance(&cq->cq_next_statclock, statclock_avg,
218 		    now);
219 	}
220 	for (i = 0; i < count; i++)
221 		statclock(frame);
222 	run += count;
223 
224 	/* schedclock */
225 	if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
226 		count = nsec_advance(&cq->cq_next_schedclock,
227 		    schedclock_period, now);
228 		if (p != NULL) {
229 			for (i = 0; i < count; i++)
230 				schedclock(p);
231 		}
232 		run += count;
233 	}
234 
235 	/* Run the dispatch again if the next event has already expired. */
236 	cq->cq_next = cq->cq_next_hardclock;
237 	if (cq->cq_next_statclock < cq->cq_next)
238 		cq->cq_next = cq->cq_next_statclock;
239 	if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
240 		if (cq->cq_next_schedclock < cq->cq_next)
241 			cq->cq_next = cq->cq_next_schedclock;
242 	}
243 	now = nsecuptime();
244 	if (cq->cq_next <= now)
245 		goto again;
246 
247 	/*
248 	 * Dispatch complete.
249 	 */
250 done:
251 	/* Rearm the interrupt clock if we have one. */
252 	if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
253 		intrclock_rearm(&cq->cq_intrclock, cq->cq_next - now);
254 
255 	/* Update our stats. */
256 	ogen = cq->cq_gen;
257 	cq->cq_gen = 0;
258 	membar_producer();
259 	cq->cq_stat.cs_dispatched += now - start;
260 	if (run > 0) {
261 		cq->cq_stat.cs_lateness += lateness;
262 		cq->cq_stat.cs_prompt++;
263 		cq->cq_stat.cs_run += run;
264 	} else {
265 		cq->cq_stat.cs_early++;
266 		cq->cq_stat.cs_earliness += cq->cq_next - now;
267 	}
268 	membar_producer();
269 	cq->cq_gen = MAX(1, ogen + 1);
270 
271 	if (cq->cq_dispatch != 1)
272 		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
273 	cq->cq_dispatch = 0;
274 
275 	return run > 0;
276 }
277 
278 /*
279  * Compute the period (avg) for the given frequency and a range around
280  * that period.  The range is [min + 1, min + mask].  The range is used
281  * during dispatch to choose a new pseudorandom deadline for each statclock
282  * event.
283  */
284 void
285 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
286 {
287 	uint32_t half_avg, var;
288 
289 	KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
290 	KASSERT(freq > 0 && freq <= 1000000000);
291 
292 	/* Compute avg, the average period. */
293 	*avg = 1000000000 / freq;
294 
295 	/* Find var, the largest power of two such that var <= avg / 2. */
296 	half_avg = *avg / 2;
297 	for (var = 1U << 31; var > half_avg; var /= 2)
298 		continue;
299 
300 	/* Using avg and var, set a lower bound for the range. */
301 	*min = *avg - (var / 2);
302 
303 	/* The mask is just (var - 1). */
304 	*mask = var - 1;
305 }
306 
307 /*
308  * Update the statclock_* variables according to the given frequency.
309  * Must only be called after clockintr_statvar_init() initializes both
310  * stathz_* and profhz_*.
311  */
312 void
313 clockintr_setstatclockrate(int freq)
314 {
315 	u_int ogen;
316 
317 	KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
318 
319 	mtx_enter(&clockintr_mtx);
320 
321 	ogen = statclock_gen;
322 	statclock_gen = 0;
323 	membar_producer();
324 	if (freq == stathz) {
325 		statclock_avg = stat_avg;
326 		statclock_min = stat_min;
327 		statclock_mask = stat_mask;
328 	} else if (freq == profhz) {
329 		statclock_avg = prof_avg;
330 		statclock_min = prof_min;
331 		statclock_mask = prof_mask;
332 	} else {
333 		panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
334 		    __func__, stathz, profhz, freq);
335 	}
336 	membar_producer();
337 	statclock_gen = MAX(1, ogen + 1);
338 
339 	mtx_leave(&clockintr_mtx);
340 }
341 
342 /*
343  * Advance *next in increments of period until it exceeds now.
344  * Returns the number of increments *next was advanced.
345  *
346  * We check the common cases first to avoid division if possible.
347  * This does no overflow checking.
348  */
349 uint64_t
350 nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
351 {
352 	uint64_t elapsed;
353 
354 	if (now < *next)
355 		return 0;
356 
357 	if (now < *next + period) {
358 		*next += period;
359 		return 1;
360 	}
361 
362 	elapsed = (now - *next) / period + 1;
363 	*next += period * elapsed;
364 	return elapsed;
365 }
366 
367 int
368 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
369     void *newp, size_t newlen)
370 {
371 	struct clockintr_stat sum, tmp;
372 	struct clockintr_queue *cq;
373 	struct cpu_info *ci;
374 	CPU_INFO_ITERATOR cii;
375 	u_int gen;
376 
377 	if (namelen != 1)
378 		return ENOTDIR;
379 
380 	switch (name[0]) {
381 	case KERN_CLOCKINTR_STATS:
382 		memset(&sum, 0, sizeof sum);
383 		CPU_INFO_FOREACH(cii, ci) {
384 			cq = &ci->ci_queue;
385 			if (!ISSET(cq->cq_flags, CL_CPU_INIT))
386 				continue;
387 			do {
388 				gen = cq->cq_gen;
389 				membar_consumer();
390 				tmp = cq->cq_stat;
391 				membar_consumer();
392 			} while (gen == 0 || gen != cq->cq_gen);
393 			sum.cs_dispatched += tmp.cs_dispatched;
394 			sum.cs_early += tmp.cs_early;
395 			sum.cs_earliness += tmp.cs_earliness;
396 			sum.cs_lateness += tmp.cs_lateness;
397 			sum.cs_prompt += tmp.cs_prompt;
398 			sum.cs_run += tmp.cs_run;
399 		}
400 		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
401 	default:
402 		break;
403 	}
404 
405 	return EINVAL;
406 }
407 
408 #ifdef DDB
409 
410 #include <machine/db_machdep.h>
411 
412 #include <ddb/db_interface.h>
413 #include <ddb/db_output.h>
414 #include <ddb/db_sym.h>
415 
416 void db_show_clockintr(uint64_t, u_int, const char *);
417 void db_show_clockintr_cpu(struct cpu_info *);
418 
419 void
420 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
421 {
422 	struct timespec now;
423 	struct cpu_info *ci;
424 	CPU_INFO_ITERATOR cii;
425 
426 	nanouptime(&now);
427 	db_printf("%20s\n", "UPTIME");
428 	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
429 	db_printf("\n");
430 	db_printf("%20s  %3s  %s\n", "EXPIRATION", "CPU", "NAME");
431 	CPU_INFO_FOREACH(cii, ci) {
432 		if (ISSET(ci->ci_queue.cq_flags, CL_CPU_INIT))
433 			db_show_clockintr_cpu(ci);
434 	}
435 }
436 
437 void
438 db_show_clockintr_cpu(struct cpu_info *ci)
439 {
440 	struct clockintr_queue *cq = &ci->ci_queue;
441 	u_int cpu = CPU_INFO_UNIT(ci);
442 
443 	db_show_clockintr(cq->cq_next_hardclock, cpu, "hardclock");
444 	db_show_clockintr(cq->cq_next_statclock, cpu, "statclock");
445 	if (ISSET(clockintr_flags, CL_SCHEDCLOCK))
446 		db_show_clockintr(cq->cq_next_schedclock, cpu, "schedclock");
447 }
448 
449 void
450 db_show_clockintr(uint64_t expiration, u_int cpu, const char *name)
451 {
452 	struct timespec ts;
453 
454 	NSEC_TO_TIMESPEC(expiration, &ts);
455 	db_printf("%10lld.%09ld  %3u  %s\n", ts.tv_sec, ts.tv_nsec, cpu, name);
456 }
457 
458 #endif /* DDB */
459 #endif /*__HAVE_CLOCKINTR */
460