1 /* $OpenBSD: kern_clockintr.c,v 1.2 2022/12/31 00:48:53 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/mutex.h> 26 #include <sys/stdint.h> 27 #include <sys/sysctl.h> 28 #include <sys/time.h> 29 30 #ifdef __HAVE_CLOCKINTR 31 32 /* 33 * Protection for global variables in this file: 34 * 35 * C Global clockintr configuration mutex (clockintr_mtx). 36 * I Immutable after initialization. 37 */ 38 struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK); 39 40 u_int clockintr_flags; /* [I] global state + behavior flags */ 41 uint32_t hardclock_period; /* [I] hardclock period (ns) */ 42 uint32_t schedclock_period; /* [I] schedclock period (ns) */ 43 volatile u_int statclock_gen = 1; /* [C] statclock update generation */ 44 volatile uint32_t statclock_avg; /* [C] average statclock period (ns) */ 45 uint32_t statclock_min; /* [C] minimum statclock period (ns) */ 46 uint32_t statclock_mask; /* [C] set of allowed offsets */ 47 uint32_t stat_avg; /* [I] average stathz period (ns) */ 48 uint32_t stat_min; /* [I] set of allowed offsets */ 49 uint32_t stat_mask; /* [I] max offset from minimum (ns) */ 50 uint32_t prof_avg; /* [I] average profhz period (ns) */ 51 uint32_t prof_min; /* [I] minimum profhz period (ns) */ 52 uint32_t prof_mask; /* [I] set of allowed offsets */ 53 54 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *); 55 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 56 57 /* 58 * Initialize global state. Set flags and compute intervals. 59 */ 60 void 61 clockintr_init(u_int flags) 62 { 63 KASSERT(CPU_IS_PRIMARY(curcpu())); 64 KASSERT(clockintr_flags == 0); 65 KASSERT(!ISSET(flags, ~CL_FLAG_MASK)); 66 67 KASSERT(hz > 0 && hz <= 1000000000); 68 hardclock_period = 1000000000 / hz; 69 70 KASSERT(stathz >= 1 && stathz <= 1000000000); 71 KASSERT(profhz >= stathz && profhz <= 1000000000); 72 KASSERT(profhz % stathz == 0); 73 clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask); 74 clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask); 75 SET(clockintr_flags, CL_STATCLOCK); 76 clockintr_setstatclockrate(stathz); 77 78 KASSERT(schedhz >= 0 && schedhz <= 1000000000); 79 if (schedhz != 0) { 80 schedclock_period = 1000000000 / schedhz; 81 SET(clockintr_flags, CL_SCHEDCLOCK); 82 } 83 84 SET(clockintr_flags, flags | CL_INIT); 85 } 86 87 /* 88 * Ready the calling CPU for clockintr_dispatch(). If this is our 89 * first time here, install the intrclock, if any, and set necessary 90 * flags. Advance the schedule as needed. 91 */ 92 void 93 clockintr_cpu_init(const struct intrclock *ic) 94 { 95 uint64_t multiplier, now; 96 struct cpu_info *ci = curcpu(); 97 struct clockintr_queue *cq = &ci->ci_queue; 98 99 KASSERT(ISSET(clockintr_flags, CL_INIT)); 100 101 if (!ISSET(cq->cq_flags, CL_CPU_INIT)) { 102 if (ic != NULL) { 103 cq->cq_intrclock = *ic; 104 SET(cq->cq_flags, CL_CPU_INTRCLOCK); 105 } 106 cq->cq_gen = 1; 107 } 108 109 /* 110 * Until we understand scheduler lock contention better, stagger 111 * the hardclock and statclock so they don't all happen at once. 112 * If we have no intrclock it doesn't matter, we have no control 113 * anyway. The primary CPU's starting offset is always zero, so 114 * set multiplier to zero. 115 */ 116 if (!CPU_IS_PRIMARY(ci) && ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) 117 multiplier = CPU_INFO_UNIT(ci); 118 else 119 multiplier = 0; 120 121 now = nsecuptime(); 122 123 /* 124 * The first time we do this, the primary CPU cannot skip any 125 * hardclocks. We can skip hardclocks on subsequent calls because 126 * the global tick value is advanced during inittodr(9) on our 127 * behalf. 128 */ 129 if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) { 130 cq->cq_next_hardclock = hardclock_period / ncpus * multiplier; 131 nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); 132 } 133 134 /* 135 * We can always advance the statclock and schedclock. 136 */ 137 cq->cq_next_statclock = stat_avg / ncpus * multiplier; 138 nsec_advance(&cq->cq_next_statclock, stat_avg, now); 139 if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { 140 cq->cq_next_schedclock = schedclock_period / ncpus * multiplier; 141 nsec_advance(&cq->cq_next_schedclock, schedclock_period, now); 142 } 143 144 SET(cq->cq_flags, CL_CPU_INIT); 145 } 146 147 /* 148 * If we have an intrclock, trigger it to start the dispatch cycle. 149 */ 150 void 151 clockintr_trigger(void) 152 { 153 struct clockintr_queue *cq = &curcpu()->ci_queue; 154 155 KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT)); 156 157 if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) 158 intrclock_trigger(&cq->cq_intrclock); 159 } 160 161 /* 162 * Run all expired events scheduled on the calling CPU. 163 */ 164 int 165 clockintr_dispatch(void *frame) 166 { 167 uint64_t count, i, lateness, now, run = 0, start; 168 struct cpu_info *ci = curcpu(); 169 struct clockintr_queue *cq = &ci->ci_queue; 170 struct proc *p = curproc; 171 uint32_t mask, min, off; 172 u_int gen, ogen; 173 174 if (cq->cq_dispatch != 0) 175 panic("%s: recursive dispatch", __func__); 176 cq->cq_dispatch = 1; 177 178 splassert(IPL_CLOCK); 179 KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT)); 180 181 /* 182 * If we arrived too early we have nothing to do. 183 */ 184 start = nsecuptime(); 185 now = start; 186 if (now < cq->cq_next) 187 goto done; 188 lateness = now - cq->cq_next; 189 190 /* 191 * Dispatch expired events. 192 */ 193 again: 194 /* hardclock */ 195 count = nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); 196 for (i = 0; i < count; i++) 197 hardclock(frame); 198 run += count; 199 200 /* statclock */ 201 if (ISSET(clockintr_flags, CL_RNDSTAT)) { 202 do { 203 gen = statclock_gen; 204 membar_consumer(); 205 min = statclock_min; 206 mask = statclock_mask; 207 membar_consumer(); 208 } while (gen == 0 || gen != statclock_gen); 209 count = 0; 210 while (cq->cq_next_statclock <= now) { 211 while ((off = (random() & mask)) == 0) 212 continue; 213 cq->cq_next_statclock += min + off; 214 count++; 215 } 216 } else { 217 count = nsec_advance(&cq->cq_next_statclock, statclock_avg, 218 now); 219 } 220 for (i = 0; i < count; i++) 221 statclock(frame); 222 run += count; 223 224 /* schedclock */ 225 if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { 226 count = nsec_advance(&cq->cq_next_schedclock, 227 schedclock_period, now); 228 if (p != NULL) { 229 for (i = 0; i < count; i++) 230 schedclock(p); 231 } 232 run += count; 233 } 234 235 /* Run the dispatch again if the next event has already expired. */ 236 cq->cq_next = cq->cq_next_hardclock; 237 if (cq->cq_next_statclock < cq->cq_next) 238 cq->cq_next = cq->cq_next_statclock; 239 if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { 240 if (cq->cq_next_schedclock < cq->cq_next) 241 cq->cq_next = cq->cq_next_schedclock; 242 } 243 now = nsecuptime(); 244 if (cq->cq_next <= now) 245 goto again; 246 247 /* 248 * Dispatch complete. 249 */ 250 done: 251 /* Rearm the interrupt clock if we have one. */ 252 if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) 253 intrclock_rearm(&cq->cq_intrclock, cq->cq_next - now); 254 255 /* Update our stats. */ 256 ogen = cq->cq_gen; 257 cq->cq_gen = 0; 258 membar_producer(); 259 cq->cq_stat.cs_dispatched += now - start; 260 if (run > 0) { 261 cq->cq_stat.cs_lateness += lateness; 262 cq->cq_stat.cs_prompt++; 263 cq->cq_stat.cs_run += run; 264 } else { 265 cq->cq_stat.cs_early++; 266 cq->cq_stat.cs_earliness += cq->cq_next - now; 267 } 268 membar_producer(); 269 cq->cq_gen = MAX(1, ogen + 1); 270 271 if (cq->cq_dispatch != 1) 272 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 273 cq->cq_dispatch = 0; 274 275 return run > 0; 276 } 277 278 /* 279 * Compute the period (avg) for the given frequency and a range around 280 * that period. The range is [min + 1, min + mask]. The range is used 281 * during dispatch to choose a new pseudorandom deadline for each statclock 282 * event. 283 */ 284 void 285 clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask) 286 { 287 uint32_t half_avg, var; 288 289 KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK)); 290 KASSERT(freq > 0 && freq <= 1000000000); 291 292 /* Compute avg, the average period. */ 293 *avg = 1000000000 / freq; 294 295 /* Find var, the largest power of two such that var <= avg / 2. */ 296 half_avg = *avg / 2; 297 for (var = 1U << 31; var > half_avg; var /= 2) 298 continue; 299 300 /* Using avg and var, set a lower bound for the range. */ 301 *min = *avg - (var / 2); 302 303 /* The mask is just (var - 1). */ 304 *mask = var - 1; 305 } 306 307 /* 308 * Update the statclock_* variables according to the given frequency. 309 * Must only be called after clockintr_statvar_init() initializes both 310 * stathz_* and profhz_*. 311 */ 312 void 313 clockintr_setstatclockrate(int freq) 314 { 315 u_int ogen; 316 317 KASSERT(ISSET(clockintr_flags, CL_STATCLOCK)); 318 319 mtx_enter(&clockintr_mtx); 320 321 ogen = statclock_gen; 322 statclock_gen = 0; 323 membar_producer(); 324 if (freq == stathz) { 325 statclock_avg = stat_avg; 326 statclock_min = stat_min; 327 statclock_mask = stat_mask; 328 } else if (freq == profhz) { 329 statclock_avg = prof_avg; 330 statclock_min = prof_min; 331 statclock_mask = prof_mask; 332 } else { 333 panic("%s: frequency is not stathz (%d) or profhz (%d): %d", 334 __func__, stathz, profhz, freq); 335 } 336 membar_producer(); 337 statclock_gen = MAX(1, ogen + 1); 338 339 mtx_leave(&clockintr_mtx); 340 } 341 342 /* 343 * Advance *next in increments of period until it exceeds now. 344 * Returns the number of increments *next was advanced. 345 * 346 * We check the common cases first to avoid division if possible. 347 * This does no overflow checking. 348 */ 349 uint64_t 350 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 351 { 352 uint64_t elapsed; 353 354 if (now < *next) 355 return 0; 356 357 if (now < *next + period) { 358 *next += period; 359 return 1; 360 } 361 362 elapsed = (now - *next) / period + 1; 363 *next += period * elapsed; 364 return elapsed; 365 } 366 367 int 368 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 369 void *newp, size_t newlen) 370 { 371 struct clockintr_stat sum, tmp; 372 struct clockintr_queue *cq; 373 struct cpu_info *ci; 374 CPU_INFO_ITERATOR cii; 375 u_int gen; 376 377 if (namelen != 1) 378 return ENOTDIR; 379 380 switch (name[0]) { 381 case KERN_CLOCKINTR_STATS: 382 memset(&sum, 0, sizeof sum); 383 CPU_INFO_FOREACH(cii, ci) { 384 cq = &ci->ci_queue; 385 if (!ISSET(cq->cq_flags, CL_CPU_INIT)) 386 continue; 387 do { 388 gen = cq->cq_gen; 389 membar_consumer(); 390 tmp = cq->cq_stat; 391 membar_consumer(); 392 } while (gen == 0 || gen != cq->cq_gen); 393 sum.cs_dispatched += tmp.cs_dispatched; 394 sum.cs_early += tmp.cs_early; 395 sum.cs_earliness += tmp.cs_earliness; 396 sum.cs_lateness += tmp.cs_lateness; 397 sum.cs_prompt += tmp.cs_prompt; 398 sum.cs_run += tmp.cs_run; 399 } 400 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 401 default: 402 break; 403 } 404 405 return EINVAL; 406 } 407 408 #ifdef DDB 409 410 #include <machine/db_machdep.h> 411 412 #include <ddb/db_interface.h> 413 #include <ddb/db_output.h> 414 #include <ddb/db_sym.h> 415 416 void db_show_clockintr(uint64_t, u_int, const char *); 417 void db_show_clockintr_cpu(struct cpu_info *); 418 419 void 420 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 421 { 422 struct timespec now; 423 struct cpu_info *ci; 424 CPU_INFO_ITERATOR cii; 425 426 nanouptime(&now); 427 db_printf("%20s\n", "UPTIME"); 428 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 429 db_printf("\n"); 430 db_printf("%20s %3s %s\n", "EXPIRATION", "CPU", "NAME"); 431 CPU_INFO_FOREACH(cii, ci) { 432 if (ISSET(ci->ci_queue.cq_flags, CL_CPU_INIT)) 433 db_show_clockintr_cpu(ci); 434 } 435 } 436 437 void 438 db_show_clockintr_cpu(struct cpu_info *ci) 439 { 440 struct clockintr_queue *cq = &ci->ci_queue; 441 u_int cpu = CPU_INFO_UNIT(ci); 442 443 db_show_clockintr(cq->cq_next_hardclock, cpu, "hardclock"); 444 db_show_clockintr(cq->cq_next_statclock, cpu, "statclock"); 445 if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) 446 db_show_clockintr(cq->cq_next_schedclock, cpu, "schedclock"); 447 } 448 449 void 450 db_show_clockintr(uint64_t expiration, u_int cpu, const char *name) 451 { 452 struct timespec ts; 453 454 NSEC_TO_TIMESPEC(expiration, &ts); 455 db_printf("%10lld.%09ld %3u %s\n", ts.tv_sec, ts.tv_nsec, cpu, name); 456 } 457 458 #endif /* DDB */ 459 #endif /*__HAVE_CLOCKINTR */ 460