1 /* $OpenBSD: kern_clockintr.c,v 1.56 2023/09/17 15:24:35 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 void clockintr_hardclock(struct clockintr *, void *, void *); 35 void clockintr_schedule(struct clockintr *, uint64_t); 36 void clockintr_schedule_locked(struct clockintr *, uint64_t); 37 void clockqueue_intrclock_install(struct clockintr_queue *, 38 const struct intrclock *); 39 uint64_t clockqueue_next(const struct clockintr_queue *); 40 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 41 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 42 uint64_t); 43 void clockqueue_reset_intrclock(struct clockintr_queue *); 44 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 45 46 /* 47 * Ready the calling CPU for clockintr_dispatch(). If this is our 48 * first time here, install the intrclock, if any, and set necessary 49 * flags. Advance the schedule as needed. 50 */ 51 void 52 clockintr_cpu_init(const struct intrclock *ic) 53 { 54 uint64_t multiplier = 0; 55 struct cpu_info *ci = curcpu(); 56 struct clockintr_queue *cq = &ci->ci_queue; 57 struct schedstate_percpu *spc = &ci->ci_schedstate; 58 int reset_cq_intrclock = 0; 59 60 if (ic != NULL) 61 clockqueue_intrclock_install(cq, ic); 62 63 /* TODO: Remove this from struct clockintr_queue. */ 64 if (cq->cq_hardclock == NULL) { 65 cq->cq_hardclock = clockintr_establish(ci, clockintr_hardclock, 66 NULL); 67 if (cq->cq_hardclock == NULL) 68 panic("%s: failed to establish hardclock", __func__); 69 } 70 71 /* 72 * Mask CQ_INTRCLOCK while we're advancing the internal clock 73 * interrupts. We don't want the intrclock to fire until this 74 * thread reaches clockintr_trigger(). 75 */ 76 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 77 CLR(cq->cq_flags, CQ_INTRCLOCK); 78 reset_cq_intrclock = 1; 79 } 80 81 /* 82 * Until we understand scheduler lock contention better, stagger 83 * the hardclock and statclock so they don't all happen at once. 84 * If we have no intrclock it doesn't matter, we have no control 85 * anyway. The primary CPU's starting offset is always zero, so 86 * leave the multiplier zero. 87 */ 88 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 89 multiplier = CPU_INFO_UNIT(ci); 90 91 /* 92 * The first time we do this, the primary CPU cannot skip any 93 * hardclocks. We can skip hardclocks on subsequent calls because 94 * the global tick value is advanced during inittodr(9) on our 95 * behalf. 96 */ 97 if (CPU_IS_PRIMARY(ci)) { 98 if (cq->cq_hardclock->cl_expiration == 0) 99 clockintr_schedule(cq->cq_hardclock, 0); 100 else 101 clockintr_advance(cq->cq_hardclock, hardclock_period); 102 } else { 103 if (cq->cq_hardclock->cl_expiration == 0) { 104 clockintr_stagger(cq->cq_hardclock, hardclock_period, 105 multiplier, MAXCPUS); 106 } 107 clockintr_advance(cq->cq_hardclock, hardclock_period); 108 } 109 110 /* 111 * We can always advance the statclock. There is no reason to 112 * stagger a randomized statclock. 113 */ 114 if (!statclock_is_randomized) { 115 if (spc->spc_statclock->cl_expiration == 0) { 116 clockintr_stagger(spc->spc_statclock, statclock_avg, 117 multiplier, MAXCPUS); 118 } 119 } 120 clockintr_advance(spc->spc_statclock, statclock_avg); 121 122 /* 123 * XXX Need to find a better place to do this. We can't do it in 124 * sched_init_cpu() because initclocks() runs after it. 125 */ 126 if (spc->spc_itimer->cl_expiration == 0) { 127 clockintr_stagger(spc->spc_itimer, hardclock_period, 128 multiplier, MAXCPUS); 129 } 130 if (spc->spc_profclock->cl_expiration == 0) { 131 clockintr_stagger(spc->spc_profclock, profclock_period, 132 multiplier, MAXCPUS); 133 } 134 if (spc->spc_roundrobin->cl_expiration == 0) { 135 clockintr_stagger(spc->spc_roundrobin, hardclock_period, 136 multiplier, MAXCPUS); 137 } 138 clockintr_advance(spc->spc_roundrobin, roundrobin_period); 139 140 if (reset_cq_intrclock) 141 SET(cq->cq_flags, CQ_INTRCLOCK); 142 } 143 144 /* 145 * If we have an intrclock, trigger it to start the dispatch cycle. 146 */ 147 void 148 clockintr_trigger(void) 149 { 150 struct clockintr_queue *cq = &curcpu()->ci_queue; 151 152 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 153 154 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 155 intrclock_trigger(&cq->cq_intrclock); 156 } 157 158 /* 159 * Run all expired events scheduled on the calling CPU. 160 */ 161 int 162 clockintr_dispatch(void *frame) 163 { 164 uint64_t lateness, run = 0, start; 165 struct cpu_info *ci = curcpu(); 166 struct clockintr *cl, *shadow; 167 struct clockintr_queue *cq = &ci->ci_queue; 168 uint32_t ogen; 169 170 if (cq->cq_dispatch != 0) 171 panic("%s: recursive dispatch", __func__); 172 cq->cq_dispatch = 1; 173 174 splassert(IPL_CLOCK); 175 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 176 177 mtx_enter(&cq->cq_mtx); 178 179 /* 180 * If nothing is scheduled or we arrived too early, we have 181 * nothing to do. 182 */ 183 start = nsecuptime(); 184 cq->cq_uptime = start; 185 if (TAILQ_EMPTY(&cq->cq_pend)) 186 goto stats; 187 if (cq->cq_uptime < clockqueue_next(cq)) 188 goto rearm; 189 lateness = start - clockqueue_next(cq); 190 191 /* 192 * Dispatch expired events. 193 */ 194 for (;;) { 195 cl = TAILQ_FIRST(&cq->cq_pend); 196 if (cl == NULL) 197 break; 198 if (cq->cq_uptime < cl->cl_expiration) { 199 /* Double-check the time before giving up. */ 200 cq->cq_uptime = nsecuptime(); 201 if (cq->cq_uptime < cl->cl_expiration) 202 break; 203 } 204 205 /* 206 * This clockintr has expired. Initialize a shadow copy 207 * and execute it. 208 */ 209 clockqueue_pend_delete(cq, cl); 210 shadow = &cq->cq_shadow; 211 shadow->cl_expiration = cl->cl_expiration; 212 shadow->cl_arg = cl->cl_arg; 213 shadow->cl_func = cl->cl_func; 214 cq->cq_running = cl; 215 mtx_leave(&cq->cq_mtx); 216 217 shadow->cl_func(shadow, frame, shadow->cl_arg); 218 219 mtx_enter(&cq->cq_mtx); 220 cq->cq_running = NULL; 221 if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) { 222 CLR(cl->cl_flags, CLST_IGNORE_SHADOW); 223 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 224 } 225 if (ISSET(shadow->cl_flags, CLST_SHADOW_PENDING)) { 226 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 227 clockqueue_pend_insert(cq, cl, shadow->cl_expiration); 228 } 229 run++; 230 } 231 232 /* 233 * Dispatch complete. 234 */ 235 rearm: 236 /* Rearm the interrupt clock if we have one. */ 237 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 238 if (!TAILQ_EMPTY(&cq->cq_pend)) { 239 intrclock_rearm(&cq->cq_intrclock, 240 clockqueue_next(cq) - cq->cq_uptime); 241 } 242 } 243 stats: 244 /* Update our stats. */ 245 ogen = cq->cq_gen; 246 cq->cq_gen = 0; 247 membar_producer(); 248 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 249 if (run > 0) { 250 cq->cq_stat.cs_lateness += lateness; 251 cq->cq_stat.cs_prompt++; 252 cq->cq_stat.cs_run += run; 253 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 254 cq->cq_stat.cs_early++; 255 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 256 } else 257 cq->cq_stat.cs_spurious++; 258 membar_producer(); 259 cq->cq_gen = MAX(1, ogen + 1); 260 261 mtx_leave(&cq->cq_mtx); 262 263 if (cq->cq_dispatch != 1) 264 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 265 cq->cq_dispatch = 0; 266 267 return run > 0; 268 } 269 270 uint64_t 271 clockintr_advance(struct clockintr *cl, uint64_t period) 272 { 273 uint64_t count, expiration; 274 struct clockintr_queue *cq = cl->cl_queue; 275 276 if (cl == &cq->cq_shadow) { 277 count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime); 278 SET(cl->cl_flags, CLST_SHADOW_PENDING); 279 } else { 280 mtx_enter(&cq->cq_mtx); 281 expiration = cl->cl_expiration; 282 count = nsec_advance(&expiration, period, nsecuptime()); 283 clockintr_schedule_locked(cl, expiration); 284 mtx_leave(&cq->cq_mtx); 285 } 286 return count; 287 } 288 289 uint64_t 290 clockintr_advance_random(struct clockintr *cl, uint64_t min, uint32_t mask) 291 { 292 uint64_t count = 0; 293 struct clockintr_queue *cq = cl->cl_queue; 294 uint32_t off; 295 296 KASSERT(cl == &cq->cq_shadow); 297 298 while (cl->cl_expiration <= cq->cq_uptime) { 299 while ((off = (random() & mask)) == 0) 300 continue; 301 cl->cl_expiration += min + off; 302 count++; 303 } 304 SET(cl->cl_flags, CLST_SHADOW_PENDING); 305 return count; 306 } 307 308 void 309 clockintr_cancel(struct clockintr *cl) 310 { 311 struct clockintr_queue *cq = cl->cl_queue; 312 int was_next; 313 314 if (cl == &cq->cq_shadow) { 315 CLR(cl->cl_flags, CLST_SHADOW_PENDING); 316 return; 317 } 318 319 mtx_enter(&cq->cq_mtx); 320 if (ISSET(cl->cl_flags, CLST_PENDING)) { 321 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 322 clockqueue_pend_delete(cq, cl); 323 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 324 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 325 if (cq == &curcpu()->ci_queue) 326 clockqueue_reset_intrclock(cq); 327 } 328 } 329 } 330 if (cl == cq->cq_running) 331 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 332 mtx_leave(&cq->cq_mtx); 333 } 334 335 struct clockintr * 336 clockintr_establish(struct cpu_info *ci, 337 void (*func)(struct clockintr *, void *, void *), void *arg) 338 { 339 struct clockintr *cl; 340 struct clockintr_queue *cq = &ci->ci_queue; 341 342 cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); 343 if (cl == NULL) 344 return NULL; 345 cl->cl_arg = arg; 346 cl->cl_func = func; 347 cl->cl_queue = cq; 348 349 mtx_enter(&cq->cq_mtx); 350 TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); 351 mtx_leave(&cq->cq_mtx); 352 return cl; 353 } 354 355 void 356 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 357 { 358 struct clockintr_queue *cq = cl->cl_queue; 359 360 if (cl == &cq->cq_shadow) { 361 cl->cl_expiration = expiration; 362 SET(cl->cl_flags, CLST_SHADOW_PENDING); 363 } else { 364 mtx_enter(&cq->cq_mtx); 365 clockintr_schedule_locked(cl, expiration); 366 mtx_leave(&cq->cq_mtx); 367 } 368 } 369 370 void 371 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 372 { 373 struct clockintr_queue *cq = cl->cl_queue; 374 375 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 376 377 if (ISSET(cl->cl_flags, CLST_PENDING)) 378 clockqueue_pend_delete(cq, cl); 379 clockqueue_pend_insert(cq, cl, expiration); 380 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 381 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 382 if (cq == &curcpu()->ci_queue) 383 clockqueue_reset_intrclock(cq); 384 } 385 } 386 if (cl == cq->cq_running) 387 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 388 } 389 390 void 391 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t n, 392 uint32_t count) 393 { 394 struct clockintr_queue *cq = cl->cl_queue; 395 396 KASSERT(n < count); 397 398 mtx_enter(&cq->cq_mtx); 399 if (ISSET(cl->cl_flags, CLST_PENDING)) 400 panic("%s: clock interrupt pending", __func__); 401 cl->cl_expiration = period / count * n; 402 mtx_leave(&cq->cq_mtx); 403 } 404 405 void 406 clockintr_hardclock(struct clockintr *cl, void *frame, void *arg) 407 { 408 uint64_t count, i; 409 410 count = clockintr_advance(cl, hardclock_period); 411 for (i = 0; i < count; i++) 412 hardclock(frame); 413 } 414 415 void 416 clockqueue_init(struct clockintr_queue *cq) 417 { 418 if (ISSET(cq->cq_flags, CQ_INIT)) 419 return; 420 421 cq->cq_shadow.cl_queue = cq; 422 mtx_init(&cq->cq_mtx, IPL_CLOCK); 423 TAILQ_INIT(&cq->cq_all); 424 TAILQ_INIT(&cq->cq_pend); 425 cq->cq_gen = 1; 426 SET(cq->cq_flags, CQ_INIT); 427 } 428 429 void 430 clockqueue_intrclock_install(struct clockintr_queue *cq, 431 const struct intrclock *ic) 432 { 433 mtx_enter(&cq->cq_mtx); 434 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 435 cq->cq_intrclock = *ic; 436 SET(cq->cq_flags, CQ_INTRCLOCK); 437 } 438 mtx_leave(&cq->cq_mtx); 439 } 440 441 uint64_t 442 clockqueue_next(const struct clockintr_queue *cq) 443 { 444 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 445 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 446 } 447 448 void 449 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 450 { 451 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 452 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 453 454 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 455 CLR(cl->cl_flags, CLST_PENDING); 456 } 457 458 459 void 460 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 461 uint64_t expiration) 462 { 463 struct clockintr *elm; 464 465 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 466 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 467 468 cl->cl_expiration = expiration; 469 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 470 if (cl->cl_expiration < elm->cl_expiration) 471 break; 472 } 473 if (elm == NULL) 474 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 475 else 476 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 477 SET(cl->cl_flags, CLST_PENDING); 478 } 479 480 void 481 clockqueue_reset_intrclock(struct clockintr_queue *cq) 482 { 483 uint64_t exp, now; 484 485 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 486 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 487 488 exp = clockqueue_next(cq); 489 now = nsecuptime(); 490 if (now < exp) 491 intrclock_rearm(&cq->cq_intrclock, exp - now); 492 else 493 intrclock_trigger(&cq->cq_intrclock); 494 } 495 496 /* 497 * Advance *next in increments of period until it exceeds now. 498 * Returns the number of increments *next was advanced. 499 * 500 * We check the common cases first to avoid division if possible. 501 * This does no overflow checking. 502 */ 503 uint64_t 504 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 505 { 506 uint64_t elapsed; 507 508 if (now < *next) 509 return 0; 510 511 if (now < *next + period) { 512 *next += period; 513 return 1; 514 } 515 516 elapsed = (now - *next) / period + 1; 517 *next += period * elapsed; 518 return elapsed; 519 } 520 521 int 522 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 523 void *newp, size_t newlen) 524 { 525 struct clockintr_stat sum, tmp; 526 struct clockintr_queue *cq; 527 struct cpu_info *ci; 528 CPU_INFO_ITERATOR cii; 529 uint32_t gen; 530 531 if (namelen != 1) 532 return ENOTDIR; 533 534 switch (name[0]) { 535 case KERN_CLOCKINTR_STATS: 536 memset(&sum, 0, sizeof sum); 537 CPU_INFO_FOREACH(cii, ci) { 538 cq = &ci->ci_queue; 539 if (!ISSET(cq->cq_flags, CQ_INIT)) 540 continue; 541 do { 542 gen = cq->cq_gen; 543 membar_consumer(); 544 tmp = cq->cq_stat; 545 membar_consumer(); 546 } while (gen == 0 || gen != cq->cq_gen); 547 sum.cs_dispatched += tmp.cs_dispatched; 548 sum.cs_early += tmp.cs_early; 549 sum.cs_earliness += tmp.cs_earliness; 550 sum.cs_lateness += tmp.cs_lateness; 551 sum.cs_prompt += tmp.cs_prompt; 552 sum.cs_run += tmp.cs_run; 553 sum.cs_spurious += tmp.cs_spurious; 554 } 555 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 556 default: 557 break; 558 } 559 560 return EINVAL; 561 } 562 563 #ifdef DDB 564 565 #include <machine/db_machdep.h> 566 567 #include <ddb/db_interface.h> 568 #include <ddb/db_output.h> 569 #include <ddb/db_sym.h> 570 571 void db_show_clockintr(const struct clockintr *, const char *, u_int); 572 void db_show_clockintr_cpu(struct cpu_info *); 573 574 void 575 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 576 { 577 struct timespec now; 578 struct cpu_info *ci; 579 CPU_INFO_ITERATOR cii; 580 581 nanouptime(&now); 582 db_printf("%20s\n", "UPTIME"); 583 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 584 db_printf("\n"); 585 db_printf("%20s %5s %3s %s\n", "EXPIRATION", "STATE", "CPU", "NAME"); 586 CPU_INFO_FOREACH(cii, ci) { 587 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 588 db_show_clockintr_cpu(ci); 589 } 590 } 591 592 void 593 db_show_clockintr_cpu(struct cpu_info *ci) 594 { 595 struct clockintr *elm; 596 struct clockintr_queue *cq = &ci->ci_queue; 597 u_int cpu = CPU_INFO_UNIT(ci); 598 599 if (cq->cq_running != NULL) 600 db_show_clockintr(cq->cq_running, "run", cpu); 601 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 602 db_show_clockintr(elm, "pend", cpu); 603 TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { 604 if (!ISSET(elm->cl_flags, CLST_PENDING)) 605 db_show_clockintr(elm, "idle", cpu); 606 } 607 } 608 609 void 610 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 611 { 612 struct timespec ts; 613 char *name; 614 db_expr_t offset; 615 616 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 617 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 618 if (name == NULL) 619 name = "?"; 620 db_printf("%10lld.%09ld %5s %3u %s\n", 621 ts.tv_sec, ts.tv_nsec, state, cpu, name); 622 } 623 624 #endif /* DDB */ 625