1 /* $OpenBSD: kern_clockintr.c,v 1.53 2023/09/15 11:48:49 deraadt Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 /* 35 * Protection for global variables in this file: 36 * 37 * I Immutable after initialization. 38 */ 39 uint32_t clockintr_flags; /* [I] global state + behavior flags */ 40 41 void clockintr_hardclock(struct clockintr *, void *, void *); 42 void clockintr_schedule(struct clockintr *, uint64_t); 43 void clockintr_schedule_locked(struct clockintr *, uint64_t); 44 void clockqueue_intrclock_install(struct clockintr_queue *, 45 const struct intrclock *); 46 uint64_t clockqueue_next(const struct clockintr_queue *); 47 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 48 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 49 uint64_t); 50 void clockqueue_reset_intrclock(struct clockintr_queue *); 51 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 52 53 /* 54 * Initialize global state. Set flags and compute intervals. 55 */ 56 void 57 clockintr_init(uint32_t flags) 58 { 59 KASSERT(CPU_IS_PRIMARY(curcpu())); 60 KASSERT(clockintr_flags == 0); 61 KASSERT(!ISSET(flags, ~CL_FLAG_MASK)); 62 63 SET(clockintr_flags, flags | CL_INIT); 64 } 65 66 /* 67 * Ready the calling CPU for clockintr_dispatch(). If this is our 68 * first time here, install the intrclock, if any, and set necessary 69 * flags. Advance the schedule as needed. 70 */ 71 void 72 clockintr_cpu_init(const struct intrclock *ic) 73 { 74 uint64_t multiplier = 0; 75 struct cpu_info *ci = curcpu(); 76 struct clockintr_queue *cq = &ci->ci_queue; 77 struct schedstate_percpu *spc = &ci->ci_schedstate; 78 int reset_cq_intrclock = 0; 79 80 KASSERT(ISSET(clockintr_flags, CL_INIT)); 81 82 if (ic != NULL) 83 clockqueue_intrclock_install(cq, ic); 84 85 /* TODO: Remove this from struct clockintr_queue. */ 86 if (cq->cq_hardclock == NULL) { 87 cq->cq_hardclock = clockintr_establish(ci, clockintr_hardclock, 88 NULL); 89 if (cq->cq_hardclock == NULL) 90 panic("%s: failed to establish hardclock", __func__); 91 } 92 93 /* 94 * Mask CQ_INTRCLOCK while we're advancing the internal clock 95 * interrupts. We don't want the intrclock to fire until this 96 * thread reaches clockintr_trigger(). 97 */ 98 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 99 CLR(cq->cq_flags, CQ_INTRCLOCK); 100 reset_cq_intrclock = 1; 101 } 102 103 /* 104 * Until we understand scheduler lock contention better, stagger 105 * the hardclock and statclock so they don't all happen at once. 106 * If we have no intrclock it doesn't matter, we have no control 107 * anyway. The primary CPU's starting offset is always zero, so 108 * leave the multiplier zero. 109 */ 110 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 111 multiplier = CPU_INFO_UNIT(ci); 112 113 /* 114 * The first time we do this, the primary CPU cannot skip any 115 * hardclocks. We can skip hardclocks on subsequent calls because 116 * the global tick value is advanced during inittodr(9) on our 117 * behalf. 118 */ 119 if (CPU_IS_PRIMARY(ci)) { 120 if (cq->cq_hardclock->cl_expiration == 0) 121 clockintr_schedule(cq->cq_hardclock, 0); 122 else 123 clockintr_advance(cq->cq_hardclock, hardclock_period); 124 } else { 125 if (cq->cq_hardclock->cl_expiration == 0) { 126 clockintr_stagger(cq->cq_hardclock, hardclock_period, 127 multiplier, MAXCPUS); 128 } 129 clockintr_advance(cq->cq_hardclock, hardclock_period); 130 } 131 132 /* 133 * We can always advance the statclock. There is no reason to 134 * stagger a randomized statclock. 135 */ 136 if (!statclock_is_randomized) { 137 if (spc->spc_statclock->cl_expiration == 0) { 138 clockintr_stagger(spc->spc_statclock, statclock_avg, 139 multiplier, MAXCPUS); 140 } 141 } 142 clockintr_advance(spc->spc_statclock, statclock_avg); 143 144 /* 145 * XXX Need to find a better place to do this. We can't do it in 146 * sched_init_cpu() because initclocks() runs after it. 147 */ 148 if (spc->spc_itimer->cl_expiration == 0) { 149 clockintr_stagger(spc->spc_itimer, hardclock_period, 150 multiplier, MAXCPUS); 151 } 152 if (spc->spc_profclock->cl_expiration == 0) { 153 clockintr_stagger(spc->spc_profclock, profclock_period, 154 multiplier, MAXCPUS); 155 } 156 if (spc->spc_roundrobin->cl_expiration == 0) { 157 clockintr_stagger(spc->spc_roundrobin, hardclock_period, 158 multiplier, MAXCPUS); 159 } 160 clockintr_advance(spc->spc_roundrobin, roundrobin_period); 161 162 if (reset_cq_intrclock) 163 SET(cq->cq_flags, CQ_INTRCLOCK); 164 } 165 166 /* 167 * If we have an intrclock, trigger it to start the dispatch cycle. 168 */ 169 void 170 clockintr_trigger(void) 171 { 172 struct clockintr_queue *cq = &curcpu()->ci_queue; 173 174 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 175 176 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 177 intrclock_trigger(&cq->cq_intrclock); 178 } 179 180 /* 181 * Run all expired events scheduled on the calling CPU. 182 */ 183 int 184 clockintr_dispatch(void *frame) 185 { 186 uint64_t lateness, run = 0, start; 187 struct cpu_info *ci = curcpu(); 188 struct clockintr *cl, *shadow; 189 struct clockintr_queue *cq = &ci->ci_queue; 190 uint32_t ogen; 191 192 if (cq->cq_dispatch != 0) 193 panic("%s: recursive dispatch", __func__); 194 cq->cq_dispatch = 1; 195 196 splassert(IPL_CLOCK); 197 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 198 199 mtx_enter(&cq->cq_mtx); 200 201 /* 202 * If nothing is scheduled or we arrived too early, we have 203 * nothing to do. 204 */ 205 start = nsecuptime(); 206 cq->cq_uptime = start; 207 if (TAILQ_EMPTY(&cq->cq_pend)) 208 goto stats; 209 if (cq->cq_uptime < clockqueue_next(cq)) 210 goto rearm; 211 lateness = start - clockqueue_next(cq); 212 213 /* 214 * Dispatch expired events. 215 */ 216 for (;;) { 217 cl = TAILQ_FIRST(&cq->cq_pend); 218 if (cl == NULL) 219 break; 220 if (cq->cq_uptime < cl->cl_expiration) { 221 /* Double-check the time before giving up. */ 222 cq->cq_uptime = nsecuptime(); 223 if (cq->cq_uptime < cl->cl_expiration) 224 break; 225 } 226 227 /* 228 * This clockintr has expired. Initialize a shadow copy 229 * and execute it. 230 */ 231 clockqueue_pend_delete(cq, cl); 232 shadow = &cq->cq_shadow; 233 shadow->cl_expiration = cl->cl_expiration; 234 shadow->cl_arg = cl->cl_arg; 235 shadow->cl_func = cl->cl_func; 236 cq->cq_running = cl; 237 mtx_leave(&cq->cq_mtx); 238 239 shadow->cl_func(shadow, frame, shadow->cl_arg); 240 241 mtx_enter(&cq->cq_mtx); 242 cq->cq_running = NULL; 243 if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) { 244 CLR(cl->cl_flags, CLST_IGNORE_SHADOW); 245 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 246 } 247 if (ISSET(shadow->cl_flags, CLST_SHADOW_PENDING)) { 248 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 249 clockqueue_pend_insert(cq, cl, shadow->cl_expiration); 250 } 251 run++; 252 } 253 254 /* 255 * Dispatch complete. 256 */ 257 rearm: 258 /* Rearm the interrupt clock if we have one. */ 259 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 260 if (!TAILQ_EMPTY(&cq->cq_pend)) { 261 intrclock_rearm(&cq->cq_intrclock, 262 clockqueue_next(cq) - cq->cq_uptime); 263 } 264 } 265 stats: 266 /* Update our stats. */ 267 ogen = cq->cq_gen; 268 cq->cq_gen = 0; 269 membar_producer(); 270 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 271 if (run > 0) { 272 cq->cq_stat.cs_lateness += lateness; 273 cq->cq_stat.cs_prompt++; 274 cq->cq_stat.cs_run += run; 275 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 276 cq->cq_stat.cs_early++; 277 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 278 } else 279 cq->cq_stat.cs_spurious++; 280 membar_producer(); 281 cq->cq_gen = MAX(1, ogen + 1); 282 283 mtx_leave(&cq->cq_mtx); 284 285 if (cq->cq_dispatch != 1) 286 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 287 cq->cq_dispatch = 0; 288 289 return run > 0; 290 } 291 292 uint64_t 293 clockintr_advance(struct clockintr *cl, uint64_t period) 294 { 295 uint64_t count, expiration; 296 struct clockintr_queue *cq = cl->cl_queue; 297 298 if (cl == &cq->cq_shadow) { 299 count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime); 300 SET(cl->cl_flags, CLST_SHADOW_PENDING); 301 } else { 302 mtx_enter(&cq->cq_mtx); 303 expiration = cl->cl_expiration; 304 count = nsec_advance(&expiration, period, nsecuptime()); 305 clockintr_schedule_locked(cl, expiration); 306 mtx_leave(&cq->cq_mtx); 307 } 308 return count; 309 } 310 311 uint64_t 312 clockintr_advance_random(struct clockintr *cl, uint64_t min, uint32_t mask) 313 { 314 uint64_t count = 0; 315 struct clockintr_queue *cq = cl->cl_queue; 316 uint32_t off; 317 318 KASSERT(cl == &cq->cq_shadow); 319 320 while (cl->cl_expiration <= cq->cq_uptime) { 321 while ((off = (random() & mask)) == 0) 322 continue; 323 cl->cl_expiration += min + off; 324 count++; 325 } 326 SET(cl->cl_flags, CLST_SHADOW_PENDING); 327 return count; 328 } 329 330 void 331 clockintr_cancel(struct clockintr *cl) 332 { 333 struct clockintr_queue *cq = cl->cl_queue; 334 int was_next; 335 336 if (cl == &cq->cq_shadow) { 337 CLR(cl->cl_flags, CLST_SHADOW_PENDING); 338 return; 339 } 340 341 mtx_enter(&cq->cq_mtx); 342 if (ISSET(cl->cl_flags, CLST_PENDING)) { 343 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 344 clockqueue_pend_delete(cq, cl); 345 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 346 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 347 if (cq == &curcpu()->ci_queue) 348 clockqueue_reset_intrclock(cq); 349 } 350 } 351 } 352 if (cl == cq->cq_running) 353 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 354 mtx_leave(&cq->cq_mtx); 355 } 356 357 struct clockintr * 358 clockintr_establish(void *vci, 359 void (*func)(struct clockintr *, void *, void *), void *arg) 360 { 361 struct cpu_info *ci = vci; 362 struct clockintr *cl; 363 struct clockintr_queue *cq = &ci->ci_queue; 364 365 cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); 366 if (cl == NULL) 367 return NULL; 368 cl->cl_arg = arg; 369 cl->cl_func = func; 370 cl->cl_queue = cq; 371 372 mtx_enter(&cq->cq_mtx); 373 TAILQ_INSERT_TAIL(&cq->cq_est, cl, cl_elink); 374 mtx_leave(&cq->cq_mtx); 375 return cl; 376 } 377 378 void 379 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 380 { 381 struct clockintr_queue *cq = cl->cl_queue; 382 383 if (cl == &cq->cq_shadow) { 384 cl->cl_expiration = expiration; 385 SET(cl->cl_flags, CLST_SHADOW_PENDING); 386 } else { 387 mtx_enter(&cq->cq_mtx); 388 clockintr_schedule_locked(cl, expiration); 389 mtx_leave(&cq->cq_mtx); 390 } 391 } 392 393 void 394 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 395 { 396 struct clockintr_queue *cq = cl->cl_queue; 397 398 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 399 400 if (ISSET(cl->cl_flags, CLST_PENDING)) 401 clockqueue_pend_delete(cq, cl); 402 clockqueue_pend_insert(cq, cl, expiration); 403 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 404 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 405 if (cq == &curcpu()->ci_queue) 406 clockqueue_reset_intrclock(cq); 407 } 408 } 409 if (cl == cq->cq_running) 410 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 411 } 412 413 void 414 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t n, 415 uint32_t count) 416 { 417 struct clockintr_queue *cq = cl->cl_queue; 418 419 KASSERT(n < count); 420 421 mtx_enter(&cq->cq_mtx); 422 if (ISSET(cl->cl_flags, CLST_PENDING)) 423 panic("%s: clock interrupt pending", __func__); 424 cl->cl_expiration = period / count * n; 425 mtx_leave(&cq->cq_mtx); 426 } 427 428 void 429 clockintr_hardclock(struct clockintr *cl, void *frame, void *arg) 430 { 431 uint64_t count, i; 432 433 count = clockintr_advance(cl, hardclock_period); 434 for (i = 0; i < count; i++) 435 hardclock(frame); 436 } 437 438 void 439 clockqueue_init(struct clockintr_queue *cq) 440 { 441 if (ISSET(cq->cq_flags, CQ_INIT)) 442 return; 443 444 cq->cq_shadow.cl_queue = cq; 445 mtx_init(&cq->cq_mtx, IPL_CLOCK); 446 TAILQ_INIT(&cq->cq_est); 447 TAILQ_INIT(&cq->cq_pend); 448 cq->cq_gen = 1; 449 SET(cq->cq_flags, CQ_INIT); 450 } 451 452 void 453 clockqueue_intrclock_install(struct clockintr_queue *cq, 454 const struct intrclock *ic) 455 { 456 mtx_enter(&cq->cq_mtx); 457 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 458 cq->cq_intrclock = *ic; 459 SET(cq->cq_flags, CQ_INTRCLOCK); 460 } 461 mtx_leave(&cq->cq_mtx); 462 } 463 464 uint64_t 465 clockqueue_next(const struct clockintr_queue *cq) 466 { 467 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 468 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 469 } 470 471 void 472 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 473 { 474 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 475 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 476 477 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 478 CLR(cl->cl_flags, CLST_PENDING); 479 } 480 481 482 void 483 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 484 uint64_t expiration) 485 { 486 struct clockintr *elm; 487 488 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 489 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 490 491 cl->cl_expiration = expiration; 492 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 493 if (cl->cl_expiration < elm->cl_expiration) 494 break; 495 } 496 if (elm == NULL) 497 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 498 else 499 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 500 SET(cl->cl_flags, CLST_PENDING); 501 } 502 503 void 504 clockqueue_reset_intrclock(struct clockintr_queue *cq) 505 { 506 uint64_t exp, now; 507 508 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 509 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 510 511 exp = clockqueue_next(cq); 512 now = nsecuptime(); 513 if (now < exp) 514 intrclock_rearm(&cq->cq_intrclock, exp - now); 515 else 516 intrclock_trigger(&cq->cq_intrclock); 517 } 518 519 /* 520 * Advance *next in increments of period until it exceeds now. 521 * Returns the number of increments *next was advanced. 522 * 523 * We check the common cases first to avoid division if possible. 524 * This does no overflow checking. 525 */ 526 uint64_t 527 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 528 { 529 uint64_t elapsed; 530 531 if (now < *next) 532 return 0; 533 534 if (now < *next + period) { 535 *next += period; 536 return 1; 537 } 538 539 elapsed = (now - *next) / period + 1; 540 *next += period * elapsed; 541 return elapsed; 542 } 543 544 int 545 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 546 void *newp, size_t newlen) 547 { 548 struct clockintr_stat sum, tmp; 549 struct clockintr_queue *cq; 550 struct cpu_info *ci; 551 CPU_INFO_ITERATOR cii; 552 uint32_t gen; 553 554 if (namelen != 1) 555 return ENOTDIR; 556 557 switch (name[0]) { 558 case KERN_CLOCKINTR_STATS: 559 memset(&sum, 0, sizeof sum); 560 CPU_INFO_FOREACH(cii, ci) { 561 cq = &ci->ci_queue; 562 if (!ISSET(cq->cq_flags, CQ_INIT)) 563 continue; 564 do { 565 gen = cq->cq_gen; 566 membar_consumer(); 567 tmp = cq->cq_stat; 568 membar_consumer(); 569 } while (gen == 0 || gen != cq->cq_gen); 570 sum.cs_dispatched += tmp.cs_dispatched; 571 sum.cs_early += tmp.cs_early; 572 sum.cs_earliness += tmp.cs_earliness; 573 sum.cs_lateness += tmp.cs_lateness; 574 sum.cs_prompt += tmp.cs_prompt; 575 sum.cs_run += tmp.cs_run; 576 sum.cs_spurious += tmp.cs_spurious; 577 } 578 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 579 default: 580 break; 581 } 582 583 return EINVAL; 584 } 585 586 #ifdef DDB 587 588 #include <machine/db_machdep.h> 589 590 #include <ddb/db_interface.h> 591 #include <ddb/db_output.h> 592 #include <ddb/db_sym.h> 593 594 void db_show_clockintr(const struct clockintr *, const char *, u_int); 595 void db_show_clockintr_cpu(struct cpu_info *); 596 597 void 598 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 599 { 600 struct timespec now; 601 struct cpu_info *ci; 602 CPU_INFO_ITERATOR cii; 603 604 nanouptime(&now); 605 db_printf("%20s\n", "UPTIME"); 606 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 607 db_printf("\n"); 608 db_printf("%20s %5s %3s %s\n", "EXPIRATION", "STATE", "CPU", "NAME"); 609 CPU_INFO_FOREACH(cii, ci) { 610 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 611 db_show_clockintr_cpu(ci); 612 } 613 } 614 615 void 616 db_show_clockintr_cpu(struct cpu_info *ci) 617 { 618 struct clockintr *elm; 619 struct clockintr_queue *cq = &ci->ci_queue; 620 u_int cpu = CPU_INFO_UNIT(ci); 621 622 if (cq->cq_running != NULL) 623 db_show_clockintr(cq->cq_running, "run", cpu); 624 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 625 db_show_clockintr(elm, "pend", cpu); 626 TAILQ_FOREACH(elm, &cq->cq_est, cl_elink) { 627 if (!ISSET(elm->cl_flags, CLST_PENDING)) 628 db_show_clockintr(elm, "idle", cpu); 629 } 630 } 631 632 void 633 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 634 { 635 struct timespec ts; 636 char *name; 637 db_expr_t offset; 638 639 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 640 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 641 if (name == NULL) 642 name = "?"; 643 db_printf("%10lld.%09ld %5s %3u %s\n", 644 ts.tv_sec, ts.tv_nsec, state, cpu, name); 645 } 646 647 #endif /* DDB */ 648