1 /* $OpenBSD: kern_clockintr.c,v 1.55 2023/09/17 15:05:45 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 void clockintr_hardclock(struct clockintr *, void *, void *); 35 void clockintr_schedule(struct clockintr *, uint64_t); 36 void clockintr_schedule_locked(struct clockintr *, uint64_t); 37 void clockqueue_intrclock_install(struct clockintr_queue *, 38 const struct intrclock *); 39 uint64_t clockqueue_next(const struct clockintr_queue *); 40 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 41 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 42 uint64_t); 43 void clockqueue_reset_intrclock(struct clockintr_queue *); 44 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 45 46 /* 47 * Ready the calling CPU for clockintr_dispatch(). If this is our 48 * first time here, install the intrclock, if any, and set necessary 49 * flags. Advance the schedule as needed. 50 */ 51 void 52 clockintr_cpu_init(const struct intrclock *ic) 53 { 54 uint64_t multiplier = 0; 55 struct cpu_info *ci = curcpu(); 56 struct clockintr_queue *cq = &ci->ci_queue; 57 struct schedstate_percpu *spc = &ci->ci_schedstate; 58 int reset_cq_intrclock = 0; 59 60 if (ic != NULL) 61 clockqueue_intrclock_install(cq, ic); 62 63 /* TODO: Remove this from struct clockintr_queue. */ 64 if (cq->cq_hardclock == NULL) { 65 cq->cq_hardclock = clockintr_establish(ci, clockintr_hardclock, 66 NULL); 67 if (cq->cq_hardclock == NULL) 68 panic("%s: failed to establish hardclock", __func__); 69 } 70 71 /* 72 * Mask CQ_INTRCLOCK while we're advancing the internal clock 73 * interrupts. We don't want the intrclock to fire until this 74 * thread reaches clockintr_trigger(). 75 */ 76 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 77 CLR(cq->cq_flags, CQ_INTRCLOCK); 78 reset_cq_intrclock = 1; 79 } 80 81 /* 82 * Until we understand scheduler lock contention better, stagger 83 * the hardclock and statclock so they don't all happen at once. 84 * If we have no intrclock it doesn't matter, we have no control 85 * anyway. The primary CPU's starting offset is always zero, so 86 * leave the multiplier zero. 87 */ 88 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 89 multiplier = CPU_INFO_UNIT(ci); 90 91 /* 92 * The first time we do this, the primary CPU cannot skip any 93 * hardclocks. We can skip hardclocks on subsequent calls because 94 * the global tick value is advanced during inittodr(9) on our 95 * behalf. 96 */ 97 if (CPU_IS_PRIMARY(ci)) { 98 if (cq->cq_hardclock->cl_expiration == 0) 99 clockintr_schedule(cq->cq_hardclock, 0); 100 else 101 clockintr_advance(cq->cq_hardclock, hardclock_period); 102 } else { 103 if (cq->cq_hardclock->cl_expiration == 0) { 104 clockintr_stagger(cq->cq_hardclock, hardclock_period, 105 multiplier, MAXCPUS); 106 } 107 clockintr_advance(cq->cq_hardclock, hardclock_period); 108 } 109 110 /* 111 * We can always advance the statclock. There is no reason to 112 * stagger a randomized statclock. 113 */ 114 if (!statclock_is_randomized) { 115 if (spc->spc_statclock->cl_expiration == 0) { 116 clockintr_stagger(spc->spc_statclock, statclock_avg, 117 multiplier, MAXCPUS); 118 } 119 } 120 clockintr_advance(spc->spc_statclock, statclock_avg); 121 122 /* 123 * XXX Need to find a better place to do this. We can't do it in 124 * sched_init_cpu() because initclocks() runs after it. 125 */ 126 if (spc->spc_itimer->cl_expiration == 0) { 127 clockintr_stagger(spc->spc_itimer, hardclock_period, 128 multiplier, MAXCPUS); 129 } 130 if (spc->spc_profclock->cl_expiration == 0) { 131 clockintr_stagger(spc->spc_profclock, profclock_period, 132 multiplier, MAXCPUS); 133 } 134 if (spc->spc_roundrobin->cl_expiration == 0) { 135 clockintr_stagger(spc->spc_roundrobin, hardclock_period, 136 multiplier, MAXCPUS); 137 } 138 clockintr_advance(spc->spc_roundrobin, roundrobin_period); 139 140 if (reset_cq_intrclock) 141 SET(cq->cq_flags, CQ_INTRCLOCK); 142 } 143 144 /* 145 * If we have an intrclock, trigger it to start the dispatch cycle. 146 */ 147 void 148 clockintr_trigger(void) 149 { 150 struct clockintr_queue *cq = &curcpu()->ci_queue; 151 152 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 153 154 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 155 intrclock_trigger(&cq->cq_intrclock); 156 } 157 158 /* 159 * Run all expired events scheduled on the calling CPU. 160 */ 161 int 162 clockintr_dispatch(void *frame) 163 { 164 uint64_t lateness, run = 0, start; 165 struct cpu_info *ci = curcpu(); 166 struct clockintr *cl, *shadow; 167 struct clockintr_queue *cq = &ci->ci_queue; 168 uint32_t ogen; 169 170 if (cq->cq_dispatch != 0) 171 panic("%s: recursive dispatch", __func__); 172 cq->cq_dispatch = 1; 173 174 splassert(IPL_CLOCK); 175 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 176 177 mtx_enter(&cq->cq_mtx); 178 179 /* 180 * If nothing is scheduled or we arrived too early, we have 181 * nothing to do. 182 */ 183 start = nsecuptime(); 184 cq->cq_uptime = start; 185 if (TAILQ_EMPTY(&cq->cq_pend)) 186 goto stats; 187 if (cq->cq_uptime < clockqueue_next(cq)) 188 goto rearm; 189 lateness = start - clockqueue_next(cq); 190 191 /* 192 * Dispatch expired events. 193 */ 194 for (;;) { 195 cl = TAILQ_FIRST(&cq->cq_pend); 196 if (cl == NULL) 197 break; 198 if (cq->cq_uptime < cl->cl_expiration) { 199 /* Double-check the time before giving up. */ 200 cq->cq_uptime = nsecuptime(); 201 if (cq->cq_uptime < cl->cl_expiration) 202 break; 203 } 204 205 /* 206 * This clockintr has expired. Initialize a shadow copy 207 * and execute it. 208 */ 209 clockqueue_pend_delete(cq, cl); 210 shadow = &cq->cq_shadow; 211 shadow->cl_expiration = cl->cl_expiration; 212 shadow->cl_arg = cl->cl_arg; 213 shadow->cl_func = cl->cl_func; 214 cq->cq_running = cl; 215 mtx_leave(&cq->cq_mtx); 216 217 shadow->cl_func(shadow, frame, shadow->cl_arg); 218 219 mtx_enter(&cq->cq_mtx); 220 cq->cq_running = NULL; 221 if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) { 222 CLR(cl->cl_flags, CLST_IGNORE_SHADOW); 223 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 224 } 225 if (ISSET(shadow->cl_flags, CLST_SHADOW_PENDING)) { 226 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 227 clockqueue_pend_insert(cq, cl, shadow->cl_expiration); 228 } 229 run++; 230 } 231 232 /* 233 * Dispatch complete. 234 */ 235 rearm: 236 /* Rearm the interrupt clock if we have one. */ 237 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 238 if (!TAILQ_EMPTY(&cq->cq_pend)) { 239 intrclock_rearm(&cq->cq_intrclock, 240 clockqueue_next(cq) - cq->cq_uptime); 241 } 242 } 243 stats: 244 /* Update our stats. */ 245 ogen = cq->cq_gen; 246 cq->cq_gen = 0; 247 membar_producer(); 248 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 249 if (run > 0) { 250 cq->cq_stat.cs_lateness += lateness; 251 cq->cq_stat.cs_prompt++; 252 cq->cq_stat.cs_run += run; 253 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 254 cq->cq_stat.cs_early++; 255 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 256 } else 257 cq->cq_stat.cs_spurious++; 258 membar_producer(); 259 cq->cq_gen = MAX(1, ogen + 1); 260 261 mtx_leave(&cq->cq_mtx); 262 263 if (cq->cq_dispatch != 1) 264 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 265 cq->cq_dispatch = 0; 266 267 return run > 0; 268 } 269 270 uint64_t 271 clockintr_advance(struct clockintr *cl, uint64_t period) 272 { 273 uint64_t count, expiration; 274 struct clockintr_queue *cq = cl->cl_queue; 275 276 if (cl == &cq->cq_shadow) { 277 count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime); 278 SET(cl->cl_flags, CLST_SHADOW_PENDING); 279 } else { 280 mtx_enter(&cq->cq_mtx); 281 expiration = cl->cl_expiration; 282 count = nsec_advance(&expiration, period, nsecuptime()); 283 clockintr_schedule_locked(cl, expiration); 284 mtx_leave(&cq->cq_mtx); 285 } 286 return count; 287 } 288 289 uint64_t 290 clockintr_advance_random(struct clockintr *cl, uint64_t min, uint32_t mask) 291 { 292 uint64_t count = 0; 293 struct clockintr_queue *cq = cl->cl_queue; 294 uint32_t off; 295 296 KASSERT(cl == &cq->cq_shadow); 297 298 while (cl->cl_expiration <= cq->cq_uptime) { 299 while ((off = (random() & mask)) == 0) 300 continue; 301 cl->cl_expiration += min + off; 302 count++; 303 } 304 SET(cl->cl_flags, CLST_SHADOW_PENDING); 305 return count; 306 } 307 308 void 309 clockintr_cancel(struct clockintr *cl) 310 { 311 struct clockintr_queue *cq = cl->cl_queue; 312 int was_next; 313 314 if (cl == &cq->cq_shadow) { 315 CLR(cl->cl_flags, CLST_SHADOW_PENDING); 316 return; 317 } 318 319 mtx_enter(&cq->cq_mtx); 320 if (ISSET(cl->cl_flags, CLST_PENDING)) { 321 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 322 clockqueue_pend_delete(cq, cl); 323 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 324 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 325 if (cq == &curcpu()->ci_queue) 326 clockqueue_reset_intrclock(cq); 327 } 328 } 329 } 330 if (cl == cq->cq_running) 331 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 332 mtx_leave(&cq->cq_mtx); 333 } 334 335 struct clockintr * 336 clockintr_establish(void *vci, 337 void (*func)(struct clockintr *, void *, void *), void *arg) 338 { 339 struct cpu_info *ci = vci; 340 struct clockintr *cl; 341 struct clockintr_queue *cq = &ci->ci_queue; 342 343 cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); 344 if (cl == NULL) 345 return NULL; 346 cl->cl_arg = arg; 347 cl->cl_func = func; 348 cl->cl_queue = cq; 349 350 mtx_enter(&cq->cq_mtx); 351 TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); 352 mtx_leave(&cq->cq_mtx); 353 return cl; 354 } 355 356 void 357 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 358 { 359 struct clockintr_queue *cq = cl->cl_queue; 360 361 if (cl == &cq->cq_shadow) { 362 cl->cl_expiration = expiration; 363 SET(cl->cl_flags, CLST_SHADOW_PENDING); 364 } else { 365 mtx_enter(&cq->cq_mtx); 366 clockintr_schedule_locked(cl, expiration); 367 mtx_leave(&cq->cq_mtx); 368 } 369 } 370 371 void 372 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 373 { 374 struct clockintr_queue *cq = cl->cl_queue; 375 376 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 377 378 if (ISSET(cl->cl_flags, CLST_PENDING)) 379 clockqueue_pend_delete(cq, cl); 380 clockqueue_pend_insert(cq, cl, expiration); 381 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 382 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 383 if (cq == &curcpu()->ci_queue) 384 clockqueue_reset_intrclock(cq); 385 } 386 } 387 if (cl == cq->cq_running) 388 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 389 } 390 391 void 392 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t n, 393 uint32_t count) 394 { 395 struct clockintr_queue *cq = cl->cl_queue; 396 397 KASSERT(n < count); 398 399 mtx_enter(&cq->cq_mtx); 400 if (ISSET(cl->cl_flags, CLST_PENDING)) 401 panic("%s: clock interrupt pending", __func__); 402 cl->cl_expiration = period / count * n; 403 mtx_leave(&cq->cq_mtx); 404 } 405 406 void 407 clockintr_hardclock(struct clockintr *cl, void *frame, void *arg) 408 { 409 uint64_t count, i; 410 411 count = clockintr_advance(cl, hardclock_period); 412 for (i = 0; i < count; i++) 413 hardclock(frame); 414 } 415 416 void 417 clockqueue_init(struct clockintr_queue *cq) 418 { 419 if (ISSET(cq->cq_flags, CQ_INIT)) 420 return; 421 422 cq->cq_shadow.cl_queue = cq; 423 mtx_init(&cq->cq_mtx, IPL_CLOCK); 424 TAILQ_INIT(&cq->cq_all); 425 TAILQ_INIT(&cq->cq_pend); 426 cq->cq_gen = 1; 427 SET(cq->cq_flags, CQ_INIT); 428 } 429 430 void 431 clockqueue_intrclock_install(struct clockintr_queue *cq, 432 const struct intrclock *ic) 433 { 434 mtx_enter(&cq->cq_mtx); 435 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 436 cq->cq_intrclock = *ic; 437 SET(cq->cq_flags, CQ_INTRCLOCK); 438 } 439 mtx_leave(&cq->cq_mtx); 440 } 441 442 uint64_t 443 clockqueue_next(const struct clockintr_queue *cq) 444 { 445 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 446 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 447 } 448 449 void 450 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 451 { 452 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 453 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 454 455 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 456 CLR(cl->cl_flags, CLST_PENDING); 457 } 458 459 460 void 461 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 462 uint64_t expiration) 463 { 464 struct clockintr *elm; 465 466 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 467 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 468 469 cl->cl_expiration = expiration; 470 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 471 if (cl->cl_expiration < elm->cl_expiration) 472 break; 473 } 474 if (elm == NULL) 475 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 476 else 477 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 478 SET(cl->cl_flags, CLST_PENDING); 479 } 480 481 void 482 clockqueue_reset_intrclock(struct clockintr_queue *cq) 483 { 484 uint64_t exp, now; 485 486 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 487 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 488 489 exp = clockqueue_next(cq); 490 now = nsecuptime(); 491 if (now < exp) 492 intrclock_rearm(&cq->cq_intrclock, exp - now); 493 else 494 intrclock_trigger(&cq->cq_intrclock); 495 } 496 497 /* 498 * Advance *next in increments of period until it exceeds now. 499 * Returns the number of increments *next was advanced. 500 * 501 * We check the common cases first to avoid division if possible. 502 * This does no overflow checking. 503 */ 504 uint64_t 505 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 506 { 507 uint64_t elapsed; 508 509 if (now < *next) 510 return 0; 511 512 if (now < *next + period) { 513 *next += period; 514 return 1; 515 } 516 517 elapsed = (now - *next) / period + 1; 518 *next += period * elapsed; 519 return elapsed; 520 } 521 522 int 523 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 524 void *newp, size_t newlen) 525 { 526 struct clockintr_stat sum, tmp; 527 struct clockintr_queue *cq; 528 struct cpu_info *ci; 529 CPU_INFO_ITERATOR cii; 530 uint32_t gen; 531 532 if (namelen != 1) 533 return ENOTDIR; 534 535 switch (name[0]) { 536 case KERN_CLOCKINTR_STATS: 537 memset(&sum, 0, sizeof sum); 538 CPU_INFO_FOREACH(cii, ci) { 539 cq = &ci->ci_queue; 540 if (!ISSET(cq->cq_flags, CQ_INIT)) 541 continue; 542 do { 543 gen = cq->cq_gen; 544 membar_consumer(); 545 tmp = cq->cq_stat; 546 membar_consumer(); 547 } while (gen == 0 || gen != cq->cq_gen); 548 sum.cs_dispatched += tmp.cs_dispatched; 549 sum.cs_early += tmp.cs_early; 550 sum.cs_earliness += tmp.cs_earliness; 551 sum.cs_lateness += tmp.cs_lateness; 552 sum.cs_prompt += tmp.cs_prompt; 553 sum.cs_run += tmp.cs_run; 554 sum.cs_spurious += tmp.cs_spurious; 555 } 556 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 557 default: 558 break; 559 } 560 561 return EINVAL; 562 } 563 564 #ifdef DDB 565 566 #include <machine/db_machdep.h> 567 568 #include <ddb/db_interface.h> 569 #include <ddb/db_output.h> 570 #include <ddb/db_sym.h> 571 572 void db_show_clockintr(const struct clockintr *, const char *, u_int); 573 void db_show_clockintr_cpu(struct cpu_info *); 574 575 void 576 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 577 { 578 struct timespec now; 579 struct cpu_info *ci; 580 CPU_INFO_ITERATOR cii; 581 582 nanouptime(&now); 583 db_printf("%20s\n", "UPTIME"); 584 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 585 db_printf("\n"); 586 db_printf("%20s %5s %3s %s\n", "EXPIRATION", "STATE", "CPU", "NAME"); 587 CPU_INFO_FOREACH(cii, ci) { 588 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 589 db_show_clockintr_cpu(ci); 590 } 591 } 592 593 void 594 db_show_clockintr_cpu(struct cpu_info *ci) 595 { 596 struct clockintr *elm; 597 struct clockintr_queue *cq = &ci->ci_queue; 598 u_int cpu = CPU_INFO_UNIT(ci); 599 600 if (cq->cq_running != NULL) 601 db_show_clockintr(cq->cq_running, "run", cpu); 602 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 603 db_show_clockintr(elm, "pend", cpu); 604 TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { 605 if (!ISSET(elm->cl_flags, CLST_PENDING)) 606 db_show_clockintr(elm, "idle", cpu); 607 } 608 } 609 610 void 611 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 612 { 613 struct timespec ts; 614 char *name; 615 db_expr_t offset; 616 617 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 618 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 619 if (name == NULL) 620 name = "?"; 621 db_printf("%10lld.%09ld %5s %3u %s\n", 622 ts.tv_sec, ts.tv_nsec, state, cpu, name); 623 } 624 625 #endif /* DDB */ 626