1 /* $OpenBSD: kern_clockintr.c,v 1.64 2024/01/24 19:23:38 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 void clockintr_hardclock(struct clockrequest *, void *, void *); 35 void clockintr_schedule_locked(struct clockintr *, uint64_t); 36 void clockqueue_intrclock_install(struct clockintr_queue *, 37 const struct intrclock *); 38 uint64_t clockqueue_next(const struct clockintr_queue *); 39 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 40 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 41 uint64_t); 42 void clockqueue_reset_intrclock(struct clockintr_queue *); 43 void intrclock_rearm(struct intrclock *, uint64_t); 44 void intrclock_trigger(struct intrclock *); 45 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 46 47 /* 48 * Ready the calling CPU for clockintr_dispatch(). If this is our 49 * first time here, install the intrclock, if any, and set necessary 50 * flags. Advance the schedule as needed. 51 */ 52 void 53 clockintr_cpu_init(const struct intrclock *ic) 54 { 55 uint64_t multiplier = 0; 56 struct cpu_info *ci = curcpu(); 57 struct clockintr_queue *cq = &ci->ci_queue; 58 struct schedstate_percpu *spc = &ci->ci_schedstate; 59 int reset_cq_intrclock = 0; 60 61 if (ic != NULL) 62 clockqueue_intrclock_install(cq, ic); 63 64 /* TODO: Remove this from struct clockintr_queue. */ 65 if (cq->cq_hardclock.cl_expiration == 0) { 66 clockintr_bind(&cq->cq_hardclock, ci, clockintr_hardclock, 67 NULL); 68 } 69 70 /* 71 * Mask CQ_INTRCLOCK while we're advancing the internal clock 72 * interrupts. We don't want the intrclock to fire until this 73 * thread reaches clockintr_trigger(). 74 */ 75 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 76 CLR(cq->cq_flags, CQ_INTRCLOCK); 77 reset_cq_intrclock = 1; 78 } 79 80 /* 81 * Until we understand scheduler lock contention better, stagger 82 * the hardclock and statclock so they don't all happen at once. 83 * If we have no intrclock it doesn't matter, we have no control 84 * anyway. The primary CPU's starting offset is always zero, so 85 * leave the multiplier zero. 86 */ 87 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 88 multiplier = CPU_INFO_UNIT(ci); 89 90 /* 91 * The first time we do this, the primary CPU cannot skip any 92 * hardclocks. We can skip hardclocks on subsequent calls because 93 * the global tick value is advanced during inittodr(9) on our 94 * behalf. 95 */ 96 if (CPU_IS_PRIMARY(ci)) { 97 if (cq->cq_hardclock.cl_expiration == 0) 98 clockintr_schedule(&cq->cq_hardclock, 0); 99 else 100 clockintr_advance(&cq->cq_hardclock, hardclock_period); 101 } else { 102 if (cq->cq_hardclock.cl_expiration == 0) { 103 clockintr_stagger(&cq->cq_hardclock, hardclock_period, 104 multiplier, MAXCPUS); 105 } 106 clockintr_advance(&cq->cq_hardclock, hardclock_period); 107 } 108 109 /* 110 * We can always advance the statclock. There is no reason to 111 * stagger a randomized statclock. 112 */ 113 if (!statclock_is_randomized) { 114 if (spc->spc_statclock.cl_expiration == 0) { 115 clockintr_stagger(&spc->spc_statclock, statclock_avg, 116 multiplier, MAXCPUS); 117 } 118 } 119 clockintr_advance(&spc->spc_statclock, statclock_avg); 120 121 /* 122 * XXX Need to find a better place to do this. We can't do it in 123 * sched_init_cpu() because initclocks() runs after it. 124 */ 125 if (spc->spc_itimer.cl_expiration == 0) { 126 clockintr_stagger(&spc->spc_itimer, hardclock_period, 127 multiplier, MAXCPUS); 128 } 129 if (spc->spc_profclock.cl_expiration == 0) { 130 clockintr_stagger(&spc->spc_profclock, profclock_period, 131 multiplier, MAXCPUS); 132 } 133 if (spc->spc_roundrobin.cl_expiration == 0) { 134 clockintr_stagger(&spc->spc_roundrobin, hardclock_period, 135 multiplier, MAXCPUS); 136 } 137 clockintr_advance(&spc->spc_roundrobin, roundrobin_period); 138 139 if (reset_cq_intrclock) 140 SET(cq->cq_flags, CQ_INTRCLOCK); 141 } 142 143 /* 144 * If we have an intrclock, trigger it to start the dispatch cycle. 145 */ 146 void 147 clockintr_trigger(void) 148 { 149 struct clockintr_queue *cq = &curcpu()->ci_queue; 150 151 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 152 153 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 154 intrclock_trigger(&cq->cq_intrclock); 155 } 156 157 /* 158 * Run all expired events scheduled on the calling CPU. 159 */ 160 int 161 clockintr_dispatch(void *frame) 162 { 163 uint64_t lateness, run = 0, start; 164 struct cpu_info *ci = curcpu(); 165 struct clockintr *cl; 166 struct clockintr_queue *cq = &ci->ci_queue; 167 struct clockrequest *request = &cq->cq_request; 168 void *arg; 169 void (*func)(struct clockrequest *, void *, void *); 170 uint32_t ogen; 171 172 if (cq->cq_dispatch != 0) 173 panic("%s: recursive dispatch", __func__); 174 cq->cq_dispatch = 1; 175 176 splassert(IPL_CLOCK); 177 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 178 179 mtx_enter(&cq->cq_mtx); 180 181 /* 182 * If nothing is scheduled or we arrived too early, we have 183 * nothing to do. 184 */ 185 start = nsecuptime(); 186 cq->cq_uptime = start; 187 if (TAILQ_EMPTY(&cq->cq_pend)) 188 goto stats; 189 if (cq->cq_uptime < clockqueue_next(cq)) 190 goto rearm; 191 lateness = start - clockqueue_next(cq); 192 193 /* 194 * Dispatch expired events. 195 */ 196 for (;;) { 197 cl = TAILQ_FIRST(&cq->cq_pend); 198 if (cl == NULL) 199 break; 200 if (cq->cq_uptime < cl->cl_expiration) { 201 /* Double-check the time before giving up. */ 202 cq->cq_uptime = nsecuptime(); 203 if (cq->cq_uptime < cl->cl_expiration) 204 break; 205 } 206 207 /* 208 * This clockintr has expired. Execute it. 209 */ 210 clockqueue_pend_delete(cq, cl); 211 request->cr_expiration = cl->cl_expiration; 212 arg = cl->cl_arg; 213 func = cl->cl_func; 214 cq->cq_running = cl; 215 mtx_leave(&cq->cq_mtx); 216 217 func(request, frame, arg); 218 219 mtx_enter(&cq->cq_mtx); 220 cq->cq_running = NULL; 221 if (ISSET(cq->cq_flags, CQ_IGNORE_REQUEST)) { 222 CLR(cq->cq_flags, CQ_IGNORE_REQUEST); 223 CLR(request->cr_flags, CR_RESCHEDULE); 224 } 225 if (ISSET(request->cr_flags, CR_RESCHEDULE)) { 226 CLR(request->cr_flags, CR_RESCHEDULE); 227 clockqueue_pend_insert(cq, cl, request->cr_expiration); 228 } 229 run++; 230 } 231 232 /* 233 * Dispatch complete. 234 */ 235 rearm: 236 /* Rearm the interrupt clock if we have one. */ 237 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 238 if (!TAILQ_EMPTY(&cq->cq_pend)) { 239 intrclock_rearm(&cq->cq_intrclock, 240 clockqueue_next(cq) - cq->cq_uptime); 241 } 242 } 243 stats: 244 /* Update our stats. */ 245 ogen = cq->cq_gen; 246 cq->cq_gen = 0; 247 membar_producer(); 248 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 249 if (run > 0) { 250 cq->cq_stat.cs_lateness += lateness; 251 cq->cq_stat.cs_prompt++; 252 cq->cq_stat.cs_run += run; 253 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 254 cq->cq_stat.cs_early++; 255 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 256 } else 257 cq->cq_stat.cs_spurious++; 258 membar_producer(); 259 cq->cq_gen = MAX(1, ogen + 1); 260 261 mtx_leave(&cq->cq_mtx); 262 263 if (cq->cq_dispatch != 1) 264 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 265 cq->cq_dispatch = 0; 266 267 return run > 0; 268 } 269 270 uint64_t 271 clockintr_advance(struct clockintr *cl, uint64_t period) 272 { 273 uint64_t count, expiration; 274 struct clockintr_queue *cq = cl->cl_queue; 275 276 mtx_enter(&cq->cq_mtx); 277 expiration = cl->cl_expiration; 278 count = nsec_advance(&expiration, period, nsecuptime()); 279 clockintr_schedule_locked(cl, expiration); 280 mtx_leave(&cq->cq_mtx); 281 282 return count; 283 } 284 285 uint64_t 286 clockrequest_advance(struct clockrequest *cr, uint64_t period) 287 { 288 struct clockintr_queue *cq = cr->cr_queue; 289 290 KASSERT(cr == &cq->cq_request); 291 292 SET(cr->cr_flags, CR_RESCHEDULE); 293 return nsec_advance(&cr->cr_expiration, period, cq->cq_uptime); 294 } 295 296 uint64_t 297 clockrequest_advance_random(struct clockrequest *cr, uint64_t min, 298 uint32_t mask) 299 { 300 uint64_t count = 0; 301 struct clockintr_queue *cq = cr->cr_queue; 302 uint32_t off; 303 304 KASSERT(cr == &cq->cq_request); 305 306 while (cr->cr_expiration <= cq->cq_uptime) { 307 while ((off = (random() & mask)) == 0) 308 continue; 309 cr->cr_expiration += min + off; 310 count++; 311 } 312 SET(cr->cr_flags, CR_RESCHEDULE); 313 return count; 314 } 315 316 void 317 clockintr_cancel(struct clockintr *cl) 318 { 319 struct clockintr_queue *cq = cl->cl_queue; 320 int was_next; 321 322 mtx_enter(&cq->cq_mtx); 323 if (ISSET(cl->cl_flags, CLST_PENDING)) { 324 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 325 clockqueue_pend_delete(cq, cl); 326 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 327 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 328 if (cq == &curcpu()->ci_queue) 329 clockqueue_reset_intrclock(cq); 330 } 331 } 332 } 333 if (cl == cq->cq_running) 334 SET(cq->cq_flags, CQ_IGNORE_REQUEST); 335 mtx_leave(&cq->cq_mtx); 336 } 337 338 void 339 clockintr_bind(struct clockintr *cl, struct cpu_info *ci, 340 void (*func)(struct clockrequest *, void *, void *), void *arg) 341 { 342 struct clockintr_queue *cq = &ci->ci_queue; 343 344 cl->cl_arg = arg; 345 cl->cl_func = func; 346 cl->cl_queue = cq; 347 348 mtx_enter(&cq->cq_mtx); 349 TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); 350 mtx_leave(&cq->cq_mtx); 351 } 352 353 void 354 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 355 { 356 struct clockintr_queue *cq = cl->cl_queue; 357 358 mtx_enter(&cq->cq_mtx); 359 clockintr_schedule_locked(cl, expiration); 360 mtx_leave(&cq->cq_mtx); 361 } 362 363 void 364 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 365 { 366 struct clockintr_queue *cq = cl->cl_queue; 367 368 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 369 370 if (ISSET(cl->cl_flags, CLST_PENDING)) 371 clockqueue_pend_delete(cq, cl); 372 clockqueue_pend_insert(cq, cl, expiration); 373 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 374 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 375 if (cq == &curcpu()->ci_queue) 376 clockqueue_reset_intrclock(cq); 377 } 378 } 379 if (cl == cq->cq_running) 380 SET(cq->cq_flags, CQ_IGNORE_REQUEST); 381 } 382 383 void 384 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer, 385 uint32_t denom) 386 { 387 struct clockintr_queue *cq = cl->cl_queue; 388 389 KASSERT(numer < denom); 390 391 mtx_enter(&cq->cq_mtx); 392 if (ISSET(cl->cl_flags, CLST_PENDING)) 393 panic("%s: clock interrupt pending", __func__); 394 cl->cl_expiration = period / denom * numer; 395 mtx_leave(&cq->cq_mtx); 396 } 397 398 void 399 clockintr_hardclock(struct clockrequest *cr, void *frame, void *arg) 400 { 401 uint64_t count, i; 402 403 count = clockrequest_advance(cr, hardclock_period); 404 for (i = 0; i < count; i++) 405 hardclock(frame); 406 } 407 408 void 409 clockqueue_init(struct clockintr_queue *cq) 410 { 411 if (ISSET(cq->cq_flags, CQ_INIT)) 412 return; 413 414 cq->cq_request.cr_queue = cq; 415 mtx_init(&cq->cq_mtx, IPL_CLOCK); 416 TAILQ_INIT(&cq->cq_all); 417 TAILQ_INIT(&cq->cq_pend); 418 cq->cq_gen = 1; 419 SET(cq->cq_flags, CQ_INIT); 420 } 421 422 void 423 clockqueue_intrclock_install(struct clockintr_queue *cq, 424 const struct intrclock *ic) 425 { 426 mtx_enter(&cq->cq_mtx); 427 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 428 cq->cq_intrclock = *ic; 429 SET(cq->cq_flags, CQ_INTRCLOCK); 430 } 431 mtx_leave(&cq->cq_mtx); 432 } 433 434 uint64_t 435 clockqueue_next(const struct clockintr_queue *cq) 436 { 437 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 438 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 439 } 440 441 void 442 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 443 { 444 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 445 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 446 447 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 448 CLR(cl->cl_flags, CLST_PENDING); 449 } 450 451 void 452 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 453 uint64_t expiration) 454 { 455 struct clockintr *elm; 456 457 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 458 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 459 460 cl->cl_expiration = expiration; 461 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 462 if (cl->cl_expiration < elm->cl_expiration) 463 break; 464 } 465 if (elm == NULL) 466 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 467 else 468 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 469 SET(cl->cl_flags, CLST_PENDING); 470 } 471 472 void 473 clockqueue_reset_intrclock(struct clockintr_queue *cq) 474 { 475 uint64_t exp, now; 476 477 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 478 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 479 480 exp = clockqueue_next(cq); 481 now = nsecuptime(); 482 if (now < exp) 483 intrclock_rearm(&cq->cq_intrclock, exp - now); 484 else 485 intrclock_trigger(&cq->cq_intrclock); 486 } 487 488 void 489 intrclock_rearm(struct intrclock *ic, uint64_t nsecs) 490 { 491 ic->ic_rearm(ic->ic_cookie, nsecs); 492 } 493 494 void 495 intrclock_trigger(struct intrclock *ic) 496 { 497 ic->ic_trigger(ic->ic_cookie); 498 } 499 500 /* 501 * Advance *next in increments of period until it exceeds now. 502 * Returns the number of increments *next was advanced. 503 * 504 * We check the common cases first to avoid division if possible. 505 * This does no overflow checking. 506 */ 507 uint64_t 508 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 509 { 510 uint64_t elapsed; 511 512 if (now < *next) 513 return 0; 514 515 if (now < *next + period) { 516 *next += period; 517 return 1; 518 } 519 520 elapsed = (now - *next) / period + 1; 521 *next += period * elapsed; 522 return elapsed; 523 } 524 525 int 526 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 527 void *newp, size_t newlen) 528 { 529 struct clockintr_stat sum, tmp; 530 struct clockintr_queue *cq; 531 struct cpu_info *ci; 532 CPU_INFO_ITERATOR cii; 533 uint32_t gen; 534 535 if (namelen != 1) 536 return ENOTDIR; 537 538 switch (name[0]) { 539 case KERN_CLOCKINTR_STATS: 540 memset(&sum, 0, sizeof sum); 541 CPU_INFO_FOREACH(cii, ci) { 542 cq = &ci->ci_queue; 543 if (!ISSET(cq->cq_flags, CQ_INIT)) 544 continue; 545 do { 546 gen = cq->cq_gen; 547 membar_consumer(); 548 tmp = cq->cq_stat; 549 membar_consumer(); 550 } while (gen == 0 || gen != cq->cq_gen); 551 sum.cs_dispatched += tmp.cs_dispatched; 552 sum.cs_early += tmp.cs_early; 553 sum.cs_earliness += tmp.cs_earliness; 554 sum.cs_lateness += tmp.cs_lateness; 555 sum.cs_prompt += tmp.cs_prompt; 556 sum.cs_run += tmp.cs_run; 557 sum.cs_spurious += tmp.cs_spurious; 558 } 559 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 560 default: 561 break; 562 } 563 564 return EINVAL; 565 } 566 567 #ifdef DDB 568 569 #include <machine/db_machdep.h> 570 571 #include <ddb/db_interface.h> 572 #include <ddb/db_output.h> 573 #include <ddb/db_sym.h> 574 575 void db_show_clockintr(const struct clockintr *, const char *, u_int); 576 void db_show_clockintr_cpu(struct cpu_info *); 577 578 void 579 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 580 { 581 struct timespec now; 582 struct cpu_info *ci; 583 CPU_INFO_ITERATOR cii; 584 int width = sizeof(long) * 2 + 2; /* +2 for "0x" prefix */ 585 586 nanouptime(&now); 587 db_printf("%20s\n", "UPTIME"); 588 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 589 db_printf("\n"); 590 db_printf("%20s %5s %3s %*s %s\n", 591 "EXPIRATION", "STATE", "CPU", width, "ARG", "NAME"); 592 CPU_INFO_FOREACH(cii, ci) { 593 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 594 db_show_clockintr_cpu(ci); 595 } 596 } 597 598 void 599 db_show_clockintr_cpu(struct cpu_info *ci) 600 { 601 struct clockintr *elm; 602 struct clockintr_queue *cq = &ci->ci_queue; 603 u_int cpu = CPU_INFO_UNIT(ci); 604 605 if (cq->cq_running != NULL) 606 db_show_clockintr(cq->cq_running, "run", cpu); 607 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 608 db_show_clockintr(elm, "pend", cpu); 609 TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { 610 if (!ISSET(elm->cl_flags, CLST_PENDING)) 611 db_show_clockintr(elm, "idle", cpu); 612 } 613 } 614 615 void 616 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 617 { 618 struct timespec ts; 619 char *name; 620 db_expr_t offset; 621 int width = sizeof(long) * 2; 622 623 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 624 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 625 if (name == NULL) 626 name = "?"; 627 db_printf("%10lld.%09ld %5s %3u 0x%0*lx %s\n", 628 ts.tv_sec, ts.tv_nsec, state, cpu, 629 width, (unsigned long)cl->cl_arg, name); 630 } 631 632 #endif /* DDB */ 633