1 /* $OpenBSD: kern_clockintr.c,v 1.62 2023/10/17 00:04:02 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 void clockintr_hardclock(struct clockrequest *, void *, void *); 35 void clockintr_schedule_locked(struct clockintr *, uint64_t); 36 void clockqueue_intrclock_install(struct clockintr_queue *, 37 const struct intrclock *); 38 uint64_t clockqueue_next(const struct clockintr_queue *); 39 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 40 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 41 uint64_t); 42 void clockqueue_reset_intrclock(struct clockintr_queue *); 43 void intrclock_rearm(struct intrclock *, uint64_t); 44 void intrclock_trigger(struct intrclock *); 45 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 46 47 /* 48 * Ready the calling CPU for clockintr_dispatch(). If this is our 49 * first time here, install the intrclock, if any, and set necessary 50 * flags. Advance the schedule as needed. 51 */ 52 void 53 clockintr_cpu_init(const struct intrclock *ic) 54 { 55 uint64_t multiplier = 0; 56 struct cpu_info *ci = curcpu(); 57 struct clockintr_queue *cq = &ci->ci_queue; 58 struct schedstate_percpu *spc = &ci->ci_schedstate; 59 int reset_cq_intrclock = 0; 60 61 if (ic != NULL) 62 clockqueue_intrclock_install(cq, ic); 63 64 /* TODO: Remove this from struct clockintr_queue. */ 65 if (cq->cq_hardclock == NULL) { 66 cq->cq_hardclock = clockintr_establish(ci, clockintr_hardclock, 67 NULL); 68 if (cq->cq_hardclock == NULL) 69 panic("%s: failed to establish hardclock", __func__); 70 } 71 72 /* 73 * Mask CQ_INTRCLOCK while we're advancing the internal clock 74 * interrupts. We don't want the intrclock to fire until this 75 * thread reaches clockintr_trigger(). 76 */ 77 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 78 CLR(cq->cq_flags, CQ_INTRCLOCK); 79 reset_cq_intrclock = 1; 80 } 81 82 /* 83 * Until we understand scheduler lock contention better, stagger 84 * the hardclock and statclock so they don't all happen at once. 85 * If we have no intrclock it doesn't matter, we have no control 86 * anyway. The primary CPU's starting offset is always zero, so 87 * leave the multiplier zero. 88 */ 89 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 90 multiplier = CPU_INFO_UNIT(ci); 91 92 /* 93 * The first time we do this, the primary CPU cannot skip any 94 * hardclocks. We can skip hardclocks on subsequent calls because 95 * the global tick value is advanced during inittodr(9) on our 96 * behalf. 97 */ 98 if (CPU_IS_PRIMARY(ci)) { 99 if (cq->cq_hardclock->cl_expiration == 0) 100 clockintr_schedule(cq->cq_hardclock, 0); 101 else 102 clockintr_advance(cq->cq_hardclock, hardclock_period); 103 } else { 104 if (cq->cq_hardclock->cl_expiration == 0) { 105 clockintr_stagger(cq->cq_hardclock, hardclock_period, 106 multiplier, MAXCPUS); 107 } 108 clockintr_advance(cq->cq_hardclock, hardclock_period); 109 } 110 111 /* 112 * We can always advance the statclock. There is no reason to 113 * stagger a randomized statclock. 114 */ 115 if (!statclock_is_randomized) { 116 if (spc->spc_statclock->cl_expiration == 0) { 117 clockintr_stagger(spc->spc_statclock, statclock_avg, 118 multiplier, MAXCPUS); 119 } 120 } 121 clockintr_advance(spc->spc_statclock, statclock_avg); 122 123 /* 124 * XXX Need to find a better place to do this. We can't do it in 125 * sched_init_cpu() because initclocks() runs after it. 126 */ 127 if (spc->spc_itimer->cl_expiration == 0) { 128 clockintr_stagger(spc->spc_itimer, hardclock_period, 129 multiplier, MAXCPUS); 130 } 131 if (spc->spc_profclock->cl_expiration == 0) { 132 clockintr_stagger(spc->spc_profclock, profclock_period, 133 multiplier, MAXCPUS); 134 } 135 if (spc->spc_roundrobin->cl_expiration == 0) { 136 clockintr_stagger(spc->spc_roundrobin, hardclock_period, 137 multiplier, MAXCPUS); 138 } 139 clockintr_advance(spc->spc_roundrobin, roundrobin_period); 140 141 if (reset_cq_intrclock) 142 SET(cq->cq_flags, CQ_INTRCLOCK); 143 } 144 145 /* 146 * If we have an intrclock, trigger it to start the dispatch cycle. 147 */ 148 void 149 clockintr_trigger(void) 150 { 151 struct clockintr_queue *cq = &curcpu()->ci_queue; 152 153 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 154 155 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 156 intrclock_trigger(&cq->cq_intrclock); 157 } 158 159 /* 160 * Run all expired events scheduled on the calling CPU. 161 */ 162 int 163 clockintr_dispatch(void *frame) 164 { 165 uint64_t lateness, run = 0, start; 166 struct cpu_info *ci = curcpu(); 167 struct clockintr *cl; 168 struct clockintr_queue *cq = &ci->ci_queue; 169 struct clockrequest *request = &cq->cq_request; 170 void *arg; 171 void (*func)(struct clockrequest *, void *, void *); 172 uint32_t ogen; 173 174 if (cq->cq_dispatch != 0) 175 panic("%s: recursive dispatch", __func__); 176 cq->cq_dispatch = 1; 177 178 splassert(IPL_CLOCK); 179 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 180 181 mtx_enter(&cq->cq_mtx); 182 183 /* 184 * If nothing is scheduled or we arrived too early, we have 185 * nothing to do. 186 */ 187 start = nsecuptime(); 188 cq->cq_uptime = start; 189 if (TAILQ_EMPTY(&cq->cq_pend)) 190 goto stats; 191 if (cq->cq_uptime < clockqueue_next(cq)) 192 goto rearm; 193 lateness = start - clockqueue_next(cq); 194 195 /* 196 * Dispatch expired events. 197 */ 198 for (;;) { 199 cl = TAILQ_FIRST(&cq->cq_pend); 200 if (cl == NULL) 201 break; 202 if (cq->cq_uptime < cl->cl_expiration) { 203 /* Double-check the time before giving up. */ 204 cq->cq_uptime = nsecuptime(); 205 if (cq->cq_uptime < cl->cl_expiration) 206 break; 207 } 208 209 /* 210 * This clockintr has expired. Execute it. 211 */ 212 clockqueue_pend_delete(cq, cl); 213 request->cr_expiration = cl->cl_expiration; 214 arg = cl->cl_arg; 215 func = cl->cl_func; 216 cq->cq_running = cl; 217 mtx_leave(&cq->cq_mtx); 218 219 func(request, frame, arg); 220 221 mtx_enter(&cq->cq_mtx); 222 cq->cq_running = NULL; 223 if (ISSET(cl->cl_flags, CLST_IGNORE_REQUEST)) { 224 CLR(cl->cl_flags, CLST_IGNORE_REQUEST); 225 CLR(request->cr_flags, CR_RESCHEDULE); 226 } 227 if (ISSET(request->cr_flags, CR_RESCHEDULE)) { 228 CLR(request->cr_flags, CR_RESCHEDULE); 229 clockqueue_pend_insert(cq, cl, request->cr_expiration); 230 } 231 run++; 232 } 233 234 /* 235 * Dispatch complete. 236 */ 237 rearm: 238 /* Rearm the interrupt clock if we have one. */ 239 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 240 if (!TAILQ_EMPTY(&cq->cq_pend)) { 241 intrclock_rearm(&cq->cq_intrclock, 242 clockqueue_next(cq) - cq->cq_uptime); 243 } 244 } 245 stats: 246 /* Update our stats. */ 247 ogen = cq->cq_gen; 248 cq->cq_gen = 0; 249 membar_producer(); 250 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 251 if (run > 0) { 252 cq->cq_stat.cs_lateness += lateness; 253 cq->cq_stat.cs_prompt++; 254 cq->cq_stat.cs_run += run; 255 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 256 cq->cq_stat.cs_early++; 257 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 258 } else 259 cq->cq_stat.cs_spurious++; 260 membar_producer(); 261 cq->cq_gen = MAX(1, ogen + 1); 262 263 mtx_leave(&cq->cq_mtx); 264 265 if (cq->cq_dispatch != 1) 266 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 267 cq->cq_dispatch = 0; 268 269 return run > 0; 270 } 271 272 uint64_t 273 clockintr_advance(struct clockintr *cl, uint64_t period) 274 { 275 uint64_t count, expiration; 276 struct clockintr_queue *cq = cl->cl_queue; 277 278 mtx_enter(&cq->cq_mtx); 279 expiration = cl->cl_expiration; 280 count = nsec_advance(&expiration, period, nsecuptime()); 281 clockintr_schedule_locked(cl, expiration); 282 mtx_leave(&cq->cq_mtx); 283 284 return count; 285 } 286 287 uint64_t 288 clockrequest_advance(struct clockrequest *cr, uint64_t period) 289 { 290 struct clockintr_queue *cq = cr->cr_queue; 291 292 KASSERT(cr == &cq->cq_request); 293 294 SET(cr->cr_flags, CR_RESCHEDULE); 295 return nsec_advance(&cr->cr_expiration, period, cq->cq_uptime); 296 } 297 298 uint64_t 299 clockrequest_advance_random(struct clockrequest *cr, uint64_t min, 300 uint32_t mask) 301 { 302 uint64_t count = 0; 303 struct clockintr_queue *cq = cr->cr_queue; 304 uint32_t off; 305 306 KASSERT(cr == &cq->cq_request); 307 308 while (cr->cr_expiration <= cq->cq_uptime) { 309 while ((off = (random() & mask)) == 0) 310 continue; 311 cr->cr_expiration += min + off; 312 count++; 313 } 314 SET(cr->cr_flags, CR_RESCHEDULE); 315 return count; 316 } 317 318 void 319 clockintr_cancel(struct clockintr *cl) 320 { 321 struct clockintr_queue *cq = cl->cl_queue; 322 int was_next; 323 324 mtx_enter(&cq->cq_mtx); 325 if (ISSET(cl->cl_flags, CLST_PENDING)) { 326 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 327 clockqueue_pend_delete(cq, cl); 328 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 329 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 330 if (cq == &curcpu()->ci_queue) 331 clockqueue_reset_intrclock(cq); 332 } 333 } 334 } 335 if (cl == cq->cq_running) 336 SET(cl->cl_flags, CLST_IGNORE_REQUEST); 337 mtx_leave(&cq->cq_mtx); 338 } 339 340 struct clockintr * 341 clockintr_establish(struct cpu_info *ci, 342 void (*func)(struct clockrequest *, void *, void *), void *arg) 343 { 344 struct clockintr *cl; 345 struct clockintr_queue *cq = &ci->ci_queue; 346 347 cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); 348 if (cl == NULL) 349 return NULL; 350 cl->cl_arg = arg; 351 cl->cl_func = func; 352 cl->cl_queue = cq; 353 354 mtx_enter(&cq->cq_mtx); 355 TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); 356 mtx_leave(&cq->cq_mtx); 357 return cl; 358 } 359 360 void 361 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 362 { 363 struct clockintr_queue *cq = cl->cl_queue; 364 365 mtx_enter(&cq->cq_mtx); 366 clockintr_schedule_locked(cl, expiration); 367 mtx_leave(&cq->cq_mtx); 368 } 369 370 void 371 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 372 { 373 struct clockintr_queue *cq = cl->cl_queue; 374 375 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 376 377 if (ISSET(cl->cl_flags, CLST_PENDING)) 378 clockqueue_pend_delete(cq, cl); 379 clockqueue_pend_insert(cq, cl, expiration); 380 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 381 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 382 if (cq == &curcpu()->ci_queue) 383 clockqueue_reset_intrclock(cq); 384 } 385 } 386 if (cl == cq->cq_running) 387 SET(cl->cl_flags, CLST_IGNORE_REQUEST); 388 } 389 390 void 391 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer, 392 uint32_t denom) 393 { 394 struct clockintr_queue *cq = cl->cl_queue; 395 396 KASSERT(numer < denom); 397 398 mtx_enter(&cq->cq_mtx); 399 if (ISSET(cl->cl_flags, CLST_PENDING)) 400 panic("%s: clock interrupt pending", __func__); 401 cl->cl_expiration = period / denom * numer; 402 mtx_leave(&cq->cq_mtx); 403 } 404 405 void 406 clockintr_hardclock(struct clockrequest *cr, void *frame, void *arg) 407 { 408 uint64_t count, i; 409 410 count = clockrequest_advance(cr, hardclock_period); 411 for (i = 0; i < count; i++) 412 hardclock(frame); 413 } 414 415 void 416 clockqueue_init(struct clockintr_queue *cq) 417 { 418 if (ISSET(cq->cq_flags, CQ_INIT)) 419 return; 420 421 cq->cq_request.cr_queue = cq; 422 mtx_init(&cq->cq_mtx, IPL_CLOCK); 423 TAILQ_INIT(&cq->cq_all); 424 TAILQ_INIT(&cq->cq_pend); 425 cq->cq_gen = 1; 426 SET(cq->cq_flags, CQ_INIT); 427 } 428 429 void 430 clockqueue_intrclock_install(struct clockintr_queue *cq, 431 const struct intrclock *ic) 432 { 433 mtx_enter(&cq->cq_mtx); 434 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 435 cq->cq_intrclock = *ic; 436 SET(cq->cq_flags, CQ_INTRCLOCK); 437 } 438 mtx_leave(&cq->cq_mtx); 439 } 440 441 uint64_t 442 clockqueue_next(const struct clockintr_queue *cq) 443 { 444 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 445 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 446 } 447 448 void 449 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 450 { 451 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 452 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 453 454 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 455 CLR(cl->cl_flags, CLST_PENDING); 456 } 457 458 void 459 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 460 uint64_t expiration) 461 { 462 struct clockintr *elm; 463 464 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 465 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 466 467 cl->cl_expiration = expiration; 468 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 469 if (cl->cl_expiration < elm->cl_expiration) 470 break; 471 } 472 if (elm == NULL) 473 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 474 else 475 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 476 SET(cl->cl_flags, CLST_PENDING); 477 } 478 479 void 480 clockqueue_reset_intrclock(struct clockintr_queue *cq) 481 { 482 uint64_t exp, now; 483 484 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 485 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 486 487 exp = clockqueue_next(cq); 488 now = nsecuptime(); 489 if (now < exp) 490 intrclock_rearm(&cq->cq_intrclock, exp - now); 491 else 492 intrclock_trigger(&cq->cq_intrclock); 493 } 494 495 void 496 intrclock_rearm(struct intrclock *ic, uint64_t nsecs) 497 { 498 ic->ic_rearm(ic->ic_cookie, nsecs); 499 } 500 501 void 502 intrclock_trigger(struct intrclock *ic) 503 { 504 ic->ic_trigger(ic->ic_cookie); 505 } 506 507 /* 508 * Advance *next in increments of period until it exceeds now. 509 * Returns the number of increments *next was advanced. 510 * 511 * We check the common cases first to avoid division if possible. 512 * This does no overflow checking. 513 */ 514 uint64_t 515 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 516 { 517 uint64_t elapsed; 518 519 if (now < *next) 520 return 0; 521 522 if (now < *next + period) { 523 *next += period; 524 return 1; 525 } 526 527 elapsed = (now - *next) / period + 1; 528 *next += period * elapsed; 529 return elapsed; 530 } 531 532 int 533 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 534 void *newp, size_t newlen) 535 { 536 struct clockintr_stat sum, tmp; 537 struct clockintr_queue *cq; 538 struct cpu_info *ci; 539 CPU_INFO_ITERATOR cii; 540 uint32_t gen; 541 542 if (namelen != 1) 543 return ENOTDIR; 544 545 switch (name[0]) { 546 case KERN_CLOCKINTR_STATS: 547 memset(&sum, 0, sizeof sum); 548 CPU_INFO_FOREACH(cii, ci) { 549 cq = &ci->ci_queue; 550 if (!ISSET(cq->cq_flags, CQ_INIT)) 551 continue; 552 do { 553 gen = cq->cq_gen; 554 membar_consumer(); 555 tmp = cq->cq_stat; 556 membar_consumer(); 557 } while (gen == 0 || gen != cq->cq_gen); 558 sum.cs_dispatched += tmp.cs_dispatched; 559 sum.cs_early += tmp.cs_early; 560 sum.cs_earliness += tmp.cs_earliness; 561 sum.cs_lateness += tmp.cs_lateness; 562 sum.cs_prompt += tmp.cs_prompt; 563 sum.cs_run += tmp.cs_run; 564 sum.cs_spurious += tmp.cs_spurious; 565 } 566 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 567 default: 568 break; 569 } 570 571 return EINVAL; 572 } 573 574 #ifdef DDB 575 576 #include <machine/db_machdep.h> 577 578 #include <ddb/db_interface.h> 579 #include <ddb/db_output.h> 580 #include <ddb/db_sym.h> 581 582 void db_show_clockintr(const struct clockintr *, const char *, u_int); 583 void db_show_clockintr_cpu(struct cpu_info *); 584 585 void 586 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 587 { 588 struct timespec now; 589 struct cpu_info *ci; 590 CPU_INFO_ITERATOR cii; 591 int width = sizeof(long) * 2 + 2; /* +2 for "0x" prefix */ 592 593 nanouptime(&now); 594 db_printf("%20s\n", "UPTIME"); 595 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 596 db_printf("\n"); 597 db_printf("%20s %5s %3s %*s %s\n", 598 "EXPIRATION", "STATE", "CPU", width, "ARG", "NAME"); 599 CPU_INFO_FOREACH(cii, ci) { 600 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 601 db_show_clockintr_cpu(ci); 602 } 603 } 604 605 void 606 db_show_clockintr_cpu(struct cpu_info *ci) 607 { 608 struct clockintr *elm; 609 struct clockintr_queue *cq = &ci->ci_queue; 610 u_int cpu = CPU_INFO_UNIT(ci); 611 612 if (cq->cq_running != NULL) 613 db_show_clockintr(cq->cq_running, "run", cpu); 614 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 615 db_show_clockintr(elm, "pend", cpu); 616 TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { 617 if (!ISSET(elm->cl_flags, CLST_PENDING)) 618 db_show_clockintr(elm, "idle", cpu); 619 } 620 } 621 622 void 623 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 624 { 625 struct timespec ts; 626 char *name; 627 db_expr_t offset; 628 int width = sizeof(long) * 2; 629 630 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 631 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 632 if (name == NULL) 633 name = "?"; 634 db_printf("%10lld.%09ld %5s %3u 0x%0*lx %s\n", 635 ts.tv_sec, ts.tv_nsec, state, cpu, 636 width, (unsigned long)cl->cl_arg, name); 637 } 638 639 #endif /* DDB */ 640