1 /* $NetBSD: kern_heartbeat.c,v 1.14 2024/08/25 01:14:01 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress 31 * 32 * Manual tests to run when changing this file. Magic numbers are for 33 * evbarm; adjust for other platforms. Tests involving cpuctl 34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU 35 * system, offline all but one CPU. 36 * 37 * 1. cpuctl offline 0 38 * sleep 20 39 * cpuctl online 0 40 * 41 * 2. cpuctl offline 1 42 * sleep 20 43 * cpuctl online 1 44 * 45 * 3. cpuctl offline 0 46 * sysctl -w kern.heartbeat.max_period=5 47 * sleep 10 48 * sysctl -w kern.heartbeat.max_period=0 49 * sleep 10 50 * sysctl -w kern.heartbeat.max_period=5 51 * sleep 10 52 * cpuctl online 0 53 * 54 * 4. sysctl -w debug.crashme_enable=1 55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 56 * # verify system panics after 15sec, with a stack trace through 57 * # crashme_spl_spinout 58 * 59 * 5. sysctl -w debug.crashme_enable=1 60 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 61 * # verify system panics after 15sec, with a stack trace through 62 * # crashme_spl_spinout 63 * 64 * 6. cpuctl offline 0 65 * sysctl -w debug.crashme_enable=1 66 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 67 * # verify system panics after 15sec, with a stack trace through 68 * # crashme_spl_spinout 69 * 70 * 7. cpuctl offline 0 71 * sysctl -w debug.crashme_enable=1 72 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM 73 * # verify system panics after 15sec, with a stack trace through 74 * # crashme_spl_spinout 75 * 76 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU 77 * # require a hardware watchdog timer. 78 * #cpuctl offline 0 79 * #sysctl -w debug.crashme_enable 80 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 81 * # hope watchdog timer kicks in 82 */ 83 84 #include <sys/cdefs.h> 85 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.14 2024/08/25 01:14:01 riastradh Exp $"); 86 87 #ifdef _KERNEL_OPT 88 #include "opt_ddb.h" 89 #include "opt_heartbeat.h" 90 #endif 91 92 #include "heartbeat.h" 93 94 #include <sys/param.h> 95 #include <sys/types.h> 96 97 #include <sys/atomic.h> 98 #include <sys/cpu.h> 99 #include <sys/errno.h> 100 #include <sys/heartbeat.h> 101 #include <sys/ipi.h> 102 #include <sys/kernel.h> 103 #include <sys/mutex.h> 104 #include <sys/sysctl.h> 105 #include <sys/systm.h> 106 #include <sys/xcall.h> 107 108 #ifdef DDB 109 #include <ddb/ddb.h> 110 #endif 111 112 /* 113 * Global state. 114 * 115 * heartbeat_lock serializes access to heartbeat_max_period_secs 116 * and heartbeat_max_period_ticks. Two separate variables so we 117 * can avoid multiplication or division in the heartbeat routine. 118 * 119 * heartbeat_sih is stable after initialization in 120 * heartbeat_start. 121 */ 122 kmutex_t heartbeat_lock __cacheline_aligned; 123 unsigned heartbeat_max_period_secs __read_mostly; 124 unsigned heartbeat_max_period_ticks __read_mostly; 125 126 void *heartbeat_sih __read_mostly; 127 128 /* 129 * heartbeat_suspend() 130 * 131 * Suspend heartbeat monitoring of the current CPU. 132 * 133 * Called after the current CPU has been marked offline but before 134 * it has stopped running, or after IPL has been raised for 135 * polling-mode console input. Nestable (but only 2^32 times, so 136 * don't do this in a loop). Reversed by heartbeat_resume. 137 * 138 * Caller must be bound to the CPU, i.e., curcpu_stable() must be 139 * true. This function does not assert curcpu_stable() since it 140 * is used in the ddb entry path, where any assertions risk 141 * infinite regress into undebuggable chaos, so callers must be 142 * careful. 143 */ 144 void 145 heartbeat_suspend(void) 146 { 147 unsigned *p; 148 149 p = &curcpu()->ci_heartbeat_suspend; 150 atomic_store_relaxed(p, *p + 1); 151 } 152 153 /* 154 * heartbeat_resume_cpu(ci) 155 * 156 * Resume heartbeat monitoring of ci. 157 * 158 * Called at startup while cold, and whenever heartbeat monitoring 159 * is re-enabled after being disabled or the period is changed. 160 * When not cold, ci must be the current CPU. 161 * 162 * Must be run at splsched. 163 */ 164 static void 165 heartbeat_resume_cpu(struct cpu_info *ci) 166 { 167 168 KASSERT(__predict_false(cold) || curcpu_stable()); 169 KASSERT(__predict_false(cold) || ci == curcpu()); 170 /* XXX KASSERT IPL_SCHED */ 171 172 ci->ci_heartbeat_count = 0; 173 ci->ci_heartbeat_uptime_cache = time_uptime32; 174 ci->ci_heartbeat_uptime_stamp = 0; 175 } 176 177 /* 178 * heartbeat_resume() 179 * 180 * Resume heartbeat monitoring of the current CPU. 181 * 182 * Called after the current CPU has started running but before it 183 * has been marked online, or when ending polling-mode input 184 * before IPL is restored. Reverses heartbeat_suspend. 185 * 186 * Caller must be bound to the CPU, i.e., curcpu_stable() must be 187 * true. 188 */ 189 void 190 heartbeat_resume(void) 191 { 192 struct cpu_info *ci = curcpu(); 193 unsigned *p; 194 int s; 195 196 KASSERT(curcpu_stable()); 197 198 /* 199 * Reset the state so nobody spuriously thinks we had a heart 200 * attack as soon as the heartbeat checks resume. 201 */ 202 s = splsched(); 203 heartbeat_resume_cpu(ci); 204 splx(s); 205 206 p = &ci->ci_heartbeat_suspend; 207 atomic_store_relaxed(p, *p - 1); 208 } 209 210 /* 211 * heartbeat_timecounter_suspended() 212 * 213 * True if timecounter heartbeat checks are suspended because the 214 * timecounter may not be advancing, false if heartbeat checks 215 * should check for timecounter progress. 216 */ 217 static bool 218 heartbeat_timecounter_suspended(void) 219 { 220 CPU_INFO_ITERATOR cii; 221 struct cpu_info *ci; 222 223 /* 224 * The timecounter ticks only on the primary CPU. Check 225 * whether it's suspended. 226 * 227 * XXX Would be nice if we could find the primary CPU without 228 * iterating over all CPUs. 229 */ 230 for (CPU_INFO_FOREACH(cii, ci)) { 231 if (CPU_IS_PRIMARY(ci)) 232 return atomic_load_relaxed(&ci->ci_heartbeat_suspend); 233 } 234 235 /* 236 * This should be unreachable -- there had better be a primary 237 * CPU in the system! If not, the timecounter will be busted 238 * anyway. 239 */ 240 panic("no primary CPU"); 241 } 242 243 /* 244 * heartbeat_reset_xc(a, b) 245 * 246 * Cross-call handler to reset heartbeat state just prior to 247 * enabling heartbeat checks. 248 */ 249 static void 250 heartbeat_reset_xc(void *a, void *b) 251 { 252 int s; 253 254 s = splsched(); 255 heartbeat_resume_cpu(curcpu()); 256 splx(s); 257 } 258 259 /* 260 * set_max_period(max_period) 261 * 262 * Set the maximum period, in seconds, for heartbeat checks. 263 * 264 * - If max_period is zero, disable them. 265 * 266 * - If the max period was zero and max_period is nonzero, ensure 267 * all CPUs' heartbeat uptime caches are up-to-date before 268 * re-enabling them. 269 * 270 * max_period must be below UINT_MAX/4/hz to avoid arithmetic 271 * overflow and give room for slop. 272 * 273 * Caller must hold heartbeat_lock. 274 */ 275 static void 276 set_max_period(unsigned max_period) 277 { 278 279 KASSERTMSG(max_period <= UINT_MAX/4/hz, 280 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)", 281 max_period, UINT_MAX/4/hz, hz); 282 KASSERT(mutex_owned(&heartbeat_lock)); 283 284 /* 285 * If we're enabling heartbeat checks, make sure we have a 286 * reasonably up-to-date time_uptime32 cache on all CPUs so we 287 * don't think we had an instant heart attack. 288 */ 289 if (heartbeat_max_period_secs == 0 && max_period != 0) { 290 if (cold) { 291 CPU_INFO_ITERATOR cii; 292 struct cpu_info *ci; 293 294 for (CPU_INFO_FOREACH(cii, ci)) 295 heartbeat_resume_cpu(ci); 296 } else { 297 const uint64_t ticket = 298 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL); 299 xc_wait(ticket); 300 } 301 } 302 303 /* 304 * Once the heartbeat state has been updated on all (online) 305 * CPUs, set the period. At this point, heartbeat checks can 306 * begin. 307 */ 308 atomic_store_relaxed(&heartbeat_max_period_secs, max_period); 309 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz); 310 } 311 312 /* 313 * heartbeat_max_period_ticks(SYSCTLFN_ARGS) 314 * 315 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies 316 * it lies within a reasonable interval and sets it. 317 */ 318 static int 319 heartbeat_max_period_sysctl(SYSCTLFN_ARGS) 320 { 321 struct sysctlnode node; 322 unsigned max_period; 323 int error; 324 325 mutex_enter(&heartbeat_lock); 326 327 max_period = heartbeat_max_period_secs; 328 node = *rnode; 329 node.sysctl_data = &max_period; 330 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 331 if (error || newp == NULL) 332 goto out; 333 334 /* 335 * Ensure there's plenty of slop between heartbeats. 336 */ 337 if (max_period > UINT_MAX/4/hz) { 338 error = EOVERFLOW; 339 goto out; 340 } 341 342 /* 343 * Success! Set the period. This enables heartbeat checks if 344 * we went from zero period to nonzero period, or disables them 345 * if the other way around. 346 */ 347 set_max_period(max_period); 348 error = 0; 349 350 out: mutex_exit(&heartbeat_lock); 351 return error; 352 } 353 354 /* 355 * sysctl_heartbeat_setup() 356 * 357 * Set up the kern.heartbeat.* sysctl subtree. 358 */ 359 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup") 360 { 361 const struct sysctlnode *rnode; 362 int error; 363 364 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE); 365 366 /* kern.heartbeat */ 367 error = sysctl_createv(NULL, 0, NULL, &rnode, 368 CTLFLAG_PERMANENT, 369 CTLTYPE_NODE, "heartbeat", 370 SYSCTL_DESCR("Kernel heartbeat parameters"), 371 NULL, 0, NULL, 0, 372 CTL_KERN, CTL_CREATE, CTL_EOL); 373 if (error) { 374 printf("%s: failed to create kern.heartbeat: %d\n", 375 __func__, error); 376 return; 377 } 378 379 /* kern.heartbeat.max_period */ 380 error = sysctl_createv(NULL, 0, &rnode, NULL, 381 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 382 CTLTYPE_INT, "max_period", 383 SYSCTL_DESCR("Max seconds between heartbeats before panic"), 384 &heartbeat_max_period_sysctl, 0, NULL, 0, 385 CTL_CREATE, CTL_EOL); 386 if (error) { 387 printf("%s: failed to create kern.heartbeat.max_period: %d\n", 388 __func__, error); 389 return; 390 } 391 } 392 393 /* 394 * heartbeat_intr(cookie) 395 * 396 * Soft interrupt handler to update the local CPU's view of the 397 * system uptime. This runs at the same priority level as 398 * callouts, so if callouts are stuck on this CPU, it won't run, 399 * and eventually another CPU will notice that this one is stuck. 400 * 401 * Don't do spl* here -- keep it to a minimum so if anything goes 402 * wrong we don't end up with hard interrupts blocked and unable 403 * to detect a missed heartbeat. 404 */ 405 static void 406 heartbeat_intr(void *cookie) 407 { 408 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count); 409 unsigned uptime = time_uptime32; 410 411 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count); 412 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime); 413 } 414 415 /* 416 * heartbeat_start() 417 * 418 * Start system heartbeat monitoring. 419 */ 420 void 421 heartbeat_start(void) 422 { 423 enum { max_period = HEARTBEAT_MAX_PERIOD_DEFAULT }; 424 425 /* 426 * Ensure the maximum period is small enough that we never have 427 * to worry about 32-bit wraparound even if there's a lot of 428 * slop. (In fact this is required to be less than 429 * UINT_MAX/4/hz, but that's not a compile-time constant.) 430 */ 431 __CTASSERT(max_period < UINT_MAX/4); 432 433 /* 434 * Establish a softint so we can schedule it once ready. This 435 * should be at the lowest softint priority level so that we 436 * ensure all softint priorities are making progress. 437 */ 438 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE, 439 &heartbeat_intr, NULL); 440 441 /* 442 * Now that the softint is established, kick off heartbeat 443 * monitoring with the default period. This will initialize 444 * the per-CPU state to an up-to-date cache of time_uptime32. 445 */ 446 mutex_enter(&heartbeat_lock); 447 set_max_period(max_period); 448 mutex_exit(&heartbeat_lock); 449 } 450 451 /* 452 * defibrillator(cookie) 453 * 454 * IPI handler for defibrillation. If the CPU's heart has stopped 455 * beating normally, but the CPU can still execute things, 456 * acknowledge the IPI to the doctor and then panic so we at least 457 * get a stack trace from whatever the current CPU is stuck doing, 458 * if not a core dump. 459 * 460 * (This metaphor is a little stretched, since defibrillation is 461 * usually administered when the heart is beating errattically but 462 * hasn't stopped, and causes the heart to stop temporarily, and 463 * one hopes it is not fatal. But we're (software) engineers, so 464 * we can stretch metaphors like silly putty in a blender.) 465 */ 466 static void 467 defibrillator(void *cookie) 468 { 469 bool *ack = cookie; 470 471 /* 472 * Acknowledge the interrupt so the doctor CPU won't trigger a 473 * new panic for defibrillation timeout. 474 */ 475 atomic_store_relaxed(ack, true); 476 477 /* 478 * If a panic is already in progress, we may have interrupted 479 * the logic that prints a stack trace on this CPU -- so let's 480 * not make it worse by giving the misapprehension of a 481 * recursive panic. 482 */ 483 if (atomic_load_relaxed(&panicstr) != NULL) 484 return; 485 486 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()), 487 curlwp->l_lid, 488 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 489 } 490 491 /* 492 * defibrillate(ci, unsigned d) 493 * 494 * The patient CPU ci's heart has stopped beating after d seconds. 495 * Force the patient CPU ci to panic, or panic on this CPU if the 496 * patient CPU doesn't respond within 1sec. 497 */ 498 static void __noinline 499 defibrillate(struct cpu_info *ci, unsigned d) 500 { 501 bool ack = false; 502 ipi_msg_t msg = { 503 .func = &defibrillator, 504 .arg = &ack, 505 }; 506 unsigned countdown = 1000; /* 1sec */ 507 508 KASSERT(curcpu_stable()); 509 510 /* 511 * First notify the console that the patient CPU's heart seems 512 * to have stopped beating. 513 */ 514 printf("%s: found %s heart stopped beating after %u seconds\n", 515 cpu_name(curcpu()), cpu_name(ci), d); 516 517 /* 518 * Next, give the patient CPU a chance to panic, so we get a 519 * stack trace on that CPU even if we don't get a crash dump. 520 */ 521 ipi_unicast(&msg, ci); 522 523 /* 524 * Busy-wait up to 1sec for the patient CPU to print a stack 525 * trace and panic. If the patient CPU acknowledges the IPI, 526 * just give up and stop here -- the system is coming down soon 527 * and we should avoid getting in the way. 528 */ 529 while (countdown --> 0) { 530 if (atomic_load_relaxed(&ack)) 531 return; 532 DELAY(1000); /* 1ms */ 533 } 534 535 /* 536 * The patient CPU failed to acknowledge the panic request. 537 * Panic now; with any luck, we'll get a crash dump. 538 */ 539 panic("%s: found %s heart stopped beating and unresponsive", 540 cpu_name(curcpu()), cpu_name(ci)); 541 } 542 543 /* 544 * select_patient() 545 * 546 * Select another CPU to check the heartbeat of. Returns NULL if 547 * there are no other online CPUs. Never returns curcpu(). 548 * Caller must have kpreemption disabled. 549 */ 550 static struct cpu_info * 551 select_patient(void) 552 { 553 CPU_INFO_ITERATOR cii; 554 struct cpu_info *first = NULL, *patient = NULL, *ci; 555 bool passedcur = false; 556 557 KASSERT(curcpu_stable()); 558 559 /* 560 * In the iteration order of all CPUs, find the next online CPU 561 * after curcpu(), or the first online one if curcpu() is last 562 * in the iteration order. 563 */ 564 for (CPU_INFO_FOREACH(cii, ci)) { 565 if (atomic_load_relaxed(&ci->ci_heartbeat_suspend)) 566 continue; 567 if (passedcur) { 568 /* 569 * (...|curcpu()|ci|...) 570 * 571 * Found the patient right after curcpu(). 572 */ 573 KASSERT(patient != ci); 574 patient = ci; 575 break; 576 } 577 if (ci == curcpu()) { 578 /* 579 * (...|prev|ci=curcpu()|next|...) 580 * 581 * Note that we want next (or first, if there's 582 * nothing after curcpu()). 583 */ 584 passedcur = true; 585 continue; 586 } 587 if (first == NULL) { 588 /* 589 * (ci|...|curcpu()|...) 590 * 591 * Record ci as first in case there's nothing 592 * after curcpu(). 593 */ 594 first = ci; 595 continue; 596 } 597 } 598 599 /* 600 * If we hit the end, wrap around to the beginning. 601 */ 602 if (patient == NULL) { 603 KASSERT(passedcur); 604 patient = first; 605 } 606 607 return patient; 608 } 609 610 /* 611 * heartbeat() 612 * 613 * 1. Count a heartbeat on the local CPU. 614 * 615 * 2. Panic if the system uptime doesn't seem to have advanced in 616 * a while. 617 * 618 * 3. Panic if the soft interrupt on this CPU hasn't advanced the 619 * local view of the system uptime. 620 * 621 * 4. Schedule the soft interrupt to advance the local view of the 622 * system uptime. 623 * 624 * 5. Select another CPU to check the heartbeat of. 625 * 626 * 6. Panic if the other CPU hasn't advanced its view of the 627 * system uptime in a while. 628 */ 629 void 630 heartbeat(void) 631 { 632 unsigned period_ticks, period_secs; 633 unsigned count, uptime, cache, stamp, d; 634 struct cpu_info *patient; 635 636 KASSERT(curcpu_stable()); 637 638 /* 639 * If heartbeat checks are disabled globally, or if they are 640 * suspended locally, or if we're already panicking so it's not 641 * helpful to trigger more panics for more reasons, do nothing. 642 */ 643 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks); 644 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs); 645 if (__predict_false(period_ticks == 0) || 646 __predict_false(period_secs == 0) || 647 __predict_false(curcpu()->ci_heartbeat_suspend) || 648 __predict_false(panicstr != NULL)) 649 return; 650 651 /* 652 * Count a heartbeat on this CPU. 653 */ 654 count = curcpu()->ci_heartbeat_count++; 655 656 /* 657 * If the uptime hasn't changed, make sure that we haven't 658 * counted too many of our own heartbeats since the uptime last 659 * changed, and stop here -- we only do the cross-CPU work once 660 * per second. 661 */ 662 uptime = time_uptime32; 663 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache); 664 if (__predict_true(cache == uptime)) { 665 /* 666 * Timecounter hasn't advanced by more than a second. 667 * Make sure the timecounter isn't stuck according to 668 * our heartbeats -- unless timecounter heartbeats are 669 * suspended too. 670 * 671 * Our own heartbeat count can't roll back, and 672 * time_uptime32 should be updated before it wraps 673 * around, so d should never go negative; hence no 674 * check for d < UINT_MAX/2. 675 */ 676 stamp = 677 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp); 678 d = count - stamp; 679 if (__predict_false(d > period_ticks) && 680 !heartbeat_timecounter_suspended()) { 681 panic("%s: time has not advanced in %u heartbeats", 682 cpu_name(curcpu()), d); 683 } 684 return; 685 } 686 687 /* 688 * If the uptime has changed, make sure that it hasn't changed 689 * so much that softints must be stuck on this CPU. Since 690 * time_uptime32 is monotonic and our cache of it is updated at 691 * most every UINT_MAX/4/hz sec (hence no concern about 692 * wraparound even after 68 or 136 years), this can't go 693 * negative, hence no check for d < UINT_MAX/2. 694 * 695 * This uses the hard timer interrupt handler on the current 696 * CPU to ensure soft interrupts at all priority levels have 697 * made progress. 698 */ 699 d = uptime - cache; 700 if (__predict_false(d > period_secs)) { 701 panic("%s: softints stuck for %u seconds", 702 cpu_name(curcpu()), d); 703 } 704 705 /* 706 * Schedule a softint to update our cache of the system uptime 707 * so the next call to heartbeat, on this or another CPU, can 708 * detect progress on this one. 709 */ 710 softint_schedule(heartbeat_sih); 711 712 /* 713 * Select a patient to check the heartbeat of. If there's no 714 * other online CPU, nothing to do. 715 */ 716 patient = select_patient(); 717 if (patient == NULL) 718 return; 719 720 /* 721 * Verify that time is advancing on the patient CPU. If the 722 * delta exceeds UINT_MAX/2, that means it is already ahead by 723 * a little on the other CPU, and the subtraction went 724 * negative, which is OK. If the CPU's heartbeats have been 725 * suspended since we selected it, no worries. 726 * 727 * This uses the current CPU to ensure the other CPU has made 728 * progress, even if the other CPU's hard timer interrupt 729 * handler is stuck for some reason. 730 * 731 * XXX Maybe confirm it hasn't gone negative by more than 732 * max_period? 733 */ 734 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache); 735 if (__predict_false(d > period_secs) && 736 __predict_false(d < UINT_MAX/2) && 737 atomic_load_relaxed(&patient->ci_heartbeat_suspend) == 0) 738 defibrillate(patient, d); 739 } 740 741 /* 742 * heartbeat_dump() 743 * 744 * Print the heartbeat data of all CPUs. Can be called from ddb. 745 */ 746 #ifdef DDB 747 static unsigned 748 db_read_unsigned(const volatile unsigned *p) 749 { 750 unsigned x; 751 752 db_read_bytes((db_addr_t)(uintptr_t)p, sizeof(x), (char *)&x); 753 754 return x; 755 } 756 757 void 758 heartbeat_dump(void) 759 { 760 struct cpu_info *ci; 761 762 db_printf("Heartbeats:\n"); 763 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { 764 db_printf("cpu%u: count %u uptime %u stamp %u suspend %u\n", 765 db_read_unsigned(&ci->ci_index), 766 db_read_unsigned(&ci->ci_heartbeat_count), 767 db_read_unsigned(&ci->ci_heartbeat_uptime_cache), 768 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp), 769 db_read_unsigned(&ci->ci_heartbeat_suspend)); 770 } 771 } 772 #endif 773