1 /* $NetBSD: kern_heartbeat.c,v 1.5 2023/07/16 10:18:19 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress 31 * 32 * Manual tests to run when changing this file. Magic numbers are for 33 * evbarm; adjust for other platforms. Tests involving cpuctl 34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU 35 * system, offline all but one CPU. 36 * 37 * 1. cpuctl offline 0 38 * sleep 20 39 * cpuctl online 0 40 * 41 * 2. cpuctl offline 1 42 * sleep 20 43 * cpuctl online 1 44 * 45 * 3. cpuctl offline 0 46 * sysctl -w kern.heartbeat.max_period=5 47 * sleep 10 48 * sysctl -w kern.heartbeat.max_period=0 49 * sleep 10 50 * sysctl -w kern.heartbeat.max_period=5 51 * sleep 10 52 * cpuctl online 0 53 * 54 * 4. sysctl -w debug.crashme_enable=1 55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 56 * # verify system panics after 15sec 57 * 58 * 5. sysctl -w debug.crashme_enable=1 59 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 60 * # verify system panics after 15sec 61 * 62 * 6. cpuctl offline 0 63 * sysctl -w debug.crashme_enable=1 64 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 65 * # verify system panics after 15sec 66 * 67 * 7. cpuctl offline 0 68 * sysctl -w debug.crashme_enable=1 69 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM 70 * # verify system panics after 15sec 71 * 72 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU 73 * # require a hardware watchdog timer. 74 * #cpuctl offline 0 75 * #sysctl -w debug.crashme_enable 76 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 77 * # hope watchdog timer kicks in 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.5 2023/07/16 10:18:19 riastradh Exp $"); 82 83 #ifdef _KERNEL_OPT 84 #include "opt_ddb.h" 85 #include "opt_heartbeat.h" 86 #endif 87 88 #include "heartbeat.h" 89 90 #include <sys/param.h> 91 #include <sys/types.h> 92 93 #include <sys/atomic.h> 94 #include <sys/cpu.h> 95 #include <sys/errno.h> 96 #include <sys/heartbeat.h> 97 #include <sys/ipi.h> 98 #include <sys/kernel.h> 99 #include <sys/mutex.h> 100 #include <sys/sysctl.h> 101 #include <sys/systm.h> 102 #include <sys/xcall.h> 103 104 #ifdef DDB 105 #include <ddb/ddb.h> 106 #endif 107 108 /* 109 * Global state. 110 * 111 * heartbeat_lock serializes access to heartbeat_max_period_secs 112 * and heartbeat_max_period_ticks. Two separate variables so we 113 * can avoid multiplication or division in the heartbeat routine. 114 * 115 * heartbeat_sih is stable after initialization in 116 * heartbeat_start. 117 */ 118 kmutex_t heartbeat_lock __cacheline_aligned; 119 unsigned heartbeat_max_period_secs __read_mostly; 120 unsigned heartbeat_max_period_ticks __read_mostly; 121 122 void *heartbeat_sih __read_mostly; 123 124 /* 125 * heartbeat_suspend() 126 * 127 * Suspend heartbeat monitoring of the current CPU. 128 * 129 * Called after the current CPU has been marked offline but before 130 * it has stopped running. Caller must have preemption disabled. 131 */ 132 void 133 heartbeat_suspend(void) 134 { 135 136 KASSERT(curcpu_stable()); 137 138 /* 139 * Nothing to do -- we just check the SPCF_OFFLINE flag. 140 */ 141 } 142 143 /* 144 * heartbeat_resume_cpu(ci) 145 * 146 * Resume heartbeat monitoring of ci. 147 * 148 * Called at startup while cold, and whenever heartbeat monitoring 149 * is re-enabled after being disabled or the period is changed. 150 * When not cold, ci must be the current CPU. 151 */ 152 static void 153 heartbeat_resume_cpu(struct cpu_info *ci) 154 { 155 156 KASSERT(__predict_false(cold) || curcpu_stable()); 157 KASSERT(__predict_false(cold) || ci == curcpu()); 158 159 ci->ci_heartbeat_count = 0; 160 ci->ci_heartbeat_uptime_cache = time_uptime; 161 ci->ci_heartbeat_uptime_stamp = 0; 162 } 163 164 /* 165 * heartbeat_resume() 166 * 167 * Resume heartbeat monitoring of the current CPU. 168 * 169 * Called after the current CPU has started running but before it 170 * has been marked online. Also used internally when starting up 171 * heartbeat monitoring at boot or when the maximum period is set 172 * from zero to nonzero. Caller must have preemption disabled. 173 */ 174 void 175 heartbeat_resume(void) 176 { 177 struct cpu_info *ci = curcpu(); 178 int s; 179 180 KASSERT(curcpu_stable()); 181 182 /* 183 * Block heartbeats while we reset the state so we don't 184 * spuriously think we had a heart attack in the middle of 185 * resetting the count and the uptime stamp. 186 */ 187 s = splsched(); 188 heartbeat_resume_cpu(ci); 189 splx(s); 190 } 191 192 /* 193 * heartbeat_reset_xc(a, b) 194 * 195 * Cross-call handler to reset heartbeat state just prior to 196 * enabling heartbeat checks. 197 */ 198 static void 199 heartbeat_reset_xc(void *a, void *b) 200 { 201 202 heartbeat_resume(); 203 } 204 205 /* 206 * set_max_period(max_period) 207 * 208 * Set the maximum period, in seconds, for heartbeat checks. 209 * 210 * - If max_period is zero, disable them. 211 * 212 * - If the max period was zero and max_period is nonzero, ensure 213 * all CPUs' heartbeat uptime caches are up-to-date before 214 * re-enabling them. 215 * 216 * max_period must be below UINT_MAX/4/hz to avoid arithmetic 217 * overflow and give room for slop. 218 * 219 * Caller must hold heartbeat_lock. 220 */ 221 static void 222 set_max_period(unsigned max_period) 223 { 224 225 KASSERTMSG(max_period <= UINT_MAX/4/hz, 226 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)", 227 max_period, UINT_MAX/4/hz, hz); 228 KASSERT(mutex_owned(&heartbeat_lock)); 229 230 /* 231 * If we're enabling heartbeat checks, make sure we have a 232 * reasonably up-to-date time_uptime cache on all CPUs so we 233 * don't think we had an instant heart attack. 234 */ 235 if (heartbeat_max_period_secs == 0 && max_period != 0) { 236 if (cold) { 237 CPU_INFO_ITERATOR cii; 238 struct cpu_info *ci; 239 240 for (CPU_INFO_FOREACH(cii, ci)) 241 heartbeat_resume_cpu(ci); 242 } else { 243 const uint64_t ticket = 244 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL); 245 xc_wait(ticket); 246 } 247 } 248 249 /* 250 * Once the heartbeat state has been updated on all (online) 251 * CPUs, set the period. At this point, heartbeat checks can 252 * begin. 253 */ 254 atomic_store_relaxed(&heartbeat_max_period_secs, max_period); 255 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz); 256 } 257 258 /* 259 * heartbeat_max_period_ticks(SYSCTLFN_ARGS) 260 * 261 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies 262 * it lies within a reasonable interval and sets it. 263 */ 264 static int 265 heartbeat_max_period_sysctl(SYSCTLFN_ARGS) 266 { 267 struct sysctlnode node; 268 unsigned max_period; 269 int error; 270 271 mutex_enter(&heartbeat_lock); 272 273 max_period = heartbeat_max_period_secs; 274 node = *rnode; 275 node.sysctl_data = &max_period; 276 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 277 if (error || newp == NULL) 278 goto out; 279 280 /* 281 * Ensure there's plenty of slop between heartbeats. 282 */ 283 if (max_period > UINT_MAX/4/hz) { 284 error = EOVERFLOW; 285 goto out; 286 } 287 288 /* 289 * Success! Set the period. This enables heartbeat checks if 290 * we went from zero period to nonzero period, or disables them 291 * if the other way around. 292 */ 293 set_max_period(max_period); 294 error = 0; 295 296 out: mutex_exit(&heartbeat_lock); 297 return error; 298 } 299 300 /* 301 * sysctl_heartbeat_setup() 302 * 303 * Set up the kern.heartbeat.* sysctl subtree. 304 */ 305 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup") 306 { 307 const struct sysctlnode *rnode; 308 int error; 309 310 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE); 311 312 /* kern.heartbeat */ 313 error = sysctl_createv(NULL, 0, NULL, &rnode, 314 CTLFLAG_PERMANENT, 315 CTLTYPE_NODE, "heartbeat", 316 SYSCTL_DESCR("Kernel heartbeat parameters"), 317 NULL, 0, NULL, 0, 318 CTL_KERN, CTL_CREATE, CTL_EOL); 319 if (error) { 320 printf("%s: failed to create kern.heartbeat: %d\n", 321 __func__, error); 322 return; 323 } 324 325 /* kern.heartbeat.max_period */ 326 error = sysctl_createv(NULL, 0, &rnode, NULL, 327 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 328 CTLTYPE_INT, "max_period", 329 SYSCTL_DESCR("Max seconds between heartbeats before panic"), 330 &heartbeat_max_period_sysctl, 0, NULL, 0, 331 CTL_CREATE, CTL_EOL); 332 if (error) { 333 printf("%s: failed to create kern.heartbeat.max_period: %d\n", 334 __func__, error); 335 return; 336 } 337 } 338 339 /* 340 * heartbeat_intr(cookie) 341 * 342 * Soft interrupt handler to update the local CPU's view of the 343 * system uptime. This runs at the same priority level as 344 * callouts, so if callouts are stuck on this CPU, it won't run, 345 * and eventually another CPU will notice that this one is stuck. 346 * 347 * Don't do spl* here -- keep it to a minimum so if anything goes 348 * wrong we don't end up with hard interrupts blocked and unable 349 * to detect a missed heartbeat. 350 */ 351 static void 352 heartbeat_intr(void *cookie) 353 { 354 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count); 355 unsigned uptime = time_uptime; 356 357 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count); 358 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime); 359 } 360 361 /* 362 * heartbeat_start() 363 * 364 * Start system heartbeat monitoring. 365 */ 366 void 367 heartbeat_start(void) 368 { 369 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT; 370 371 /* 372 * Establish a softint so we can schedule it once ready. This 373 * should be at the lowest softint priority level so that we 374 * ensure all softint priorities are making progress. 375 */ 376 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE, 377 &heartbeat_intr, NULL); 378 379 /* 380 * Now that the softint is established, kick off heartbeat 381 * monitoring with the default period. This will initialize 382 * the per-CPU state to an up-to-date cache of time_uptime. 383 */ 384 mutex_enter(&heartbeat_lock); 385 set_max_period(max_period); 386 mutex_exit(&heartbeat_lock); 387 } 388 389 /* 390 * defibrillator(cookie) 391 * 392 * IPI handler for defibrillation. If the CPU's heart has stopped 393 * beating normally, but the CPU can still execute things, 394 * acknowledge the IPI to the doctor and then panic so we at least 395 * get a stack trace from whatever the current CPU is stuck doing, 396 * if not a core dump. 397 * 398 * (This metaphor is a little stretched, since defibrillation is 399 * usually administered when the heart is beating errattically but 400 * hasn't stopped, and causes the heart to stop temporarily, and 401 * one hopes it is not fatal. But we're (software) engineers, so 402 * we can stretch metaphors like silly putty in a blender.) 403 */ 404 static void 405 defibrillator(void *cookie) 406 { 407 bool *ack = cookie; 408 409 atomic_store_relaxed(ack, true); 410 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()), 411 curlwp->l_lid, 412 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 413 } 414 415 /* 416 * defibrillate(ci, unsigned d) 417 * 418 * The patient CPU ci's heart has stopped beating after d seconds. 419 * Force the patient CPU ci to panic, or panic on this CPU if the 420 * patient CPU doesn't respond within 1sec. 421 */ 422 static void __noinline 423 defibrillate(struct cpu_info *ci, unsigned d) 424 { 425 bool ack = false; 426 ipi_msg_t msg = { 427 .func = &defibrillator, 428 .arg = &ack, 429 }; 430 unsigned countdown = 1000; /* 1sec */ 431 432 KASSERT(curcpu_stable()); 433 434 /* 435 * First notify the console that the patient CPU's heart seems 436 * to have stopped beating. 437 */ 438 printf("%s: found %s heart stopped beating after %u seconds\n", 439 cpu_name(curcpu()), cpu_name(ci), d); 440 441 /* 442 * Next, give the patient CPU a chance to panic, so we get a 443 * stack trace on that CPU even if we don't get a crash dump. 444 */ 445 ipi_unicast(&msg, ci); 446 447 /* 448 * Busy-wait up to 1sec for the patient CPU to print a stack 449 * trace and panic. If the patient CPU acknowledges the IPI, 450 * or if we're panicking anyway, just give up and stop here -- 451 * the system is coming down soon and we should avoid getting 452 * in the way. 453 */ 454 while (countdown --> 0) { 455 if (atomic_load_relaxed(&ack) || 456 atomic_load_relaxed(&panicstr) != NULL) 457 return; 458 DELAY(1000); /* 1ms */ 459 } 460 461 /* 462 * The patient CPU failed to acknowledge the panic request. 463 * Panic now; with any luck, we'll get a crash dump. 464 */ 465 panic("%s: found %s heart stopped beating and unresponsive", 466 cpu_name(curcpu()), cpu_name(ci)); 467 } 468 469 /* 470 * select_patient() 471 * 472 * Select another CPU to check the heartbeat of. Returns NULL if 473 * there are no other online CPUs. Never returns curcpu(). 474 * Caller must have kpreemption disabled. 475 */ 476 static struct cpu_info * 477 select_patient(void) 478 { 479 CPU_INFO_ITERATOR cii; 480 struct cpu_info *first = NULL, *patient = NULL, *ci; 481 bool passedcur = false; 482 483 KASSERT(curcpu_stable()); 484 485 /* 486 * In the iteration order of all CPUs, find the next online CPU 487 * after curcpu(), or the first online one if curcpu() is last 488 * in the iteration order. 489 */ 490 for (CPU_INFO_FOREACH(cii, ci)) { 491 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE) 492 continue; 493 if (passedcur) { 494 /* 495 * (...|curcpu()|ci|...) 496 * 497 * Found the patient right after curcpu(). 498 */ 499 KASSERT(patient != ci); 500 patient = ci; 501 break; 502 } 503 if (ci == curcpu()) { 504 /* 505 * (...|prev|ci=curcpu()|next|...) 506 * 507 * Note that we want next (or first, if there's 508 * nothing after curcpu()). 509 */ 510 passedcur = true; 511 continue; 512 } 513 if (first == NULL) { 514 /* 515 * (ci|...|curcpu()|...) 516 * 517 * Record ci as first in case there's nothing 518 * after curcpu(). 519 */ 520 first = ci; 521 continue; 522 } 523 } 524 525 /* 526 * If we hit the end, wrap around to the beginning. 527 */ 528 if (patient == NULL) { 529 KASSERT(passedcur); 530 patient = first; 531 } 532 533 return patient; 534 } 535 536 /* 537 * heartbeat() 538 * 539 * 1. Count a heartbeat on the local CPU. 540 * 541 * 2. Panic if the system uptime doesn't seem to have advanced in 542 * a while. 543 * 544 * 3. Panic if the soft interrupt on this CPU hasn't advanced the 545 * local view of the system uptime. 546 * 547 * 4. Schedule the soft interrupt to advance the local view of the 548 * system uptime. 549 * 550 * 5. Select another CPU to check the heartbeat of. 551 * 552 * 6. Panic if the other CPU hasn't advanced its view of the 553 * system uptime in a while. 554 */ 555 void 556 heartbeat(void) 557 { 558 unsigned period_ticks, period_secs; 559 unsigned count, uptime, cache, stamp, d; 560 struct cpu_info *patient; 561 562 KASSERT(curcpu_stable()); 563 564 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks); 565 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs); 566 if (__predict_false(period_ticks == 0) || 567 __predict_false(period_secs == 0) || 568 __predict_false(curcpu()->ci_schedstate.spc_flags & SPCF_OFFLINE)) 569 return; 570 571 /* 572 * Count a heartbeat on this CPU. 573 */ 574 count = curcpu()->ci_heartbeat_count++; 575 576 /* 577 * If the uptime hasn't changed, make sure that we haven't 578 * counted too many of our own heartbeats since the uptime last 579 * changed, and stop here -- we only do the cross-CPU work once 580 * per second. 581 */ 582 uptime = time_uptime; 583 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache); 584 if (__predict_true(cache == uptime)) { 585 /* 586 * Timecounter hasn't advanced by more than a second. 587 * Make sure the timecounter isn't stuck according to 588 * our heartbeats. 589 * 590 * Our own heartbeat count can't roll back, and 591 * time_uptime should be updated before it wraps 592 * around, so d should never go negative; hence no 593 * check for d < UINT_MAX/2. 594 */ 595 stamp = 596 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp); 597 d = count - stamp; 598 if (__predict_false(d > period_ticks)) { 599 panic("%s: time has not advanced in %u heartbeats", 600 cpu_name(curcpu()), d); 601 } 602 return; 603 } 604 605 /* 606 * If the uptime has changed, make sure that it hasn't changed 607 * so much that softints must be stuck on this CPU. Since 608 * time_uptime is monotonic, this can't go negative, hence no 609 * check for d < UINT_MAX/2. 610 * 611 * This uses the hard timer interrupt handler on the current 612 * CPU to ensure soft interrupts at all priority levels have 613 * made progress. 614 */ 615 d = uptime - cache; 616 if (__predict_false(d > period_secs)) { 617 panic("%s: softints stuck for %u seconds", 618 cpu_name(curcpu()), d); 619 } 620 621 /* 622 * Schedule a softint to update our cache of the system uptime 623 * so the next call to heartbeat, on this or another CPU, can 624 * detect progress on this one. 625 */ 626 softint_schedule(heartbeat_sih); 627 628 /* 629 * Select a patient to check the heartbeat of. If there's no 630 * other online CPU, nothing to do. 631 */ 632 patient = select_patient(); 633 if (patient == NULL) 634 return; 635 636 /* 637 * Verify that time is advancing on the patient CPU. If the 638 * delta exceeds UINT_MAX/2, that means it is already ahead by 639 * a little on the other CPU, and the subtraction went 640 * negative, which is OK. If the CPU has been 641 * offlined since we selected it, no worries. 642 * 643 * This uses the current CPU to ensure the other CPU has made 644 * progress, even if the other CPU's hard timer interrupt 645 * handler is stuck for some reason. 646 * 647 * XXX Maybe confirm it hasn't gone negative by more than 648 * max_period? 649 */ 650 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache); 651 if (__predict_false(d > period_secs) && 652 __predict_false(d < UINT_MAX/2) && 653 ((patient->ci_schedstate.spc_flags & SPCF_OFFLINE) == 0)) 654 defibrillate(patient, d); 655 } 656 657 /* 658 * heartbeat_dump() 659 * 660 * Print the heartbeat data of all CPUs. Can be called from ddb. 661 */ 662 #ifdef DDB 663 static unsigned 664 db_read_unsigned(const unsigned *p) 665 { 666 unsigned x; 667 668 db_read_bytes((db_addr_t)p, sizeof(x), (char *)&x); 669 670 return x; 671 } 672 673 void 674 heartbeat_dump(void) 675 { 676 struct cpu_info *ci; 677 678 db_printf("Heartbeats:\n"); 679 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { 680 db_printf("cpu%u: count %u uptime %u stamp %u\n", 681 db_read_unsigned(&ci->ci_index), 682 db_read_unsigned(&ci->ci_heartbeat_count), 683 db_read_unsigned(&ci->ci_heartbeat_uptime_cache), 684 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp)); 685 } 686 } 687 #endif 688