1 /* $NetBSD: apei_hest.c,v 1.7 2025/01/05 22:11:18 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * APEI HEST -- Hardware Error Source Table 31 * 32 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#acpi-error-source 33 * 34 * XXX uncorrectable error NMI comes in on all CPUs at once, what to do? 35 * 36 * XXX AMD MCA 37 * 38 * XXX IA32 machine check stuff 39 * 40 * XXX switch-to-polling for GHES notifications 41 * 42 * XXX error threshold for GHES notifications 43 * 44 * XXX sort out interrupt notification types, e.g. do we ever need to 45 * do acpi_intr_establish? 46 * 47 * XXX sysctl knob to force polling each particular error source that 48 * supports it 49 * 50 * XXX consider a lighter-weight polling schedule for machines with 51 * thousands of polled GHESes 52 */ 53 54 #include <sys/cdefs.h> 55 __KERNEL_RCSID(0, "$NetBSD: apei_hest.c,v 1.7 2025/01/05 22:11:18 andvar Exp $"); 56 57 #include <sys/types.h> 58 59 #include <sys/atomic.h> 60 #include <sys/kmem.h> 61 #include <sys/lock.h> 62 #include <sys/systm.h> 63 64 #include <dev/acpi/acpivar.h> 65 #include <dev/acpi/apei_cper.h> 66 #include <dev/acpi/apei_hestvar.h> 67 #include <dev/acpi/apei_hed.h> 68 #include <dev/acpi/apei_mapreg.h> 69 #include <dev/acpi/apeivar.h> 70 71 #if defined(__i386__) || defined(__x86_64__) 72 #include <x86/nmi.h> 73 #endif 74 75 #include "ioconf.h" 76 77 #define _COMPONENT ACPI_RESOURCE_COMPONENT 78 ACPI_MODULE_NAME ("apei") 79 80 /* 81 * apei_hest_ghes_handle(sc, src) 82 * 83 * Check for, report, and acknowledge any error from a Generic 84 * Hardware Error Source (GHES, not GHESv2). Return true if there 85 * was any error to report, false if not. 86 */ 87 static bool 88 apei_hest_ghes_handle(struct apei_softc *sc, struct apei_source *src) 89 { 90 ACPI_HEST_GENERIC *ghes = container_of(src->as_header, 91 ACPI_HEST_GENERIC, Header); 92 ACPI_HEST_GENERIC_STATUS *gesb = src->as_ghes.gesb; 93 char ctx[sizeof("error source 65535")]; 94 uint32_t status; 95 bool fatal = false; 96 97 /* 98 * Process and report any error. 99 */ 100 snprintf(ctx, sizeof(ctx), "error source %"PRIu16, 101 ghes->Header.SourceId); 102 status = apei_gesb_report(sc, src->as_ghes.gesb, 103 ghes->ErrorBlockLength, ctx, &fatal); 104 105 /* 106 * Acknowledge the error by clearing the block status. To 107 * avoid races, we probably have to avoid further access to the 108 * GESB until we get another notification. 109 * 110 * As a precaution, we zero this with atomic compare-and-swap 111 * so at least we can see if the status changed while we were 112 * working on it. 113 * 114 * It is tempting to clear bits with atomic and-complement, but 115 * the BlockStatus is not just a bit mask -- bits [13:4] are a 116 * count of Generic Error Data Entries, and who knows what bits 117 * [31:14] might be used for in the future. 118 * 119 * XXX The GHES(v1) protocol is unclear from the specification 120 * here. The GHESv2 protocol has a separate register write to 121 * acknowledge, which is a bit clearer. 122 */ 123 membar_release(); 124 const uint32_t status1 = atomic_cas_32(&gesb->BlockStatus, status, 0); 125 if (status1 != status) { 126 device_printf(sc->sc_dev, "%s: status changed from" 127 " 0x%"PRIx32" to 0x%"PRIx32"\n", 128 ctx, status, status1); 129 } 130 131 /* 132 * If the error was fatal, panic now. 133 */ 134 if (fatal) 135 panic("fatal hardware error"); 136 137 return status != 0; 138 } 139 140 /* 141 * apei_hest_ghes_v2_handle(sc, src) 142 * 143 * Check for, report, and acknowledge any error from a Generic 144 * Hardware Error Source v2. Return true if there was any error 145 * to report, false if not. 146 */ 147 static bool 148 apei_hest_ghes_v2_handle(struct apei_softc *sc, struct apei_source *src) 149 { 150 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(src->as_header, 151 ACPI_HEST_GENERIC_V2, Header); 152 ACPI_HEST_GENERIC_STATUS *gesb = src->as_ghes.gesb; 153 char ctx[sizeof("error source 65535")]; 154 uint64_t X; 155 uint32_t status; 156 bool fatal; 157 158 /* 159 * Process and report any error. 160 */ 161 snprintf(ctx, sizeof(ctx), "error source %"PRIu16, 162 ghes_v2->Header.SourceId); 163 status = apei_gesb_report(sc, src->as_ghes.gesb, 164 ghes_v2->ErrorBlockLength, ctx, &fatal); 165 166 /* 167 * First clear the block status. As a precaution, we zero this 168 * with atomic compare-and-swap so at least we can see if the 169 * status changed while we were working on it. 170 */ 171 membar_release(); 172 const uint32_t status1 = atomic_cas_32(&gesb->BlockStatus, status, 0); 173 if (status1 != status) { 174 device_printf(sc->sc_dev, "%s: status changed from" 175 " 0x%"PRIx32" to 0x%"PRIx32"\n", 176 ctx, status, status1); 177 } 178 179 /* 180 * Next, do the Read Ack dance. 181 * 182 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#generic-hardware-error-source-version-2-ghesv2-type-10 183 */ 184 X = apei_mapreg_read(&ghes_v2->ReadAckRegister, 185 src->as_ghes_v2.read_ack); 186 X &= ghes_v2->ReadAckPreserve; 187 X |= ghes_v2->ReadAckWrite; 188 apei_mapreg_write(&ghes_v2->ReadAckRegister, 189 src->as_ghes_v2.read_ack, X); 190 191 /* 192 * If the error was fatal, panic now. 193 */ 194 if (fatal) 195 panic("fatal hardware error"); 196 197 return status != 0; 198 } 199 200 /* 201 * apei_hest_ghes_poll(cookie) 202 * 203 * Callout handler for periodic polling of a Generic Hardware 204 * Error Source (GHES, not GHESv2), using Notification Type `0 - 205 * Polled'. 206 * 207 * cookie is the struct apei_source pointer for a single source; 208 * if there are multiple sources there will be multiple callouts. 209 */ 210 static void 211 apei_hest_ghes_poll(void *cookie) 212 { 213 struct apei_source *src = cookie; 214 struct apei_softc *sc = src->as_sc; 215 ACPI_HEST_GENERIC *ghes = container_of(src->as_header, 216 ACPI_HEST_GENERIC, Header); 217 218 /* 219 * Process and acknowledge any error. 220 */ 221 (void)apei_hest_ghes_handle(sc, src); 222 223 /* 224 * Schedule polling again after the firmware-suggested 225 * interval. 226 */ 227 callout_schedule(&src->as_ch, 228 MAX(1, mstohz(ghes->Notify.PollInterval))); 229 } 230 231 /* 232 * apei_hest_ghes_v2_poll(cookie) 233 * 234 * Callout handler for periodic polling of a Generic Hardware 235 * Error Source v2, using Notification Type `0 - Polled'. 236 * 237 * cookie is the struct apei_source pointer for a single source; 238 * if there are multiple sources there will be multiple callouts. 239 */ 240 static void 241 apei_hest_ghes_v2_poll(void *cookie) 242 { 243 struct apei_source *src = cookie; 244 struct apei_softc *sc = src->as_sc; 245 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(src->as_header, 246 ACPI_HEST_GENERIC_V2, Header); 247 248 /* 249 * Process and acknowledge any error. 250 */ 251 (void)apei_hest_ghes_v2_handle(sc, src); 252 253 /* 254 * Schedule polling again after the firmware-suggested 255 * interval. 256 */ 257 callout_schedule(&src->as_ch, 258 MAX(1, mstohz(ghes_v2->Notify.PollInterval))); 259 } 260 261 #if defined(__i386__) || defined(__x86_64__) 262 263 /* 264 * The NMI is (sometimes?) delivered to all CPUs at once. To reduce 265 * confusion, let's try to have only one CPU process error 266 * notifications at a time. 267 */ 268 static __cpu_simple_lock_t apei_hest_nmi_lock = __SIMPLELOCK_UNLOCKED; 269 270 /* 271 * apei_hest_ghes_nmi(tf, cookie) 272 * 273 * Nonmaskable interrupt handler for Generic Hardware Error 274 * Sources (GHES, not GHESv2) with Notification Type `4 - NMI'. 275 */ 276 static int 277 apei_hest_ghes_nmi(const struct trapframe *tf, void *cookie) 278 { 279 struct apei_source *src = cookie; 280 struct apei_softc *sc = src->as_sc; 281 282 __cpu_simple_lock(&apei_hest_nmi_lock); 283 const bool mine = apei_hest_ghes_handle(sc, src); 284 __cpu_simple_unlock(&apei_hest_nmi_lock); 285 286 /* 287 * Tell the NMI subsystem whether this interrupt could have 288 * been for us or not. 289 */ 290 return mine; 291 } 292 293 /* 294 * apei_hest_ghes_v2_nmi(tf, cookie) 295 * 296 * Nonmaskable interrupt handler for Generic Hardware Error 297 * Sources v2 with Notification Type `4 - NMI'. 298 */ 299 static int 300 apei_hest_ghes_v2_nmi(const struct trapframe *tf, void *cookie) 301 { 302 struct apei_source *src = cookie; 303 struct apei_softc *sc = src->as_sc; 304 305 __cpu_simple_lock(&apei_hest_nmi_lock); 306 const bool mine = apei_hest_ghes_v2_handle(sc, src); 307 __cpu_simple_unlock(&apei_hest_nmi_lock); 308 309 /* 310 * Tell the NMI subsystem whether this interrupt could have 311 * been for us or not. 312 */ 313 return mine; 314 } 315 316 #endif /* defined(__i386__) || defined(__x86_64__) */ 317 318 /* 319 * apei_hest_attach_ghes(sc, ghes, i) 320 * 321 * Attach a Generic Hardware Error Source (GHES, not GHESv2) as 322 * the ith source in the Hardware Error Source Table. 323 * 324 * After this point, the system will check for and handle errors 325 * when notified by this source. 326 */ 327 static void 328 apei_hest_attach_ghes(struct apei_softc *sc, ACPI_HEST_GENERIC *ghes, 329 uint32_t i) 330 { 331 struct apei_hest_softc *hsc = &sc->sc_hest; 332 struct apei_source *src = &hsc->hsc_source[i]; 333 uint64_t addr; 334 ACPI_STATUS rv; 335 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 336 337 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 338 i, ghes->Header.SourceId); 339 340 /* 341 * Verify the source is enabled before proceeding. The Enabled 342 * field is 8 bits with 256 possibilities, but only two of the 343 * possibilities, 0 and 1, have semantics defined in the spec, 344 * so out of an abundance of caution let's tread carefully in 345 * case anything changes and noisily reject any values other 346 * than 1. 347 */ 348 switch (ghes->Enabled) { 349 case 1: 350 break; 351 case 0: 352 aprint_debug_dev(sc->sc_dev, "%s: disabled\n", ctx); 353 return; 354 default: 355 aprint_error_dev(sc->sc_dev, "%s: unknown GHES Enabled state:" 356 " 0x%"PRIx8"\n", ctx, ghes->Enabled); 357 return; 358 } 359 360 /* 361 * Verify the Error Status Address bit width is at most 64 bits 362 * before proceeding with this source. When we get 128-bit 363 * addressing, this code will have to be updated. 364 */ 365 if (ghes->ErrorStatusAddress.BitWidth > 64) { 366 aprint_error_dev(sc->sc_dev, "%s: excessive address bits:" 367 " %"PRIu8"\n", ctx, ghes->ErrorStatusAddress.BitWidth); 368 return; 369 } 370 371 /* 372 * Read the GHES Error Status Address. This is the physical 373 * address of a GESB, Generic Error Status Block. Why the 374 * physical address is exposed via this indirection, and not 375 * simply stored directly in the GHES, is unclear to me. 376 * Hoping it's not because the address can change dynamically, 377 * because the error handling path shouldn't involve mapping 378 * anything. 379 */ 380 rv = AcpiRead(&addr, &ghes->ErrorStatusAddress); 381 if (ACPI_FAILURE(rv)) { 382 aprint_error_dev(sc->sc_dev, "%s:" 383 " failed to read error status address: %s", ctx, 384 AcpiFormatException(rv)); 385 return; 386 } 387 aprint_debug_dev(sc->sc_dev, "%s: error status @ 0x%"PRIx64"\n", ctx, 388 addr); 389 390 /* 391 * Initialize the source and map the GESB so we can get at it 392 * in the error handling path. 393 */ 394 src->as_sc = sc; 395 src->as_header = &ghes->Header; 396 src->as_ghes.gesb = AcpiOsMapMemory(addr, ghes->ErrorBlockLength); 397 398 /* 399 * Arrange to receive notifications. 400 */ 401 switch (ghes->Notify.Type) { 402 case ACPI_HEST_NOTIFY_POLLED: 403 if (ghes->Notify.PollInterval == 0) /* paranoia */ 404 break; 405 callout_init(&src->as_ch, CALLOUT_MPSAFE); 406 callout_setfunc(&src->as_ch, &apei_hest_ghes_poll, src); 407 callout_schedule(&src->as_ch, 0); 408 break; 409 case ACPI_HEST_NOTIFY_SCI: 410 case ACPI_HEST_NOTIFY_GPIO: 411 /* 412 * SCI and GPIO notifications are delivered through 413 * Hardware Error Device (PNP0C33) events. 414 * 415 * XXX Where is this spelled out? The text at 416 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#event-notification-for-generic-error-sources 417 * is vague. 418 */ 419 SIMPLEQ_INSERT_TAIL(&hsc->hsc_hed_list, src, as_entry); 420 break; 421 #if defined(__i386__) || defined(__x86_64__) 422 case ACPI_HEST_NOTIFY_NMI: 423 src->as_nmi = nmi_establish(&apei_hest_ghes_nmi, src); 424 break; 425 #endif 426 } 427 428 /* 429 * Now that we have notification set up, process and 430 * acknowledge the initial GESB report if any. 431 */ 432 apei_hest_ghes_handle(sc, src); 433 } 434 435 /* 436 * apei_hest_detach_ghes(sc, ghes, i) 437 * 438 * Detach the ith source, which is a Generic Hardware Error Source 439 * (GHES, not GHESv2). 440 * 441 * After this point, the system will ignore notifications from 442 * this source. 443 */ 444 static void 445 apei_hest_detach_ghes(struct apei_softc *sc, ACPI_HEST_GENERIC *ghes, 446 uint32_t i) 447 { 448 struct apei_hest_softc *hsc = &sc->sc_hest; 449 struct apei_source *src = &hsc->hsc_source[i]; 450 451 /* 452 * Arrange to stop receiving notifications. 453 */ 454 switch (ghes->Notify.Type) { 455 case ACPI_HEST_NOTIFY_POLLED: 456 if (ghes->Notify.PollInterval == 0) /* paranoia */ 457 break; 458 callout_halt(&src->as_ch, NULL); 459 callout_destroy(&src->as_ch); 460 break; 461 case ACPI_HEST_NOTIFY_SCI: 462 case ACPI_HEST_NOTIFY_GPIO: 463 /* 464 * No need to spend time removing the entry; no further 465 * calls via apei_hed_notify are possible at this 466 * point, now that detach has begun. 467 */ 468 break; 469 #if defined(__i386__) || defined(__x86_64__) 470 case ACPI_HEST_NOTIFY_NMI: 471 nmi_disestablish(src->as_nmi); 472 src->as_nmi = NULL; 473 break; 474 #endif 475 } 476 477 /* 478 * No more notifications. Unmap the GESB and destroy the 479 * interrupt source now that it will no longer be used in 480 * error handling path. 481 */ 482 AcpiOsUnmapMemory(src->as_ghes.gesb, ghes->ErrorBlockLength); 483 src->as_ghes.gesb = NULL; 484 src->as_header = NULL; 485 src->as_sc = NULL; 486 } 487 488 489 /* 490 * apei_hest_attach_ghes_v2(sc, ghes_v2, i) 491 * 492 * Attach a Generic Hardware Error Source v2 as the ith source in 493 * the Hardware Error Source Table. 494 * 495 * After this point, the system will check for and handle errors 496 * when notified by this source. 497 */ 498 static void 499 apei_hest_attach_ghes_v2(struct apei_softc *sc, ACPI_HEST_GENERIC_V2 *ghes_v2, 500 uint32_t i) 501 { 502 struct apei_hest_softc *hsc = &sc->sc_hest; 503 struct apei_source *src = &hsc->hsc_source[i]; 504 uint64_t addr; 505 struct apei_mapreg *read_ack; 506 ACPI_STATUS rv; 507 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 508 509 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 510 i, ghes_v2->Header.SourceId); 511 512 /* 513 * Verify the source is enabled before proceeding. The Enabled 514 * field is 8 bits with 256 possibilities, but only two of the 515 * possibilities, 0 and 1, have semantics defined in the spec, 516 * so out of an abundance of caution let's tread carefully in 517 * case anything changes and noisily reject any values other 518 * than 1. 519 */ 520 switch (ghes_v2->Enabled) { 521 case 1: 522 break; 523 case 0: 524 aprint_debug_dev(sc->sc_dev, "%s: disabled\n", ctx); 525 return; 526 default: 527 aprint_error_dev(sc->sc_dev, "%s:" 528 " unknown GHESv2 Enabled state: 0x%"PRIx8"\n", ctx, 529 ghes_v2->Enabled); 530 return; 531 } 532 533 /* 534 * Verify the Error Status Address bit width is at most 64 bits 535 * before proceeding with this source. When we get 128-bit 536 * addressing, this code will have to be updated. 537 */ 538 if (ghes_v2->ErrorStatusAddress.BitWidth > 64) { 539 aprint_error_dev(sc->sc_dev, "%s: excessive address bits:" 540 " %"PRIu8"\n", ctx, ghes_v2->ErrorStatusAddress.BitWidth); 541 return; 542 } 543 544 /* 545 * Read the GHESv2 Error Status Address. This is the physical 546 * address of a GESB, Generic Error Status Block. Why the 547 * physical address is exposed via this indirection, and not 548 * simply stored directly in the GHESv2, is unclear to me. 549 * Hoping it's not because the address can change dynamically, 550 * because the error handling path shouldn't involve mapping 551 * anything. 552 */ 553 rv = AcpiRead(&addr, &ghes_v2->ErrorStatusAddress); 554 if (ACPI_FAILURE(rv)) { 555 aprint_error_dev(sc->sc_dev, "%s:" 556 " failed to read error status address: %s", ctx, 557 AcpiFormatException(rv)); 558 return; 559 } 560 aprint_debug_dev(sc->sc_dev, "%s: error status @ 0x%"PRIx64"\n", ctx, 561 addr); 562 563 /* 564 * Try to map the Read Ack register up front, so we don't have 565 * to allocate and free kva in AcpiRead/AcpiWrite at the time 566 * we're handling an error. Bail if we can't. 567 */ 568 read_ack = apei_mapreg_map(&ghes_v2->ReadAckRegister); 569 if (read_ack == NULL) { 570 aprint_error_dev(sc->sc_dev, "%s:" 571 " unable to map Read Ack register\n", ctx); 572 return; 573 } 574 575 /* 576 * Initialize the source and map the GESB it in the error 577 * handling path. 578 */ 579 src->as_sc = sc; 580 src->as_header = &ghes_v2->Header; 581 src->as_ghes_v2.gesb = AcpiOsMapMemory(addr, 582 ghes_v2->ErrorBlockLength); 583 src->as_ghes_v2.read_ack = read_ack; 584 585 /* 586 * Arrange to receive notifications. 587 */ 588 switch (ghes_v2->Notify.Type) { 589 case ACPI_HEST_NOTIFY_POLLED: 590 if (ghes_v2->Notify.PollInterval == 0) /* paranoia */ 591 break; 592 callout_init(&src->as_ch, CALLOUT_MPSAFE); 593 callout_setfunc(&src->as_ch, &apei_hest_ghes_v2_poll, src); 594 callout_schedule(&src->as_ch, 0); 595 break; 596 case ACPI_HEST_NOTIFY_SCI: 597 case ACPI_HEST_NOTIFY_GPIO: 598 /* 599 * SCI and GPIO notifications are delivered through 600 * Hardware Error Device (PNP0C33) events. 601 * 602 * XXX Where is this spelled out? The text at 603 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#event-notification-for-generic-error-sources 604 * is vague. 605 */ 606 SIMPLEQ_INSERT_TAIL(&hsc->hsc_hed_list, src, as_entry); 607 break; 608 #if defined(__i386__) || defined(__x86_64__) 609 case ACPI_HEST_NOTIFY_NMI: 610 src->as_nmi = nmi_establish(&apei_hest_ghes_v2_nmi, src); 611 break; 612 #endif 613 } 614 615 /* 616 * Now that we have notification set up, process and 617 * acknowledge the initial GESB report if any. 618 */ 619 apei_hest_ghes_handle(sc, src); 620 } 621 622 /* 623 * apei_hest_detach_ghes_v2(sc, ghes_v2, i) 624 * 625 * Detach the ith source, which is a Generic Hardware Error Source 626 * v2. 627 * 628 * After this point, the system will ignore notifications from 629 * this source. 630 */ 631 static void 632 apei_hest_detach_ghes_v2(struct apei_softc *sc, ACPI_HEST_GENERIC_V2 *ghes_v2, 633 uint32_t i) 634 { 635 struct apei_hest_softc *hsc = &sc->sc_hest; 636 struct apei_source *src = &hsc->hsc_source[i]; 637 638 /* 639 * Arrange to stop receiving notifications. 640 */ 641 switch (ghes_v2->Notify.Type) { 642 case ACPI_HEST_NOTIFY_POLLED: 643 if (ghes_v2->Notify.PollInterval == 0) /* paranoia */ 644 break; 645 callout_halt(&src->as_ch, NULL); 646 callout_destroy(&src->as_ch); 647 break; 648 case ACPI_HEST_NOTIFY_SCI: 649 case ACPI_HEST_NOTIFY_GPIO: 650 /* 651 * No need to spend time removing the entry; no further 652 * calls via apei_hed_notify are possible at this 653 * point, now that detach has begun. 654 */ 655 break; 656 #if defined(__i386__) || defined(__x86_64__) 657 case ACPI_HEST_NOTIFY_NMI: 658 nmi_disestablish(src->as_nmi); 659 src->as_nmi = NULL; 660 break; 661 #endif 662 } 663 664 /* 665 * No more notifications. Unmap the GESB and read ack register 666 * now that it will no longer be used in error handling path. 667 */ 668 AcpiOsUnmapMemory(src->as_ghes_v2.gesb, ghes_v2->ErrorBlockLength); 669 src->as_ghes_v2.gesb = NULL; 670 apei_mapreg_unmap(&ghes_v2->ReadAckRegister, src->as_ghes_v2.read_ack); 671 src->as_ghes_v2.read_ack = NULL; 672 src->as_header = NULL; 673 src->as_sc = NULL; 674 } 675 676 /* 677 * apei_hest_attach_source(sc, header, i, size_t maxlen) 678 * 679 * Attach the ith source in the Hardware Error Source Table given 680 * its header, and return a pointer to the header of the next 681 * source in the table, provided it is no more than maxlen bytes 682 * past header. Return NULL if the size of the source is unknown 683 * or would exceed maxlen bytes. 684 */ 685 static ACPI_HEST_HEADER * 686 apei_hest_attach_source(struct apei_softc *sc, ACPI_HEST_HEADER *header, 687 uint32_t i, size_t maxlen) 688 { 689 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 690 691 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 692 i, header->SourceId); 693 694 switch (header->Type) { 695 case ACPI_HEST_TYPE_IA32_CHECK: { 696 ACPI_HEST_IA_MACHINE_CHECK *const imc = container_of(header, 697 ACPI_HEST_IA_MACHINE_CHECK, Header); 698 699 aprint_error_dev(sc->sc_dev, "%s:" 700 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 701 702 if (maxlen < sizeof(*imc)) 703 return NULL; 704 maxlen -= sizeof(*imc); 705 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imc + 1); 706 if (maxlen < imc->NumHardwareBanks*sizeof(*bank)) 707 return NULL; 708 return (ACPI_HEST_HEADER *)(bank + imc->NumHardwareBanks); 709 } 710 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: { 711 ACPI_HEST_IA_CORRECTED *const imcc = container_of(header, 712 ACPI_HEST_IA_CORRECTED, Header); 713 714 aprint_error_dev(sc->sc_dev, "%s:" 715 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 716 717 if (maxlen < sizeof(*imcc)) 718 return NULL; 719 maxlen -= sizeof(*imcc); 720 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imcc + 1); 721 if (maxlen < imcc->NumHardwareBanks*sizeof(*bank)) 722 return NULL; 723 return (ACPI_HEST_HEADER *)(bank + imcc->NumHardwareBanks); 724 } 725 case ACPI_HEST_TYPE_IA32_NMI: { 726 ACPI_HEST_IA_NMI *const ianmi = container_of(header, 727 ACPI_HEST_IA_NMI, Header); 728 729 aprint_error_dev(sc->sc_dev, "%s:" 730 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 731 732 if (maxlen < sizeof(*ianmi)) 733 return NULL; 734 return (ACPI_HEST_HEADER *)(ianmi + 1); 735 } 736 case ACPI_HEST_TYPE_AER_ROOT_PORT: { 737 ACPI_HEST_AER_ROOT *const aerroot = container_of(header, 738 ACPI_HEST_AER_ROOT, Header); 739 740 aprint_error_dev(sc->sc_dev, "%s:" 741 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 742 743 if (maxlen < sizeof(*aerroot)) 744 return NULL; 745 return (ACPI_HEST_HEADER *)(aerroot + 1); 746 } 747 case ACPI_HEST_TYPE_AER_ENDPOINT: { 748 ACPI_HEST_AER *const aer = container_of(header, 749 ACPI_HEST_AER, Header); 750 751 aprint_error_dev(sc->sc_dev, "%s:" 752 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 753 754 if (maxlen < sizeof(*aer)) 755 return NULL; 756 return (ACPI_HEST_HEADER *)(aer + 1); 757 } 758 case ACPI_HEST_TYPE_AER_BRIDGE: { 759 ACPI_HEST_AER_BRIDGE *const aerbridge = container_of(header, 760 ACPI_HEST_AER_BRIDGE, Header); 761 762 aprint_error_dev(sc->sc_dev, "%s:" 763 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 764 765 if (maxlen < sizeof(*aerbridge)) 766 return NULL; 767 return (ACPI_HEST_HEADER *)(aerbridge + 1); 768 } 769 case ACPI_HEST_TYPE_GENERIC_ERROR: { 770 ACPI_HEST_GENERIC *const ghes = container_of(header, 771 ACPI_HEST_GENERIC, Header); 772 773 if (maxlen < sizeof(*ghes)) 774 return NULL; 775 apei_hest_attach_ghes(sc, ghes, i); 776 return (ACPI_HEST_HEADER *)(ghes + 1); 777 } 778 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: { 779 ACPI_HEST_GENERIC_V2 *const ghes_v2 = container_of(header, 780 ACPI_HEST_GENERIC_V2, Header); 781 782 if (maxlen < sizeof(*ghes_v2)) 783 return NULL; 784 apei_hest_attach_ghes_v2(sc, ghes_v2, i); 785 return (ACPI_HEST_HEADER *)(ghes_v2 + 1); 786 } 787 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: { 788 ACPI_HEST_IA_DEFERRED_CHECK *const imdc = container_of(header, 789 ACPI_HEST_IA_DEFERRED_CHECK, Header); 790 791 aprint_error_dev(sc->sc_dev, "%s:" 792 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 793 794 if (maxlen < sizeof(*imdc)) 795 return NULL; 796 maxlen -= sizeof(*imdc); 797 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imdc + 1); 798 if (maxlen < imdc->NumHardwareBanks*sizeof(*bank)) 799 return NULL; 800 return (ACPI_HEST_HEADER *)(bank + imdc->NumHardwareBanks); 801 } 802 case ACPI_HEST_TYPE_NOT_USED3: 803 case ACPI_HEST_TYPE_NOT_USED4: 804 case ACPI_HEST_TYPE_NOT_USED5: 805 default: 806 aprint_error_dev(sc->sc_dev, "%s: unknown type:" 807 " 0x%04"PRIx16"\n", ctx, header->Type); 808 if (header->Type >= 12) { 809 /* 810 * `Beginning with error source type 12 and 811 * onward, each Error Source Structure must 812 * use the standard Error Source Structure 813 * Header as defined below.' 814 * 815 * Not yet in acpica, though, so we copy this 816 * down manually. 817 */ 818 struct { 819 UINT16 Type; 820 UINT16 Length; 821 } *const essh = (void *)header; 822 823 if (maxlen < sizeof(*essh) || maxlen < essh->Length) 824 return NULL; 825 return (ACPI_HEST_HEADER *)((char *)header + 826 essh->Length); 827 } 828 return NULL; 829 } 830 } 831 832 /* 833 * apei_hest_detach_source(sc, header, i) 834 * 835 * Detach the ith source in the Hardware Error Status Table. 836 * Caller is assumed to have stored where each source's header is, 837 * so no need to return the pointer to the header of the next 838 * source in the table. 839 */ 840 static void 841 apei_hest_detach_source(struct apei_softc *sc, ACPI_HEST_HEADER *header, 842 uint32_t i) 843 { 844 845 switch (header->Type) { 846 case ACPI_HEST_TYPE_GENERIC_ERROR: { 847 ACPI_HEST_GENERIC *ghes = container_of(header, 848 ACPI_HEST_GENERIC, Header); 849 850 apei_hest_detach_ghes(sc, ghes, i); 851 break; 852 } 853 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: { 854 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(header, 855 ACPI_HEST_GENERIC_V2, Header); 856 857 apei_hest_detach_ghes_v2(sc, ghes_v2, i); 858 break; 859 } 860 case ACPI_HEST_TYPE_IA32_CHECK: 861 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: 862 case ACPI_HEST_TYPE_IA32_NMI: 863 case ACPI_HEST_TYPE_NOT_USED3: 864 case ACPI_HEST_TYPE_NOT_USED4: 865 case ACPI_HEST_TYPE_NOT_USED5: 866 case ACPI_HEST_TYPE_AER_ROOT_PORT: 867 case ACPI_HEST_TYPE_AER_ENDPOINT: 868 case ACPI_HEST_TYPE_AER_BRIDGE: 869 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: 870 default: 871 /* XXX shouldn't happen */ 872 break; 873 } 874 } 875 876 /* 877 * apei_hest_attach(sc) 878 * 879 * Scan the Hardware Error Source Table and attach sources 880 * enumerated in it so we can receive and process hardware errors 881 * during operation. 882 */ 883 void 884 apei_hest_attach(struct apei_softc *sc) 885 { 886 ACPI_TABLE_HEST *hest = sc->sc_tab.hest; 887 struct apei_hest_softc *hsc = &sc->sc_hest; 888 ACPI_HEST_HEADER *header, *next; 889 uint32_t i, n; 890 size_t resid; 891 892 /* 893 * Initialize the HED (Hardware Error Device, PNP0C33) 894 * notification list so apei_hed_notify becomes a noop with no 895 * extra effort even if we fail to attach anything. 896 */ 897 SIMPLEQ_INIT(&hsc->hsc_hed_list); 898 899 /* 900 * Verify the table is large enough. 901 */ 902 if (hest->Header.Length < sizeof(*hest)) { 903 aprint_error_dev(sc->sc_dev, "HEST: truncated table:" 904 " %"PRIu32" < %zu minimum bytes\n", 905 hest->Header.Length, sizeof(*hest)); 906 return; 907 } 908 909 n = hest->ErrorSourceCount; 910 aprint_normal_dev(sc->sc_dev, "HEST: %"PRIu32 911 " hardware error source%s\n", n, n == 1 ? "" : "s"); 912 913 /* 914 * This could be SIZE_MAX but let's put a smaller arbitrary 915 * limit on it; if you have gigabytes of HEST something is 916 * probably wrong. 917 */ 918 if (n > MIN(SIZE_MAX, INT32_MAX)/sizeof(hsc->hsc_source[0])) { 919 aprint_error_dev(sc->sc_dev, "HEST: too many error sources\n"); 920 return; 921 } 922 hsc->hsc_source = kmem_zalloc(n * sizeof(hsc->hsc_source[0]), 923 KM_SLEEP); 924 925 header = (ACPI_HEST_HEADER *)(hest + 1); 926 resid = hest->Header.Length - sizeof(*hest); 927 for (i = 0; i < n && resid; i++, header = next) { 928 next = apei_hest_attach_source(sc, header, i, resid); 929 if (next == NULL) { 930 aprint_error_dev(sc->sc_dev, "truncated source:" 931 " %"PRIu32"\n", i); 932 break; 933 } 934 KASSERT(header < next); 935 KASSERT((size_t)((const char *)next - (const char *)header) <= 936 resid); 937 resid -= (const char *)next - (const char *)header; 938 } 939 if (resid) { 940 aprint_error_dev(sc->sc_dev, "HEST:" 941 " %zu bytes of trailing garbage after %"PRIu32" entries\n", 942 resid, n); 943 } 944 } 945 946 /* 947 * apei_hest_detach(sc) 948 * 949 * Stop receiving and processing hardware error notifications and 950 * free resources set up from the Hardware Error Source Table. 951 */ 952 void 953 apei_hest_detach(struct apei_softc *sc) 954 { 955 ACPI_TABLE_HEST *hest = sc->sc_tab.hest; 956 struct apei_hest_softc *hsc = &sc->sc_hest; 957 uint32_t i, n; 958 959 if (hsc->hsc_source) { 960 n = hest->ErrorSourceCount; 961 for (i = 0; i < n; i++) { 962 struct apei_source *src = &hsc->hsc_source[i]; 963 ACPI_HEST_HEADER *header = src->as_header; 964 965 if (src->as_header == NULL) 966 continue; 967 apei_hest_detach_source(sc, header, i); 968 } 969 kmem_free(hsc->hsc_source, n * sizeof(hsc->hsc_source[0])); 970 hsc->hsc_source = NULL; 971 } 972 } 973 974 void 975 apei_hed_notify(void) 976 { 977 device_t apei0; 978 struct apei_softc *sc; 979 struct apei_hest_softc *hsc; 980 struct apei_source *src; 981 982 /* 983 * Take a reference to the apei0 device so it doesn't go away 984 * while we're working. 985 */ 986 if ((apei0 = device_lookup_acquire(&apei_cd, 0)) == NULL) 987 goto out; 988 sc = device_private(apei0); 989 990 /* 991 * If there's no HEST, nothing to do. 992 */ 993 if (sc->sc_tab.hest == NULL) 994 goto out; 995 hsc = &sc->sc_hest; 996 997 /* 998 * Walk through the HED-notified hardware error sources and 999 * check them. The list is stable until we release apei0. 1000 */ 1001 SIMPLEQ_FOREACH(src, &hsc->hsc_hed_list, as_entry) { 1002 ACPI_HEST_HEADER *const header = src->as_header; 1003 1004 switch (header->Type) { 1005 case ACPI_HEST_TYPE_GENERIC_ERROR: 1006 apei_hest_ghes_handle(sc, src); 1007 break; 1008 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: 1009 apei_hest_ghes_v2_handle(sc, src); 1010 break; 1011 case ACPI_HEST_TYPE_IA32_CHECK: 1012 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: 1013 case ACPI_HEST_TYPE_IA32_NMI: 1014 case ACPI_HEST_TYPE_NOT_USED3: 1015 case ACPI_HEST_TYPE_NOT_USED4: 1016 case ACPI_HEST_TYPE_NOT_USED5: 1017 case ACPI_HEST_TYPE_AER_ROOT_PORT: 1018 case ACPI_HEST_TYPE_AER_ENDPOINT: 1019 case ACPI_HEST_TYPE_AER_BRIDGE: 1020 // case ACPI_HEST_TYPE_GENERIC_ERROR: 1021 // case ACPI_HEST_TYPE_GENERIC_ERROR_V2: 1022 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: 1023 default: 1024 /* XXX shouldn't happen */ 1025 break; 1026 } 1027 } 1028 1029 out: if (apei0) { 1030 device_release(apei0); 1031 apei0 = NULL; 1032 } 1033 } 1034