1/* $NetBSD: apei_hest.c,v 1.3 2024/03/21 02:35:09 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* 30 * APEI HEST -- Hardware Error Source Table 31 * 32 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#acpi-error-source 33 * 34 * XXX uncorrectable error NMI comes in on all CPUs at once, what to do? 35 * 36 * XXX AMD MCA 37 * 38 * XXX IA32 machine check stuff 39 * 40 * XXX switch-to-polling for GHES notifications 41 * 42 * XXX error threshold for GHES notifications 43 * 44 * XXX sort out interrupt notification types, e.g. do we ever need to 45 * do acpi_intr_establish? 46 * 47 * XXX sysctl knob to force polling each particular error source that 48 * supports it 49 * 50 * XXX consider a lighter-weight polling schedule for machines with 51 * thousands of polled GHESes 52 */ 53 54#include <sys/cdefs.h> 55__KERNEL_RCSID(0, "$NetBSD: apei_hest.c,v 1.3 2024/03/21 02:35:09 riastradh Exp $"); 56 57#include <sys/types.h> 58 59#include <sys/atomic.h> 60#include <sys/kmem.h> 61#include <sys/lock.h> 62#include <sys/systm.h> 63 64#include <dev/acpi/acpivar.h> 65#include <dev/acpi/apei_cper.h> 66#include <dev/acpi/apei_hestvar.h> 67#include <dev/acpi/apei_hed.h> 68#include <dev/acpi/apei_mapreg.h> 69#include <dev/acpi/apeivar.h> 70 71#if defined(__i386__) || defined(__x86_64__) 72#include <x86/nmi.h> 73#endif 74 75#include "ioconf.h" 76 77#define _COMPONENT ACPI_RESOURCE_COMPONENT 78ACPI_MODULE_NAME ("apei") 79 80/* 81 * apei_hest_ghes_handle(sc, src) 82 * 83 * Check for, report, and acknowledge any error from a Generic 84 * Hardware Error Source (GHES, not GHESv2). Return true if there 85 * was any error to report, false if not. 86 */ 87static bool 88apei_hest_ghes_handle(struct apei_softc *sc, struct apei_source *src) 89{ 90 ACPI_HEST_GENERIC *ghes = container_of(src->as_header, 91 ACPI_HEST_GENERIC, Header); 92 ACPI_HEST_GENERIC_STATUS *gesb = src->as_ghes.gesb; 93 char ctx[sizeof("error source 65535")]; 94 uint32_t status; 95 bool fatal = false; 96 97 /* 98 * Process and report any error. 99 */ 100 snprintf(ctx, sizeof(ctx), "error source %"PRIu16, 101 ghes->Header.SourceId); 102 status = apei_gesb_report(sc, src->as_ghes.gesb, 103 ghes->ErrorBlockLength, ctx, &fatal); 104 105 /* 106 * Acknowledge the error by clearing the block status. To 107 * avoid races, we probably have to avoid further access to the 108 * GESB until we get another notification. 109 * 110 * As a precaution, we zero this with atomic compare-and-swap 111 * so at least we can see if the status changed while we were 112 * working on it. 113 * 114 * It is tempting to clear bits with atomic and-complement, but 115 * the BlockStatus is not just a bit mask -- bits [13:4] are a 116 * count of Generic Error Data Entries, and who knows what bits 117 * [31:14] might be used for in the future. 118 * 119 * XXX The GHES(v1) protocol is unclear from the specification 120 * here. The GHESv2 protocol has a separate register write to 121 * acknowledge, which is a bit clearer. 122 */ 123 membar_release(); 124 const uint32_t status1 = atomic_cas_32(&gesb->BlockStatus, status, 0); 125 if (status1 != status) { 126 device_printf(sc->sc_dev, "%s: status changed from" 127 " 0x%"PRIx32" to 0x%"PRIx32"\n", 128 ctx, status, status1); 129 } 130 131 /* 132 * If the error was fatal, panic now. 133 */ 134 if (fatal) 135 panic("fatal hardware error"); 136 137 return status != 0; 138} 139 140/* 141 * apei_hest_ghes_v2_handle(sc, src) 142 * 143 * Check for, report, and acknowledge any error from a Generic 144 * Hardware Error Source v2. Return true if there was any error 145 * to report, false if not. 146 */ 147static bool 148apei_hest_ghes_v2_handle(struct apei_softc *sc, struct apei_source *src) 149{ 150 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(src->as_header, 151 ACPI_HEST_GENERIC_V2, Header); 152 ACPI_HEST_GENERIC_STATUS *gesb = src->as_ghes.gesb; 153 char ctx[sizeof("error source 65535")]; 154 uint64_t X; 155 uint32_t status; 156 bool fatal; 157 158 /* 159 * Process and report any error. 160 */ 161 snprintf(ctx, sizeof(ctx), "error source %"PRIu16, 162 ghes_v2->Header.SourceId); 163 status = apei_gesb_report(sc, src->as_ghes.gesb, 164 ghes_v2->ErrorBlockLength, ctx, &fatal); 165 166 /* 167 * First clear the block status. As a precaution, we zero this 168 * with atomic compare-and-swap so at least we can see if the 169 * status changed while we were working on it. 170 */ 171 membar_release(); 172 const uint32_t status1 = atomic_cas_32(&gesb->BlockStatus, status, 0); 173 if (status1 != status) { 174 device_printf(sc->sc_dev, "%s: status changed from" 175 " 0x%"PRIx32" to 0x%"PRIx32"\n", 176 ctx, status, status1); 177 } 178 179 /* 180 * Next, do the Read Ack dance. 181 * 182 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#generic-hardware-error-source-version-2-ghesv2-type-10 183 */ 184 X = apei_mapreg_read(&ghes_v2->ReadAckRegister, 185 src->as_ghes_v2.read_ack); 186 X &= ghes_v2->ReadAckPreserve; 187 X |= ghes_v2->ReadAckWrite; 188 apei_mapreg_write(&ghes_v2->ReadAckRegister, 189 src->as_ghes_v2.read_ack, X); 190 191 /* 192 * If the error was fatal, panic now. 193 */ 194 if (fatal) 195 panic("fatal hardware error"); 196 197 return status != 0; 198} 199 200/* 201 * apei_hest_ghes_poll(cookie) 202 * 203 * Callout handler for periodic polling of a Generic Hardware 204 * Error Source (GHES, not GHESv2), using Notification Type `0 - 205 * Polled'. 206 * 207 * cookie is the struct apei_source pointer for a single source; 208 * if there are multiple sources there will be multiple callouts. 209 */ 210static void 211apei_hest_ghes_poll(void *cookie) 212{ 213 struct apei_source *src = cookie; 214 struct apei_softc *sc = src->as_sc; 215 ACPI_HEST_GENERIC *ghes = container_of(src->as_header, 216 ACPI_HEST_GENERIC, Header); 217 218 /* 219 * Process and acknowledge any error. 220 */ 221 (void)apei_hest_ghes_handle(sc, src); 222 223 /* 224 * Schedule polling again after the firmware-suggested 225 * interval. 226 */ 227 callout_schedule(&src->as_ch, 228 MAX(1, mstohz(ghes->Notify.PollInterval))); 229} 230 231/* 232 * apei_hest_ghes_v2_poll(cookie) 233 * 234 * Callout handler for periodic polling of a Generic Hardware 235 * Error Source v2, using Notification Type `0 - Polled'. 236 * 237 * cookie is the struct apei_source pointer for a single source; 238 * if there are multiple sources there will be multiple callouts. 239 */ 240static void 241apei_hest_ghes_v2_poll(void *cookie) 242{ 243 struct apei_source *src = cookie; 244 struct apei_softc *sc = src->as_sc; 245 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(src->as_header, 246 ACPI_HEST_GENERIC_V2, Header); 247 248 /* 249 * Process and acknowledge any error. 250 */ 251 (void)apei_hest_ghes_v2_handle(sc, src); 252 253 /* 254 * Schedule polling again after the firmware-suggested 255 * interval. 256 */ 257 callout_schedule(&src->as_ch, 258 MAX(1, mstohz(ghes_v2->Notify.PollInterval))); 259} 260 261#if defined(__i386__) || defined(__x86_64__) 262 263/* 264 * The NMI is (sometimes?) delivered to all CPUs at once. To reduce 265 * confusion, let's try to have only one CPU process error 266 * notifications at a time. 267 */ 268static __cpu_simple_lock_t apei_hest_nmi_lock; 269 270/* 271 * apei_hest_ghes_nmi(tf, cookie) 272 * 273 * Nonmaskable interrupt handler for Generic Hardware Error 274 * Sources (GHES, not GHESv2) with Notification Type `4 - NMI'. 275 */ 276static int 277apei_hest_ghes_nmi(const struct trapframe *tf, void *cookie) 278{ 279 struct apei_source *src = cookie; 280 struct apei_softc *sc = src->as_sc; 281 282 __cpu_simple_lock(&apei_hest_nmi_lock); 283 const bool mine = apei_hest_ghes_handle(sc, src); 284 __cpu_simple_unlock(&apei_hest_nmi_lock); 285 286 /* 287 * Tell the NMI subsystem whether this interrupt could have 288 * been for us or not. 289 */ 290 return mine; 291} 292 293/* 294 * apei_hest_ghes_v2_nmi(tf, cookie) 295 * 296 * Nonmaskable interrupt handler for Generic Hardware Error 297 * Sources v2 with Notification Type `4 - NMI'. 298 */ 299static int 300apei_hest_ghes_v2_nmi(const struct trapframe *tf, void *cookie) 301{ 302 struct apei_source *src = cookie; 303 struct apei_softc *sc = src->as_sc; 304 305 __cpu_simple_lock(&apei_hest_nmi_lock); 306 const bool mine = apei_hest_ghes_v2_handle(sc, src); 307 __cpu_simple_unlock(&apei_hest_nmi_lock); 308 309 /* 310 * Tell the NMI subsystem whether this interrupt could have 311 * been for us or not. 312 */ 313 return mine; 314} 315 316#endif /* defined(__i386__) || defined(__x86_64__) */ 317 318/* 319 * apei_hest_attach_ghes(sc, ghes, i) 320 * 321 * Attach a Generic Hardware Error Source (GHES, not GHESv2) as 322 * the ith source in the Hardware Error Source Table. 323 * 324 * After this point, the system will check for and handle errors 325 * when notified by this source. 326 */ 327static void 328apei_hest_attach_ghes(struct apei_softc *sc, ACPI_HEST_GENERIC *ghes, 329 uint32_t i) 330{ 331 struct apei_hest_softc *hsc = &sc->sc_hest; 332 struct apei_source *src = &hsc->hsc_source[i]; 333 uint64_t addr; 334 ACPI_STATUS rv; 335 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 336 337 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 338 i, ghes->Header.SourceId); 339 340 /* 341 * Verify the source is enabled before proceeding. The Enabled 342 * field is 8 bits with 256 possibilities, but only two of the 343 * possibilities, 0 and 1, have semantics defined in the spec, 344 * so out of an abundance of caution let's tread carefully in 345 * case anything changes and noisily reject any values other 346 * than 1. 347 */ 348 switch (ghes->Enabled) { 349 case 1: 350 break; 351 case 0: 352 aprint_debug_dev(sc->sc_dev, "%s: disabled\n", ctx); 353 return; 354 default: 355 aprint_error_dev(sc->sc_dev, "%s: unknown GHES Enabled state:" 356 " 0x%"PRIx8"\n", ctx, ghes->Enabled); 357 return; 358 } 359 360 /* 361 * Verify the Error Status Address bit width is at most 64 bits 362 * before proceeding with this source. When we get 128-bit 363 * addressing, this code will have to be updated. 364 */ 365 if (ghes->ErrorStatusAddress.BitWidth > 64) { 366 aprint_error_dev(sc->sc_dev, "%s: excessive address bits:" 367 " %"PRIu8"\n", ctx, ghes->ErrorStatusAddress.BitWidth); 368 return; 369 } 370 371 /* 372 * Read the GHES Error Status Addresss. This is the physical 373 * address of a GESB, Generic Error Status Block. Why the 374 * physical address is exposed via this indirection, and not 375 * simply stored directly in the GHES, is unclear to me. 376 * Hoping it's not because the address can change dynamically, 377 * because the error handling path shouldn't involve mapping 378 * anything. 379 */ 380 rv = AcpiRead(&addr, &ghes->ErrorStatusAddress); 381 if (ACPI_FAILURE(rv)) { 382 aprint_error_dev(sc->sc_dev, "%s:" 383 " failed to read error status address: %s", ctx, 384 AcpiFormatException(rv)); 385 return; 386 } 387 aprint_debug_dev(sc->sc_dev, "%s: error status @ 0x%"PRIx64"\n", ctx, 388 addr); 389 390 /* 391 * Initialize the source and map the GESB so we can get at it 392 * in the error handling path. 393 */ 394 src->as_sc = sc; 395 src->as_header = &ghes->Header; 396 src->as_ghes.gesb = AcpiOsMapMemory(addr, ghes->ErrorBlockLength); 397 398 /* 399 * Arrange to receive notifications. 400 */ 401 switch (ghes->Notify.Type) { 402 case ACPI_HEST_NOTIFY_POLLED: 403 callout_init(&src->as_ch, CALLOUT_MPSAFE); 404 callout_setfunc(&src->as_ch, &apei_hest_ghes_poll, src); 405 callout_schedule(&src->as_ch, 0); 406 break; 407 case ACPI_HEST_NOTIFY_SCI: 408 case ACPI_HEST_NOTIFY_GPIO: 409 /* 410 * SCI and GPIO notifications are delivered through 411 * Hardware Error Device (PNP0C33) events. 412 * 413 * XXX Where is this spelled out? The text at 414 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#event-notification-for-generic-error-sources 415 * is vague. 416 */ 417 SIMPLEQ_INSERT_TAIL(&hsc->hsc_hed_list, src, as_entry); 418 break; 419#if defined(__i386__) || defined(__x86_64__) 420 case ACPI_HEST_NOTIFY_NMI: 421 src->as_nmi = nmi_establish(&apei_hest_ghes_nmi, src); 422 break; 423#endif 424 } 425 426 /* 427 * Now that we have notification set up, process and 428 * acknowledge the initial GESB report if any. 429 */ 430 apei_hest_ghes_handle(sc, src); 431} 432 433/* 434 * apei_hest_detach_ghes(sc, ghes, i) 435 * 436 * Detach the ith source, which is a Generic Hardware Error Source 437 * (GHES, not GHESv2). 438 * 439 * After this point, the system will ignore notifications from 440 * this source. 441 */ 442static void 443apei_hest_detach_ghes(struct apei_softc *sc, ACPI_HEST_GENERIC *ghes, 444 uint32_t i) 445{ 446 struct apei_hest_softc *hsc = &sc->sc_hest; 447 struct apei_source *src = &hsc->hsc_source[i]; 448 449 /* 450 * Arrange to stop receiving notifications. 451 */ 452 switch (ghes->Notify.Type) { 453 case ACPI_HEST_NOTIFY_POLLED: 454 callout_halt(&src->as_ch, NULL); 455 callout_destroy(&src->as_ch); 456 break; 457 case ACPI_HEST_NOTIFY_SCI: 458 case ACPI_HEST_NOTIFY_GPIO: 459 /* 460 * No need to spend time removing the entry; no further 461 * calls via apei_hed_notify are possible at this 462 * point, now that detach has begun. 463 */ 464 break; 465#if defined(__i386__) || defined(__x86_64__) 466 case ACPI_HEST_NOTIFY_NMI: 467 nmi_disestablish(src->as_nmi); 468 src->as_nmi = NULL; 469 break; 470#endif 471 } 472 473 /* 474 * No more notifications. Unmap the GESB and destroy the 475 * interrupt source now that it will no longer be used in 476 * error handling path. 477 */ 478 AcpiOsUnmapMemory(src->as_ghes.gesb, ghes->ErrorBlockLength); 479 src->as_ghes.gesb = NULL; 480 src->as_header = NULL; 481 src->as_sc = NULL; 482} 483 484 485/* 486 * apei_hest_attach_ghes_v2(sc, ghes_v2, i) 487 * 488 * Attach a Generic Hardware Error Source v2 as the ith source in 489 * the Hardware Error Source Table. 490 * 491 * After this point, the system will check for and handle errors 492 * when notified by this source. 493 */ 494static void 495apei_hest_attach_ghes_v2(struct apei_softc *sc, ACPI_HEST_GENERIC_V2 *ghes_v2, 496 uint32_t i) 497{ 498 struct apei_hest_softc *hsc = &sc->sc_hest; 499 struct apei_source *src = &hsc->hsc_source[i]; 500 uint64_t addr; 501 struct apei_mapreg *read_ack; 502 ACPI_STATUS rv; 503 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 504 505 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 506 i, ghes_v2->Header.SourceId); 507 508 /* 509 * Verify the source is enabled before proceeding. The Enabled 510 * field is 8 bits with 256 possibilities, but only two of the 511 * possibilities, 0 and 1, have semantics defined in the spec, 512 * so out of an abundance of caution let's tread carefully in 513 * case anything changes and noisily reject any values other 514 * than 1. 515 */ 516 switch (ghes_v2->Enabled) { 517 case 1: 518 break; 519 case 0: 520 aprint_debug_dev(sc->sc_dev, "%s: disabled\n", ctx); 521 return; 522 default: 523 aprint_error_dev(sc->sc_dev, "%s:" 524 " unknown GHESv2 Enabled state: 0x%"PRIx8"\n", ctx, 525 ghes_v2->Enabled); 526 return; 527 } 528 529 /* 530 * Verify the Error Status Address bit width is at most 64 bits 531 * before proceeding with this source. When we get 128-bit 532 * addressing, this code will have to be updated. 533 */ 534 if (ghes_v2->ErrorStatusAddress.BitWidth > 64) { 535 aprint_error_dev(sc->sc_dev, "%s: excessive address bits:" 536 " %"PRIu8"\n", ctx, ghes_v2->ErrorStatusAddress.BitWidth); 537 return; 538 } 539 540 /* 541 * Read the GHESv2 Error Status Addresss. This is the physical 542 * address of a GESB, Generic Error Status Block. Why the 543 * physical address is exposed via this indirection, and not 544 * simply stored directly in the GHESv2, is unclear to me. 545 * Hoping it's not because the address can change dynamically, 546 * because the error handling path shouldn't involve mapping 547 * anything. 548 */ 549 rv = AcpiRead(&addr, &ghes_v2->ErrorStatusAddress); 550 if (ACPI_FAILURE(rv)) { 551 aprint_error_dev(sc->sc_dev, "%s:" 552 " failed to read error status address: %s", ctx, 553 AcpiFormatException(rv)); 554 return; 555 } 556 aprint_debug_dev(sc->sc_dev, "%s: error status @ 0x%"PRIx64"\n", ctx, 557 addr); 558 559 /* 560 * Try to map the Read Ack register up front, so we don't have 561 * to allocate and free kva in AcpiRead/AcpiWrite at the time 562 * we're handling an error. Bail if we can't. 563 */ 564 read_ack = apei_mapreg_map(&ghes_v2->ReadAckRegister); 565 if (read_ack == NULL) { 566 aprint_error_dev(sc->sc_dev, "%s:" 567 " unable to map Read Ack register\n", ctx); 568 return; 569 } 570 571 /* 572 * Initialize the source and map the GESB it in the error 573 * handling path. 574 */ 575 src->as_sc = sc; 576 src->as_header = &ghes_v2->Header; 577 src->as_ghes_v2.gesb = AcpiOsMapMemory(addr, 578 ghes_v2->ErrorBlockLength); 579 src->as_ghes_v2.read_ack = read_ack; 580 581 /* 582 * Arrange to receive notifications. 583 */ 584 switch (ghes_v2->Notify.Type) { 585 case ACPI_HEST_NOTIFY_POLLED: 586 callout_init(&src->as_ch, CALLOUT_MPSAFE); 587 callout_setfunc(&src->as_ch, &apei_hest_ghes_v2_poll, src); 588 callout_schedule(&src->as_ch, 0); 589 break; 590 case ACPI_HEST_NOTIFY_SCI: 591 case ACPI_HEST_NOTIFY_GPIO: 592 /* 593 * SCI and GPIO notifications are delivered through 594 * Hardware Error Device (PNP0C33) events. 595 * 596 * XXX Where is this spelled out? The text at 597 * https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#event-notification-for-generic-error-sources 598 * is vague. 599 */ 600 SIMPLEQ_INSERT_TAIL(&hsc->hsc_hed_list, src, as_entry); 601 break; 602#if defined(__i386__) || defined(__x86_64__) 603 case ACPI_HEST_NOTIFY_NMI: 604 src->as_nmi = nmi_establish(&apei_hest_ghes_v2_nmi, src); 605 break; 606#endif 607 } 608 609 /* 610 * Now that we have notification set up, process and 611 * acknowledge the initial GESB report if any. 612 */ 613 apei_hest_ghes_handle(sc, src); 614} 615 616/* 617 * apei_hest_detach_ghes_v2(sc, ghes_v2, i) 618 * 619 * Detach the ith source, which is a Generic Hardware Error Source 620 * v2. 621 * 622 * After this point, the system will ignore notifications from 623 * this source. 624 */ 625static void 626apei_hest_detach_ghes_v2(struct apei_softc *sc, ACPI_HEST_GENERIC_V2 *ghes_v2, 627 uint32_t i) 628{ 629 struct apei_hest_softc *hsc = &sc->sc_hest; 630 struct apei_source *src = &hsc->hsc_source[i]; 631 632 /* 633 * Arrange to stop receiving notifications. 634 */ 635 switch (ghes_v2->Notify.Type) { 636 case ACPI_HEST_NOTIFY_POLLED: 637 callout_halt(&src->as_ch, NULL); 638 callout_destroy(&src->as_ch); 639 break; 640 case ACPI_HEST_NOTIFY_SCI: 641 case ACPI_HEST_NOTIFY_GPIO: 642 /* 643 * No need to spend time removing the entry; no further 644 * calls via apei_hed_notify are possible at this 645 * point, now that detach has begun. 646 */ 647 break; 648#if defined(__i386__) || defined(__x86_64__) 649 case ACPI_HEST_NOTIFY_NMI: 650 nmi_disestablish(src->as_nmi); 651 src->as_nmi = NULL; 652 break; 653#endif 654 } 655 656 /* 657 * No more notifications. Unmap the GESB and read ack register 658 * now that it will no longer be used in error handling path. 659 */ 660 AcpiOsUnmapMemory(src->as_ghes_v2.gesb, ghes_v2->ErrorBlockLength); 661 src->as_ghes_v2.gesb = NULL; 662 apei_mapreg_unmap(&ghes_v2->ReadAckRegister, src->as_ghes_v2.read_ack); 663 src->as_ghes_v2.read_ack = NULL; 664 src->as_header = NULL; 665 src->as_sc = NULL; 666} 667 668/* 669 * apei_hest_attach_source(sc, header, i, size_t maxlen) 670 * 671 * Attach the ith source in the Hardware Error Source Table given 672 * its header, and return a pointer to the header of the next 673 * source in the table, provided it is no more than maxlen bytes 674 * past header. Return NULL if the size of the source is unknown 675 * or would exceed maxlen bytes. 676 */ 677static ACPI_HEST_HEADER * 678apei_hest_attach_source(struct apei_softc *sc, ACPI_HEST_HEADER *header, 679 uint32_t i, size_t maxlen) 680{ 681 char ctx[sizeof("HEST[4294967295, Id=65535]")]; 682 683 snprintf(ctx, sizeof(ctx), "HEST[%"PRIu32", Id=%"PRIu16"]", 684 i, header->SourceId); 685 686 switch (header->Type) { 687 case ACPI_HEST_TYPE_IA32_CHECK: { 688 ACPI_HEST_IA_MACHINE_CHECK *const imc = container_of(header, 689 ACPI_HEST_IA_MACHINE_CHECK, Header); 690 691 aprint_error_dev(sc->sc_dev, "%s:" 692 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 693 694 if (maxlen < sizeof(*imc)) 695 return NULL; 696 maxlen -= sizeof(*imc); 697 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imc + 1); 698 if (maxlen < imc->NumHardwareBanks*sizeof(*bank)) 699 return NULL; 700 return (ACPI_HEST_HEADER *)(bank + imc->NumHardwareBanks); 701 } 702 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: { 703 ACPI_HEST_IA_CORRECTED *const imcc = container_of(header, 704 ACPI_HEST_IA_CORRECTED, Header); 705 706 aprint_error_dev(sc->sc_dev, "%s:" 707 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 708 709 if (maxlen < sizeof(*imcc)) 710 return NULL; 711 maxlen -= sizeof(*imcc); 712 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imcc + 1); 713 if (maxlen < imcc->NumHardwareBanks*sizeof(*bank)) 714 return NULL; 715 return (ACPI_HEST_HEADER *)(bank + imcc->NumHardwareBanks); 716 } 717 case ACPI_HEST_TYPE_IA32_NMI: { 718 ACPI_HEST_IA_NMI *const ianmi = container_of(header, 719 ACPI_HEST_IA_NMI, Header); 720 721 aprint_error_dev(sc->sc_dev, "%s:" 722 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 723 724 if (maxlen < sizeof(*ianmi)) 725 return NULL; 726 return (ACPI_HEST_HEADER *)(ianmi + 1); 727 } 728 case ACPI_HEST_TYPE_AER_ROOT_PORT: { 729 ACPI_HEST_AER_ROOT *const aerroot = container_of(header, 730 ACPI_HEST_AER_ROOT, Header); 731 732 aprint_error_dev(sc->sc_dev, "%s:" 733 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 734 735 if (maxlen < sizeof(*aerroot)) 736 return NULL; 737 return (ACPI_HEST_HEADER *)(aerroot + 1); 738 } 739 case ACPI_HEST_TYPE_AER_ENDPOINT: { 740 ACPI_HEST_AER *const aer = container_of(header, 741 ACPI_HEST_AER, Header); 742 743 aprint_error_dev(sc->sc_dev, "%s:" 744 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 745 746 if (maxlen < sizeof(*aer)) 747 return NULL; 748 return (ACPI_HEST_HEADER *)(aer + 1); 749 } 750 case ACPI_HEST_TYPE_AER_BRIDGE: { 751 ACPI_HEST_AER_BRIDGE *const aerbridge = container_of(header, 752 ACPI_HEST_AER_BRIDGE, Header); 753 754 aprint_error_dev(sc->sc_dev, "%s:" 755 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 756 757 if (maxlen < sizeof(*aerbridge)) 758 return NULL; 759 return (ACPI_HEST_HEADER *)(aerbridge + 1); 760 } 761 case ACPI_HEST_TYPE_GENERIC_ERROR: { 762 ACPI_HEST_GENERIC *const ghes = container_of(header, 763 ACPI_HEST_GENERIC, Header); 764 765 if (maxlen < sizeof(*ghes)) 766 return NULL; 767 apei_hest_attach_ghes(sc, ghes, i); 768 return (ACPI_HEST_HEADER *)(ghes + 1); 769 } 770 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: { 771 ACPI_HEST_GENERIC_V2 *const ghes_v2 = container_of(header, 772 ACPI_HEST_GENERIC_V2, Header); 773 774 if (maxlen < sizeof(*ghes_v2)) 775 return NULL; 776 apei_hest_attach_ghes_v2(sc, ghes_v2, i); 777 return (ACPI_HEST_HEADER *)(ghes_v2 + 1); 778 } 779 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: { 780 ACPI_HEST_IA_DEFERRED_CHECK *const imdc = container_of(header, 781 ACPI_HEST_IA_DEFERRED_CHECK, Header); 782 783 aprint_error_dev(sc->sc_dev, "%s:" 784 " unimplemented type: 0x%04"PRIx16"\n", ctx, header->Type); 785 786 if (maxlen < sizeof(*imdc)) 787 return NULL; 788 maxlen -= sizeof(*imdc); 789 ACPI_HEST_IA_ERROR_BANK *const bank = (void *)(imdc + 1); 790 if (maxlen < imdc->NumHardwareBanks*sizeof(*bank)) 791 return NULL; 792 return (ACPI_HEST_HEADER *)(bank + imdc->NumHardwareBanks); 793 } 794 case ACPI_HEST_TYPE_NOT_USED3: 795 case ACPI_HEST_TYPE_NOT_USED4: 796 case ACPI_HEST_TYPE_NOT_USED5: 797 default: 798 aprint_error_dev(sc->sc_dev, "%s: unknown type:" 799 " 0x%04"PRIx16"\n", ctx, header->Type); 800 if (header->Type >= 12) { 801 /* 802 * `Beginning with error source type 12 and 803 * onward, each Error Source Structure must 804 * use the standard Error Source Structure 805 * Header as defined below.' 806 * 807 * Not yet in acpica, though, so we copy this 808 * down manually. 809 */ 810 struct { 811 UINT16 Type; 812 UINT16 Length; 813 } *const essh = (void *)header; 814 815 if (maxlen < sizeof(*essh) || maxlen < essh->Length) 816 return NULL; 817 return (ACPI_HEST_HEADER *)((char *)header + 818 essh->Length); 819 } 820 return NULL; 821 } 822} 823 824/* 825 * apei_hest_detach_source(sc, header, i) 826 * 827 * Detach the ith source in the Hardware Error Status Table. 828 * Caller is assumed to have stored where each source's header is, 829 * so no need to return the pointer to the header of the next 830 * source in the table. 831 */ 832static void 833apei_hest_detach_source(struct apei_softc *sc, ACPI_HEST_HEADER *header, 834 uint32_t i) 835{ 836 837 switch (header->Type) { 838 case ACPI_HEST_TYPE_GENERIC_ERROR: { 839 ACPI_HEST_GENERIC *ghes = container_of(header, 840 ACPI_HEST_GENERIC, Header); 841 842 apei_hest_detach_ghes(sc, ghes, i); 843 break; 844 } 845 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: { 846 ACPI_HEST_GENERIC_V2 *ghes_v2 = container_of(header, 847 ACPI_HEST_GENERIC_V2, Header); 848 849 apei_hest_detach_ghes_v2(sc, ghes_v2, i); 850 break; 851 } 852 case ACPI_HEST_TYPE_IA32_CHECK: 853 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: 854 case ACPI_HEST_TYPE_IA32_NMI: 855 case ACPI_HEST_TYPE_NOT_USED3: 856 case ACPI_HEST_TYPE_NOT_USED4: 857 case ACPI_HEST_TYPE_NOT_USED5: 858 case ACPI_HEST_TYPE_AER_ROOT_PORT: 859 case ACPI_HEST_TYPE_AER_ENDPOINT: 860 case ACPI_HEST_TYPE_AER_BRIDGE: 861 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: 862 default: 863 /* XXX shouldn't happen */ 864 break; 865 } 866} 867 868/* 869 * apei_hest_attach(sc) 870 * 871 * Scan the Hardware Error Source Table and attach sources 872 * enumerated in it so we can receive and process hardware errors 873 * during operation. 874 */ 875void 876apei_hest_attach(struct apei_softc *sc) 877{ 878 ACPI_TABLE_HEST *hest = sc->sc_tab.hest; 879 struct apei_hest_softc *hsc = &sc->sc_hest; 880 ACPI_HEST_HEADER *header, *next; 881 uint32_t i, n; 882 size_t resid; 883 884 /* 885 * Initialize the HED (Hardware Error Device, PNP0C33) 886 * notification list so apei_hed_notify becomes a noop with no 887 * extra effort even if we fail to attach anything. 888 */ 889 SIMPLEQ_INIT(&hsc->hsc_hed_list); 890 891 /* 892 * Verify the table is large enough. 893 */ 894 if (hest->Header.Length < sizeof(*hest)) { 895 aprint_error_dev(sc->sc_dev, "HEST: truncated table:" 896 " %"PRIu32" < %zu minimum bytes\n", 897 hest->Header.Length, sizeof(*hest)); 898 return; 899 } 900 901 n = hest->ErrorSourceCount; 902 aprint_normal_dev(sc->sc_dev, "HEST: %"PRIu32 903 " hardware error source%s\n", n, n == 1 ? "" : "s"); 904 905 /* 906 * This could be SIZE_MAX but let's put a smaller arbitrary 907 * limit on it; if you have gigabytes of HEST something is 908 * probably wrong. 909 */ 910 if (n > INT32_MAX/sizeof(hsc->hsc_source[0])) { 911 aprint_error_dev(sc->sc_dev, "HEST: too many error sources\n"); 912 return; 913 } 914 hsc->hsc_source = kmem_zalloc(n * sizeof(hsc->hsc_source[0]), 915 KM_SLEEP); 916 917 header = (ACPI_HEST_HEADER *)(hest + 1); 918 resid = hest->Header.Length - sizeof(*hest); 919 for (i = 0; i < n && resid; i++, header = next) { 920 next = apei_hest_attach_source(sc, header, i, resid); 921 if (next == NULL) { 922 aprint_error_dev(sc->sc_dev, "truncated source:" 923 " %"PRIu32"\n", i); 924 break; 925 } 926 KASSERT(header < next); 927 KASSERT((size_t)((const char *)next - (const char *)header) <= 928 resid); 929 resid -= (const char *)next - (const char *)header; 930 } 931 if (resid) { 932 aprint_error_dev(sc->sc_dev, "HEST:" 933 " %zu bytes of trailing garbage after %"PRIu32" entries\n", 934 resid, n); 935 } 936} 937 938/* 939 * apei_hest_detach(sc) 940 * 941 * Stop receiving and processing hardware error notifications and 942 * free resources set up from the Hardware Error Source Table. 943 */ 944void 945apei_hest_detach(struct apei_softc *sc) 946{ 947 ACPI_TABLE_HEST *hest = sc->sc_tab.hest; 948 struct apei_hest_softc *hsc = &sc->sc_hest; 949 uint32_t i, n; 950 951 if (hsc->hsc_source) { 952 n = hest->ErrorSourceCount; 953 for (i = 0; i < n; i++) { 954 struct apei_source *src = &hsc->hsc_source[i]; 955 ACPI_HEST_HEADER *header = src->as_header; 956 957 if (src->as_header == NULL) 958 continue; 959 apei_hest_detach_source(sc, header, i); 960 } 961 kmem_free(hsc->hsc_source, n * sizeof(hsc->hsc_source[0])); 962 hsc->hsc_source = NULL; 963 } 964} 965 966void 967apei_hed_notify(void) 968{ 969 device_t apei0; 970 struct apei_softc *sc; 971 struct apei_hest_softc *hsc; 972 struct apei_source *src; 973 974 /* 975 * Take a reference to the apei0 device so it doesn't go away 976 * while we're working. 977 */ 978 if ((apei0 = device_lookup_acquire(&apei_cd, 0)) == NULL) 979 goto out; 980 sc = device_private(apei0); 981 982 /* 983 * If there's no HEST, nothing to do. 984 */ 985 if (sc->sc_tab.hest == NULL) 986 goto out; 987 hsc = &sc->sc_hest; 988 989 /* 990 * Walk through the HED-notified hardware error sources and 991 * check them. The list is stable until we release apei0. 992 */ 993 SIMPLEQ_FOREACH(src, &hsc->hsc_hed_list, as_entry) { 994 ACPI_HEST_HEADER *const header = src->as_header; 995 996 switch (header->Type) { 997 case ACPI_HEST_TYPE_GENERIC_ERROR: 998 apei_hest_ghes_handle(sc, src); 999 break; 1000 case ACPI_HEST_TYPE_GENERIC_ERROR_V2: 1001 apei_hest_ghes_v2_handle(sc, src); 1002 break; 1003 case ACPI_HEST_TYPE_IA32_CHECK: 1004 case ACPI_HEST_TYPE_IA32_CORRECTED_CHECK: 1005 case ACPI_HEST_TYPE_IA32_NMI: 1006 case ACPI_HEST_TYPE_NOT_USED3: 1007 case ACPI_HEST_TYPE_NOT_USED4: 1008 case ACPI_HEST_TYPE_NOT_USED5: 1009 case ACPI_HEST_TYPE_AER_ROOT_PORT: 1010 case ACPI_HEST_TYPE_AER_ENDPOINT: 1011 case ACPI_HEST_TYPE_AER_BRIDGE: 1012// case ACPI_HEST_TYPE_GENERIC_ERROR: 1013// case ACPI_HEST_TYPE_GENERIC_ERROR_V2: 1014 case ACPI_HEST_TYPE_IA32_DEFERRED_CHECK: 1015 default: 1016 /* XXX shouldn't happen */ 1017 break; 1018 } 1019 } 1020 1021out: if (apei0) { 1022 device_release(apei0); 1023 apei0 = NULL; 1024 } 1025} 1026