mca.c revision 267985
1/*- 2 * Copyright (c) 2009 Advanced Computing Technologies LLC 3 * Written by: John H. Baldwin <jhb@FreeBSD.org> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28/* 29 * Support for x86 machine check architecture. 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/x86/x86/mca.c 267985 2014-06-27 22:05:21Z gjb $"); 34 35#ifdef __amd64__ 36#define DEV_APIC 37#else 38#include "opt_apic.h" 39#endif 40 41#include <sys/param.h> 42#include <sys/bus.h> 43#include <sys/interrupt.h> 44#include <sys/kernel.h> 45#include <sys/lock.h> 46#include <sys/malloc.h> 47#include <sys/mutex.h> 48#include <sys/proc.h> 49#include <sys/sched.h> 50#include <sys/smp.h> 51#include <sys/sysctl.h> 52#include <sys/systm.h> 53#include <sys/taskqueue.h> 54#include <machine/intr_machdep.h> 55#include <x86/apicvar.h> 56#include <machine/cpu.h> 57#include <machine/cputypes.h> 58#include <x86/mca.h> 59#include <machine/md_var.h> 60#include <machine/specialreg.h> 61 62/* Modes for mca_scan() */ 63enum scan_mode { 64 POLLED, 65 MCE, 66 CMCI, 67}; 68 69#ifdef DEV_APIC 70/* 71 * State maintained for each monitored MCx bank to control the 72 * corrected machine check interrupt threshold. 73 */ 74struct cmc_state { 75 int max_threshold; 76 int last_intr; 77}; 78#endif 79 80struct mca_internal { 81 struct mca_record rec; 82 int logged; 83 STAILQ_ENTRY(mca_internal) link; 84}; 85 86static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture"); 87 88static volatile int mca_count; /* Number of records stored. */ 89static int mca_banks; /* Number of per-CPU register banks. */ 90 91static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, 92 "Machine Check Architecture"); 93 94static int mca_enabled = 1; 95TUNABLE_INT("hw.mca.enabled", &mca_enabled); 96SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0, 97 "Administrative toggle for machine check support"); 98 99static int amd10h_L1TP = 1; 100TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP); 101SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0, 102 "Administrative toggle for logging of level one TLB parity (L1TP) errors"); 103 104int workaround_erratum383; 105SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0, 106 "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); 107 108static STAILQ_HEAD(, mca_internal) mca_freelist; 109static int mca_freecount; 110static STAILQ_HEAD(, mca_internal) mca_records; 111static struct callout mca_timer; 112static int mca_ticks = 3600; /* Check hourly by default. */ 113static struct taskqueue *mca_tq; 114static struct task mca_refill_task, mca_scan_task; 115static struct mtx mca_lock; 116 117#ifdef DEV_APIC 118static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ 119static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ 120#endif 121 122static int 123sysctl_positive_int(SYSCTL_HANDLER_ARGS) 124{ 125 int error, value; 126 127 value = *(int *)arg1; 128 error = sysctl_handle_int(oidp, &value, 0, req); 129 if (error || req->newptr == NULL) 130 return (error); 131 if (value <= 0) 132 return (EINVAL); 133 *(int *)arg1 = value; 134 return (0); 135} 136 137static int 138sysctl_mca_records(SYSCTL_HANDLER_ARGS) 139{ 140 int *name = (int *)arg1; 141 u_int namelen = arg2; 142 struct mca_record record; 143 struct mca_internal *rec; 144 int i; 145 146 if (namelen != 1) 147 return (EINVAL); 148 149 if (name[0] < 0 || name[0] >= mca_count) 150 return (EINVAL); 151 152 mtx_lock_spin(&mca_lock); 153 if (name[0] >= mca_count) { 154 mtx_unlock_spin(&mca_lock); 155 return (EINVAL); 156 } 157 i = 0; 158 STAILQ_FOREACH(rec, &mca_records, link) { 159 if (i == name[0]) { 160 record = rec->rec; 161 break; 162 } 163 i++; 164 } 165 mtx_unlock_spin(&mca_lock); 166 return (SYSCTL_OUT(req, &record, sizeof(record))); 167} 168 169static const char * 170mca_error_ttype(uint16_t mca_error) 171{ 172 173 switch ((mca_error & 0x000c) >> 2) { 174 case 0: 175 return ("I"); 176 case 1: 177 return ("D"); 178 case 2: 179 return ("G"); 180 } 181 return ("?"); 182} 183 184static const char * 185mca_error_level(uint16_t mca_error) 186{ 187 188 switch (mca_error & 0x0003) { 189 case 0: 190 return ("L0"); 191 case 1: 192 return ("L1"); 193 case 2: 194 return ("L2"); 195 case 3: 196 return ("LG"); 197 } 198 return ("L?"); 199} 200 201static const char * 202mca_error_request(uint16_t mca_error) 203{ 204 205 switch ((mca_error & 0x00f0) >> 4) { 206 case 0x0: 207 return ("ERR"); 208 case 0x1: 209 return ("RD"); 210 case 0x2: 211 return ("WR"); 212 case 0x3: 213 return ("DRD"); 214 case 0x4: 215 return ("DWR"); 216 case 0x5: 217 return ("IRD"); 218 case 0x6: 219 return ("PREFETCH"); 220 case 0x7: 221 return ("EVICT"); 222 case 0x8: 223 return ("SNOOP"); 224 } 225 return ("???"); 226} 227 228static const char * 229mca_error_mmtype(uint16_t mca_error) 230{ 231 232 switch ((mca_error & 0x70) >> 4) { 233 case 0x0: 234 return ("GEN"); 235 case 0x1: 236 return ("RD"); 237 case 0x2: 238 return ("WR"); 239 case 0x3: 240 return ("AC"); 241 case 0x4: 242 return ("MS"); 243 } 244 return ("???"); 245} 246 247/* Dump details about a single machine check. */ 248static void __nonnull(1) 249mca_log(const struct mca_record *rec) 250{ 251 uint16_t mca_error; 252 253 printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, 254 (long long)rec->mr_status); 255 printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", 256 (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); 257 printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, 258 rec->mr_cpu_id, rec->mr_apic_id); 259 printf("MCA: CPU %d ", rec->mr_cpu); 260 if (rec->mr_status & MC_STATUS_UC) 261 printf("UNCOR "); 262 else { 263 printf("COR "); 264 if (rec->mr_mcg_cap & MCG_CAP_CMCI_P) 265 printf("(%lld) ", ((long long)rec->mr_status & 266 MC_STATUS_COR_COUNT) >> 38); 267 } 268 if (rec->mr_status & MC_STATUS_PCC) 269 printf("PCC "); 270 if (rec->mr_status & MC_STATUS_OVER) 271 printf("OVER "); 272 mca_error = rec->mr_status & MC_STATUS_MCA_ERROR; 273 switch (mca_error) { 274 /* Simple error codes. */ 275 case 0x0000: 276 printf("no error"); 277 break; 278 case 0x0001: 279 printf("unclassified error"); 280 break; 281 case 0x0002: 282 printf("ucode ROM parity error"); 283 break; 284 case 0x0003: 285 printf("external error"); 286 break; 287 case 0x0004: 288 printf("FRC error"); 289 break; 290 case 0x0005: 291 printf("internal parity error"); 292 break; 293 case 0x0400: 294 printf("internal timer error"); 295 break; 296 default: 297 if ((mca_error & 0xfc00) == 0x0400) { 298 printf("internal error %x", mca_error & 0x03ff); 299 break; 300 } 301 302 /* Compound error codes. */ 303 304 /* Memory hierarchy error. */ 305 if ((mca_error & 0xeffc) == 0x000c) { 306 printf("%s memory error", mca_error_level(mca_error)); 307 break; 308 } 309 310 /* TLB error. */ 311 if ((mca_error & 0xeff0) == 0x0010) { 312 printf("%sTLB %s error", mca_error_ttype(mca_error), 313 mca_error_level(mca_error)); 314 break; 315 } 316 317 /* Memory controller error. */ 318 if ((mca_error & 0xef80) == 0x0080) { 319 printf("%s channel ", mca_error_mmtype(mca_error)); 320 if ((mca_error & 0x000f) != 0x000f) 321 printf("%d", mca_error & 0x000f); 322 else 323 printf("??"); 324 printf(" memory error"); 325 break; 326 } 327 328 /* Cache error. */ 329 if ((mca_error & 0xef00) == 0x0100) { 330 printf("%sCACHE %s %s error", 331 mca_error_ttype(mca_error), 332 mca_error_level(mca_error), 333 mca_error_request(mca_error)); 334 break; 335 } 336 337 /* Bus and/or Interconnect error. */ 338 if ((mca_error & 0xe800) == 0x0800) { 339 printf("BUS%s ", mca_error_level(mca_error)); 340 switch ((mca_error & 0x0600) >> 9) { 341 case 0: 342 printf("Source"); 343 break; 344 case 1: 345 printf("Responder"); 346 break; 347 case 2: 348 printf("Observer"); 349 break; 350 default: 351 printf("???"); 352 break; 353 } 354 printf(" %s ", mca_error_request(mca_error)); 355 switch ((mca_error & 0x000c) >> 2) { 356 case 0: 357 printf("Memory"); 358 break; 359 case 2: 360 printf("I/O"); 361 break; 362 case 3: 363 printf("Other"); 364 break; 365 default: 366 printf("???"); 367 break; 368 } 369 if (mca_error & 0x0100) 370 printf(" timed out"); 371 break; 372 } 373 374 printf("unknown error %x", mca_error); 375 break; 376 } 377 printf("\n"); 378 if (rec->mr_status & MC_STATUS_ADDRV) 379 printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr); 380 if (rec->mr_status & MC_STATUS_MISCV) 381 printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); 382} 383 384static int __nonnull(2) 385mca_check_status(int bank, struct mca_record *rec) 386{ 387 uint64_t status; 388 u_int p[4]; 389 390 status = rdmsr(MSR_MC_STATUS(bank)); 391 if (!(status & MC_STATUS_VAL)) 392 return (0); 393 394 /* Save exception information. */ 395 rec->mr_status = status; 396 rec->mr_bank = bank; 397 rec->mr_addr = 0; 398 if (status & MC_STATUS_ADDRV) 399 rec->mr_addr = rdmsr(MSR_MC_ADDR(bank)); 400 rec->mr_misc = 0; 401 if (status & MC_STATUS_MISCV) 402 rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); 403 rec->mr_tsc = rdtsc(); 404 rec->mr_apic_id = PCPU_GET(apic_id); 405 rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); 406 rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); 407 rec->mr_cpu_id = cpu_id; 408 rec->mr_cpu_vendor_id = cpu_vendor_id; 409 rec->mr_cpu = PCPU_GET(cpuid); 410 411 /* 412 * Clear machine check. Don't do this for uncorrectable 413 * errors so that the BIOS can see them. 414 */ 415 if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) { 416 wrmsr(MSR_MC_STATUS(bank), 0); 417 do_cpuid(0, p); 418 } 419 return (1); 420} 421 422static void 423mca_fill_freelist(void) 424{ 425 struct mca_internal *rec; 426 int desired; 427 428 /* 429 * Ensure we have at least one record for each bank and one 430 * record per CPU. 431 */ 432 desired = imax(mp_ncpus, mca_banks); 433 mtx_lock_spin(&mca_lock); 434 while (mca_freecount < desired) { 435 mtx_unlock_spin(&mca_lock); 436 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); 437 mtx_lock_spin(&mca_lock); 438 STAILQ_INSERT_TAIL(&mca_freelist, rec, link); 439 mca_freecount++; 440 } 441 mtx_unlock_spin(&mca_lock); 442} 443 444static void 445mca_refill(void *context, int pending) 446{ 447 448 mca_fill_freelist(); 449} 450 451static void __nonnull(2) 452mca_record_entry(enum scan_mode mode, const struct mca_record *record) 453{ 454 struct mca_internal *rec; 455 456 if (mode == POLLED) { 457 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); 458 mtx_lock_spin(&mca_lock); 459 } else { 460 mtx_lock_spin(&mca_lock); 461 rec = STAILQ_FIRST(&mca_freelist); 462 if (rec == NULL) { 463 printf("MCA: Unable to allocate space for an event.\n"); 464 mca_log(record); 465 mtx_unlock_spin(&mca_lock); 466 return; 467 } 468 STAILQ_REMOVE_HEAD(&mca_freelist, link); 469 mca_freecount--; 470 } 471 472 rec->rec = *record; 473 rec->logged = 0; 474 STAILQ_INSERT_TAIL(&mca_records, rec, link); 475 mca_count++; 476 mtx_unlock_spin(&mca_lock); 477 if (mode == CMCI) 478 taskqueue_enqueue_fast(mca_tq, &mca_refill_task); 479} 480 481#ifdef DEV_APIC 482/* 483 * Update the interrupt threshold for a CMCI. The strategy is to use 484 * a low trigger that interrupts as soon as the first event occurs. 485 * However, if a steady stream of events arrive, the threshold is 486 * increased until the interrupts are throttled to once every 487 * cmc_throttle seconds or the periodic scan. If a periodic scan 488 * finds that the threshold is too high, it is lowered. 489 */ 490static void 491cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) 492{ 493 struct cmc_state *cc; 494 uint64_t ctl; 495 u_int delta; 496 int count, limit; 497 498 /* Fetch the current limit for this bank. */ 499 cc = &cmc_state[PCPU_GET(cpuid)][bank]; 500 ctl = rdmsr(MSR_MC_CTL2(bank)); 501 count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; 502 delta = (u_int)(ticks - cc->last_intr); 503 504 /* 505 * If an interrupt was received less than cmc_throttle seconds 506 * since the previous interrupt and the count from the current 507 * event is greater than or equal to the current threshold, 508 * double the threshold up to the max. 509 */ 510 if (mode == CMCI && valid) { 511 limit = ctl & MC_CTL2_THRESHOLD; 512 if (delta < cmc_throttle && count >= limit && 513 limit < cc->max_threshold) { 514 limit = min(limit << 1, cc->max_threshold); 515 ctl &= ~MC_CTL2_THRESHOLD; 516 ctl |= limit; 517 wrmsr(MSR_MC_CTL2(bank), limit); 518 } 519 cc->last_intr = ticks; 520 return; 521 } 522 523 /* 524 * When the banks are polled, check to see if the threshold 525 * should be lowered. 526 */ 527 if (mode != POLLED) 528 return; 529 530 /* If a CMCI occured recently, do nothing for now. */ 531 if (delta < cmc_throttle) 532 return; 533 534 /* 535 * Compute a new limit based on the average rate of events per 536 * cmc_throttle seconds since the last interrupt. 537 */ 538 if (valid) { 539 count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; 540 limit = count * cmc_throttle / delta; 541 if (limit <= 0) 542 limit = 1; 543 else if (limit > cc->max_threshold) 544 limit = cc->max_threshold; 545 } else 546 limit = 1; 547 if ((ctl & MC_CTL2_THRESHOLD) != limit) { 548 ctl &= ~MC_CTL2_THRESHOLD; 549 ctl |= limit; 550 wrmsr(MSR_MC_CTL2(bank), limit); 551 } 552} 553#endif 554 555/* 556 * This scans all the machine check banks of the current CPU to see if 557 * there are any machine checks. Any non-recoverable errors are 558 * reported immediately via mca_log(). The current thread must be 559 * pinned when this is called. The 'mode' parameter indicates if we 560 * are being called from the MC exception handler, the CMCI handler, 561 * or the periodic poller. In the MC exception case this function 562 * returns true if the system is restartable. Otherwise, it returns a 563 * count of the number of valid MC records found. 564 */ 565static int 566mca_scan(enum scan_mode mode) 567{ 568 struct mca_record rec; 569 uint64_t mcg_cap, ucmask; 570 int count, i, recoverable, valid; 571 572 count = 0; 573 recoverable = 1; 574 ucmask = MC_STATUS_UC | MC_STATUS_PCC; 575 576 /* When handling a MCE#, treat the OVER flag as non-restartable. */ 577 if (mode == MCE) 578 ucmask |= MC_STATUS_OVER; 579 mcg_cap = rdmsr(MSR_MCG_CAP); 580 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { 581#ifdef DEV_APIC 582 /* 583 * For a CMCI, only check banks this CPU is 584 * responsible for. 585 */ 586 if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) 587 continue; 588#endif 589 590 valid = mca_check_status(i, &rec); 591 if (valid) { 592 count++; 593 if (rec.mr_status & ucmask) { 594 recoverable = 0; 595 mtx_lock_spin(&mca_lock); 596 mca_log(&rec); 597 mtx_unlock_spin(&mca_lock); 598 } 599 mca_record_entry(mode, &rec); 600 } 601 602#ifdef DEV_APIC 603 /* 604 * If this is a bank this CPU monitors via CMCI, 605 * update the threshold. 606 */ 607 if (PCPU_GET(cmci_mask) & 1 << i) 608 cmci_update(mode, i, valid, &rec); 609#endif 610 } 611 if (mode == POLLED) 612 mca_fill_freelist(); 613 return (mode == MCE ? recoverable : count); 614} 615 616/* 617 * Scan the machine check banks on all CPUs by binding to each CPU in 618 * turn. If any of the CPUs contained new machine check records, log 619 * them to the console. 620 */ 621static void 622mca_scan_cpus(void *context, int pending) 623{ 624 struct mca_internal *mca; 625 struct thread *td; 626 int count, cpu; 627 628 mca_fill_freelist(); 629 td = curthread; 630 count = 0; 631 thread_lock(td); 632 CPU_FOREACH(cpu) { 633 sched_bind(td, cpu); 634 thread_unlock(td); 635 count += mca_scan(POLLED); 636 thread_lock(td); 637 sched_unbind(td); 638 } 639 thread_unlock(td); 640 if (count != 0) { 641 mtx_lock_spin(&mca_lock); 642 STAILQ_FOREACH(mca, &mca_records, link) { 643 if (!mca->logged) { 644 mca->logged = 1; 645 mca_log(&mca->rec); 646 } 647 } 648 mtx_unlock_spin(&mca_lock); 649 } 650} 651 652static void 653mca_periodic_scan(void *arg) 654{ 655 656 taskqueue_enqueue_fast(mca_tq, &mca_scan_task); 657 callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL); 658} 659 660static int 661sysctl_mca_scan(SYSCTL_HANDLER_ARGS) 662{ 663 int error, i; 664 665 i = 0; 666 error = sysctl_handle_int(oidp, &i, 0, req); 667 if (error) 668 return (error); 669 if (i) 670 taskqueue_enqueue_fast(mca_tq, &mca_scan_task); 671 return (0); 672} 673 674static void 675mca_createtq(void *dummy) 676{ 677 if (mca_banks <= 0) 678 return; 679 680 mca_tq = taskqueue_create_fast("mca", M_WAITOK, 681 taskqueue_thread_enqueue, &mca_tq); 682 taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq"); 683} 684SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL); 685 686static void 687mca_startup(void *dummy) 688{ 689 690 if (mca_banks <= 0) 691 return; 692 693 callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL); 694} 695SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); 696 697#ifdef DEV_APIC 698static void 699cmci_setup(void) 700{ 701 int i; 702 703 cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA, 704 M_WAITOK); 705 for (i = 0; i <= mp_maxid; i++) 706 cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks, 707 M_MCA, M_WAITOK | M_ZERO); 708 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 709 "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 710 &cmc_throttle, 0, sysctl_positive_int, "I", 711 "Interval in seconds to throttle corrected MC interrupts"); 712} 713#endif 714 715static void 716mca_setup(uint64_t mcg_cap) 717{ 718 719 /* 720 * On AMD Family 10h processors, unless logging of level one TLB 721 * parity (L1TP) errors is disabled, enable the recommended workaround 722 * for Erratum 383. 723 */ 724 if (cpu_vendor_id == CPU_VENDOR_AMD && 725 CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP) 726 workaround_erratum383 = 1; 727 728 mca_banks = mcg_cap & MCG_CAP_COUNT; 729 mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); 730 STAILQ_INIT(&mca_records); 731 TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL); 732 callout_init(&mca_timer, CALLOUT_MPSAFE); 733 STAILQ_INIT(&mca_freelist); 734 TASK_INIT(&mca_refill_task, 0, mca_refill, NULL); 735 mca_fill_freelist(); 736 SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 737 "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0, 738 "Record count"); 739 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 740 "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks, 741 0, sysctl_positive_int, "I", 742 "Periodic interval in seconds to scan for machine checks"); 743 SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 744 "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records"); 745 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, 746 "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 747 sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); 748#ifdef DEV_APIC 749 if (mcg_cap & MCG_CAP_CMCI_P) 750 cmci_setup(); 751#endif 752} 753 754#ifdef DEV_APIC 755/* 756 * See if we should monitor CMCI for this bank. If CMCI_EN is already 757 * set in MC_CTL2, then another CPU is responsible for this bank, so 758 * ignore it. If CMCI_EN returns zero after being set, then this bank 759 * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should 760 * now monitor this bank. 761 */ 762static void 763cmci_monitor(int i) 764{ 765 struct cmc_state *cc; 766 uint64_t ctl; 767 768 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); 769 770 ctl = rdmsr(MSR_MC_CTL2(i)); 771 if (ctl & MC_CTL2_CMCI_EN) 772 /* Already monitored by another CPU. */ 773 return; 774 775 /* Set the threshold to one event for now. */ 776 ctl &= ~MC_CTL2_THRESHOLD; 777 ctl |= MC_CTL2_CMCI_EN | 1; 778 wrmsr(MSR_MC_CTL2(i), ctl); 779 ctl = rdmsr(MSR_MC_CTL2(i)); 780 if (!(ctl & MC_CTL2_CMCI_EN)) 781 /* This bank does not support CMCI. */ 782 return; 783 784 cc = &cmc_state[PCPU_GET(cpuid)][i]; 785 786 /* Determine maximum threshold. */ 787 ctl &= ~MC_CTL2_THRESHOLD; 788 ctl |= 0x7fff; 789 wrmsr(MSR_MC_CTL2(i), ctl); 790 ctl = rdmsr(MSR_MC_CTL2(i)); 791 cc->max_threshold = ctl & MC_CTL2_THRESHOLD; 792 793 /* Start off with a threshold of 1. */ 794 ctl &= ~MC_CTL2_THRESHOLD; 795 ctl |= 1; 796 wrmsr(MSR_MC_CTL2(i), ctl); 797 798 /* Mark this bank as monitored. */ 799 PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); 800} 801 802/* 803 * For resume, reset the threshold for any banks we monitor back to 804 * one and throw away the timestamp of the last interrupt. 805 */ 806static void 807cmci_resume(int i) 808{ 809 struct cmc_state *cc; 810 uint64_t ctl; 811 812 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); 813 814 /* Ignore banks not monitored by this CPU. */ 815 if (!(PCPU_GET(cmci_mask) & 1 << i)) 816 return; 817 818 cc = &cmc_state[PCPU_GET(cpuid)][i]; 819 cc->last_intr = -ticks; 820 ctl = rdmsr(MSR_MC_CTL2(i)); 821 ctl &= ~MC_CTL2_THRESHOLD; 822 ctl |= MC_CTL2_CMCI_EN | 1; 823 wrmsr(MSR_MC_CTL2(i), ctl); 824} 825#endif 826 827/* 828 * Initializes per-CPU machine check registers and enables corrected 829 * machine check interrupts. 830 */ 831static void 832_mca_init(int boot) 833{ 834 uint64_t mcg_cap; 835 uint64_t ctl, mask; 836 int i, skip; 837 838 /* MCE is required. */ 839 if (!mca_enabled || !(cpu_feature & CPUID_MCE)) 840 return; 841 842 if (cpu_feature & CPUID_MCA) { 843 if (boot) 844 PCPU_SET(cmci_mask, 0); 845 846 mcg_cap = rdmsr(MSR_MCG_CAP); 847 if (mcg_cap & MCG_CAP_CTL_P) 848 /* Enable MCA features. */ 849 wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); 850 if (PCPU_GET(cpuid) == 0 && boot) 851 mca_setup(mcg_cap); 852 853 /* 854 * Disable logging of level one TLB parity (L1TP) errors by 855 * the data cache as an alternative workaround for AMD Family 856 * 10h Erratum 383. Unlike the recommended workaround, there 857 * is no performance penalty to this workaround. However, 858 * L1TP errors will go unreported. 859 */ 860 if (cpu_vendor_id == CPU_VENDOR_AMD && 861 CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) { 862 mask = rdmsr(MSR_MC0_CTL_MASK); 863 if ((mask & (1UL << 5)) == 0) 864 wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5)); 865 } 866 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { 867 /* By default enable logging of all errors. */ 868 ctl = 0xffffffffffffffffUL; 869 skip = 0; 870 871 if (cpu_vendor_id == CPU_VENDOR_INTEL) { 872 /* 873 * For P6 models before Nehalem MC0_CTL is 874 * always enabled and reserved. 875 */ 876 if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6 877 && CPUID_TO_MODEL(cpu_id) < 0x1a) 878 skip = 1; 879 } else if (cpu_vendor_id == CPU_VENDOR_AMD) { 880 /* BKDG for Family 10h: unset GartTblWkEn. */ 881 if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf) 882 ctl &= ~(1UL << 10); 883 } 884 885 if (!skip) 886 wrmsr(MSR_MC_CTL(i), ctl); 887 888#ifdef DEV_APIC 889 if (mcg_cap & MCG_CAP_CMCI_P) { 890 if (boot) 891 cmci_monitor(i); 892 else 893 cmci_resume(i); 894 } 895#endif 896 897 /* Clear all errors. */ 898 wrmsr(MSR_MC_STATUS(i), 0); 899 } 900 901#ifdef DEV_APIC 902 if (PCPU_GET(cmci_mask) != 0 && boot) 903 lapic_enable_cmc(); 904#endif 905 } 906 907 load_cr4(rcr4() | CR4_MCE); 908} 909 910/* Must be executed on each CPU during boot. */ 911void 912mca_init(void) 913{ 914 915 _mca_init(1); 916} 917 918/* Must be executed on each CPU during resume. */ 919void 920mca_resume(void) 921{ 922 923 _mca_init(0); 924} 925 926/* 927 * The machine check registers for the BSP cannot be initialized until 928 * the local APIC is initialized. This happens at SI_SUB_CPU, 929 * SI_ORDER_SECOND. 930 */ 931static void 932mca_init_bsp(void *arg __unused) 933{ 934 935 mca_init(); 936} 937SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL); 938 939/* Called when a machine check exception fires. */ 940void 941mca_intr(void) 942{ 943 uint64_t mcg_status; 944 int old_count, recoverable; 945 946 if (!(cpu_feature & CPUID_MCA)) { 947 /* 948 * Just print the values of the old Pentium registers 949 * and panic. 950 */ 951 printf("MC Type: 0x%jx Address: 0x%jx\n", 952 (uintmax_t)rdmsr(MSR_P5_MC_TYPE), 953 (uintmax_t)rdmsr(MSR_P5_MC_ADDR)); 954 panic("Machine check"); 955 } 956 957 /* Scan the banks and check for any non-recoverable errors. */ 958 old_count = mca_count; 959 recoverable = mca_scan(MCE); 960 mcg_status = rdmsr(MSR_MCG_STATUS); 961 if (!(mcg_status & MCG_STATUS_RIPV)) 962 recoverable = 0; 963 964 if (!recoverable) { 965 /* 966 * Wait for at least one error to be logged before 967 * panic'ing. Some errors will assert a machine check 968 * on all CPUs, but only certain CPUs will find a valid 969 * bank to log. 970 */ 971 while (mca_count == old_count) 972 cpu_spinwait(); 973 974 panic("Unrecoverable machine check exception"); 975 } 976 977 /* Clear MCIP. */ 978 wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); 979} 980 981#ifdef DEV_APIC 982/* Called for a CMCI (correctable machine check interrupt). */ 983void 984cmc_intr(void) 985{ 986 struct mca_internal *mca; 987 int count; 988 989 /* 990 * Serialize MCA bank scanning to prevent collisions from 991 * sibling threads. 992 */ 993 count = mca_scan(CMCI); 994 995 /* If we found anything, log them to the console. */ 996 if (count != 0) { 997 mtx_lock_spin(&mca_lock); 998 STAILQ_FOREACH(mca, &mca_records, link) { 999 if (!mca->logged) { 1000 mca->logged = 1; 1001 mca_log(&mca->rec); 1002 } 1003 } 1004 mtx_unlock_spin(&mca_lock); 1005 } 1006} 1007#endif 1008