1/* 2 * Machine check handler. 3 * 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 5 * Rest from unknown author(s). 6 * 2004 Andi Kleen. Rewrote most of it. 7 * Copyright 2008 Intel Corporation 8 * Author: Andi Kleen 9 */ 10#include <linux/thread_info.h> 11#include <linux/capability.h> 12#include <linux/miscdevice.h> 13#include <linux/interrupt.h> 14#include <linux/ratelimit.h> 15#include <linux/kallsyms.h> 16#include <linux/rcupdate.h> 17#include <linux/kobject.h> 18#include <linux/uaccess.h> 19#include <linux/kdebug.h> 20#include <linux/kernel.h> 21#include <linux/percpu.h> 22#include <linux/string.h> 23#include <linux/sysdev.h> 24#include <linux/delay.h> 25#include <linux/ctype.h> 26#include <linux/sched.h> 27#include <linux/sysfs.h> 28#include <linux/types.h> 29#include <linux/slab.h> 30#include <linux/init.h> 31#include <linux/kmod.h> 32#include <linux/poll.h> 33#include <linux/nmi.h> 34#include <linux/cpu.h> 35#include <linux/smp.h> 36#include <linux/fs.h> 37#include <linux/mm.h> 38#include <linux/debugfs.h> 39#include <linux/edac_mce.h> 40 41#include <asm/processor.h> 42#include <asm/hw_irq.h> 43#include <asm/apic.h> 44#include <asm/idle.h> 45#include <asm/ipi.h> 46#include <asm/mce.h> 47#include <asm/msr.h> 48 49#include "mce-internal.h" 50 51static DEFINE_MUTEX(mce_read_mutex); 52 53#define rcu_dereference_check_mce(p) \ 54 rcu_dereference_index_check((p), \ 55 rcu_read_lock_sched_held() || \ 56 lockdep_is_held(&mce_read_mutex)) 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/mce.h> 60 61int mce_disabled __read_mostly; 62 63#define MISC_MCELOG_MINOR 227 64 65#define SPINUNIT 100 /* 100ns */ 66 67atomic_t mce_entry; 68 69DEFINE_PER_CPU(unsigned, mce_exception_count); 70 71/* 72 * Tolerant levels: 73 * 0: always panic on uncorrected errors, log corrected errors 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 76 * 3: never panic or SIGBUS, log all errors (for testing only) 77 */ 78static int tolerant __read_mostly = 1; 79static int banks __read_mostly; 80static int rip_msr __read_mostly; 81static int mce_bootlog __read_mostly = -1; 82static int monarch_timeout __read_mostly = -1; 83static int mce_panic_timeout __read_mostly; 84static int mce_dont_log_ce __read_mostly; 85int mce_cmci_disabled __read_mostly; 86int mce_ignore_ce __read_mostly; 87int mce_ser __read_mostly; 88 89struct mce_bank *mce_banks __read_mostly; 90 91/* User mode helper program triggered by machine check event */ 92static unsigned long mce_need_notify; 93static char mce_helper[128]; 94static char *mce_helper_argv[2] = { mce_helper, NULL }; 95 96static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 97static DEFINE_PER_CPU(struct mce, mces_seen); 98static int cpu_missing; 99 100/* 101 * CPU/chipset specific EDAC code can register a notifier call here to print 102 * MCE errors in a human-readable form. 103 */ 104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106 107static int default_decode_mce(struct notifier_block *nb, unsigned long val, 108 void *data) 109{ 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); 112 113 return NOTIFY_STOP; 114} 115 116static struct notifier_block mce_dec_nb = { 117 .notifier_call = default_decode_mce, 118 .priority = -1, 119}; 120 121/* MCA banks polled by the period polling timer for corrected events */ 122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 123 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 124}; 125 126static DEFINE_PER_CPU(struct work_struct, mce_work); 127 128/* Do initial initialization of a struct mce */ 129void mce_setup(struct mce *m) 130{ 131 memset(m, 0, sizeof(struct mce)); 132 m->cpu = m->extcpu = smp_processor_id(); 133 rdtscll(m->tsc); 134 /* We hope get_seconds stays lockless */ 135 m->time = get_seconds(); 136 m->cpuvendor = boot_cpu_data.x86_vendor; 137 m->cpuid = cpuid_eax(1); 138#ifdef CONFIG_SMP 139 m->socketid = cpu_data(m->extcpu).phys_proc_id; 140#endif 141 m->apicid = cpu_data(m->extcpu).initial_apicid; 142 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 143} 144 145DEFINE_PER_CPU(struct mce, injectm); 146EXPORT_PER_CPU_SYMBOL_GPL(injectm); 147 148/* 149 * Lockless MCE logging infrastructure. 150 * This avoids deadlocks on printk locks without having to break locks. Also 151 * separate MCEs from kernel messages to avoid bogus bug reports. 152 */ 153 154static struct mce_log mcelog = { 155 .signature = MCE_LOG_SIGNATURE, 156 .len = MCE_LOG_LEN, 157 .recordlen = sizeof(struct mce), 158}; 159 160void mce_log(struct mce *mce) 161{ 162 unsigned next, entry; 163 164 /* Emit the trace record: */ 165 trace_mce_record(mce); 166 167 mce->finished = 0; 168 wmb(); 169 for (;;) { 170 entry = rcu_dereference_check_mce(mcelog.next); 171 for (;;) { 172 /* 173 * If edac_mce is enabled, it will check the error type 174 * and will process it, if it is a known error. 175 * Otherwise, the error will be sent through mcelog 176 * interface 177 */ 178 if (edac_mce_parse(mce)) 179 return; 180 181 /* 182 * When the buffer fills up discard new entries. 183 * Assume that the earlier errors are the more 184 * interesting ones: 185 */ 186 if (entry >= MCE_LOG_LEN) { 187 set_bit(MCE_OVERFLOW, 188 (unsigned long *)&mcelog.flags); 189 return; 190 } 191 /* Old left over entry. Skip: */ 192 if (mcelog.entry[entry].finished) { 193 entry++; 194 continue; 195 } 196 break; 197 } 198 smp_rmb(); 199 next = entry + 1; 200 if (cmpxchg(&mcelog.next, entry, next) == entry) 201 break; 202 } 203 memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); 204 wmb(); 205 mcelog.entry[entry].finished = 1; 206 wmb(); 207 208 mce->finished = 1; 209 set_bit(0, &mce_need_notify); 210} 211 212static void print_mce(struct mce *m) 213{ 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 215 m->extcpu, m->mcgstatus, m->bank, m->status); 216 217 if (m->ip) { 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 220 m->cs, m->ip); 221 222 if (m->cs == __KERNEL_CS) 223 print_symbol("{%s}", m->ip); 224 pr_cont("\n"); 225 } 226 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc); 228 if (m->addr) 229 pr_cont("ADDR %llx ", m->addr); 230 if (m->misc) 231 pr_cont("MISC %llx ", m->misc); 232 233 pr_cont("\n"); 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 236 237 /* 238 * Print out human-readable details about the MCE error, 239 * (if the CPU has an implementation for that) 240 */ 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 242} 243 244#define PANIC_TIMEOUT 5 /* 5 seconds */ 245 246static atomic_t mce_paniced; 247 248static int fake_panic; 249static atomic_t mce_fake_paniced; 250 251/* Panic in progress. Enable interrupts and wait for final IPI */ 252static void wait_for_panic(void) 253{ 254 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 255 256 preempt_disable(); 257 local_irq_enable(); 258 while (timeout-- > 0) 259 udelay(1); 260 if (panic_timeout == 0) 261 panic_timeout = mce_panic_timeout; 262 panic("Panicing machine check CPU died"); 263} 264 265static void mce_panic(char *msg, struct mce *final, char *exp) 266{ 267 int i, apei_err = 0; 268 269 if (!fake_panic) { 270 /* 271 * Make sure only one CPU runs in machine check panic 272 */ 273 if (atomic_inc_return(&mce_paniced) > 1) 274 wait_for_panic(); 275 barrier(); 276 277 bust_spinlocks(1); 278 console_verbose(); 279 } else { 280 /* Don't log too much for fake panic */ 281 if (atomic_inc_return(&mce_fake_paniced) > 1) 282 return; 283 } 284 /* First print corrected ones that are still unlogged */ 285 for (i = 0; i < MCE_LOG_LEN; i++) { 286 struct mce *m = &mcelog.entry[i]; 287 if (!(m->status & MCI_STATUS_VAL)) 288 continue; 289 if (!(m->status & MCI_STATUS_UC)) { 290 print_mce(m); 291 if (!apei_err) 292 apei_err = apei_write_mce(m); 293 } 294 } 295 /* Now print uncorrected but with the final one last */ 296 for (i = 0; i < MCE_LOG_LEN; i++) { 297 struct mce *m = &mcelog.entry[i]; 298 if (!(m->status & MCI_STATUS_VAL)) 299 continue; 300 if (!(m->status & MCI_STATUS_UC)) 301 continue; 302 if (!final || memcmp(m, final, sizeof(struct mce))) { 303 print_mce(m); 304 if (!apei_err) 305 apei_err = apei_write_mce(m); 306 } 307 } 308 if (final) { 309 print_mce(final); 310 if (!apei_err) 311 apei_err = apei_write_mce(final); 312 } 313 if (cpu_missing) 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); 315 if (exp) 316 pr_emerg(HW_ERR "Machine check: %s\n", exp); 317 if (!fake_panic) { 318 if (panic_timeout == 0) 319 panic_timeout = mce_panic_timeout; 320 panic(msg); 321 } else 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 323} 324 325/* Support code for software error injection */ 326 327static int msr_to_offset(u32 msr) 328{ 329 unsigned bank = __get_cpu_var(injectm.bank); 330 331 if (msr == rip_msr) 332 return offsetof(struct mce, ip); 333 if (msr == MSR_IA32_MCx_STATUS(bank)) 334 return offsetof(struct mce, status); 335 if (msr == MSR_IA32_MCx_ADDR(bank)) 336 return offsetof(struct mce, addr); 337 if (msr == MSR_IA32_MCx_MISC(bank)) 338 return offsetof(struct mce, misc); 339 if (msr == MSR_IA32_MCG_STATUS) 340 return offsetof(struct mce, mcgstatus); 341 return -1; 342} 343 344/* MSR access wrappers used for error injection */ 345static u64 mce_rdmsrl(u32 msr) 346{ 347 u64 v; 348 349 if (__get_cpu_var(injectm).finished) { 350 int offset = msr_to_offset(msr); 351 352 if (offset < 0) 353 return 0; 354 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 355 } 356 357 if (rdmsrl_safe(msr, &v)) { 358 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); 359 /* 360 * Return zero in case the access faulted. This should 361 * not happen normally but can happen if the CPU does 362 * something weird, or if the code is buggy. 363 */ 364 v = 0; 365 } 366 367 return v; 368} 369 370static void mce_wrmsrl(u32 msr, u64 v) 371{ 372 if (__get_cpu_var(injectm).finished) { 373 int offset = msr_to_offset(msr); 374 375 if (offset >= 0) 376 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 377 return; 378 } 379 wrmsrl(msr, v); 380} 381 382/* 383 * Simple lockless ring to communicate PFNs from the exception handler with the 384 * process context work function. This is vastly simplified because there's 385 * only a single reader and a single writer. 386 */ 387#define MCE_RING_SIZE 16 /* we use one entry less */ 388 389struct mce_ring { 390 unsigned short start; 391 unsigned short end; 392 unsigned long ring[MCE_RING_SIZE]; 393}; 394static DEFINE_PER_CPU(struct mce_ring, mce_ring); 395 396/* Runs with CPU affinity in workqueue */ 397static int mce_ring_empty(void) 398{ 399 struct mce_ring *r = &__get_cpu_var(mce_ring); 400 401 return r->start == r->end; 402} 403 404static int mce_ring_get(unsigned long *pfn) 405{ 406 struct mce_ring *r; 407 int ret = 0; 408 409 *pfn = 0; 410 get_cpu(); 411 r = &__get_cpu_var(mce_ring); 412 if (r->start == r->end) 413 goto out; 414 *pfn = r->ring[r->start]; 415 r->start = (r->start + 1) % MCE_RING_SIZE; 416 ret = 1; 417out: 418 put_cpu(); 419 return ret; 420} 421 422/* Always runs in MCE context with preempt off */ 423static int mce_ring_add(unsigned long pfn) 424{ 425 struct mce_ring *r = &__get_cpu_var(mce_ring); 426 unsigned next; 427 428 next = (r->end + 1) % MCE_RING_SIZE; 429 if (next == r->start) 430 return -1; 431 r->ring[r->end] = pfn; 432 wmb(); 433 r->end = next; 434 return 0; 435} 436 437int mce_available(struct cpuinfo_x86 *c) 438{ 439 if (mce_disabled) 440 return 0; 441 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 442} 443 444static void mce_schedule_work(void) 445{ 446 if (!mce_ring_empty()) { 447 struct work_struct *work = &__get_cpu_var(mce_work); 448 if (!work_pending(work)) 449 schedule_work(work); 450 } 451} 452 453/* 454 * Get the address of the instruction at the time of the machine check 455 * error. 456 */ 457static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 458{ 459 460 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { 461 m->ip = regs->ip; 462 m->cs = regs->cs; 463 } else { 464 m->ip = 0; 465 m->cs = 0; 466 } 467 if (rip_msr) 468 m->ip = mce_rdmsrl(rip_msr); 469} 470 471#ifdef CONFIG_X86_LOCAL_APIC 472/* 473 * Called after interrupts have been reenabled again 474 * when a MCE happened during an interrupts off region 475 * in the kernel. 476 */ 477asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) 478{ 479 ack_APIC_irq(); 480 exit_idle(); 481 irq_enter(); 482 mce_notify_irq(); 483 mce_schedule_work(); 484 irq_exit(); 485} 486#endif 487 488static void mce_report_event(struct pt_regs *regs) 489{ 490 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 491 mce_notify_irq(); 492 /* 493 * Triggering the work queue here is just an insurance 494 * policy in case the syscall exit notify handler 495 * doesn't run soon enough or ends up running on the 496 * wrong CPU (can happen when audit sleeps) 497 */ 498 mce_schedule_work(); 499 return; 500 } 501 502#ifdef CONFIG_X86_LOCAL_APIC 503 /* 504 * Without APIC do not notify. The event will be picked 505 * up eventually. 506 */ 507 if (!cpu_has_apic) 508 return; 509 510 /* 511 * When interrupts are disabled we cannot use 512 * kernel services safely. Trigger an self interrupt 513 * through the APIC to instead do the notification 514 * after interrupts are reenabled again. 515 */ 516 apic->send_IPI_self(MCE_SELF_VECTOR); 517 518 /* 519 * Wait for idle afterwards again so that we don't leave the 520 * APIC in a non idle state because the normal APIC writes 521 * cannot exclude us. 522 */ 523 apic_wait_icr_idle(); 524#endif 525} 526 527DEFINE_PER_CPU(unsigned, mce_poll_count); 528 529/* 530 * Poll for corrected events or events that happened before reset. 531 * Those are just logged through /dev/mcelog. 532 * 533 * This is executed in standard interrupt context. 534 * 535 * Note: spec recommends to panic for fatal unsignalled 536 * errors here. However this would be quite problematic -- 537 * we would need to reimplement the Monarch handling and 538 * it would mess up the exclusion between exception handler 539 * and poll hander -- * so we skip this for now. 540 * These cases should not happen anyways, or only when the CPU 541 * is already totally * confused. In this case it's likely it will 542 * not fully execute the machine check handler either. 543 */ 544void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 545{ 546 struct mce m; 547 int i; 548 549 percpu_inc(mce_poll_count); 550 551 mce_setup(&m); 552 553 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 554 for (i = 0; i < banks; i++) { 555 if (!mce_banks[i].ctl || !test_bit(i, *b)) 556 continue; 557 558 m.misc = 0; 559 m.addr = 0; 560 m.bank = i; 561 m.tsc = 0; 562 563 barrier(); 564 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 565 if (!(m.status & MCI_STATUS_VAL)) 566 continue; 567 568 /* 569 * Uncorrected or signalled events are handled by the exception 570 * handler when it is enabled, so don't process those here. 571 * 572 * TBD do the same check for MCI_STATUS_EN here? 573 */ 574 if (!(flags & MCP_UC) && 575 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 576 continue; 577 578 if (m.status & MCI_STATUS_MISCV) 579 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 580 if (m.status & MCI_STATUS_ADDRV) 581 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 582 583 if (!(flags & MCP_TIMESTAMP)) 584 m.tsc = 0; 585 /* 586 * Don't get the IP here because it's unlikely to 587 * have anything to do with the actual error location. 588 */ 589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 590 mce_log(&m); 591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 592 add_taint(TAINT_MACHINE_CHECK); 593 } 594 595 /* 596 * Clear state for this bank. 597 */ 598 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 599 } 600 601 /* 602 * Don't clear MCG_STATUS here because it's only defined for 603 * exceptions. 604 */ 605 606 sync_core(); 607} 608EXPORT_SYMBOL_GPL(machine_check_poll); 609 610/* 611 * Do a quick check if any of the events requires a panic. 612 * This decides if we keep the events around or clear them. 613 */ 614static int mce_no_way_out(struct mce *m, char **msg) 615{ 616 int i; 617 618 for (i = 0; i < banks; i++) { 619 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 620 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 621 return 1; 622 } 623 return 0; 624} 625 626/* 627 * Variable to establish order between CPUs while scanning. 628 * Each CPU spins initially until executing is equal its number. 629 */ 630static atomic_t mce_executing; 631 632/* 633 * Defines order of CPUs on entry. First CPU becomes Monarch. 634 */ 635static atomic_t mce_callin; 636 637/* 638 * Check if a timeout waiting for other CPUs happened. 639 */ 640static int mce_timed_out(u64 *t) 641{ 642 /* 643 * The others already did panic for some reason. 644 * Bail out like in a timeout. 645 * rmb() to tell the compiler that system_state 646 * might have been modified by someone else. 647 */ 648 rmb(); 649 if (atomic_read(&mce_paniced)) 650 wait_for_panic(); 651 if (!monarch_timeout) 652 goto out; 653 if ((s64)*t < SPINUNIT) { 654 /* CHECKME: Make panic default for 1 too? */ 655 if (tolerant < 1) 656 mce_panic("Timeout synchronizing machine check over CPUs", 657 NULL, NULL); 658 cpu_missing = 1; 659 return 1; 660 } 661 *t -= SPINUNIT; 662out: 663 touch_nmi_watchdog(); 664 return 0; 665} 666 667/* 668 * The Monarch's reign. The Monarch is the CPU who entered 669 * the machine check handler first. It waits for the others to 670 * raise the exception too and then grades them. When any 671 * error is fatal panic. Only then let the others continue. 672 * 673 * The other CPUs entering the MCE handler will be controlled by the 674 * Monarch. They are called Subjects. 675 * 676 * This way we prevent any potential data corruption in a unrecoverable case 677 * and also makes sure always all CPU's errors are examined. 678 * 679 * Also this detects the case of a machine check event coming from outer 680 * space (not detected by any CPUs) In this case some external agent wants 681 * us to shut down, so panic too. 682 * 683 * The other CPUs might still decide to panic if the handler happens 684 * in a unrecoverable place, but in this case the system is in a semi-stable 685 * state and won't corrupt anything by itself. It's ok to let the others 686 * continue for a bit first. 687 * 688 * All the spin loops have timeouts; when a timeout happens a CPU 689 * typically elects itself to be Monarch. 690 */ 691static void mce_reign(void) 692{ 693 int cpu; 694 struct mce *m = NULL; 695 int global_worst = 0; 696 char *msg = NULL; 697 char *nmsg = NULL; 698 699 /* 700 * This CPU is the Monarch and the other CPUs have run 701 * through their handlers. 702 * Grade the severity of the errors of all the CPUs. 703 */ 704 for_each_possible_cpu(cpu) { 705 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 706 &nmsg); 707 if (severity > global_worst) { 708 msg = nmsg; 709 global_worst = severity; 710 m = &per_cpu(mces_seen, cpu); 711 } 712 } 713 714 /* 715 * Cannot recover? Panic here then. 716 * This dumps all the mces in the log buffer and stops the 717 * other CPUs. 718 */ 719 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 720 mce_panic("Fatal Machine check", m, msg); 721 722 /* 723 * For UC somewhere we let the CPU who detects it handle it. 724 * Also must let continue the others, otherwise the handling 725 * CPU could deadlock on a lock. 726 */ 727 728 /* 729 * No machine check event found. Must be some external 730 * source or one CPU is hung. Panic. 731 */ 732 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 733 mce_panic("Machine check from unknown source", NULL, NULL); 734 735 /* 736 * Now clear all the mces_seen so that they don't reappear on 737 * the next mce. 738 */ 739 for_each_possible_cpu(cpu) 740 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); 741} 742 743static atomic_t global_nwo; 744 745/* 746 * Start of Monarch synchronization. This waits until all CPUs have 747 * entered the exception handler and then determines if any of them 748 * saw a fatal event that requires panic. Then it executes them 749 * in the entry order. 750 * TBD double check parallel CPU hotunplug 751 */ 752static int mce_start(int *no_way_out) 753{ 754 int order; 755 int cpus = num_online_cpus(); 756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 757 758 if (!timeout) 759 return -1; 760 761 atomic_add(*no_way_out, &global_nwo); 762 /* 763 * global_nwo should be updated before mce_callin 764 */ 765 smp_wmb(); 766 order = atomic_inc_return(&mce_callin); 767 768 /* 769 * Wait for everyone. 770 */ 771 while (atomic_read(&mce_callin) != cpus) { 772 if (mce_timed_out(&timeout)) { 773 atomic_set(&global_nwo, 0); 774 return -1; 775 } 776 ndelay(SPINUNIT); 777 } 778 779 /* 780 * mce_callin should be read before global_nwo 781 */ 782 smp_rmb(); 783 784 if (order == 1) { 785 /* 786 * Monarch: Starts executing now, the others wait. 787 */ 788 atomic_set(&mce_executing, 1); 789 } else { 790 /* 791 * Subject: Now start the scanning loop one by one in 792 * the original callin order. 793 * This way when there are any shared banks it will be 794 * only seen by one CPU before cleared, avoiding duplicates. 795 */ 796 while (atomic_read(&mce_executing) < order) { 797 if (mce_timed_out(&timeout)) { 798 atomic_set(&global_nwo, 0); 799 return -1; 800 } 801 ndelay(SPINUNIT); 802 } 803 } 804 805 /* 806 * Cache the global no_way_out state. 807 */ 808 *no_way_out = atomic_read(&global_nwo); 809 810 return order; 811} 812 813/* 814 * Synchronize between CPUs after main scanning loop. 815 * This invokes the bulk of the Monarch processing. 816 */ 817static int mce_end(int order) 818{ 819 int ret = -1; 820 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 821 822 if (!timeout) 823 goto reset; 824 if (order < 0) 825 goto reset; 826 827 /* 828 * Allow others to run. 829 */ 830 atomic_inc(&mce_executing); 831 832 if (order == 1) { 833 /* CHECKME: Can this race with a parallel hotplug? */ 834 int cpus = num_online_cpus(); 835 836 /* 837 * Monarch: Wait for everyone to go through their scanning 838 * loops. 839 */ 840 while (atomic_read(&mce_executing) <= cpus) { 841 if (mce_timed_out(&timeout)) 842 goto reset; 843 ndelay(SPINUNIT); 844 } 845 846 mce_reign(); 847 barrier(); 848 ret = 0; 849 } else { 850 /* 851 * Subject: Wait for Monarch to finish. 852 */ 853 while (atomic_read(&mce_executing) != 0) { 854 if (mce_timed_out(&timeout)) 855 goto reset; 856 ndelay(SPINUNIT); 857 } 858 859 /* 860 * Don't reset anything. That's done by the Monarch. 861 */ 862 return 0; 863 } 864 865 /* 866 * Reset all global state. 867 */ 868reset: 869 atomic_set(&global_nwo, 0); 870 atomic_set(&mce_callin, 0); 871 barrier(); 872 873 /* 874 * Let others run again. 875 */ 876 atomic_set(&mce_executing, 0); 877 return ret; 878} 879 880/* 881 * Check if the address reported by the CPU is in a format we can parse. 882 * It would be possible to add code for most other cases, but all would 883 * be somewhat complicated (e.g. segment offset would require an instruction 884 * parser). So only support physical addresses upto page granuality for now. 885 */ 886static int mce_usable_address(struct mce *m) 887{ 888 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 889 return 0; 890 if ((m->misc & 0x3f) > PAGE_SHIFT) 891 return 0; 892 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 893 return 0; 894 return 1; 895} 896 897static void mce_clear_state(unsigned long *toclear) 898{ 899 int i; 900 901 for (i = 0; i < banks; i++) { 902 if (test_bit(i, toclear)) 903 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 904 } 905} 906 907/* 908 * The actual machine check handler. This only handles real 909 * exceptions when something got corrupted coming in through int 18. 910 * 911 * This is executed in NMI context not subject to normal locking rules. This 912 * implies that most kernel services cannot be safely used. Don't even 913 * think about putting a printk in there! 914 * 915 * On Intel systems this is entered on all CPUs in parallel through 916 * MCE broadcast. However some CPUs might be broken beyond repair, 917 * so be always careful when synchronizing with others. 918 */ 919void do_machine_check(struct pt_regs *regs, long error_code) 920{ 921 struct mce m, *final; 922 int i; 923 int worst = 0; 924 int severity; 925 /* 926 * Establish sequential order between the CPUs entering the machine 927 * check handler. 928 */ 929 int order; 930 /* 931 * If no_way_out gets set, there is no safe way to recover from this 932 * MCE. If tolerant is cranked up, we'll try anyway. 933 */ 934 int no_way_out = 0; 935 /* 936 * If kill_it gets set, there might be a way to recover from this 937 * error. 938 */ 939 int kill_it = 0; 940 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 941 char *msg = "Unknown"; 942 943 atomic_inc(&mce_entry); 944 945 percpu_inc(mce_exception_count); 946 947 if (notify_die(DIE_NMI, "machine check", regs, error_code, 948 18, SIGKILL) == NOTIFY_STOP) 949 goto out; 950 if (!banks) 951 goto out; 952 953 mce_setup(&m); 954 955 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 956 final = &__get_cpu_var(mces_seen); 957 *final = m; 958 959 no_way_out = mce_no_way_out(&m, &msg); 960 961 barrier(); 962 963 /* 964 * When no restart IP must always kill or panic. 965 */ 966 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 967 kill_it = 1; 968 969 /* 970 * Go through all the banks in exclusion of the other CPUs. 971 * This way we don't report duplicated events on shared banks 972 * because the first one to see it will clear it. 973 */ 974 order = mce_start(&no_way_out); 975 for (i = 0; i < banks; i++) { 976 __clear_bit(i, toclear); 977 if (!mce_banks[i].ctl) 978 continue; 979 980 m.misc = 0; 981 m.addr = 0; 982 m.bank = i; 983 984 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 985 if ((m.status & MCI_STATUS_VAL) == 0) 986 continue; 987 988 /* 989 * Non uncorrected or non signaled errors are handled by 990 * machine_check_poll. Leave them alone, unless this panics. 991 */ 992 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 993 !no_way_out) 994 continue; 995 996 /* 997 * Set taint even when machine check was not enabled. 998 */ 999 add_taint(TAINT_MACHINE_CHECK); 1000 1001 severity = mce_severity(&m, tolerant, NULL); 1002 1003 /* 1004 * When machine check was for corrected handler don't touch, 1005 * unless we're panicing. 1006 */ 1007 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1008 continue; 1009 __set_bit(i, toclear); 1010 if (severity == MCE_NO_SEVERITY) { 1011 /* 1012 * Machine check event was not enabled. Clear, but 1013 * ignore. 1014 */ 1015 continue; 1016 } 1017 1018 /* 1019 * Kill on action required. 1020 */ 1021 if (severity == MCE_AR_SEVERITY) 1022 kill_it = 1; 1023 1024 if (m.status & MCI_STATUS_MISCV) 1025 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); 1026 if (m.status & MCI_STATUS_ADDRV) 1027 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); 1028 1029 /* 1030 * Action optional error. Queue address for later processing. 1031 * When the ring overflows we just ignore the AO error. 1032 * RED-PEN add some logging mechanism when 1033 * usable_address or mce_add_ring fails. 1034 * RED-PEN don't ignore overflow for tolerant == 0 1035 */ 1036 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1037 mce_ring_add(m.addr >> PAGE_SHIFT); 1038 1039 mce_get_rip(&m, regs); 1040 mce_log(&m); 1041 1042 if (severity > worst) { 1043 *final = m; 1044 worst = severity; 1045 } 1046 } 1047 1048 if (!no_way_out) 1049 mce_clear_state(toclear); 1050 1051 /* 1052 * Do most of the synchronization with other CPUs. 1053 * When there's any problem use only local no_way_out state. 1054 */ 1055 if (mce_end(order) < 0) 1056 no_way_out = worst >= MCE_PANIC_SEVERITY; 1057 1058 /* 1059 * If we have decided that we just CAN'T continue, and the user 1060 * has not set tolerant to an insane level, give up and die. 1061 * 1062 * This is mainly used in the case when the system doesn't 1063 * support MCE broadcasting or it has been disabled. 1064 */ 1065 if (no_way_out && tolerant < 3) 1066 mce_panic("Fatal machine check on current CPU", final, msg); 1067 1068 /* 1069 * If the error seems to be unrecoverable, something should be 1070 * done. Try to kill as little as possible. If we can kill just 1071 * one task, do that. If the user has set the tolerance very 1072 * high, don't try to do anything at all. 1073 */ 1074 1075 if (kill_it && tolerant < 3) 1076 force_sig(SIGBUS, current); 1077 1078 /* notify userspace ASAP */ 1079 set_thread_flag(TIF_MCE_NOTIFY); 1080 1081 if (worst > 0) 1082 mce_report_event(regs); 1083 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1084out: 1085 atomic_dec(&mce_entry); 1086 sync_core(); 1087} 1088EXPORT_SYMBOL_GPL(do_machine_check); 1089 1090/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1091void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1092{ 1093 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1094} 1095 1096/* 1097 * Called after mce notification in process context. This code 1098 * is allowed to sleep. Call the high level VM handler to process 1099 * any corrupted pages. 1100 * Assume that the work queue code only calls this one at a time 1101 * per CPU. 1102 * Note we don't disable preemption, so this code might run on the wrong 1103 * CPU. In this case the event is picked up by the scheduled work queue. 1104 * This is merely a fast path to expedite processing in some common 1105 * cases. 1106 */ 1107void mce_notify_process(void) 1108{ 1109 unsigned long pfn; 1110 mce_notify_irq(); 1111 while (mce_ring_get(&pfn)) 1112 memory_failure(pfn, MCE_VECTOR); 1113} 1114 1115static void mce_process_work(struct work_struct *dummy) 1116{ 1117 mce_notify_process(); 1118} 1119 1120#ifdef CONFIG_X86_MCE_INTEL 1121/*** 1122 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1123 * @cpu: The CPU on which the event occurred. 1124 * @status: Event status information 1125 * 1126 * This function should be called by the thermal interrupt after the 1127 * event has been processed and the decision was made to log the event 1128 * further. 1129 * 1130 * The status parameter will be saved to the 'status' field of 'struct mce' 1131 * and historically has been the register value of the 1132 * MSR_IA32_THERMAL_STATUS (Intel) msr. 1133 */ 1134void mce_log_therm_throt_event(__u64 status) 1135{ 1136 struct mce m; 1137 1138 mce_setup(&m); 1139 m.bank = MCE_THERMAL_BANK; 1140 m.status = status; 1141 mce_log(&m); 1142} 1143#endif /* CONFIG_X86_MCE_INTEL */ 1144 1145/* 1146 * Periodic polling timer for "silent" machine check errors. If the 1147 * poller finds an MCE, poll 2x faster. When the poller finds no more 1148 * errors, poll 2x slower (up to check_interval seconds). 1149 */ 1150static int check_interval = 5 * 60; /* 5 minutes */ 1151 1152static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1153static DEFINE_PER_CPU(struct timer_list, mce_timer); 1154 1155static void mce_start_timer(unsigned long data) 1156{ 1157 struct timer_list *t = &per_cpu(mce_timer, data); 1158 int *n; 1159 1160 WARN_ON(smp_processor_id() != data); 1161 1162 if (mce_available(¤t_cpu_data)) { 1163 machine_check_poll(MCP_TIMESTAMP, 1164 &__get_cpu_var(mce_poll_banks)); 1165 } 1166 1167 /* 1168 * Alert userspace if needed. If we logged an MCE, reduce the 1169 * polling interval, otherwise increase the polling interval. 1170 */ 1171 n = &__get_cpu_var(mce_next_interval); 1172 if (mce_notify_irq()) 1173 *n = max(*n/2, HZ/100); 1174 else 1175 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1176 1177 t->expires = jiffies + *n; 1178 add_timer_on(t, smp_processor_id()); 1179} 1180 1181static void mce_do_trigger(struct work_struct *work) 1182{ 1183 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); 1184} 1185 1186static DECLARE_WORK(mce_trigger_work, mce_do_trigger); 1187 1188/* 1189 * Notify the user(s) about new machine check events. 1190 * Can be called from interrupt context, but not from machine check/NMI 1191 * context. 1192 */ 1193int mce_notify_irq(void) 1194{ 1195 /* Not more than two messages every minute */ 1196 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1197 1198 clear_thread_flag(TIF_MCE_NOTIFY); 1199 1200 if (test_and_clear_bit(0, &mce_need_notify)) { 1201 wake_up_interruptible(&mce_wait); 1202 1203 /* 1204 * There is no risk of missing notifications because 1205 * work_pending is always cleared before the function is 1206 * executed. 1207 */ 1208 if (mce_helper[0] && !work_pending(&mce_trigger_work)) 1209 schedule_work(&mce_trigger_work); 1210 1211 if (__ratelimit(&ratelimit)) 1212 pr_info(HW_ERR "Machine check events logged\n"); 1213 1214 return 1; 1215 } 1216 return 0; 1217} 1218EXPORT_SYMBOL_GPL(mce_notify_irq); 1219 1220static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1221{ 1222 int i; 1223 1224 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1225 if (!mce_banks) 1226 return -ENOMEM; 1227 for (i = 0; i < banks; i++) { 1228 struct mce_bank *b = &mce_banks[i]; 1229 1230 b->ctl = -1ULL; 1231 b->init = 1; 1232 } 1233 return 0; 1234} 1235 1236/* 1237 * Initialize Machine Checks for a CPU. 1238 */ 1239static int __cpuinit __mcheck_cpu_cap_init(void) 1240{ 1241 unsigned b; 1242 u64 cap; 1243 1244 rdmsrl(MSR_IA32_MCG_CAP, cap); 1245 1246 b = cap & MCG_BANKCNT_MASK; 1247 if (!banks) 1248 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1249 1250 if (b > MAX_NR_BANKS) { 1251 printk(KERN_WARNING 1252 "MCE: Using only %u machine check banks out of %u\n", 1253 MAX_NR_BANKS, b); 1254 b = MAX_NR_BANKS; 1255 } 1256 1257 /* Don't support asymmetric configurations today */ 1258 WARN_ON(banks != 0 && b != banks); 1259 banks = b; 1260 if (!mce_banks) { 1261 int err = __mcheck_cpu_mce_banks_init(); 1262 1263 if (err) 1264 return err; 1265 } 1266 1267 /* Use accurate RIP reporting if available. */ 1268 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1269 rip_msr = MSR_IA32_MCG_EIP; 1270 1271 if (cap & MCG_SER_P) 1272 mce_ser = 1; 1273 1274 return 0; 1275} 1276 1277static void __mcheck_cpu_init_generic(void) 1278{ 1279 mce_banks_t all_banks; 1280 u64 cap; 1281 int i; 1282 1283 /* 1284 * Log the machine checks left over from the previous reset. 1285 */ 1286 bitmap_fill(all_banks, MAX_NR_BANKS); 1287 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1288 1289 set_in_cr4(X86_CR4_MCE); 1290 1291 rdmsrl(MSR_IA32_MCG_CAP, cap); 1292 if (cap & MCG_CTL_P) 1293 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1294 1295 for (i = 0; i < banks; i++) { 1296 struct mce_bank *b = &mce_banks[i]; 1297 1298 if (!b->init) 1299 continue; 1300 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 1301 wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 1302 } 1303} 1304 1305/* Add per CPU specific workarounds here */ 1306static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1307{ 1308 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1309 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1310 return -EOPNOTSUPP; 1311 } 1312 1313 /* This should be disabled by the BIOS, but isn't always */ 1314 if (c->x86_vendor == X86_VENDOR_AMD) { 1315 if (c->x86 == 15 && banks > 4) { 1316 /* 1317 * disable GART TBL walk error reporting, which 1318 * trips off incorrectly with the IOMMU & 3ware 1319 * & Cerberus: 1320 */ 1321 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1322 } 1323 if (c->x86 <= 17 && mce_bootlog < 0) { 1324 /* 1325 * Lots of broken BIOS around that don't clear them 1326 * by default and leave crap in there. Don't log: 1327 */ 1328 mce_bootlog = 0; 1329 } 1330 /* 1331 * Various K7s with broken bank 0 around. Always disable 1332 * by default. 1333 */ 1334 if (c->x86 == 6 && banks > 0) 1335 mce_banks[0].ctl = 0; 1336 } 1337 1338 if (c->x86_vendor == X86_VENDOR_INTEL) { 1339 /* 1340 * SDM documents that on family 6 bank 0 should not be written 1341 * because it aliases to another special BIOS controlled 1342 * register. 1343 * But it's not aliased anymore on model 0x1a+ 1344 * Don't ignore bank 0 completely because there could be a 1345 * valid event later, merely don't write CTL0. 1346 */ 1347 1348 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1349 mce_banks[0].init = 0; 1350 1351 /* 1352 * All newer Intel systems support MCE broadcasting. Enable 1353 * synchronization with a one second timeout. 1354 */ 1355 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1356 monarch_timeout < 0) 1357 monarch_timeout = USEC_PER_SEC; 1358 1359 /* 1360 * There are also broken BIOSes on some Pentium M and 1361 * earlier systems: 1362 */ 1363 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1364 mce_bootlog = 0; 1365 } 1366 if (monarch_timeout < 0) 1367 monarch_timeout = 0; 1368 if (mce_bootlog != 0) 1369 mce_panic_timeout = 30; 1370 1371 return 0; 1372} 1373 1374static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1375{ 1376 if (c->x86 != 5) 1377 return; 1378 switch (c->x86_vendor) { 1379 case X86_VENDOR_INTEL: 1380 intel_p5_mcheck_init(c); 1381 break; 1382 case X86_VENDOR_CENTAUR: 1383 winchip_mcheck_init(c); 1384 break; 1385 } 1386} 1387 1388static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1389{ 1390 switch (c->x86_vendor) { 1391 case X86_VENDOR_INTEL: 1392 mce_intel_feature_init(c); 1393 break; 1394 case X86_VENDOR_AMD: 1395 mce_amd_feature_init(c); 1396 break; 1397 default: 1398 break; 1399 } 1400} 1401 1402static void __mcheck_cpu_init_timer(void) 1403{ 1404 struct timer_list *t = &__get_cpu_var(mce_timer); 1405 int *n = &__get_cpu_var(mce_next_interval); 1406 1407 setup_timer(t, mce_start_timer, smp_processor_id()); 1408 1409 if (mce_ignore_ce) 1410 return; 1411 1412 *n = check_interval * HZ; 1413 if (!*n) 1414 return; 1415 t->expires = round_jiffies(jiffies + *n); 1416 add_timer_on(t, smp_processor_id()); 1417} 1418 1419/* Handle unconfigured int18 (should never happen) */ 1420static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1421{ 1422 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", 1423 smp_processor_id()); 1424} 1425 1426/* Call the installed machine check handler for this CPU setup. */ 1427void (*machine_check_vector)(struct pt_regs *, long error_code) = 1428 unexpected_machine_check; 1429 1430/* 1431 * Called for each booted CPU to set up machine checks. 1432 * Must be called with preempt off: 1433 */ 1434void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1435{ 1436 if (mce_disabled) 1437 return; 1438 1439 __mcheck_cpu_ancient_init(c); 1440 1441 if (!mce_available(c)) 1442 return; 1443 1444 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1445 mce_disabled = 1; 1446 return; 1447 } 1448 1449 machine_check_vector = do_machine_check; 1450 1451 __mcheck_cpu_init_generic(); 1452 __mcheck_cpu_init_vendor(c); 1453 __mcheck_cpu_init_timer(); 1454 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1455 1456} 1457 1458/* 1459 * Character device to read and clear the MCE log. 1460 */ 1461 1462static DEFINE_SPINLOCK(mce_state_lock); 1463static int open_count; /* #times opened */ 1464static int open_exclu; /* already open exclusive? */ 1465 1466static int mce_open(struct inode *inode, struct file *file) 1467{ 1468 spin_lock(&mce_state_lock); 1469 1470 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1471 spin_unlock(&mce_state_lock); 1472 1473 return -EBUSY; 1474 } 1475 1476 if (file->f_flags & O_EXCL) 1477 open_exclu = 1; 1478 open_count++; 1479 1480 spin_unlock(&mce_state_lock); 1481 1482 return nonseekable_open(inode, file); 1483} 1484 1485static int mce_release(struct inode *inode, struct file *file) 1486{ 1487 spin_lock(&mce_state_lock); 1488 1489 open_count--; 1490 open_exclu = 0; 1491 1492 spin_unlock(&mce_state_lock); 1493 1494 return 0; 1495} 1496 1497static void collect_tscs(void *data) 1498{ 1499 unsigned long *cpu_tsc = (unsigned long *)data; 1500 1501 rdtscll(cpu_tsc[smp_processor_id()]); 1502} 1503 1504static int mce_apei_read_done; 1505 1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ 1507static int __mce_read_apei(char __user **ubuf, size_t usize) 1508{ 1509 int rc; 1510 u64 record_id; 1511 struct mce m; 1512 1513 if (usize < sizeof(struct mce)) 1514 return -EINVAL; 1515 1516 rc = apei_read_mce(&m, &record_id); 1517 /* Error or no more MCE record */ 1518 if (rc <= 0) { 1519 mce_apei_read_done = 1; 1520 return rc; 1521 } 1522 rc = -EFAULT; 1523 if (copy_to_user(*ubuf, &m, sizeof(struct mce))) 1524 return rc; 1525 /* 1526 * In fact, we should have cleared the record after that has 1527 * been flushed to the disk or sent to network in 1528 * /sbin/mcelog, but we have no interface to support that now, 1529 * so just clear it to avoid duplication. 1530 */ 1531 rc = apei_clear_mce(record_id); 1532 if (rc) { 1533 mce_apei_read_done = 1; 1534 return rc; 1535 } 1536 *ubuf += sizeof(struct mce); 1537 1538 return 0; 1539} 1540 1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1542 loff_t *off) 1543{ 1544 char __user *buf = ubuf; 1545 unsigned long *cpu_tsc; 1546 unsigned prev, next; 1547 int i, err; 1548 1549 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); 1550 if (!cpu_tsc) 1551 return -ENOMEM; 1552 1553 mutex_lock(&mce_read_mutex); 1554 1555 if (!mce_apei_read_done) { 1556 err = __mce_read_apei(&buf, usize); 1557 if (err || buf != ubuf) 1558 goto out; 1559 } 1560 1561 next = rcu_dereference_check_mce(mcelog.next); 1562 1563 /* Only supports full reads right now */ 1564 err = -EINVAL; 1565 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) 1566 goto out; 1567 1568 err = 0; 1569 prev = 0; 1570 do { 1571 for (i = prev; i < next; i++) { 1572 unsigned long start = jiffies; 1573 1574 while (!mcelog.entry[i].finished) { 1575 if (time_after_eq(jiffies, start + 2)) { 1576 memset(mcelog.entry + i, 0, 1577 sizeof(struct mce)); 1578 goto timeout; 1579 } 1580 cpu_relax(); 1581 } 1582 smp_rmb(); 1583 err |= copy_to_user(buf, mcelog.entry + i, 1584 sizeof(struct mce)); 1585 buf += sizeof(struct mce); 1586timeout: 1587 ; 1588 } 1589 1590 memset(mcelog.entry + prev, 0, 1591 (next - prev) * sizeof(struct mce)); 1592 prev = next; 1593 next = cmpxchg(&mcelog.next, prev, 0); 1594 } while (next != prev); 1595 1596 synchronize_sched(); 1597 1598 /* 1599 * Collect entries that were still getting written before the 1600 * synchronize. 1601 */ 1602 on_each_cpu(collect_tscs, cpu_tsc, 1); 1603 1604 for (i = next; i < MCE_LOG_LEN; i++) { 1605 if (mcelog.entry[i].finished && 1606 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1607 err |= copy_to_user(buf, mcelog.entry+i, 1608 sizeof(struct mce)); 1609 smp_rmb(); 1610 buf += sizeof(struct mce); 1611 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1612 } 1613 } 1614 1615 if (err) 1616 err = -EFAULT; 1617 1618out: 1619 mutex_unlock(&mce_read_mutex); 1620 kfree(cpu_tsc); 1621 1622 return err ? err : buf - ubuf; 1623} 1624 1625static unsigned int mce_poll(struct file *file, poll_table *wait) 1626{ 1627 poll_wait(file, &mce_wait, wait); 1628 if (rcu_dereference_check_mce(mcelog.next)) 1629 return POLLIN | POLLRDNORM; 1630 if (!mce_apei_read_done && apei_check_mce()) 1631 return POLLIN | POLLRDNORM; 1632 return 0; 1633} 1634 1635static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1636{ 1637 int __user *p = (int __user *)arg; 1638 1639 if (!capable(CAP_SYS_ADMIN)) 1640 return -EPERM; 1641 1642 switch (cmd) { 1643 case MCE_GET_RECORD_LEN: 1644 return put_user(sizeof(struct mce), p); 1645 case MCE_GET_LOG_LEN: 1646 return put_user(MCE_LOG_LEN, p); 1647 case MCE_GETCLEAR_FLAGS: { 1648 unsigned flags; 1649 1650 do { 1651 flags = mcelog.flags; 1652 } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 1653 1654 return put_user(flags, p); 1655 } 1656 default: 1657 return -ENOTTY; 1658 } 1659} 1660 1661/* Modified in mce-inject.c, so not static or const */ 1662struct file_operations mce_chrdev_ops = { 1663 .open = mce_open, 1664 .release = mce_release, 1665 .read = mce_read, 1666 .poll = mce_poll, 1667 .unlocked_ioctl = mce_ioctl, 1668}; 1669EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1670 1671static struct miscdevice mce_log_device = { 1672 MISC_MCELOG_MINOR, 1673 "mcelog", 1674 &mce_chrdev_ops, 1675}; 1676 1677/* 1678 * mce=off Disables machine check 1679 * mce=no_cmci Disables CMCI 1680 * mce=dont_log_ce Clears corrected events silently, no log created for CEs. 1681 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. 1682 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) 1683 * monarchtimeout is how long to wait for other CPUs on machine 1684 * check, or 0 to not wait 1685 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1686 * mce=nobootlog Don't log MCEs from before booting. 1687 */ 1688static int __init mcheck_enable(char *str) 1689{ 1690 if (*str == 0) { 1691 enable_p5_mce(); 1692 return 1; 1693 } 1694 if (*str == '=') 1695 str++; 1696 if (!strcmp(str, "off")) 1697 mce_disabled = 1; 1698 else if (!strcmp(str, "no_cmci")) 1699 mce_cmci_disabled = 1; 1700 else if (!strcmp(str, "dont_log_ce")) 1701 mce_dont_log_ce = 1; 1702 else if (!strcmp(str, "ignore_ce")) 1703 mce_ignore_ce = 1; 1704 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1705 mce_bootlog = (str[0] == 'b'); 1706 else if (isdigit(str[0])) { 1707 get_option(&str, &tolerant); 1708 if (*str == ',') { 1709 ++str; 1710 get_option(&str, &monarch_timeout); 1711 } 1712 } else { 1713 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1714 str); 1715 return 0; 1716 } 1717 return 1; 1718} 1719__setup("mce", mcheck_enable); 1720 1721int __init mcheck_init(void) 1722{ 1723 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); 1724 1725 mcheck_intel_therm_init(); 1726 1727 return 0; 1728} 1729 1730/* 1731 * Sysfs support 1732 */ 1733 1734/* 1735 * Disable machine checks on suspend and shutdown. We can't really handle 1736 * them later. 1737 */ 1738static int mce_disable_error_reporting(void) 1739{ 1740 int i; 1741 1742 for (i = 0; i < banks; i++) { 1743 struct mce_bank *b = &mce_banks[i]; 1744 1745 if (b->init) 1746 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 1747 } 1748 return 0; 1749} 1750 1751static int mce_suspend(struct sys_device *dev, pm_message_t state) 1752{ 1753 return mce_disable_error_reporting(); 1754} 1755 1756static int mce_shutdown(struct sys_device *dev) 1757{ 1758 return mce_disable_error_reporting(); 1759} 1760 1761/* 1762 * On resume clear all MCE state. Don't want to see leftovers from the BIOS. 1763 * Only one CPU is active at this time, the others get re-added later using 1764 * CPU hotplug: 1765 */ 1766static int mce_resume(struct sys_device *dev) 1767{ 1768 __mcheck_cpu_init_generic(); 1769 __mcheck_cpu_init_vendor(¤t_cpu_data); 1770 1771 return 0; 1772} 1773 1774static void mce_cpu_restart(void *data) 1775{ 1776 del_timer_sync(&__get_cpu_var(mce_timer)); 1777 if (!mce_available(¤t_cpu_data)) 1778 return; 1779 __mcheck_cpu_init_generic(); 1780 __mcheck_cpu_init_timer(); 1781} 1782 1783/* Reinit MCEs after user configuration changes */ 1784static void mce_restart(void) 1785{ 1786 on_each_cpu(mce_cpu_restart, NULL, 1); 1787} 1788 1789/* Toggle features for corrected errors */ 1790static void mce_disable_ce(void *all) 1791{ 1792 if (!mce_available(¤t_cpu_data)) 1793 return; 1794 if (all) 1795 del_timer_sync(&__get_cpu_var(mce_timer)); 1796 cmci_clear(); 1797} 1798 1799static void mce_enable_ce(void *all) 1800{ 1801 if (!mce_available(¤t_cpu_data)) 1802 return; 1803 cmci_reenable(); 1804 cmci_recheck(); 1805 if (all) 1806 __mcheck_cpu_init_timer(); 1807} 1808 1809static struct sysdev_class mce_sysclass = { 1810 .suspend = mce_suspend, 1811 .shutdown = mce_shutdown, 1812 .resume = mce_resume, 1813 .name = "machinecheck", 1814}; 1815 1816DEFINE_PER_CPU(struct sys_device, mce_dev); 1817 1818__cpuinitdata 1819void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1820 1821static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1822{ 1823 return container_of(attr, struct mce_bank, attr); 1824} 1825 1826static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1827 char *buf) 1828{ 1829 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1830} 1831 1832static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1833 const char *buf, size_t size) 1834{ 1835 u64 new; 1836 1837 if (strict_strtoull(buf, 0, &new) < 0) 1838 return -EINVAL; 1839 1840 attr_to_bank(attr)->ctl = new; 1841 mce_restart(); 1842 1843 return size; 1844} 1845 1846static ssize_t 1847show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1848{ 1849 strcpy(buf, mce_helper); 1850 strcat(buf, "\n"); 1851 return strlen(mce_helper) + 1; 1852} 1853 1854static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1855 const char *buf, size_t siz) 1856{ 1857 char *p; 1858 1859 strncpy(mce_helper, buf, sizeof(mce_helper)); 1860 mce_helper[sizeof(mce_helper)-1] = 0; 1861 p = strchr(mce_helper, '\n'); 1862 1863 if (p) 1864 *p = 0; 1865 1866 return strlen(mce_helper) + !!p; 1867} 1868 1869static ssize_t set_ignore_ce(struct sys_device *s, 1870 struct sysdev_attribute *attr, 1871 const char *buf, size_t size) 1872{ 1873 u64 new; 1874 1875 if (strict_strtoull(buf, 0, &new) < 0) 1876 return -EINVAL; 1877 1878 if (mce_ignore_ce ^ !!new) { 1879 if (new) { 1880 /* disable ce features */ 1881 on_each_cpu(mce_disable_ce, (void *)1, 1); 1882 mce_ignore_ce = 1; 1883 } else { 1884 /* enable ce features */ 1885 mce_ignore_ce = 0; 1886 on_each_cpu(mce_enable_ce, (void *)1, 1); 1887 } 1888 } 1889 return size; 1890} 1891 1892static ssize_t set_cmci_disabled(struct sys_device *s, 1893 struct sysdev_attribute *attr, 1894 const char *buf, size_t size) 1895{ 1896 u64 new; 1897 1898 if (strict_strtoull(buf, 0, &new) < 0) 1899 return -EINVAL; 1900 1901 if (mce_cmci_disabled ^ !!new) { 1902 if (new) { 1903 /* disable cmci */ 1904 on_each_cpu(mce_disable_ce, NULL, 1); 1905 mce_cmci_disabled = 1; 1906 } else { 1907 /* enable cmci */ 1908 mce_cmci_disabled = 0; 1909 on_each_cpu(mce_enable_ce, NULL, 1); 1910 } 1911 } 1912 return size; 1913} 1914 1915static ssize_t store_int_with_restart(struct sys_device *s, 1916 struct sysdev_attribute *attr, 1917 const char *buf, size_t size) 1918{ 1919 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1920 mce_restart(); 1921 return ret; 1922} 1923 1924static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1925static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1926static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1927static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1928 1929static struct sysdev_ext_attribute attr_check_interval = { 1930 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1931 store_int_with_restart), 1932 &check_interval 1933}; 1934 1935static struct sysdev_ext_attribute attr_ignore_ce = { 1936 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1937 &mce_ignore_ce 1938}; 1939 1940static struct sysdev_ext_attribute attr_cmci_disabled = { 1941 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1942 &mce_cmci_disabled 1943}; 1944 1945static struct sysdev_attribute *mce_attrs[] = { 1946 &attr_tolerant.attr, 1947 &attr_check_interval.attr, 1948 &attr_trigger, 1949 &attr_monarch_timeout.attr, 1950 &attr_dont_log_ce.attr, 1951 &attr_ignore_ce.attr, 1952 &attr_cmci_disabled.attr, 1953 NULL 1954}; 1955 1956static cpumask_var_t mce_dev_initialized; 1957 1958/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1959static __cpuinit int mce_create_device(unsigned int cpu) 1960{ 1961 int err; 1962 int i, j; 1963 1964 if (!mce_available(&boot_cpu_data)) 1965 return -EIO; 1966 1967 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1968 per_cpu(mce_dev, cpu).id = cpu; 1969 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1970 1971 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1972 if (err) 1973 return err; 1974 1975 for (i = 0; mce_attrs[i]; i++) { 1976 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1977 if (err) 1978 goto error; 1979 } 1980 for (j = 0; j < banks; j++) { 1981 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1982 &mce_banks[j].attr); 1983 if (err) 1984 goto error2; 1985 } 1986 cpumask_set_cpu(cpu, mce_dev_initialized); 1987 1988 return 0; 1989error2: 1990 while (--j >= 0) 1991 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1992error: 1993 while (--i >= 0) 1994 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1995 1996 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1997 1998 return err; 1999} 2000 2001static __cpuinit void mce_remove_device(unsigned int cpu) 2002{ 2003 int i; 2004 2005 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 2006 return; 2007 2008 for (i = 0; mce_attrs[i]; i++) 2009 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 2010 2011 for (i = 0; i < banks; i++) 2012 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 2013 2014 sysdev_unregister(&per_cpu(mce_dev, cpu)); 2015 cpumask_clear_cpu(cpu, mce_dev_initialized); 2016} 2017 2018/* Make sure there are no machine checks on offlined CPUs. */ 2019static void __cpuinit mce_disable_cpu(void *h) 2020{ 2021 unsigned long action = *(unsigned long *)h; 2022 int i; 2023 2024 if (!mce_available(¤t_cpu_data)) 2025 return; 2026 2027 if (!(action & CPU_TASKS_FROZEN)) 2028 cmci_clear(); 2029 for (i = 0; i < banks; i++) { 2030 struct mce_bank *b = &mce_banks[i]; 2031 2032 if (b->init) 2033 wrmsrl(MSR_IA32_MCx_CTL(i), 0); 2034 } 2035} 2036 2037static void __cpuinit mce_reenable_cpu(void *h) 2038{ 2039 unsigned long action = *(unsigned long *)h; 2040 int i; 2041 2042 if (!mce_available(¤t_cpu_data)) 2043 return; 2044 2045 if (!(action & CPU_TASKS_FROZEN)) 2046 cmci_reenable(); 2047 for (i = 0; i < banks; i++) { 2048 struct mce_bank *b = &mce_banks[i]; 2049 2050 if (b->init) 2051 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); 2052 } 2053} 2054 2055/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 2056static int __cpuinit 2057mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 2058{ 2059 unsigned int cpu = (unsigned long)hcpu; 2060 struct timer_list *t = &per_cpu(mce_timer, cpu); 2061 2062 switch (action) { 2063 case CPU_ONLINE: 2064 case CPU_ONLINE_FROZEN: 2065 mce_create_device(cpu); 2066 if (threshold_cpu_callback) 2067 threshold_cpu_callback(action, cpu); 2068 break; 2069 case CPU_DEAD: 2070 case CPU_DEAD_FROZEN: 2071 if (threshold_cpu_callback) 2072 threshold_cpu_callback(action, cpu); 2073 mce_remove_device(cpu); 2074 break; 2075 case CPU_DOWN_PREPARE: 2076 case CPU_DOWN_PREPARE_FROZEN: 2077 del_timer_sync(t); 2078 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2079 break; 2080 case CPU_DOWN_FAILED: 2081 case CPU_DOWN_FAILED_FROZEN: 2082 if (!mce_ignore_ce && check_interval) { 2083 t->expires = round_jiffies(jiffies + 2084 __get_cpu_var(mce_next_interval)); 2085 add_timer_on(t, cpu); 2086 } 2087 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2088 break; 2089 case CPU_POST_DEAD: 2090 /* intentionally ignoring frozen here */ 2091 cmci_rediscover(cpu); 2092 break; 2093 } 2094 return NOTIFY_OK; 2095} 2096 2097static struct notifier_block mce_cpu_notifier __cpuinitdata = { 2098 .notifier_call = mce_cpu_callback, 2099}; 2100 2101static __init void mce_init_banks(void) 2102{ 2103 int i; 2104 2105 for (i = 0; i < banks; i++) { 2106 struct mce_bank *b = &mce_banks[i]; 2107 struct sysdev_attribute *a = &b->attr; 2108 2109 sysfs_attr_init(&a->attr); 2110 a->attr.name = b->attrname; 2111 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2112 2113 a->attr.mode = 0644; 2114 a->show = show_bank; 2115 a->store = set_bank; 2116 } 2117} 2118 2119static __init int mcheck_init_device(void) 2120{ 2121 int err; 2122 int i = 0; 2123 2124 if (!mce_available(&boot_cpu_data)) 2125 return -EIO; 2126 2127 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2128 2129 mce_init_banks(); 2130 2131 err = sysdev_class_register(&mce_sysclass); 2132 if (err) 2133 return err; 2134 2135 for_each_online_cpu(i) { 2136 err = mce_create_device(i); 2137 if (err) 2138 return err; 2139 } 2140 2141 register_hotcpu_notifier(&mce_cpu_notifier); 2142 misc_register(&mce_log_device); 2143 2144 return err; 2145} 2146 2147device_initcall(mcheck_init_device); 2148 2149/* 2150 * Old style boot options parsing. Only for compatibility. 2151 */ 2152static int __init mcheck_disable(char *str) 2153{ 2154 mce_disabled = 1; 2155 return 1; 2156} 2157__setup("nomce", mcheck_disable); 2158 2159#ifdef CONFIG_DEBUG_FS 2160struct dentry *mce_get_debugfs_dir(void) 2161{ 2162 static struct dentry *dmce; 2163 2164 if (!dmce) 2165 dmce = debugfs_create_dir("mce", NULL); 2166 2167 return dmce; 2168} 2169 2170static void mce_reset(void) 2171{ 2172 cpu_missing = 0; 2173 atomic_set(&mce_fake_paniced, 0); 2174 atomic_set(&mce_executing, 0); 2175 atomic_set(&mce_callin, 0); 2176 atomic_set(&global_nwo, 0); 2177} 2178 2179static int fake_panic_get(void *data, u64 *val) 2180{ 2181 *val = fake_panic; 2182 return 0; 2183} 2184 2185static int fake_panic_set(void *data, u64 val) 2186{ 2187 mce_reset(); 2188 fake_panic = val; 2189 return 0; 2190} 2191 2192DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2193 fake_panic_set, "%llu\n"); 2194 2195static int __init mcheck_debugfs_init(void) 2196{ 2197 struct dentry *dmce, *ffake_panic; 2198 2199 dmce = mce_get_debugfs_dir(); 2200 if (!dmce) 2201 return -ENOMEM; 2202 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, 2203 &fake_panic_fops); 2204 if (!ffake_panic) 2205 return -ENOMEM; 2206 2207 return 0; 2208} 2209late_initcall(mcheck_debugfs_init); 2210#endif 2211