• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6/arch/x86/kernel/cpu/mcheck/
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/init.h>
31#include <linux/kmod.h>
32#include <linux/poll.h>
33#include <linux/nmi.h>
34#include <linux/cpu.h>
35#include <linux/smp.h>
36#include <linux/fs.h>
37#include <linux/mm.h>
38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
40
41#include <asm/processor.h>
42#include <asm/hw_irq.h>
43#include <asm/apic.h>
44#include <asm/idle.h>
45#include <asm/ipi.h>
46#include <asm/mce.h>
47#include <asm/msr.h>
48
49#include "mce-internal.h"
50
51static DEFINE_MUTEX(mce_read_mutex);
52
53#define rcu_dereference_check_mce(p) \
54	rcu_dereference_index_check((p), \
55			      rcu_read_lock_sched_held() || \
56			      lockdep_is_held(&mce_read_mutex))
57
58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h>
60
61int mce_disabled __read_mostly;
62
63#define MISC_MCELOG_MINOR	227
64
65#define SPINUNIT 100	/* 100ns */
66
67atomic_t mce_entry;
68
69DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71/*
72 * Tolerant levels:
73 *   0: always panic on uncorrected errors, log corrected errors
74 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
75 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
76 *   3: never panic or SIGBUS, log all errors (for testing only)
77 */
78static int			tolerant		__read_mostly = 1;
79static int			banks			__read_mostly;
80static int			rip_msr			__read_mostly;
81static int			mce_bootlog		__read_mostly = -1;
82static int			monarch_timeout		__read_mostly = -1;
83static int			mce_panic_timeout	__read_mostly;
84static int			mce_dont_log_ce		__read_mostly;
85int				mce_cmci_disabled	__read_mostly;
86int				mce_ignore_ce		__read_mostly;
87int				mce_ser			__read_mostly;
88
89struct mce_bank                *mce_banks		__read_mostly;
90
91/* User mode helper program triggered by machine check event */
92static unsigned long		mce_need_notify;
93static char			mce_helper[128];
94static char			*mce_helper_argv[2] = { mce_helper, NULL };
95
96static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
97static DEFINE_PER_CPU(struct mce, mces_seen);
98static int			cpu_missing;
99
100/*
101 * CPU/chipset specific EDAC code can register a notifier call here to print
102 * MCE errors in a human-readable form.
103 */
104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
106
107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108			       void *data)
109{
110	pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111	pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112
113	return NOTIFY_STOP;
114}
115
116static struct notifier_block mce_dec_nb = {
117	.notifier_call = default_decode_mce,
118	.priority      = -1,
119};
120
121/* MCA banks polled by the period polling timer for corrected events */
122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
123	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
124};
125
126static DEFINE_PER_CPU(struct work_struct, mce_work);
127
128/* Do initial initialization of a struct mce */
129void mce_setup(struct mce *m)
130{
131	memset(m, 0, sizeof(struct mce));
132	m->cpu = m->extcpu = smp_processor_id();
133	rdtscll(m->tsc);
134	/* We hope get_seconds stays lockless */
135	m->time = get_seconds();
136	m->cpuvendor = boot_cpu_data.x86_vendor;
137	m->cpuid = cpuid_eax(1);
138#ifdef CONFIG_SMP
139	m->socketid = cpu_data(m->extcpu).phys_proc_id;
140#endif
141	m->apicid = cpu_data(m->extcpu).initial_apicid;
142	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
143}
144
145DEFINE_PER_CPU(struct mce, injectm);
146EXPORT_PER_CPU_SYMBOL_GPL(injectm);
147
148/*
149 * Lockless MCE logging infrastructure.
150 * This avoids deadlocks on printk locks without having to break locks. Also
151 * separate MCEs from kernel messages to avoid bogus bug reports.
152 */
153
154static struct mce_log mcelog = {
155	.signature	= MCE_LOG_SIGNATURE,
156	.len		= MCE_LOG_LEN,
157	.recordlen	= sizeof(struct mce),
158};
159
160void mce_log(struct mce *mce)
161{
162	unsigned next, entry;
163
164	/* Emit the trace record: */
165	trace_mce_record(mce);
166
167	mce->finished = 0;
168	wmb();
169	for (;;) {
170		entry = rcu_dereference_check_mce(mcelog.next);
171		for (;;) {
172			/*
173			 * If edac_mce is enabled, it will check the error type
174			 * and will process it, if it is a known error.
175			 * Otherwise, the error will be sent through mcelog
176			 * interface
177			 */
178			if (edac_mce_parse(mce))
179				return;
180
181			/*
182			 * When the buffer fills up discard new entries.
183			 * Assume that the earlier errors are the more
184			 * interesting ones:
185			 */
186			if (entry >= MCE_LOG_LEN) {
187				set_bit(MCE_OVERFLOW,
188					(unsigned long *)&mcelog.flags);
189				return;
190			}
191			/* Old left over entry. Skip: */
192			if (mcelog.entry[entry].finished) {
193				entry++;
194				continue;
195			}
196			break;
197		}
198		smp_rmb();
199		next = entry + 1;
200		if (cmpxchg(&mcelog.next, entry, next) == entry)
201			break;
202	}
203	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
204	wmb();
205	mcelog.entry[entry].finished = 1;
206	wmb();
207
208	mce->finished = 1;
209	set_bit(0, &mce_need_notify);
210}
211
212static void print_mce(struct mce *m)
213{
214	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215	       m->extcpu, m->mcgstatus, m->bank, m->status);
216
217	if (m->ip) {
218		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
219			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
220				m->cs, m->ip);
221
222		if (m->cs == __KERNEL_CS)
223			print_symbol("{%s}", m->ip);
224		pr_cont("\n");
225	}
226
227	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
228	if (m->addr)
229		pr_cont("ADDR %llx ", m->addr);
230	if (m->misc)
231		pr_cont("MISC %llx ", m->misc);
232
233	pr_cont("\n");
234	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
235		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
236
237	/*
238	 * Print out human-readable details about the MCE error,
239	 * (if the CPU has an implementation for that)
240	 */
241	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
242}
243
244#define PANIC_TIMEOUT 5 /* 5 seconds */
245
246static atomic_t mce_paniced;
247
248static int fake_panic;
249static atomic_t mce_fake_paniced;
250
251/* Panic in progress. Enable interrupts and wait for final IPI */
252static void wait_for_panic(void)
253{
254	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
255
256	preempt_disable();
257	local_irq_enable();
258	while (timeout-- > 0)
259		udelay(1);
260	if (panic_timeout == 0)
261		panic_timeout = mce_panic_timeout;
262	panic("Panicing machine check CPU died");
263}
264
265static void mce_panic(char *msg, struct mce *final, char *exp)
266{
267	int i, apei_err = 0;
268
269	if (!fake_panic) {
270		/*
271		 * Make sure only one CPU runs in machine check panic
272		 */
273		if (atomic_inc_return(&mce_paniced) > 1)
274			wait_for_panic();
275		barrier();
276
277		bust_spinlocks(1);
278		console_verbose();
279	} else {
280		/* Don't log too much for fake panic */
281		if (atomic_inc_return(&mce_fake_paniced) > 1)
282			return;
283	}
284	/* First print corrected ones that are still unlogged */
285	for (i = 0; i < MCE_LOG_LEN; i++) {
286		struct mce *m = &mcelog.entry[i];
287		if (!(m->status & MCI_STATUS_VAL))
288			continue;
289		if (!(m->status & MCI_STATUS_UC)) {
290			print_mce(m);
291			if (!apei_err)
292				apei_err = apei_write_mce(m);
293		}
294	}
295	/* Now print uncorrected but with the final one last */
296	for (i = 0; i < MCE_LOG_LEN; i++) {
297		struct mce *m = &mcelog.entry[i];
298		if (!(m->status & MCI_STATUS_VAL))
299			continue;
300		if (!(m->status & MCI_STATUS_UC))
301			continue;
302		if (!final || memcmp(m, final, sizeof(struct mce))) {
303			print_mce(m);
304			if (!apei_err)
305				apei_err = apei_write_mce(m);
306		}
307	}
308	if (final) {
309		print_mce(final);
310		if (!apei_err)
311			apei_err = apei_write_mce(final);
312	}
313	if (cpu_missing)
314		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
315	if (exp)
316		pr_emerg(HW_ERR "Machine check: %s\n", exp);
317	if (!fake_panic) {
318		if (panic_timeout == 0)
319			panic_timeout = mce_panic_timeout;
320		panic(msg);
321	} else
322		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
323}
324
325/* Support code for software error injection */
326
327static int msr_to_offset(u32 msr)
328{
329	unsigned bank = __get_cpu_var(injectm.bank);
330
331	if (msr == rip_msr)
332		return offsetof(struct mce, ip);
333	if (msr == MSR_IA32_MCx_STATUS(bank))
334		return offsetof(struct mce, status);
335	if (msr == MSR_IA32_MCx_ADDR(bank))
336		return offsetof(struct mce, addr);
337	if (msr == MSR_IA32_MCx_MISC(bank))
338		return offsetof(struct mce, misc);
339	if (msr == MSR_IA32_MCG_STATUS)
340		return offsetof(struct mce, mcgstatus);
341	return -1;
342}
343
344/* MSR access wrappers used for error injection */
345static u64 mce_rdmsrl(u32 msr)
346{
347	u64 v;
348
349	if (__get_cpu_var(injectm).finished) {
350		int offset = msr_to_offset(msr);
351
352		if (offset < 0)
353			return 0;
354		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
355	}
356
357	if (rdmsrl_safe(msr, &v)) {
358		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
359		/*
360		 * Return zero in case the access faulted. This should
361		 * not happen normally but can happen if the CPU does
362		 * something weird, or if the code is buggy.
363		 */
364		v = 0;
365	}
366
367	return v;
368}
369
370static void mce_wrmsrl(u32 msr, u64 v)
371{
372	if (__get_cpu_var(injectm).finished) {
373		int offset = msr_to_offset(msr);
374
375		if (offset >= 0)
376			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
377		return;
378	}
379	wrmsrl(msr, v);
380}
381
382/*
383 * Simple lockless ring to communicate PFNs from the exception handler with the
384 * process context work function. This is vastly simplified because there's
385 * only a single reader and a single writer.
386 */
387#define MCE_RING_SIZE 16	/* we use one entry less */
388
389struct mce_ring {
390	unsigned short start;
391	unsigned short end;
392	unsigned long ring[MCE_RING_SIZE];
393};
394static DEFINE_PER_CPU(struct mce_ring, mce_ring);
395
396/* Runs with CPU affinity in workqueue */
397static int mce_ring_empty(void)
398{
399	struct mce_ring *r = &__get_cpu_var(mce_ring);
400
401	return r->start == r->end;
402}
403
404static int mce_ring_get(unsigned long *pfn)
405{
406	struct mce_ring *r;
407	int ret = 0;
408
409	*pfn = 0;
410	get_cpu();
411	r = &__get_cpu_var(mce_ring);
412	if (r->start == r->end)
413		goto out;
414	*pfn = r->ring[r->start];
415	r->start = (r->start + 1) % MCE_RING_SIZE;
416	ret = 1;
417out:
418	put_cpu();
419	return ret;
420}
421
422/* Always runs in MCE context with preempt off */
423static int mce_ring_add(unsigned long pfn)
424{
425	struct mce_ring *r = &__get_cpu_var(mce_ring);
426	unsigned next;
427
428	next = (r->end + 1) % MCE_RING_SIZE;
429	if (next == r->start)
430		return -1;
431	r->ring[r->end] = pfn;
432	wmb();
433	r->end = next;
434	return 0;
435}
436
437int mce_available(struct cpuinfo_x86 *c)
438{
439	if (mce_disabled)
440		return 0;
441	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
442}
443
444static void mce_schedule_work(void)
445{
446	if (!mce_ring_empty()) {
447		struct work_struct *work = &__get_cpu_var(mce_work);
448		if (!work_pending(work))
449			schedule_work(work);
450	}
451}
452
453/*
454 * Get the address of the instruction at the time of the machine check
455 * error.
456 */
457static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
458{
459
460	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
461		m->ip = regs->ip;
462		m->cs = regs->cs;
463	} else {
464		m->ip = 0;
465		m->cs = 0;
466	}
467	if (rip_msr)
468		m->ip = mce_rdmsrl(rip_msr);
469}
470
471#ifdef CONFIG_X86_LOCAL_APIC
472/*
473 * Called after interrupts have been reenabled again
474 * when a MCE happened during an interrupts off region
475 * in the kernel.
476 */
477asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
478{
479	ack_APIC_irq();
480	exit_idle();
481	irq_enter();
482	mce_notify_irq();
483	mce_schedule_work();
484	irq_exit();
485}
486#endif
487
488static void mce_report_event(struct pt_regs *regs)
489{
490	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
491		mce_notify_irq();
492		/*
493		 * Triggering the work queue here is just an insurance
494		 * policy in case the syscall exit notify handler
495		 * doesn't run soon enough or ends up running on the
496		 * wrong CPU (can happen when audit sleeps)
497		 */
498		mce_schedule_work();
499		return;
500	}
501
502#ifdef CONFIG_X86_LOCAL_APIC
503	/*
504	 * Without APIC do not notify. The event will be picked
505	 * up eventually.
506	 */
507	if (!cpu_has_apic)
508		return;
509
510	/*
511	 * When interrupts are disabled we cannot use
512	 * kernel services safely. Trigger an self interrupt
513	 * through the APIC to instead do the notification
514	 * after interrupts are reenabled again.
515	 */
516	apic->send_IPI_self(MCE_SELF_VECTOR);
517
518	/*
519	 * Wait for idle afterwards again so that we don't leave the
520	 * APIC in a non idle state because the normal APIC writes
521	 * cannot exclude us.
522	 */
523	apic_wait_icr_idle();
524#endif
525}
526
527DEFINE_PER_CPU(unsigned, mce_poll_count);
528
529/*
530 * Poll for corrected events or events that happened before reset.
531 * Those are just logged through /dev/mcelog.
532 *
533 * This is executed in standard interrupt context.
534 *
535 * Note: spec recommends to panic for fatal unsignalled
536 * errors here. However this would be quite problematic --
537 * we would need to reimplement the Monarch handling and
538 * it would mess up the exclusion between exception handler
539 * and poll hander -- * so we skip this for now.
540 * These cases should not happen anyways, or only when the CPU
541 * is already totally * confused. In this case it's likely it will
542 * not fully execute the machine check handler either.
543 */
544void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
545{
546	struct mce m;
547	int i;
548
549	percpu_inc(mce_poll_count);
550
551	mce_setup(&m);
552
553	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
554	for (i = 0; i < banks; i++) {
555		if (!mce_banks[i].ctl || !test_bit(i, *b))
556			continue;
557
558		m.misc = 0;
559		m.addr = 0;
560		m.bank = i;
561		m.tsc = 0;
562
563		barrier();
564		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
565		if (!(m.status & MCI_STATUS_VAL))
566			continue;
567
568		/*
569		 * Uncorrected or signalled events are handled by the exception
570		 * handler when it is enabled, so don't process those here.
571		 *
572		 * TBD do the same check for MCI_STATUS_EN here?
573		 */
574		if (!(flags & MCP_UC) &&
575		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
576			continue;
577
578		if (m.status & MCI_STATUS_MISCV)
579			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
580		if (m.status & MCI_STATUS_ADDRV)
581			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
582
583		if (!(flags & MCP_TIMESTAMP))
584			m.tsc = 0;
585		/*
586		 * Don't get the IP here because it's unlikely to
587		 * have anything to do with the actual error location.
588		 */
589		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
590			mce_log(&m);
591			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
592			add_taint(TAINT_MACHINE_CHECK);
593		}
594
595		/*
596		 * Clear state for this bank.
597		 */
598		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
599	}
600
601	/*
602	 * Don't clear MCG_STATUS here because it's only defined for
603	 * exceptions.
604	 */
605
606	sync_core();
607}
608EXPORT_SYMBOL_GPL(machine_check_poll);
609
610/*
611 * Do a quick check if any of the events requires a panic.
612 * This decides if we keep the events around or clear them.
613 */
614static int mce_no_way_out(struct mce *m, char **msg)
615{
616	int i;
617
618	for (i = 0; i < banks; i++) {
619		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
620		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
621			return 1;
622	}
623	return 0;
624}
625
626/*
627 * Variable to establish order between CPUs while scanning.
628 * Each CPU spins initially until executing is equal its number.
629 */
630static atomic_t mce_executing;
631
632/*
633 * Defines order of CPUs on entry. First CPU becomes Monarch.
634 */
635static atomic_t mce_callin;
636
637/*
638 * Check if a timeout waiting for other CPUs happened.
639 */
640static int mce_timed_out(u64 *t)
641{
642	/*
643	 * The others already did panic for some reason.
644	 * Bail out like in a timeout.
645	 * rmb() to tell the compiler that system_state
646	 * might have been modified by someone else.
647	 */
648	rmb();
649	if (atomic_read(&mce_paniced))
650		wait_for_panic();
651	if (!monarch_timeout)
652		goto out;
653	if ((s64)*t < SPINUNIT) {
654		/* CHECKME: Make panic default for 1 too? */
655		if (tolerant < 1)
656			mce_panic("Timeout synchronizing machine check over CPUs",
657				  NULL, NULL);
658		cpu_missing = 1;
659		return 1;
660	}
661	*t -= SPINUNIT;
662out:
663	touch_nmi_watchdog();
664	return 0;
665}
666
667/*
668 * The Monarch's reign.  The Monarch is the CPU who entered
669 * the machine check handler first. It waits for the others to
670 * raise the exception too and then grades them. When any
671 * error is fatal panic. Only then let the others continue.
672 *
673 * The other CPUs entering the MCE handler will be controlled by the
674 * Monarch. They are called Subjects.
675 *
676 * This way we prevent any potential data corruption in a unrecoverable case
677 * and also makes sure always all CPU's errors are examined.
678 *
679 * Also this detects the case of a machine check event coming from outer
680 * space (not detected by any CPUs) In this case some external agent wants
681 * us to shut down, so panic too.
682 *
683 * The other CPUs might still decide to panic if the handler happens
684 * in a unrecoverable place, but in this case the system is in a semi-stable
685 * state and won't corrupt anything by itself. It's ok to let the others
686 * continue for a bit first.
687 *
688 * All the spin loops have timeouts; when a timeout happens a CPU
689 * typically elects itself to be Monarch.
690 */
691static void mce_reign(void)
692{
693	int cpu;
694	struct mce *m = NULL;
695	int global_worst = 0;
696	char *msg = NULL;
697	char *nmsg = NULL;
698
699	/*
700	 * This CPU is the Monarch and the other CPUs have run
701	 * through their handlers.
702	 * Grade the severity of the errors of all the CPUs.
703	 */
704	for_each_possible_cpu(cpu) {
705		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
706					    &nmsg);
707		if (severity > global_worst) {
708			msg = nmsg;
709			global_worst = severity;
710			m = &per_cpu(mces_seen, cpu);
711		}
712	}
713
714	/*
715	 * Cannot recover? Panic here then.
716	 * This dumps all the mces in the log buffer and stops the
717	 * other CPUs.
718	 */
719	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
720		mce_panic("Fatal Machine check", m, msg);
721
722	/*
723	 * For UC somewhere we let the CPU who detects it handle it.
724	 * Also must let continue the others, otherwise the handling
725	 * CPU could deadlock on a lock.
726	 */
727
728	/*
729	 * No machine check event found. Must be some external
730	 * source or one CPU is hung. Panic.
731	 */
732	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
733		mce_panic("Machine check from unknown source", NULL, NULL);
734
735	/*
736	 * Now clear all the mces_seen so that they don't reappear on
737	 * the next mce.
738	 */
739	for_each_possible_cpu(cpu)
740		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
741}
742
743static atomic_t global_nwo;
744
745/*
746 * Start of Monarch synchronization. This waits until all CPUs have
747 * entered the exception handler and then determines if any of them
748 * saw a fatal event that requires panic. Then it executes them
749 * in the entry order.
750 * TBD double check parallel CPU hotunplug
751 */
752static int mce_start(int *no_way_out)
753{
754	int order;
755	int cpus = num_online_cpus();
756	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758	if (!timeout)
759		return -1;
760
761	atomic_add(*no_way_out, &global_nwo);
762	/*
763	 * global_nwo should be updated before mce_callin
764	 */
765	smp_wmb();
766	order = atomic_inc_return(&mce_callin);
767
768	/*
769	 * Wait for everyone.
770	 */
771	while (atomic_read(&mce_callin) != cpus) {
772		if (mce_timed_out(&timeout)) {
773			atomic_set(&global_nwo, 0);
774			return -1;
775		}
776		ndelay(SPINUNIT);
777	}
778
779	/*
780	 * mce_callin should be read before global_nwo
781	 */
782	smp_rmb();
783
784	if (order == 1) {
785		/*
786		 * Monarch: Starts executing now, the others wait.
787		 */
788		atomic_set(&mce_executing, 1);
789	} else {
790		/*
791		 * Subject: Now start the scanning loop one by one in
792		 * the original callin order.
793		 * This way when there are any shared banks it will be
794		 * only seen by one CPU before cleared, avoiding duplicates.
795		 */
796		while (atomic_read(&mce_executing) < order) {
797			if (mce_timed_out(&timeout)) {
798				atomic_set(&global_nwo, 0);
799				return -1;
800			}
801			ndelay(SPINUNIT);
802		}
803	}
804
805	/*
806	 * Cache the global no_way_out state.
807	 */
808	*no_way_out = atomic_read(&global_nwo);
809
810	return order;
811}
812
813/*
814 * Synchronize between CPUs after main scanning loop.
815 * This invokes the bulk of the Monarch processing.
816 */
817static int mce_end(int order)
818{
819	int ret = -1;
820	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
821
822	if (!timeout)
823		goto reset;
824	if (order < 0)
825		goto reset;
826
827	/*
828	 * Allow others to run.
829	 */
830	atomic_inc(&mce_executing);
831
832	if (order == 1) {
833		/* CHECKME: Can this race with a parallel hotplug? */
834		int cpus = num_online_cpus();
835
836		/*
837		 * Monarch: Wait for everyone to go through their scanning
838		 * loops.
839		 */
840		while (atomic_read(&mce_executing) <= cpus) {
841			if (mce_timed_out(&timeout))
842				goto reset;
843			ndelay(SPINUNIT);
844		}
845
846		mce_reign();
847		barrier();
848		ret = 0;
849	} else {
850		/*
851		 * Subject: Wait for Monarch to finish.
852		 */
853		while (atomic_read(&mce_executing) != 0) {
854			if (mce_timed_out(&timeout))
855				goto reset;
856			ndelay(SPINUNIT);
857		}
858
859		/*
860		 * Don't reset anything. That's done by the Monarch.
861		 */
862		return 0;
863	}
864
865	/*
866	 * Reset all global state.
867	 */
868reset:
869	atomic_set(&global_nwo, 0);
870	atomic_set(&mce_callin, 0);
871	barrier();
872
873	/*
874	 * Let others run again.
875	 */
876	atomic_set(&mce_executing, 0);
877	return ret;
878}
879
880/*
881 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now.
885 */
886static int mce_usable_address(struct mce *m)
887{
888	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
889		return 0;
890	if ((m->misc & 0x3f) > PAGE_SHIFT)
891		return 0;
892	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
893		return 0;
894	return 1;
895}
896
897static void mce_clear_state(unsigned long *toclear)
898{
899	int i;
900
901	for (i = 0; i < banks; i++) {
902		if (test_bit(i, toclear))
903			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
904	}
905}
906
907/*
908 * The actual machine check handler. This only handles real
909 * exceptions when something got corrupted coming in through int 18.
910 *
911 * This is executed in NMI context not subject to normal locking rules. This
912 * implies that most kernel services cannot be safely used. Don't even
913 * think about putting a printk in there!
914 *
915 * On Intel systems this is entered on all CPUs in parallel through
916 * MCE broadcast. However some CPUs might be broken beyond repair,
917 * so be always careful when synchronizing with others.
918 */
919void do_machine_check(struct pt_regs *regs, long error_code)
920{
921	struct mce m, *final;
922	int i;
923	int worst = 0;
924	int severity;
925	/*
926	 * Establish sequential order between the CPUs entering the machine
927	 * check handler.
928	 */
929	int order;
930	/*
931	 * If no_way_out gets set, there is no safe way to recover from this
932	 * MCE.  If tolerant is cranked up, we'll try anyway.
933	 */
934	int no_way_out = 0;
935	/*
936	 * If kill_it gets set, there might be a way to recover from this
937	 * error.
938	 */
939	int kill_it = 0;
940	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
941	char *msg = "Unknown";
942
943	atomic_inc(&mce_entry);
944
945	percpu_inc(mce_exception_count);
946
947	if (notify_die(DIE_NMI, "machine check", regs, error_code,
948			   18, SIGKILL) == NOTIFY_STOP)
949		goto out;
950	if (!banks)
951		goto out;
952
953	mce_setup(&m);
954
955	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
956	final = &__get_cpu_var(mces_seen);
957	*final = m;
958
959	no_way_out = mce_no_way_out(&m, &msg);
960
961	barrier();
962
963	/*
964	 * When no restart IP must always kill or panic.
965	 */
966	if (!(m.mcgstatus & MCG_STATUS_RIPV))
967		kill_it = 1;
968
969	/*
970	 * Go through all the banks in exclusion of the other CPUs.
971	 * This way we don't report duplicated events on shared banks
972	 * because the first one to see it will clear it.
973	 */
974	order = mce_start(&no_way_out);
975	for (i = 0; i < banks; i++) {
976		__clear_bit(i, toclear);
977		if (!mce_banks[i].ctl)
978			continue;
979
980		m.misc = 0;
981		m.addr = 0;
982		m.bank = i;
983
984		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
985		if ((m.status & MCI_STATUS_VAL) == 0)
986			continue;
987
988		/*
989		 * Non uncorrected or non signaled errors are handled by
990		 * machine_check_poll. Leave them alone, unless this panics.
991		 */
992		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
993			!no_way_out)
994			continue;
995
996		/*
997		 * Set taint even when machine check was not enabled.
998		 */
999		add_taint(TAINT_MACHINE_CHECK);
1000
1001		severity = mce_severity(&m, tolerant, NULL);
1002
1003		/*
1004		 * When machine check was for corrected handler don't touch,
1005		 * unless we're panicing.
1006		 */
1007		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1008			continue;
1009		__set_bit(i, toclear);
1010		if (severity == MCE_NO_SEVERITY) {
1011			/*
1012			 * Machine check event was not enabled. Clear, but
1013			 * ignore.
1014			 */
1015			continue;
1016		}
1017
1018		/*
1019		 * Kill on action required.
1020		 */
1021		if (severity == MCE_AR_SEVERITY)
1022			kill_it = 1;
1023
1024		if (m.status & MCI_STATUS_MISCV)
1025			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1026		if (m.status & MCI_STATUS_ADDRV)
1027			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1028
1029		/*
1030		 * Action optional error. Queue address for later processing.
1031		 * When the ring overflows we just ignore the AO error.
1032		 * RED-PEN add some logging mechanism when
1033		 * usable_address or mce_add_ring fails.
1034		 * RED-PEN don't ignore overflow for tolerant == 0
1035		 */
1036		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1037			mce_ring_add(m.addr >> PAGE_SHIFT);
1038
1039		mce_get_rip(&m, regs);
1040		mce_log(&m);
1041
1042		if (severity > worst) {
1043			*final = m;
1044			worst = severity;
1045		}
1046	}
1047
1048	if (!no_way_out)
1049		mce_clear_state(toclear);
1050
1051	/*
1052	 * Do most of the synchronization with other CPUs.
1053	 * When there's any problem use only local no_way_out state.
1054	 */
1055	if (mce_end(order) < 0)
1056		no_way_out = worst >= MCE_PANIC_SEVERITY;
1057
1058	/*
1059	 * If we have decided that we just CAN'T continue, and the user
1060	 * has not set tolerant to an insane level, give up and die.
1061	 *
1062	 * This is mainly used in the case when the system doesn't
1063	 * support MCE broadcasting or it has been disabled.
1064	 */
1065	if (no_way_out && tolerant < 3)
1066		mce_panic("Fatal machine check on current CPU", final, msg);
1067
1068	/*
1069	 * If the error seems to be unrecoverable, something should be
1070	 * done.  Try to kill as little as possible.  If we can kill just
1071	 * one task, do that.  If the user has set the tolerance very
1072	 * high, don't try to do anything at all.
1073	 */
1074
1075	if (kill_it && tolerant < 3)
1076		force_sig(SIGBUS, current);
1077
1078	/* notify userspace ASAP */
1079	set_thread_flag(TIF_MCE_NOTIFY);
1080
1081	if (worst > 0)
1082		mce_report_event(regs);
1083	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1084out:
1085	atomic_dec(&mce_entry);
1086	sync_core();
1087}
1088EXPORT_SYMBOL_GPL(do_machine_check);
1089
1090/* dummy to break dependency. actual code is in mm/memory-failure.c */
1091void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1092{
1093	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1094}
1095
1096/*
1097 * Called after mce notification in process context. This code
1098 * is allowed to sleep. Call the high level VM handler to process
1099 * any corrupted pages.
1100 * Assume that the work queue code only calls this one at a time
1101 * per CPU.
1102 * Note we don't disable preemption, so this code might run on the wrong
1103 * CPU. In this case the event is picked up by the scheduled work queue.
1104 * This is merely a fast path to expedite processing in some common
1105 * cases.
1106 */
1107void mce_notify_process(void)
1108{
1109	unsigned long pfn;
1110	mce_notify_irq();
1111	while (mce_ring_get(&pfn))
1112		memory_failure(pfn, MCE_VECTOR);
1113}
1114
1115static void mce_process_work(struct work_struct *dummy)
1116{
1117	mce_notify_process();
1118}
1119
1120#ifdef CONFIG_X86_MCE_INTEL
1121/***
1122 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1123 * @cpu: The CPU on which the event occurred.
1124 * @status: Event status information
1125 *
1126 * This function should be called by the thermal interrupt after the
1127 * event has been processed and the decision was made to log the event
1128 * further.
1129 *
1130 * The status parameter will be saved to the 'status' field of 'struct mce'
1131 * and historically has been the register value of the
1132 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1133 */
1134void mce_log_therm_throt_event(__u64 status)
1135{
1136	struct mce m;
1137
1138	mce_setup(&m);
1139	m.bank = MCE_THERMAL_BANK;
1140	m.status = status;
1141	mce_log(&m);
1142}
1143#endif /* CONFIG_X86_MCE_INTEL */
1144
1145/*
1146 * Periodic polling timer for "silent" machine check errors.  If the
1147 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1148 * errors, poll 2x slower (up to check_interval seconds).
1149 */
1150static int check_interval = 5 * 60; /* 5 minutes */
1151
1152static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1153static DEFINE_PER_CPU(struct timer_list, mce_timer);
1154
1155static void mce_start_timer(unsigned long data)
1156{
1157	struct timer_list *t = &per_cpu(mce_timer, data);
1158	int *n;
1159
1160	WARN_ON(smp_processor_id() != data);
1161
1162	if (mce_available(&current_cpu_data)) {
1163		machine_check_poll(MCP_TIMESTAMP,
1164				&__get_cpu_var(mce_poll_banks));
1165	}
1166
1167	/*
1168	 * Alert userspace if needed.  If we logged an MCE, reduce the
1169	 * polling interval, otherwise increase the polling interval.
1170	 */
1171	n = &__get_cpu_var(mce_next_interval);
1172	if (mce_notify_irq())
1173		*n = max(*n/2, HZ/100);
1174	else
1175		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1176
1177	t->expires = jiffies + *n;
1178	add_timer_on(t, smp_processor_id());
1179}
1180
1181static void mce_do_trigger(struct work_struct *work)
1182{
1183	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1184}
1185
1186static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1187
1188/*
1189 * Notify the user(s) about new machine check events.
1190 * Can be called from interrupt context, but not from machine check/NMI
1191 * context.
1192 */
1193int mce_notify_irq(void)
1194{
1195	/* Not more than two messages every minute */
1196	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1197
1198	clear_thread_flag(TIF_MCE_NOTIFY);
1199
1200	if (test_and_clear_bit(0, &mce_need_notify)) {
1201		wake_up_interruptible(&mce_wait);
1202
1203		/*
1204		 * There is no risk of missing notifications because
1205		 * work_pending is always cleared before the function is
1206		 * executed.
1207		 */
1208		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1209			schedule_work(&mce_trigger_work);
1210
1211		if (__ratelimit(&ratelimit))
1212			pr_info(HW_ERR "Machine check events logged\n");
1213
1214		return 1;
1215	}
1216	return 0;
1217}
1218EXPORT_SYMBOL_GPL(mce_notify_irq);
1219
1220static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1221{
1222	int i;
1223
1224	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1225	if (!mce_banks)
1226		return -ENOMEM;
1227	for (i = 0; i < banks; i++) {
1228		struct mce_bank *b = &mce_banks[i];
1229
1230		b->ctl = -1ULL;
1231		b->init = 1;
1232	}
1233	return 0;
1234}
1235
1236/*
1237 * Initialize Machine Checks for a CPU.
1238 */
1239static int __cpuinit __mcheck_cpu_cap_init(void)
1240{
1241	unsigned b;
1242	u64 cap;
1243
1244	rdmsrl(MSR_IA32_MCG_CAP, cap);
1245
1246	b = cap & MCG_BANKCNT_MASK;
1247	if (!banks)
1248		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1249
1250	if (b > MAX_NR_BANKS) {
1251		printk(KERN_WARNING
1252		       "MCE: Using only %u machine check banks out of %u\n",
1253			MAX_NR_BANKS, b);
1254		b = MAX_NR_BANKS;
1255	}
1256
1257	/* Don't support asymmetric configurations today */
1258	WARN_ON(banks != 0 && b != banks);
1259	banks = b;
1260	if (!mce_banks) {
1261		int err = __mcheck_cpu_mce_banks_init();
1262
1263		if (err)
1264			return err;
1265	}
1266
1267	/* Use accurate RIP reporting if available. */
1268	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1269		rip_msr = MSR_IA32_MCG_EIP;
1270
1271	if (cap & MCG_SER_P)
1272		mce_ser = 1;
1273
1274	return 0;
1275}
1276
1277static void __mcheck_cpu_init_generic(void)
1278{
1279	mce_banks_t all_banks;
1280	u64 cap;
1281	int i;
1282
1283	/*
1284	 * Log the machine checks left over from the previous reset.
1285	 */
1286	bitmap_fill(all_banks, MAX_NR_BANKS);
1287	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1288
1289	set_in_cr4(X86_CR4_MCE);
1290
1291	rdmsrl(MSR_IA32_MCG_CAP, cap);
1292	if (cap & MCG_CTL_P)
1293		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1294
1295	for (i = 0; i < banks; i++) {
1296		struct mce_bank *b = &mce_banks[i];
1297
1298		if (!b->init)
1299			continue;
1300		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1301		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1302	}
1303}
1304
1305/* Add per CPU specific workarounds here */
1306static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1307{
1308	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1309		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1310		return -EOPNOTSUPP;
1311	}
1312
1313	/* This should be disabled by the BIOS, but isn't always */
1314	if (c->x86_vendor == X86_VENDOR_AMD) {
1315		if (c->x86 == 15 && banks > 4) {
1316			/*
1317			 * disable GART TBL walk error reporting, which
1318			 * trips off incorrectly with the IOMMU & 3ware
1319			 * & Cerberus:
1320			 */
1321			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1322		}
1323		if (c->x86 <= 17 && mce_bootlog < 0) {
1324			/*
1325			 * Lots of broken BIOS around that don't clear them
1326			 * by default and leave crap in there. Don't log:
1327			 */
1328			mce_bootlog = 0;
1329		}
1330		/*
1331		 * Various K7s with broken bank 0 around. Always disable
1332		 * by default.
1333		 */
1334		 if (c->x86 == 6 && banks > 0)
1335			mce_banks[0].ctl = 0;
1336	}
1337
1338	if (c->x86_vendor == X86_VENDOR_INTEL) {
1339		/*
1340		 * SDM documents that on family 6 bank 0 should not be written
1341		 * because it aliases to another special BIOS controlled
1342		 * register.
1343		 * But it's not aliased anymore on model 0x1a+
1344		 * Don't ignore bank 0 completely because there could be a
1345		 * valid event later, merely don't write CTL0.
1346		 */
1347
1348		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1349			mce_banks[0].init = 0;
1350
1351		/*
1352		 * All newer Intel systems support MCE broadcasting. Enable
1353		 * synchronization with a one second timeout.
1354		 */
1355		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1356			monarch_timeout < 0)
1357			monarch_timeout = USEC_PER_SEC;
1358
1359		/*
1360		 * There are also broken BIOSes on some Pentium M and
1361		 * earlier systems:
1362		 */
1363		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1364			mce_bootlog = 0;
1365	}
1366	if (monarch_timeout < 0)
1367		monarch_timeout = 0;
1368	if (mce_bootlog != 0)
1369		mce_panic_timeout = 30;
1370
1371	return 0;
1372}
1373
1374static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1375{
1376	if (c->x86 != 5)
1377		return;
1378	switch (c->x86_vendor) {
1379	case X86_VENDOR_INTEL:
1380		intel_p5_mcheck_init(c);
1381		break;
1382	case X86_VENDOR_CENTAUR:
1383		winchip_mcheck_init(c);
1384		break;
1385	}
1386}
1387
1388static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1389{
1390	switch (c->x86_vendor) {
1391	case X86_VENDOR_INTEL:
1392		mce_intel_feature_init(c);
1393		break;
1394	case X86_VENDOR_AMD:
1395		mce_amd_feature_init(c);
1396		break;
1397	default:
1398		break;
1399	}
1400}
1401
1402static void __mcheck_cpu_init_timer(void)
1403{
1404	struct timer_list *t = &__get_cpu_var(mce_timer);
1405	int *n = &__get_cpu_var(mce_next_interval);
1406
1407	setup_timer(t, mce_start_timer, smp_processor_id());
1408
1409	if (mce_ignore_ce)
1410		return;
1411
1412	*n = check_interval * HZ;
1413	if (!*n)
1414		return;
1415	t->expires = round_jiffies(jiffies + *n);
1416	add_timer_on(t, smp_processor_id());
1417}
1418
1419/* Handle unconfigured int18 (should never happen) */
1420static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1421{
1422	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1423	       smp_processor_id());
1424}
1425
1426/* Call the installed machine check handler for this CPU setup. */
1427void (*machine_check_vector)(struct pt_regs *, long error_code) =
1428						unexpected_machine_check;
1429
1430/*
1431 * Called for each booted CPU to set up machine checks.
1432 * Must be called with preempt off:
1433 */
1434void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1435{
1436	if (mce_disabled)
1437		return;
1438
1439	__mcheck_cpu_ancient_init(c);
1440
1441	if (!mce_available(c))
1442		return;
1443
1444	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1445		mce_disabled = 1;
1446		return;
1447	}
1448
1449	machine_check_vector = do_machine_check;
1450
1451	__mcheck_cpu_init_generic();
1452	__mcheck_cpu_init_vendor(c);
1453	__mcheck_cpu_init_timer();
1454	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1455
1456}
1457
1458/*
1459 * Character device to read and clear the MCE log.
1460 */
1461
1462static DEFINE_SPINLOCK(mce_state_lock);
1463static int		open_count;		/* #times opened */
1464static int		open_exclu;		/* already open exclusive? */
1465
1466static int mce_open(struct inode *inode, struct file *file)
1467{
1468	spin_lock(&mce_state_lock);
1469
1470	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1471		spin_unlock(&mce_state_lock);
1472
1473		return -EBUSY;
1474	}
1475
1476	if (file->f_flags & O_EXCL)
1477		open_exclu = 1;
1478	open_count++;
1479
1480	spin_unlock(&mce_state_lock);
1481
1482	return nonseekable_open(inode, file);
1483}
1484
1485static int mce_release(struct inode *inode, struct file *file)
1486{
1487	spin_lock(&mce_state_lock);
1488
1489	open_count--;
1490	open_exclu = 0;
1491
1492	spin_unlock(&mce_state_lock);
1493
1494	return 0;
1495}
1496
1497static void collect_tscs(void *data)
1498{
1499	unsigned long *cpu_tsc = (unsigned long *)data;
1500
1501	rdtscll(cpu_tsc[smp_processor_id()]);
1502}
1503
1504static int mce_apei_read_done;
1505
1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1507static int __mce_read_apei(char __user **ubuf, size_t usize)
1508{
1509	int rc;
1510	u64 record_id;
1511	struct mce m;
1512
1513	if (usize < sizeof(struct mce))
1514		return -EINVAL;
1515
1516	rc = apei_read_mce(&m, &record_id);
1517	/* Error or no more MCE record */
1518	if (rc <= 0) {
1519		mce_apei_read_done = 1;
1520		return rc;
1521	}
1522	rc = -EFAULT;
1523	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1524		return rc;
1525	/*
1526	 * In fact, we should have cleared the record after that has
1527	 * been flushed to the disk or sent to network in
1528	 * /sbin/mcelog, but we have no interface to support that now,
1529	 * so just clear it to avoid duplication.
1530	 */
1531	rc = apei_clear_mce(record_id);
1532	if (rc) {
1533		mce_apei_read_done = 1;
1534		return rc;
1535	}
1536	*ubuf += sizeof(struct mce);
1537
1538	return 0;
1539}
1540
1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1542			loff_t *off)
1543{
1544	char __user *buf = ubuf;
1545	unsigned long *cpu_tsc;
1546	unsigned prev, next;
1547	int i, err;
1548
1549	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1550	if (!cpu_tsc)
1551		return -ENOMEM;
1552
1553	mutex_lock(&mce_read_mutex);
1554
1555	if (!mce_apei_read_done) {
1556		err = __mce_read_apei(&buf, usize);
1557		if (err || buf != ubuf)
1558			goto out;
1559	}
1560
1561	next = rcu_dereference_check_mce(mcelog.next);
1562
1563	/* Only supports full reads right now */
1564	err = -EINVAL;
1565	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1566		goto out;
1567
1568	err = 0;
1569	prev = 0;
1570	do {
1571		for (i = prev; i < next; i++) {
1572			unsigned long start = jiffies;
1573
1574			while (!mcelog.entry[i].finished) {
1575				if (time_after_eq(jiffies, start + 2)) {
1576					memset(mcelog.entry + i, 0,
1577					       sizeof(struct mce));
1578					goto timeout;
1579				}
1580				cpu_relax();
1581			}
1582			smp_rmb();
1583			err |= copy_to_user(buf, mcelog.entry + i,
1584					    sizeof(struct mce));
1585			buf += sizeof(struct mce);
1586timeout:
1587			;
1588		}
1589
1590		memset(mcelog.entry + prev, 0,
1591		       (next - prev) * sizeof(struct mce));
1592		prev = next;
1593		next = cmpxchg(&mcelog.next, prev, 0);
1594	} while (next != prev);
1595
1596	synchronize_sched();
1597
1598	/*
1599	 * Collect entries that were still getting written before the
1600	 * synchronize.
1601	 */
1602	on_each_cpu(collect_tscs, cpu_tsc, 1);
1603
1604	for (i = next; i < MCE_LOG_LEN; i++) {
1605		if (mcelog.entry[i].finished &&
1606		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1607			err |= copy_to_user(buf, mcelog.entry+i,
1608					    sizeof(struct mce));
1609			smp_rmb();
1610			buf += sizeof(struct mce);
1611			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1612		}
1613	}
1614
1615	if (err)
1616		err = -EFAULT;
1617
1618out:
1619	mutex_unlock(&mce_read_mutex);
1620	kfree(cpu_tsc);
1621
1622	return err ? err : buf - ubuf;
1623}
1624
1625static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{
1627	poll_wait(file, &mce_wait, wait);
1628	if (rcu_dereference_check_mce(mcelog.next))
1629		return POLLIN | POLLRDNORM;
1630	if (!mce_apei_read_done && apei_check_mce())
1631		return POLLIN | POLLRDNORM;
1632	return 0;
1633}
1634
1635static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1636{
1637	int __user *p = (int __user *)arg;
1638
1639	if (!capable(CAP_SYS_ADMIN))
1640		return -EPERM;
1641
1642	switch (cmd) {
1643	case MCE_GET_RECORD_LEN:
1644		return put_user(sizeof(struct mce), p);
1645	case MCE_GET_LOG_LEN:
1646		return put_user(MCE_LOG_LEN, p);
1647	case MCE_GETCLEAR_FLAGS: {
1648		unsigned flags;
1649
1650		do {
1651			flags = mcelog.flags;
1652		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1653
1654		return put_user(flags, p);
1655	}
1656	default:
1657		return -ENOTTY;
1658	}
1659}
1660
1661/* Modified in mce-inject.c, so not static or const */
1662struct file_operations mce_chrdev_ops = {
1663	.open			= mce_open,
1664	.release		= mce_release,
1665	.read			= mce_read,
1666	.poll			= mce_poll,
1667	.unlocked_ioctl		= mce_ioctl,
1668};
1669EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1670
1671static struct miscdevice mce_log_device = {
1672	MISC_MCELOG_MINOR,
1673	"mcelog",
1674	&mce_chrdev_ops,
1675};
1676
1677/*
1678 * mce=off Disables machine check
1679 * mce=no_cmci Disables CMCI
1680 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1681 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1682 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1683 *	monarchtimeout is how long to wait for other CPUs on machine
1684 *	check, or 0 to not wait
1685 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1686 * mce=nobootlog Don't log MCEs from before booting.
1687 */
1688static int __init mcheck_enable(char *str)
1689{
1690	if (*str == 0) {
1691		enable_p5_mce();
1692		return 1;
1693	}
1694	if (*str == '=')
1695		str++;
1696	if (!strcmp(str, "off"))
1697		mce_disabled = 1;
1698	else if (!strcmp(str, "no_cmci"))
1699		mce_cmci_disabled = 1;
1700	else if (!strcmp(str, "dont_log_ce"))
1701		mce_dont_log_ce = 1;
1702	else if (!strcmp(str, "ignore_ce"))
1703		mce_ignore_ce = 1;
1704	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1705		mce_bootlog = (str[0] == 'b');
1706	else if (isdigit(str[0])) {
1707		get_option(&str, &tolerant);
1708		if (*str == ',') {
1709			++str;
1710			get_option(&str, &monarch_timeout);
1711		}
1712	} else {
1713		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1714		       str);
1715		return 0;
1716	}
1717	return 1;
1718}
1719__setup("mce", mcheck_enable);
1720
1721int __init mcheck_init(void)
1722{
1723	atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1724
1725	mcheck_intel_therm_init();
1726
1727	return 0;
1728}
1729
1730/*
1731 * Sysfs support
1732 */
1733
1734/*
1735 * Disable machine checks on suspend and shutdown. We can't really handle
1736 * them later.
1737 */
1738static int mce_disable_error_reporting(void)
1739{
1740	int i;
1741
1742	for (i = 0; i < banks; i++) {
1743		struct mce_bank *b = &mce_banks[i];
1744
1745		if (b->init)
1746			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1747	}
1748	return 0;
1749}
1750
1751static int mce_suspend(struct sys_device *dev, pm_message_t state)
1752{
1753	return mce_disable_error_reporting();
1754}
1755
1756static int mce_shutdown(struct sys_device *dev)
1757{
1758	return mce_disable_error_reporting();
1759}
1760
1761/*
1762 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1763 * Only one CPU is active at this time, the others get re-added later using
1764 * CPU hotplug:
1765 */
1766static int mce_resume(struct sys_device *dev)
1767{
1768	__mcheck_cpu_init_generic();
1769	__mcheck_cpu_init_vendor(&current_cpu_data);
1770
1771	return 0;
1772}
1773
1774static void mce_cpu_restart(void *data)
1775{
1776	del_timer_sync(&__get_cpu_var(mce_timer));
1777	if (!mce_available(&current_cpu_data))
1778		return;
1779	__mcheck_cpu_init_generic();
1780	__mcheck_cpu_init_timer();
1781}
1782
1783/* Reinit MCEs after user configuration changes */
1784static void mce_restart(void)
1785{
1786	on_each_cpu(mce_cpu_restart, NULL, 1);
1787}
1788
1789/* Toggle features for corrected errors */
1790static void mce_disable_ce(void *all)
1791{
1792	if (!mce_available(&current_cpu_data))
1793		return;
1794	if (all)
1795		del_timer_sync(&__get_cpu_var(mce_timer));
1796	cmci_clear();
1797}
1798
1799static void mce_enable_ce(void *all)
1800{
1801	if (!mce_available(&current_cpu_data))
1802		return;
1803	cmci_reenable();
1804	cmci_recheck();
1805	if (all)
1806		__mcheck_cpu_init_timer();
1807}
1808
1809static struct sysdev_class mce_sysclass = {
1810	.suspend	= mce_suspend,
1811	.shutdown	= mce_shutdown,
1812	.resume		= mce_resume,
1813	.name		= "machinecheck",
1814};
1815
1816DEFINE_PER_CPU(struct sys_device, mce_dev);
1817
1818__cpuinitdata
1819void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1820
1821static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1822{
1823	return container_of(attr, struct mce_bank, attr);
1824}
1825
1826static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1827			 char *buf)
1828{
1829	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1830}
1831
1832static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1833			const char *buf, size_t size)
1834{
1835	u64 new;
1836
1837	if (strict_strtoull(buf, 0, &new) < 0)
1838		return -EINVAL;
1839
1840	attr_to_bank(attr)->ctl = new;
1841	mce_restart();
1842
1843	return size;
1844}
1845
1846static ssize_t
1847show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1848{
1849	strcpy(buf, mce_helper);
1850	strcat(buf, "\n");
1851	return strlen(mce_helper) + 1;
1852}
1853
1854static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1855				const char *buf, size_t siz)
1856{
1857	char *p;
1858
1859	strncpy(mce_helper, buf, sizeof(mce_helper));
1860	mce_helper[sizeof(mce_helper)-1] = 0;
1861	p = strchr(mce_helper, '\n');
1862
1863	if (p)
1864		*p = 0;
1865
1866	return strlen(mce_helper) + !!p;
1867}
1868
1869static ssize_t set_ignore_ce(struct sys_device *s,
1870			     struct sysdev_attribute *attr,
1871			     const char *buf, size_t size)
1872{
1873	u64 new;
1874
1875	if (strict_strtoull(buf, 0, &new) < 0)
1876		return -EINVAL;
1877
1878	if (mce_ignore_ce ^ !!new) {
1879		if (new) {
1880			/* disable ce features */
1881			on_each_cpu(mce_disable_ce, (void *)1, 1);
1882			mce_ignore_ce = 1;
1883		} else {
1884			/* enable ce features */
1885			mce_ignore_ce = 0;
1886			on_each_cpu(mce_enable_ce, (void *)1, 1);
1887		}
1888	}
1889	return size;
1890}
1891
1892static ssize_t set_cmci_disabled(struct sys_device *s,
1893				 struct sysdev_attribute *attr,
1894				 const char *buf, size_t size)
1895{
1896	u64 new;
1897
1898	if (strict_strtoull(buf, 0, &new) < 0)
1899		return -EINVAL;
1900
1901	if (mce_cmci_disabled ^ !!new) {
1902		if (new) {
1903			/* disable cmci */
1904			on_each_cpu(mce_disable_ce, NULL, 1);
1905			mce_cmci_disabled = 1;
1906		} else {
1907			/* enable cmci */
1908			mce_cmci_disabled = 0;
1909			on_each_cpu(mce_enable_ce, NULL, 1);
1910		}
1911	}
1912	return size;
1913}
1914
1915static ssize_t store_int_with_restart(struct sys_device *s,
1916				      struct sysdev_attribute *attr,
1917				      const char *buf, size_t size)
1918{
1919	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1920	mce_restart();
1921	return ret;
1922}
1923
1924static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1925static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1926static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1927static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1928
1929static struct sysdev_ext_attribute attr_check_interval = {
1930	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1931		     store_int_with_restart),
1932	&check_interval
1933};
1934
1935static struct sysdev_ext_attribute attr_ignore_ce = {
1936	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1937	&mce_ignore_ce
1938};
1939
1940static struct sysdev_ext_attribute attr_cmci_disabled = {
1941	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1942	&mce_cmci_disabled
1943};
1944
1945static struct sysdev_attribute *mce_attrs[] = {
1946	&attr_tolerant.attr,
1947	&attr_check_interval.attr,
1948	&attr_trigger,
1949	&attr_monarch_timeout.attr,
1950	&attr_dont_log_ce.attr,
1951	&attr_ignore_ce.attr,
1952	&attr_cmci_disabled.attr,
1953	NULL
1954};
1955
1956static cpumask_var_t mce_dev_initialized;
1957
1958/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1959static __cpuinit int mce_create_device(unsigned int cpu)
1960{
1961	int err;
1962	int i, j;
1963
1964	if (!mce_available(&boot_cpu_data))
1965		return -EIO;
1966
1967	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1968	per_cpu(mce_dev, cpu).id	= cpu;
1969	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1970
1971	err = sysdev_register(&per_cpu(mce_dev, cpu));
1972	if (err)
1973		return err;
1974
1975	for (i = 0; mce_attrs[i]; i++) {
1976		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1977		if (err)
1978			goto error;
1979	}
1980	for (j = 0; j < banks; j++) {
1981		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1982					&mce_banks[j].attr);
1983		if (err)
1984			goto error2;
1985	}
1986	cpumask_set_cpu(cpu, mce_dev_initialized);
1987
1988	return 0;
1989error2:
1990	while (--j >= 0)
1991		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1992error:
1993	while (--i >= 0)
1994		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1995
1996	sysdev_unregister(&per_cpu(mce_dev, cpu));
1997
1998	return err;
1999}
2000
2001static __cpuinit void mce_remove_device(unsigned int cpu)
2002{
2003	int i;
2004
2005	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
2006		return;
2007
2008	for (i = 0; mce_attrs[i]; i++)
2009		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
2010
2011	for (i = 0; i < banks; i++)
2012		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
2013
2014	sysdev_unregister(&per_cpu(mce_dev, cpu));
2015	cpumask_clear_cpu(cpu, mce_dev_initialized);
2016}
2017
2018/* Make sure there are no machine checks on offlined CPUs. */
2019static void __cpuinit mce_disable_cpu(void *h)
2020{
2021	unsigned long action = *(unsigned long *)h;
2022	int i;
2023
2024	if (!mce_available(&current_cpu_data))
2025		return;
2026
2027	if (!(action & CPU_TASKS_FROZEN))
2028		cmci_clear();
2029	for (i = 0; i < banks; i++) {
2030		struct mce_bank *b = &mce_banks[i];
2031
2032		if (b->init)
2033			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2034	}
2035}
2036
2037static void __cpuinit mce_reenable_cpu(void *h)
2038{
2039	unsigned long action = *(unsigned long *)h;
2040	int i;
2041
2042	if (!mce_available(&current_cpu_data))
2043		return;
2044
2045	if (!(action & CPU_TASKS_FROZEN))
2046		cmci_reenable();
2047	for (i = 0; i < banks; i++) {
2048		struct mce_bank *b = &mce_banks[i];
2049
2050		if (b->init)
2051			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2052	}
2053}
2054
2055/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2056static int __cpuinit
2057mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2058{
2059	unsigned int cpu = (unsigned long)hcpu;
2060	struct timer_list *t = &per_cpu(mce_timer, cpu);
2061
2062	switch (action) {
2063	case CPU_ONLINE:
2064	case CPU_ONLINE_FROZEN:
2065		mce_create_device(cpu);
2066		if (threshold_cpu_callback)
2067			threshold_cpu_callback(action, cpu);
2068		break;
2069	case CPU_DEAD:
2070	case CPU_DEAD_FROZEN:
2071		if (threshold_cpu_callback)
2072			threshold_cpu_callback(action, cpu);
2073		mce_remove_device(cpu);
2074		break;
2075	case CPU_DOWN_PREPARE:
2076	case CPU_DOWN_PREPARE_FROZEN:
2077		del_timer_sync(t);
2078		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2079		break;
2080	case CPU_DOWN_FAILED:
2081	case CPU_DOWN_FAILED_FROZEN:
2082		if (!mce_ignore_ce && check_interval) {
2083			t->expires = round_jiffies(jiffies +
2084					   __get_cpu_var(mce_next_interval));
2085			add_timer_on(t, cpu);
2086		}
2087		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2088		break;
2089	case CPU_POST_DEAD:
2090		/* intentionally ignoring frozen here */
2091		cmci_rediscover(cpu);
2092		break;
2093	}
2094	return NOTIFY_OK;
2095}
2096
2097static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2098	.notifier_call = mce_cpu_callback,
2099};
2100
2101static __init void mce_init_banks(void)
2102{
2103	int i;
2104
2105	for (i = 0; i < banks; i++) {
2106		struct mce_bank *b = &mce_banks[i];
2107		struct sysdev_attribute *a = &b->attr;
2108
2109		sysfs_attr_init(&a->attr);
2110		a->attr.name	= b->attrname;
2111		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2112
2113		a->attr.mode	= 0644;
2114		a->show		= show_bank;
2115		a->store	= set_bank;
2116	}
2117}
2118
2119static __init int mcheck_init_device(void)
2120{
2121	int err;
2122	int i = 0;
2123
2124	if (!mce_available(&boot_cpu_data))
2125		return -EIO;
2126
2127	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
2128
2129	mce_init_banks();
2130
2131	err = sysdev_class_register(&mce_sysclass);
2132	if (err)
2133		return err;
2134
2135	for_each_online_cpu(i) {
2136		err = mce_create_device(i);
2137		if (err)
2138			return err;
2139	}
2140
2141	register_hotcpu_notifier(&mce_cpu_notifier);
2142	misc_register(&mce_log_device);
2143
2144	return err;
2145}
2146
2147device_initcall(mcheck_init_device);
2148
2149/*
2150 * Old style boot options parsing. Only for compatibility.
2151 */
2152static int __init mcheck_disable(char *str)
2153{
2154	mce_disabled = 1;
2155	return 1;
2156}
2157__setup("nomce", mcheck_disable);
2158
2159#ifdef CONFIG_DEBUG_FS
2160struct dentry *mce_get_debugfs_dir(void)
2161{
2162	static struct dentry *dmce;
2163
2164	if (!dmce)
2165		dmce = debugfs_create_dir("mce", NULL);
2166
2167	return dmce;
2168}
2169
2170static void mce_reset(void)
2171{
2172	cpu_missing = 0;
2173	atomic_set(&mce_fake_paniced, 0);
2174	atomic_set(&mce_executing, 0);
2175	atomic_set(&mce_callin, 0);
2176	atomic_set(&global_nwo, 0);
2177}
2178
2179static int fake_panic_get(void *data, u64 *val)
2180{
2181	*val = fake_panic;
2182	return 0;
2183}
2184
2185static int fake_panic_set(void *data, u64 val)
2186{
2187	mce_reset();
2188	fake_panic = val;
2189	return 0;
2190}
2191
2192DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2193			fake_panic_set, "%llu\n");
2194
2195static int __init mcheck_debugfs_init(void)
2196{
2197	struct dentry *dmce, *ffake_panic;
2198
2199	dmce = mce_get_debugfs_dir();
2200	if (!dmce)
2201		return -ENOMEM;
2202	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2203					  &fake_panic_fops);
2204	if (!ffake_panic)
2205		return -ENOMEM;
2206
2207	return 0;
2208}
2209late_initcall(mcheck_debugfs_init);
2210#endif
2211