mca.c revision 209212
1/*-
2 * Copyright (c) 2009 Advanced Computing Technologies LLC
3 * Written by: John H. Baldwin <jhb@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Support for x86 machine check architecture.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/x86/x86/mca.c 209212 2010-06-15 18:51:41Z jhb $");
34
35#ifdef __amd64__
36#define	DEV_APIC
37#else
38#include "opt_apic.h"
39#endif
40
41#include <sys/param.h>
42#include <sys/bus.h>
43#include <sys/interrupt.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52#include <sys/systm.h>
53#include <sys/taskqueue.h>
54#include <machine/intr_machdep.h>
55#include <machine/apicvar.h>
56#include <machine/cputypes.h>
57#include <machine/mca.h>
58#include <machine/md_var.h>
59#include <machine/specialreg.h>
60
61/* Modes for mca_scan() */
62enum scan_mode {
63	POLLED,
64	MCE,
65	CMCI,
66};
67
68#ifdef DEV_APIC
69/*
70 * State maintained for each monitored MCx bank to control the
71 * corrected machine check interrupt threshold.
72 */
73struct cmc_state {
74	int	max_threshold;
75	int	last_intr;
76};
77#endif
78
79struct mca_internal {
80	struct mca_record rec;
81	int		logged;
82	STAILQ_ENTRY(mca_internal) link;
83};
84
85static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
86
87static int mca_count;		/* Number of records stored. */
88
89SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
90
91static int mca_enabled = 1;
92TUNABLE_INT("hw.mca.enabled", &mca_enabled);
93SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
94    "Administrative toggle for machine check support");
95
96static int amd10h_L1TP = 1;
97TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
98SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
99    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
100
101int workaround_erratum383;
102SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
103    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
104
105static STAILQ_HEAD(, mca_internal) mca_records;
106static struct callout mca_timer;
107static int mca_ticks = 3600;	/* Check hourly by default. */
108static struct task mca_task;
109static struct mtx mca_lock;
110
111#ifdef DEV_APIC
112static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
113static int cmc_banks;
114static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
115#endif
116
117static int
118sysctl_positive_int(SYSCTL_HANDLER_ARGS)
119{
120	int error, value;
121
122	value = *(int *)arg1;
123	error = sysctl_handle_int(oidp, &value, 0, req);
124	if (error || req->newptr == NULL)
125		return (error);
126	if (value <= 0)
127		return (EINVAL);
128	*(int *)arg1 = value;
129	return (0);
130}
131
132static int
133sysctl_mca_records(SYSCTL_HANDLER_ARGS)
134{
135	int *name = (int *)arg1;
136	u_int namelen = arg2;
137	struct mca_record record;
138	struct mca_internal *rec;
139	int i;
140
141	if (namelen != 1)
142		return (EINVAL);
143
144	if (name[0] < 0 || name[0] >= mca_count)
145		return (EINVAL);
146
147	mtx_lock_spin(&mca_lock);
148	if (name[0] >= mca_count) {
149		mtx_unlock_spin(&mca_lock);
150		return (EINVAL);
151	}
152	i = 0;
153	STAILQ_FOREACH(rec, &mca_records, link) {
154		if (i == name[0]) {
155			record = rec->rec;
156			break;
157		}
158		i++;
159	}
160	mtx_unlock_spin(&mca_lock);
161	return (SYSCTL_OUT(req, &record, sizeof(record)));
162}
163
164static const char *
165mca_error_ttype(uint16_t mca_error)
166{
167
168	switch ((mca_error & 0x000c) >> 2) {
169	case 0:
170		return ("I");
171	case 1:
172		return ("D");
173	case 2:
174		return ("G");
175	}
176	return ("?");
177}
178
179static const char *
180mca_error_level(uint16_t mca_error)
181{
182
183	switch (mca_error & 0x0003) {
184	case 0:
185		return ("L0");
186	case 1:
187		return ("L1");
188	case 2:
189		return ("L2");
190	case 3:
191		return ("LG");
192	}
193	return ("L?");
194}
195
196static const char *
197mca_error_request(uint16_t mca_error)
198{
199
200	switch ((mca_error & 0x00f0) >> 4) {
201	case 0x0:
202		return ("ERR");
203	case 0x1:
204		return ("RD");
205	case 0x2:
206		return ("WR");
207	case 0x3:
208		return ("DRD");
209	case 0x4:
210		return ("DWR");
211	case 0x5:
212		return ("IRD");
213	case 0x6:
214		return ("PREFETCH");
215	case 0x7:
216		return ("EVICT");
217	case 0x8:
218		return ("SNOOP");
219	}
220	return ("???");
221}
222
223static const char *
224mca_error_mmtype(uint16_t mca_error)
225{
226
227	switch ((mca_error & 0x70) >> 4) {
228	case 0x0:
229		return ("GEN");
230	case 0x1:
231		return ("RD");
232	case 0x2:
233		return ("WR");
234	case 0x3:
235		return ("AC");
236	case 0x4:
237		return ("MS");
238	}
239	return ("???");
240}
241
242/* Dump details about a single machine check. */
243static void __nonnull(1)
244mca_log(const struct mca_record *rec)
245{
246	uint16_t mca_error;
247
248	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
249	    (long long)rec->mr_status);
250	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
251	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
252	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
253	    rec->mr_cpu_id, rec->mr_apic_id);
254	printf("MCA: CPU %d ", rec->mr_cpu);
255	if (rec->mr_status & MC_STATUS_UC)
256		printf("UNCOR ");
257	else {
258		printf("COR ");
259		if (rec->mr_mcg_cap & MCG_CAP_TES_P)
260			printf("(%lld) ", ((long long)rec->mr_status &
261			    MC_STATUS_COR_COUNT) >> 38);
262	}
263	if (rec->mr_status & MC_STATUS_PCC)
264		printf("PCC ");
265	if (rec->mr_status & MC_STATUS_OVER)
266		printf("OVER ");
267	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
268	switch (mca_error) {
269		/* Simple error codes. */
270	case 0x0000:
271		printf("no error");
272		break;
273	case 0x0001:
274		printf("unclassified error");
275		break;
276	case 0x0002:
277		printf("ucode ROM parity error");
278		break;
279	case 0x0003:
280		printf("external error");
281		break;
282	case 0x0004:
283		printf("FRC error");
284		break;
285	case 0x0005:
286		printf("internal parity error");
287		break;
288	case 0x0400:
289		printf("internal timer error");
290		break;
291	default:
292		if ((mca_error & 0xfc00) == 0x0400) {
293			printf("internal error %x", mca_error & 0x03ff);
294			break;
295		}
296
297		/* Compound error codes. */
298
299		/* Memory hierarchy error. */
300		if ((mca_error & 0xeffc) == 0x000c) {
301			printf("%s memory error", mca_error_level(mca_error));
302			break;
303		}
304
305		/* TLB error. */
306		if ((mca_error & 0xeff0) == 0x0010) {
307			printf("%sTLB %s error", mca_error_ttype(mca_error),
308			    mca_error_level(mca_error));
309			break;
310		}
311
312		/* Memory controller error. */
313		if ((mca_error & 0xef80) == 0x0080) {
314			printf("%s channel ", mca_error_mmtype(mca_error));
315			if ((mca_error & 0x000f) != 0x000f)
316				printf("%d", mca_error & 0x000f);
317			else
318				printf("??");
319			printf(" memory error");
320			break;
321		}
322
323		/* Cache error. */
324		if ((mca_error & 0xef00) == 0x0100) {
325			printf("%sCACHE %s %s error",
326			    mca_error_ttype(mca_error),
327			    mca_error_level(mca_error),
328			    mca_error_request(mca_error));
329			break;
330		}
331
332		/* Bus and/or Interconnect error. */
333		if ((mca_error & 0xe800) == 0x0800) {
334			printf("BUS%s ", mca_error_level(mca_error));
335			switch ((mca_error & 0x0600) >> 9) {
336			case 0:
337				printf("Source");
338				break;
339			case 1:
340				printf("Responder");
341				break;
342			case 2:
343				printf("Observer");
344				break;
345			default:
346				printf("???");
347				break;
348			}
349			printf(" %s ", mca_error_request(mca_error));
350			switch ((mca_error & 0x000c) >> 2) {
351			case 0:
352				printf("Memory");
353				break;
354			case 2:
355				printf("I/O");
356				break;
357			case 3:
358				printf("Other");
359				break;
360			default:
361				printf("???");
362				break;
363			}
364			if (mca_error & 0x0100)
365				printf(" timed out");
366			break;
367		}
368
369		printf("unknown error %x", mca_error);
370		break;
371	}
372	printf("\n");
373	if (rec->mr_status & MC_STATUS_ADDRV)
374		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
375	if (rec->mr_status & MC_STATUS_MISCV)
376		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
377}
378
379static int __nonnull(2)
380mca_check_status(int bank, struct mca_record *rec)
381{
382	uint64_t status;
383	u_int p[4];
384
385	status = rdmsr(MSR_MC_STATUS(bank));
386	if (!(status & MC_STATUS_VAL))
387		return (0);
388
389	/* Save exception information. */
390	rec->mr_status = status;
391	rec->mr_bank = bank;
392	rec->mr_addr = 0;
393	if (status & MC_STATUS_ADDRV)
394		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
395	rec->mr_misc = 0;
396	if (status & MC_STATUS_MISCV)
397		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
398	rec->mr_tsc = rdtsc();
399	rec->mr_apic_id = PCPU_GET(apic_id);
400	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
401	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
402	rec->mr_cpu_id = cpu_id;
403	rec->mr_cpu_vendor_id = cpu_vendor_id;
404	rec->mr_cpu = PCPU_GET(cpuid);
405
406	/*
407	 * Clear machine check.  Don't do this for uncorrectable
408	 * errors so that the BIOS can see them.
409	 */
410	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
411		wrmsr(MSR_MC_STATUS(bank), 0);
412		do_cpuid(0, p);
413	}
414	return (1);
415}
416
417static void __nonnull(1)
418mca_record_entry(const struct mca_record *record)
419{
420	struct mca_internal *rec;
421
422	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
423	if (rec == NULL) {
424		printf("MCA: Unable to allocate space for an event.\n");
425		mca_log(record);
426		return;
427	}
428
429	rec->rec = *record;
430	rec->logged = 0;
431	mtx_lock_spin(&mca_lock);
432	STAILQ_INSERT_TAIL(&mca_records, rec, link);
433	mca_count++;
434	mtx_unlock_spin(&mca_lock);
435}
436
437#ifdef DEV_APIC
438/*
439 * Update the interrupt threshold for a CMCI.  The strategy is to use
440 * a low trigger that interrupts as soon as the first event occurs.
441 * However, if a steady stream of events arrive, the threshold is
442 * increased until the interrupts are throttled to once every
443 * cmc_throttle seconds or the periodic scan.  If a periodic scan
444 * finds that the threshold is too high, it is lowered.
445 */
446static void
447cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
448{
449	struct cmc_state *cc;
450	uint64_t ctl;
451	u_int delta;
452	int count, limit;
453
454	/* Fetch the current limit for this bank. */
455	cc = &cmc_state[PCPU_GET(cpuid)][bank];
456	ctl = rdmsr(MSR_MC_CTL2(bank));
457	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
458	delta = (u_int)(ticks - cc->last_intr);
459
460	/*
461	 * If an interrupt was received less than cmc_throttle seconds
462	 * since the previous interrupt and the count from the current
463	 * event is greater than or equal to the current threshold,
464	 * double the threshold up to the max.
465	 */
466	if (mode == CMCI && valid) {
467		limit = ctl & MC_CTL2_THRESHOLD;
468		if (delta < cmc_throttle && count >= limit &&
469		    limit < cc->max_threshold) {
470			limit = min(limit << 1, cc->max_threshold);
471			ctl &= ~MC_CTL2_THRESHOLD;
472			ctl |= limit;
473			wrmsr(MSR_MC_CTL2(bank), limit);
474		}
475		cc->last_intr = ticks;
476		return;
477	}
478
479	/*
480	 * When the banks are polled, check to see if the threshold
481	 * should be lowered.
482	 */
483	if (mode != POLLED)
484		return;
485
486	/* If a CMCI occured recently, do nothing for now. */
487	if (delta < cmc_throttle)
488		return;
489
490	/*
491	 * Compute a new limit based on the average rate of events per
492	 * cmc_throttle seconds since the last interrupt.
493	 */
494	if (valid) {
495		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
496		limit = count * cmc_throttle / delta;
497		if (limit <= 0)
498			limit = 1;
499		else if (limit > cc->max_threshold)
500			limit = cc->max_threshold;
501	} else
502		limit = 1;
503	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
504		ctl &= ~MC_CTL2_THRESHOLD;
505		ctl |= limit;
506		wrmsr(MSR_MC_CTL2(bank), limit);
507	}
508}
509#endif
510
511/*
512 * This scans all the machine check banks of the current CPU to see if
513 * there are any machine checks.  Any non-recoverable errors are
514 * reported immediately via mca_log().  The current thread must be
515 * pinned when this is called.  The 'mode' parameter indicates if we
516 * are being called from the MC exception handler, the CMCI handler,
517 * or the periodic poller.  In the MC exception case this function
518 * returns true if the system is restartable.  Otherwise, it returns a
519 * count of the number of valid MC records found.
520 */
521static int
522mca_scan(enum scan_mode mode)
523{
524	struct mca_record rec;
525	uint64_t mcg_cap, ucmask;
526	int count, i, recoverable, valid;
527
528	count = 0;
529	recoverable = 1;
530	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
531
532	/* When handling a MCE#, treat the OVER flag as non-restartable. */
533	if (mode == MCE)
534		ucmask |= MC_STATUS_OVER;
535	mcg_cap = rdmsr(MSR_MCG_CAP);
536	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
537#ifdef DEV_APIC
538		/*
539		 * For a CMCI, only check banks this CPU is
540		 * responsible for.
541		 */
542		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
543			continue;
544#endif
545
546		valid = mca_check_status(i, &rec);
547		if (valid) {
548			count++;
549			if (rec.mr_status & ucmask) {
550				recoverable = 0;
551				mca_log(&rec);
552			}
553			mca_record_entry(&rec);
554		}
555
556#ifdef DEV_APIC
557		/*
558		 * If this is a bank this CPU monitors via CMCI,
559		 * update the threshold.
560		 */
561		if (PCPU_GET(cmci_mask) & 1 << i)
562			cmci_update(mode, i, valid, &rec);
563#endif
564	}
565	return (mode == MCE ? recoverable : count);
566}
567
568/*
569 * Scan the machine check banks on all CPUs by binding to each CPU in
570 * turn.  If any of the CPUs contained new machine check records, log
571 * them to the console.
572 */
573static void
574mca_scan_cpus(void *context, int pending)
575{
576	struct mca_internal *mca;
577	struct thread *td;
578	int count, cpu;
579
580	td = curthread;
581	count = 0;
582	thread_lock(td);
583	CPU_FOREACH(cpu) {
584		sched_bind(td, cpu);
585		thread_unlock(td);
586		count += mca_scan(POLLED);
587		thread_lock(td);
588		sched_unbind(td);
589	}
590	thread_unlock(td);
591	if (count != 0) {
592		mtx_lock_spin(&mca_lock);
593		STAILQ_FOREACH(mca, &mca_records, link) {
594			if (!mca->logged) {
595				mca->logged = 1;
596				mtx_unlock_spin(&mca_lock);
597				mca_log(&mca->rec);
598				mtx_lock_spin(&mca_lock);
599			}
600		}
601		mtx_unlock_spin(&mca_lock);
602	}
603}
604
605static void
606mca_periodic_scan(void *arg)
607{
608
609	taskqueue_enqueue(taskqueue_thread, &mca_task);
610	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
611}
612
613static int
614sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
615{
616	int error, i;
617
618	i = 0;
619	error = sysctl_handle_int(oidp, &i, 0, req);
620	if (error)
621		return (error);
622	if (i)
623		taskqueue_enqueue(taskqueue_thread, &mca_task);
624	return (0);
625}
626
627static void
628mca_startup(void *dummy)
629{
630
631	if (!mca_enabled || !(cpu_feature & CPUID_MCA))
632		return;
633
634	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan,
635		    NULL);
636}
637SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
638
639#ifdef DEV_APIC
640static void
641cmci_setup(uint64_t mcg_cap)
642{
643	int i;
644
645	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
646	    M_MCA, M_WAITOK);
647	cmc_banks = mcg_cap & MCG_CAP_COUNT;
648	for (i = 0; i <= mp_maxid; i++)
649		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
650		    M_MCA, M_WAITOK | M_ZERO);
651	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
652	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
653	    &cmc_throttle, 0, sysctl_positive_int, "I",
654	    "Interval in seconds to throttle corrected MC interrupts");
655}
656#endif
657
658static void
659mca_setup(uint64_t mcg_cap)
660{
661
662	/*
663	 * On AMD Family 10h processors, unless logging of level one TLB
664	 * parity (L1TP) errors is disabled, enable the recommended workaround
665	 * for Erratum 383.
666	 */
667	if (cpu_vendor_id == CPU_VENDOR_AMD &&
668	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
669		workaround_erratum383 = 1;
670
671	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
672	STAILQ_INIT(&mca_records);
673	TASK_INIT(&mca_task, 0x8000, mca_scan_cpus, NULL);
674	callout_init(&mca_timer, CALLOUT_MPSAFE);
675	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
676	    "count", CTLFLAG_RD, &mca_count, 0, "Record count");
677	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
678	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
679	    0, sysctl_positive_int, "I",
680	    "Periodic interval in seconds to scan for machine checks");
681	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
682	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
683	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
684	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
685	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
686#ifdef DEV_APIC
687	if (mcg_cap & MCG_CAP_CMCI_P)
688		cmci_setup(mcg_cap);
689#endif
690}
691
692#ifdef DEV_APIC
693/*
694 * See if we should monitor CMCI for this bank.  If CMCI_EN is already
695 * set in MC_CTL2, then another CPU is responsible for this bank, so
696 * ignore it.  If CMCI_EN returns zero after being set, then this bank
697 * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
698 * now monitor this bank.
699 */
700static void
701cmci_monitor(int i)
702{
703	struct cmc_state *cc;
704	uint64_t ctl;
705
706	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
707
708	ctl = rdmsr(MSR_MC_CTL2(i));
709	if (ctl & MC_CTL2_CMCI_EN)
710		/* Already monitored by another CPU. */
711		return;
712
713	/* Set the threshold to one event for now. */
714	ctl &= ~MC_CTL2_THRESHOLD;
715	ctl |= MC_CTL2_CMCI_EN | 1;
716	wrmsr(MSR_MC_CTL2(i), ctl);
717	ctl = rdmsr(MSR_MC_CTL2(i));
718	if (!(ctl & MC_CTL2_CMCI_EN))
719		/* This bank does not support CMCI. */
720		return;
721
722	cc = &cmc_state[PCPU_GET(cpuid)][i];
723
724	/* Determine maximum threshold. */
725	ctl &= ~MC_CTL2_THRESHOLD;
726	ctl |= 0x7fff;
727	wrmsr(MSR_MC_CTL2(i), ctl);
728	ctl = rdmsr(MSR_MC_CTL2(i));
729	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
730
731	/* Start off with a threshold of 1. */
732	ctl &= ~MC_CTL2_THRESHOLD;
733	ctl |= 1;
734	wrmsr(MSR_MC_CTL2(i), ctl);
735
736	/* Mark this bank as monitored. */
737	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
738}
739
740/*
741 * For resume, reset the threshold for any banks we monitor back to
742 * one and throw away the timestamp of the last interrupt.
743 */
744static void
745cmci_resume(int i)
746{
747	struct cmc_state *cc;
748	uint64_t ctl;
749
750	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
751
752	/* Ignore banks not monitored by this CPU. */
753	if (!(PCPU_GET(cmci_mask) & 1 << i))
754		return;
755
756	cc = &cmc_state[PCPU_GET(cpuid)][i];
757	cc->last_intr = -ticks;
758	ctl = rdmsr(MSR_MC_CTL2(i));
759	ctl &= ~MC_CTL2_THRESHOLD;
760	ctl |= MC_CTL2_CMCI_EN | 1;
761	wrmsr(MSR_MC_CTL2(i), ctl);
762}
763#endif
764
765/*
766 * Initializes per-CPU machine check registers and enables corrected
767 * machine check interrupts.
768 */
769static void
770_mca_init(int boot)
771{
772	uint64_t mcg_cap;
773	uint64_t ctl, mask;
774	int i, skip;
775
776	/* MCE is required. */
777	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
778		return;
779
780	if (cpu_feature & CPUID_MCA) {
781		if (boot)
782			PCPU_SET(cmci_mask, 0);
783
784		mcg_cap = rdmsr(MSR_MCG_CAP);
785		if (mcg_cap & MCG_CAP_CTL_P)
786			/* Enable MCA features. */
787			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
788		if (PCPU_GET(cpuid) == 0 && boot)
789			mca_setup(mcg_cap);
790
791		/*
792		 * Disable logging of level one TLB parity (L1TP) errors by
793		 * the data cache as an alternative workaround for AMD Family
794		 * 10h Erratum 383.  Unlike the recommended workaround, there
795		 * is no performance penalty to this workaround.  However,
796		 * L1TP errors will go unreported.
797		 */
798		if (cpu_vendor_id == CPU_VENDOR_AMD &&
799		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
800			mask = rdmsr(MSR_MC0_CTL_MASK);
801			if ((mask & (1UL << 5)) == 0)
802				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
803		}
804		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
805			/* By default enable logging of all errors. */
806			ctl = 0xffffffffffffffffUL;
807			skip = 0;
808
809			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
810				/*
811				 * For P6 models before Nehalem MC0_CTL is
812				 * always enabled and reserved.
813				 */
814				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
815				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
816					skip = 1;
817			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
818				/* BKDG for Family 10h: unset GartTblWkEn. */
819				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
820					ctl &= ~(1UL << 10);
821			}
822
823			if (!skip)
824				wrmsr(MSR_MC_CTL(i), ctl);
825
826#ifdef DEV_APIC
827			if (mcg_cap & MCG_CAP_CMCI_P) {
828				if (boot)
829					cmci_monitor(i);
830				else
831					cmci_resume(i);
832			}
833#endif
834
835			/* Clear all errors. */
836			wrmsr(MSR_MC_STATUS(i), 0);
837		}
838
839#ifdef DEV_APIC
840		if (PCPU_GET(cmci_mask) != 0 && boot)
841			lapic_enable_cmc();
842#endif
843	}
844
845	load_cr4(rcr4() | CR4_MCE);
846}
847
848/* Must be executed on each CPU during boot. */
849void
850mca_init(void)
851{
852
853	_mca_init(1);
854}
855
856/* Must be executed on each CPU during resume. */
857void
858mca_resume(void)
859{
860
861	_mca_init(0);
862}
863
864/*
865 * The machine check registers for the BSP cannot be initialized until
866 * the local APIC is initialized.  This happens at SI_SUB_CPU,
867 * SI_ORDER_SECOND.
868 */
869static void
870mca_init_bsp(void *arg __unused)
871{
872
873	mca_init();
874}
875SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
876
877/* Called when a machine check exception fires. */
878int
879mca_intr(void)
880{
881	uint64_t mcg_status;
882	int recoverable;
883
884	if (!(cpu_feature & CPUID_MCA)) {
885		/*
886		 * Just print the values of the old Pentium registers
887		 * and panic.
888		 */
889		printf("MC Type: 0x%jx  Address: 0x%jx\n",
890		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
891		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
892		return (0);
893	}
894
895	/* Scan the banks and check for any non-recoverable errors. */
896	recoverable = mca_scan(MCE);
897	mcg_status = rdmsr(MSR_MCG_STATUS);
898	if (!(mcg_status & MCG_STATUS_RIPV))
899		recoverable = 0;
900
901	/* Clear MCIP. */
902	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
903	return (recoverable);
904}
905
906#ifdef DEV_APIC
907/* Called for a CMCI (correctable machine check interrupt). */
908void
909cmc_intr(void)
910{
911	struct mca_internal *mca;
912	int count;
913
914	/*
915	 * Serialize MCA bank scanning to prevent collisions from
916	 * sibling threads.
917	 */
918	count = mca_scan(CMCI);
919
920	/* If we found anything, log them to the console. */
921	if (count != 0) {
922		mtx_lock_spin(&mca_lock);
923		STAILQ_FOREACH(mca, &mca_records, link) {
924			if (!mca->logged) {
925				mca->logged = 1;
926				mtx_unlock_spin(&mca_lock);
927				mca_log(&mca->rec);
928				mtx_lock_spin(&mca_lock);
929			}
930		}
931		mtx_unlock_spin(&mca_lock);
932	}
933}
934#endif
935