mca.c revision 208556
1/*-
2 * Copyright (c) 2009 Advanced Computing Technologies LLC
3 * Written by: John H. Baldwin <jhb@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Support for x86 machine check architecture.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/i386/i386/mca.c 208556 2010-05-25 21:39:30Z jhb $");
34
35#include "opt_apic.h"
36
37#include <sys/param.h>
38#include <sys/bus.h>
39#include <sys/interrupt.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/malloc.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/sched.h>
46#include <sys/smp.h>
47#include <sys/sysctl.h>
48#include <sys/systm.h>
49#include <sys/taskqueue.h>
50#include <machine/intr_machdep.h>
51#include <machine/apicvar.h>
52#include <machine/cputypes.h>
53#include <machine/mca.h>
54#include <machine/md_var.h>
55#include <machine/specialreg.h>
56
57/* Modes for mca_scan() */
58enum scan_mode {
59	POLLED,
60	MCE,
61	CMCI,
62};
63
64#ifdef DEV_APIC
65/*
66 * State maintained for each monitored MCx bank to control the
67 * corrected machine check interrupt threshold.
68 */
69struct cmc_state {
70	int	max_threshold;
71	int	last_intr;
72};
73#endif
74
75struct mca_internal {
76	struct mca_record rec;
77	int		logged;
78	STAILQ_ENTRY(mca_internal) link;
79};
80
81static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
82
83static int mca_count;		/* Number of records stored. */
84
85SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
86
87static int mca_enabled = 1;
88TUNABLE_INT("hw.mca.enabled", &mca_enabled);
89SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
90    "Administrative toggle for machine check support");
91
92static int amd10h_L1TP = 1;
93TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
94SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
95    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
96
97int workaround_erratum383;
98SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
99    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
100
101static STAILQ_HEAD(, mca_internal) mca_records;
102static struct callout mca_timer;
103static int mca_ticks = 3600;	/* Check hourly by default. */
104static struct task mca_task;
105static struct mtx mca_lock;
106
107#ifdef DEV_APIC
108static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
109static int cmc_banks;
110static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
111#endif
112
113static int
114sysctl_positive_int(SYSCTL_HANDLER_ARGS)
115{
116	int error, value;
117
118	value = *(int *)arg1;
119	error = sysctl_handle_int(oidp, &value, 0, req);
120	if (error || req->newptr == NULL)
121		return (error);
122	if (value <= 0)
123		return (EINVAL);
124	*(int *)arg1 = value;
125	return (0);
126}
127
128static int
129sysctl_mca_records(SYSCTL_HANDLER_ARGS)
130{
131	int *name = (int *)arg1;
132	u_int namelen = arg2;
133	struct mca_record record;
134	struct mca_internal *rec;
135	int i;
136
137	if (namelen != 1)
138		return (EINVAL);
139
140	if (name[0] < 0 || name[0] >= mca_count)
141		return (EINVAL);
142
143	mtx_lock_spin(&mca_lock);
144	if (name[0] >= mca_count) {
145		mtx_unlock_spin(&mca_lock);
146		return (EINVAL);
147	}
148	i = 0;
149	STAILQ_FOREACH(rec, &mca_records, link) {
150		if (i == name[0]) {
151			record = rec->rec;
152			break;
153		}
154		i++;
155	}
156	mtx_unlock_spin(&mca_lock);
157	return (SYSCTL_OUT(req, &record, sizeof(record)));
158}
159
160static const char *
161mca_error_ttype(uint16_t mca_error)
162{
163
164	switch ((mca_error & 0x000c) >> 2) {
165	case 0:
166		return ("I");
167	case 1:
168		return ("D");
169	case 2:
170		return ("G");
171	}
172	return ("?");
173}
174
175static const char *
176mca_error_level(uint16_t mca_error)
177{
178
179	switch (mca_error & 0x0003) {
180	case 0:
181		return ("L0");
182	case 1:
183		return ("L1");
184	case 2:
185		return ("L2");
186	case 3:
187		return ("LG");
188	}
189	return ("L?");
190}
191
192static const char *
193mca_error_request(uint16_t mca_error)
194{
195
196	switch ((mca_error & 0x00f0) >> 4) {
197	case 0x0:
198		return ("ERR");
199	case 0x1:
200		return ("RD");
201	case 0x2:
202		return ("WR");
203	case 0x3:
204		return ("DRD");
205	case 0x4:
206		return ("DWR");
207	case 0x5:
208		return ("IRD");
209	case 0x6:
210		return ("PREFETCH");
211	case 0x7:
212		return ("EVICT");
213	case 0x8:
214		return ("SNOOP");
215	}
216	return ("???");
217}
218
219static const char *
220mca_error_mmtype(uint16_t mca_error)
221{
222
223	switch ((mca_error & 0x70) >> 4) {
224	case 0x0:
225		return ("GEN");
226	case 0x1:
227		return ("RD");
228	case 0x2:
229		return ("WR");
230	case 0x3:
231		return ("AC");
232	case 0x4:
233		return ("MS");
234	}
235	return ("???");
236}
237
238/* Dump details about a single machine check. */
239static void __nonnull(1)
240mca_log(const struct mca_record *rec)
241{
242	uint16_t mca_error;
243
244	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
245	    (long long)rec->mr_status);
246	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
247	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
248	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
249	    rec->mr_cpu_id, rec->mr_apic_id);
250	printf("MCA: CPU %d ", rec->mr_cpu);
251	if (rec->mr_status & MC_STATUS_UC)
252		printf("UNCOR ");
253	else {
254		printf("COR ");
255		if (rec->mr_mcg_cap & MCG_CAP_TES_P)
256			printf("(%lld) ", ((long long)rec->mr_status &
257			    MC_STATUS_COR_COUNT) >> 38);
258	}
259	if (rec->mr_status & MC_STATUS_PCC)
260		printf("PCC ");
261	if (rec->mr_status & MC_STATUS_OVER)
262		printf("OVER ");
263	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
264	switch (mca_error) {
265		/* Simple error codes. */
266	case 0x0000:
267		printf("no error");
268		break;
269	case 0x0001:
270		printf("unclassified error");
271		break;
272	case 0x0002:
273		printf("ucode ROM parity error");
274		break;
275	case 0x0003:
276		printf("external error");
277		break;
278	case 0x0004:
279		printf("FRC error");
280		break;
281	case 0x0005:
282		printf("internal parity error");
283		break;
284	case 0x0400:
285		printf("internal timer error");
286		break;
287	default:
288		if ((mca_error & 0xfc00) == 0x0400) {
289			printf("internal error %x", mca_error & 0x03ff);
290			break;
291		}
292
293		/* Compound error codes. */
294
295		/* Memory hierarchy error. */
296		if ((mca_error & 0xeffc) == 0x000c) {
297			printf("%s memory error", mca_error_level(mca_error));
298			break;
299		}
300
301		/* TLB error. */
302		if ((mca_error & 0xeff0) == 0x0010) {
303			printf("%sTLB %s error", mca_error_ttype(mca_error),
304			    mca_error_level(mca_error));
305			break;
306		}
307
308		/* Memory controller error. */
309		if ((mca_error & 0xef80) == 0x0080) {
310			printf("%s channel ", mca_error_mmtype(mca_error));
311			if ((mca_error & 0x000f) != 0x000f)
312				printf("%d", mca_error & 0x000f);
313			else
314				printf("??");
315			printf(" memory error");
316			break;
317		}
318
319		/* Cache error. */
320		if ((mca_error & 0xef00) == 0x0100) {
321			printf("%sCACHE %s %s error",
322			    mca_error_ttype(mca_error),
323			    mca_error_level(mca_error),
324			    mca_error_request(mca_error));
325			break;
326		}
327
328		/* Bus and/or Interconnect error. */
329		if ((mca_error & 0xe800) == 0x0800) {
330			printf("BUS%s ", mca_error_level(mca_error));
331			switch ((mca_error & 0x0600) >> 9) {
332			case 0:
333				printf("Source");
334				break;
335			case 1:
336				printf("Responder");
337				break;
338			case 2:
339				printf("Observer");
340				break;
341			default:
342				printf("???");
343				break;
344			}
345			printf(" %s ", mca_error_request(mca_error));
346			switch ((mca_error & 0x000c) >> 2) {
347			case 0:
348				printf("Memory");
349				break;
350			case 2:
351				printf("I/O");
352				break;
353			case 3:
354				printf("Other");
355				break;
356			default:
357				printf("???");
358				break;
359			}
360			if (mca_error & 0x0100)
361				printf(" timed out");
362			break;
363		}
364
365		printf("unknown error %x", mca_error);
366		break;
367	}
368	printf("\n");
369	if (rec->mr_status & MC_STATUS_ADDRV)
370		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
371	if (rec->mr_status & MC_STATUS_MISCV)
372		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
373}
374
375static int __nonnull(2)
376mca_check_status(int bank, struct mca_record *rec)
377{
378	uint64_t status;
379	u_int p[4];
380
381	status = rdmsr(MSR_MC_STATUS(bank));
382	if (!(status & MC_STATUS_VAL))
383		return (0);
384
385	/* Save exception information. */
386	rec->mr_status = status;
387	rec->mr_bank = bank;
388	rec->mr_addr = 0;
389	if (status & MC_STATUS_ADDRV)
390		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
391	rec->mr_misc = 0;
392	if (status & MC_STATUS_MISCV)
393		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
394	rec->mr_tsc = rdtsc();
395	rec->mr_apic_id = PCPU_GET(apic_id);
396	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
397	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
398	rec->mr_cpu_id = cpu_id;
399	rec->mr_cpu_vendor_id = cpu_vendor_id;
400	rec->mr_cpu = PCPU_GET(cpuid);
401
402	/*
403	 * Clear machine check.  Don't do this for uncorrectable
404	 * errors so that the BIOS can see them.
405	 */
406	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
407		wrmsr(MSR_MC_STATUS(bank), 0);
408		do_cpuid(0, p);
409	}
410	return (1);
411}
412
413static void __nonnull(1)
414mca_record_entry(const struct mca_record *record)
415{
416	struct mca_internal *rec;
417
418	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
419	if (rec == NULL) {
420		printf("MCA: Unable to allocate space for an event.\n");
421		mca_log(record);
422		return;
423	}
424
425	rec->rec = *record;
426	rec->logged = 0;
427	mtx_lock_spin(&mca_lock);
428	STAILQ_INSERT_TAIL(&mca_records, rec, link);
429	mca_count++;
430	mtx_unlock_spin(&mca_lock);
431}
432
433#ifdef DEV_APIC
434/*
435 * Update the interrupt threshold for a CMCI.  The strategy is to use
436 * a low trigger that interrupts as soon as the first event occurs.
437 * However, if a steady stream of events arrive, the threshold is
438 * increased until the interrupts are throttled to once every
439 * cmc_throttle seconds or the periodic scan.  If a periodic scan
440 * finds that the threshold is too high, it is lowered.
441 */
442static void
443cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
444{
445	struct cmc_state *cc;
446	uint64_t ctl;
447	u_int delta;
448	int count, limit;
449
450	/* Fetch the current limit for this bank. */
451	cc = &cmc_state[PCPU_GET(cpuid)][bank];
452	ctl = rdmsr(MSR_MC_CTL2(bank));
453	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
454	delta = (u_int)(ticks - cc->last_intr);
455
456	/*
457	 * If an interrupt was received less than cmc_throttle seconds
458	 * since the previous interrupt and the count from the current
459	 * event is greater than or equal to the current threshold,
460	 * double the threshold up to the max.
461	 */
462	if (mode == CMCI && valid) {
463		limit = ctl & MC_CTL2_THRESHOLD;
464		if (delta < cmc_throttle && count >= limit &&
465		    limit < cc->max_threshold) {
466			limit = min(limit << 1, cc->max_threshold);
467			ctl &= ~MC_CTL2_THRESHOLD;
468			ctl |= limit;
469			wrmsr(MSR_MC_CTL2(bank), limit);
470		}
471		cc->last_intr = ticks;
472		return;
473	}
474
475	/*
476	 * When the banks are polled, check to see if the threshold
477	 * should be lowered.
478	 */
479	if (mode != POLLED)
480		return;
481
482	/* If a CMCI occured recently, do nothing for now. */
483	if (delta < cmc_throttle)
484		return;
485
486	/*
487	 * Compute a new limit based on the average rate of events per
488	 * cmc_throttle seconds since the last interrupt.
489	 */
490	if (valid) {
491		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
492		limit = count * cmc_throttle / delta;
493		if (limit <= 0)
494			limit = 1;
495		else if (limit > cc->max_threshold)
496			limit = cc->max_threshold;
497	} else
498		limit = 1;
499	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
500		ctl &= ~MC_CTL2_THRESHOLD;
501		ctl |= limit;
502		wrmsr(MSR_MC_CTL2(bank), limit);
503	}
504}
505#endif
506
507/*
508 * This scans all the machine check banks of the current CPU to see if
509 * there are any machine checks.  Any non-recoverable errors are
510 * reported immediately via mca_log().  The current thread must be
511 * pinned when this is called.  The 'mode' parameter indicates if we
512 * are being called from the MC exception handler, the CMCI handler,
513 * or the periodic poller.  In the MC exception case this function
514 * returns true if the system is restartable.  Otherwise, it returns a
515 * count of the number of valid MC records found.
516 */
517static int
518mca_scan(enum scan_mode mode)
519{
520	struct mca_record rec;
521	uint64_t mcg_cap, ucmask;
522	int count, i, recoverable, valid;
523
524	count = 0;
525	recoverable = 1;
526	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
527
528	/* When handling a MCE#, treat the OVER flag as non-restartable. */
529	if (mode == MCE)
530		ucmask |= MC_STATUS_OVER;
531	mcg_cap = rdmsr(MSR_MCG_CAP);
532	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
533#ifdef DEV_APIC
534		/*
535		 * For a CMCI, only check banks this CPU is
536		 * responsible for.
537		 */
538		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
539			continue;
540#endif
541
542		valid = mca_check_status(i, &rec);
543		if (valid) {
544			count++;
545			if (rec.mr_status & ucmask) {
546				recoverable = 0;
547				mca_log(&rec);
548			}
549			mca_record_entry(&rec);
550		}
551
552#ifdef DEV_APIC
553		/*
554		 * If this is a bank this CPU monitors via CMCI,
555		 * update the threshold.
556		 */
557		if (PCPU_GET(cmci_mask) & (1 << i))
558			cmci_update(mode, i, valid, &rec);
559#endif
560	}
561	return (mode == MCE ? recoverable : count);
562}
563
564/*
565 * Scan the machine check banks on all CPUs by binding to each CPU in
566 * turn.  If any of the CPUs contained new machine check records, log
567 * them to the console.
568 */
569static void
570mca_scan_cpus(void *context, int pending)
571{
572	struct mca_internal *mca;
573	struct thread *td;
574	int count, cpu;
575
576	td = curthread;
577	count = 0;
578	thread_lock(td);
579	for (cpu = 0; cpu <= mp_maxid; cpu++) {
580		if (CPU_ABSENT(cpu))
581			continue;
582		sched_bind(td, cpu);
583		thread_unlock(td);
584		count += mca_scan(POLLED);
585		thread_lock(td);
586		sched_unbind(td);
587	}
588	thread_unlock(td);
589	if (count != 0) {
590		mtx_lock_spin(&mca_lock);
591		STAILQ_FOREACH(mca, &mca_records, link) {
592			if (!mca->logged) {
593				mca->logged = 1;
594				mtx_unlock_spin(&mca_lock);
595				mca_log(&mca->rec);
596				mtx_lock_spin(&mca_lock);
597			}
598		}
599		mtx_unlock_spin(&mca_lock);
600	}
601}
602
603static void
604mca_periodic_scan(void *arg)
605{
606
607	taskqueue_enqueue(taskqueue_thread, &mca_task);
608	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
609}
610
611static int
612sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
613{
614	int error, i;
615
616	i = 0;
617	error = sysctl_handle_int(oidp, &i, 0, req);
618	if (error)
619		return (error);
620	if (i)
621		taskqueue_enqueue(taskqueue_thread, &mca_task);
622	return (0);
623}
624
625static void
626mca_startup(void *dummy)
627{
628
629	if (!mca_enabled || !(cpu_feature & CPUID_MCA))
630		return;
631
632	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan,
633		    NULL);
634}
635SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
636
637#ifdef DEV_APIC
638static void
639cmci_setup(uint64_t mcg_cap)
640{
641	int i;
642
643	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
644	    M_MCA, M_WAITOK);
645	cmc_banks = mcg_cap & MCG_CAP_COUNT;
646	for (i = 0; i <= mp_maxid; i++)
647		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
648		    M_MCA, M_WAITOK | M_ZERO);
649	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
650	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
651	    &cmc_throttle, 0, sysctl_positive_int, "I",
652	    "Interval in seconds to throttle corrected MC interrupts");
653}
654#endif
655
656static void
657mca_setup(uint64_t mcg_cap)
658{
659
660	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
661	STAILQ_INIT(&mca_records);
662	TASK_INIT(&mca_task, 0x8000, mca_scan_cpus, NULL);
663	callout_init(&mca_timer, CALLOUT_MPSAFE);
664	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
665	    "count", CTLFLAG_RD, &mca_count, 0, "Record count");
666	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
667	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
668	    0, sysctl_positive_int, "I",
669	    "Periodic interval in seconds to scan for machine checks");
670	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
671	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
672	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
673	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
674	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
675#ifdef DEV_APIC
676	if (mcg_cap & MCG_CAP_CMCI_P)
677		cmci_setup(mcg_cap);
678#endif
679}
680
681#ifdef DEV_APIC
682/*
683 * See if we should monitor CMCI for this bank.  If CMCI_EN is already
684 * set in MC_CTL2, then another CPU is responsible for this bank, so
685 * ignore it.  If CMCI_EN returns zero after being set, then this bank
686 * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
687 * now monitor this bank.
688 */
689static void
690cmci_monitor(int i)
691{
692	struct cmc_state *cc;
693	uint64_t ctl;
694
695	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
696
697	ctl = rdmsr(MSR_MC_CTL2(i));
698	if (ctl & MC_CTL2_CMCI_EN)
699		/* Already monitored by another CPU. */
700		return;
701
702	/* Set the threshold to one event for now. */
703	ctl &= ~MC_CTL2_THRESHOLD;
704	ctl |= MC_CTL2_CMCI_EN | 1;
705	wrmsr(MSR_MC_CTL2(i), ctl);
706	ctl = rdmsr(MSR_MC_CTL2(i));
707	if (!(ctl & MC_CTL2_CMCI_EN))
708		/* This bank does not support CMCI. */
709		return;
710
711	cc = &cmc_state[PCPU_GET(cpuid)][i];
712
713	/* Determine maximum threshold. */
714	ctl &= ~MC_CTL2_THRESHOLD;
715	ctl |= 0x7fff;
716	wrmsr(MSR_MC_CTL2(i), ctl);
717	ctl = rdmsr(MSR_MC_CTL2(i));
718	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
719
720	/* Start off with a threshold of 1. */
721	ctl &= ~MC_CTL2_THRESHOLD;
722	ctl |= 1;
723	wrmsr(MSR_MC_CTL2(i), ctl);
724
725	/* Mark this bank as monitored. */
726	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
727}
728#endif
729
730/* Must be executed on each CPU. */
731void
732mca_init(void)
733{
734	uint64_t mcg_cap;
735	uint64_t ctl, mask;
736	int skip;
737	int i;
738
739	/* MCE is required. */
740	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
741		return;
742
743	/*
744	 * On AMD Family 10h processors, unless logging of level one TLB
745	 * parity (L1TP) errors is disabled, enable the recommended workaround
746	 * for Erratum 383.
747	 */
748	if (cpu_vendor_id == CPU_VENDOR_AMD &&
749	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
750		workaround_erratum383 = 1;
751
752	if (cpu_feature & CPUID_MCA) {
753		PCPU_SET(cmci_mask, 0);
754
755		mcg_cap = rdmsr(MSR_MCG_CAP);
756		if (mcg_cap & MCG_CAP_CTL_P)
757			/* Enable MCA features. */
758			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
759		if (PCPU_GET(cpuid) == 0)
760			mca_setup(mcg_cap);
761
762		/*
763		 * Disable logging of level one TLB parity (L1TP) errors by
764		 * the data cache as an alternative workaround for AMD Family
765		 * 10h Erratum 383.  Unlike the recommended workaround, there
766		 * is no performance penalty to this workaround.  However,
767		 * L1TP errors will go unreported.
768		 */
769		if (cpu_vendor_id == CPU_VENDOR_AMD &&
770		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
771			mask = rdmsr(MSR_MC0_CTL_MASK);
772			if ((mask & (1UL << 5)) == 0)
773				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
774		}
775		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
776			/* By default enable logging of all errors. */
777			ctl = 0xffffffffffffffffUL;
778			skip = 0;
779
780			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
781				/*
782				 * For P6 models before Nehalem MC0_CTL is
783				 * always enabled and reserved.
784				 */
785				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
786				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
787					skip = 1;
788			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
789				/* BKDG for Family 10h: unset GartTblWkEn. */
790				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
791					ctl &= ~(1UL << 10);
792			}
793
794			if (!skip)
795				wrmsr(MSR_MC_CTL(i), ctl);
796
797#ifdef DEV_APIC
798			if (mcg_cap & MCG_CAP_CMCI_P)
799				cmci_monitor(i);
800#endif
801
802			/* Clear all errors. */
803			wrmsr(MSR_MC_STATUS(i), 0);
804		}
805
806#ifdef DEV_APIC
807		if (PCPU_GET(cmci_mask) != 0)
808			lapic_enable_cmc();
809#endif
810	}
811
812	load_cr4(rcr4() | CR4_MCE);
813}
814
815/* Called when a machine check exception fires. */
816int
817mca_intr(void)
818{
819	uint64_t mcg_status;
820	int recoverable;
821
822	if (!(cpu_feature & CPUID_MCA)) {
823		/*
824		 * Just print the values of the old Pentium registers
825		 * and panic.
826		 */
827		printf("MC Type: 0x%llx  Address: 0x%llx\n",
828		    rdmsr(MSR_P5_MC_TYPE), rdmsr(MSR_P5_MC_ADDR));
829		return (0);
830	}
831
832	/* Scan the banks and check for any non-recoverable errors. */
833	recoverable = mca_scan(MCE);
834	mcg_status = rdmsr(MSR_MCG_STATUS);
835	if (!(mcg_status & MCG_STATUS_RIPV))
836		recoverable = 0;
837
838	/* Clear MCIP. */
839	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
840	return (recoverable);
841}
842
843#ifdef DEV_APIC
844/* Called for a CMCI (correctable machine check interrupt). */
845void
846cmc_intr(void)
847{
848	struct mca_internal *mca;
849	int count;
850
851	/*
852	 * Serialize MCA bank scanning to prevent collisions from
853	 * sibling threads.
854	 */
855	count = mca_scan(CMCI);
856
857	/* If we found anything, log them to the console. */
858	if (count != 0) {
859		mtx_lock_spin(&mca_lock);
860		STAILQ_FOREACH(mca, &mca_records, link) {
861			if (!mca->logged) {
862				mca->logged = 1;
863				mtx_unlock_spin(&mca_lock);
864				mca_log(&mca->rec);
865				mtx_lock_spin(&mca_lock);
866			}
867		}
868		mtx_unlock_spin(&mca_lock);
869	}
870}
871#endif
872