mca.c revision 208921
1/*-
2 * Copyright (c) 2009 Advanced Computing Technologies LLC
3 * Written by: John H. Baldwin <jhb@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Support for x86 machine check architecture.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/x86/x86/mca.c 208921 2010-06-08 18:04:07Z jhb $");
34
35#ifdef __amd64__
36#define	DEV_APIC
37#else
38#include "opt_apic.h"
39#endif
40
41#include <sys/param.h>
42#include <sys/bus.h>
43#include <sys/interrupt.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52#include <sys/systm.h>
53#include <sys/taskqueue.h>
54#include <machine/intr_machdep.h>
55#include <machine/apicvar.h>
56#include <machine/cputypes.h>
57#include <machine/mca.h>
58#include <machine/md_var.h>
59#include <machine/specialreg.h>
60
61/* Modes for mca_scan() */
62enum scan_mode {
63	POLLED,
64	MCE,
65	CMCI,
66};
67
68#ifdef DEV_APIC
69/*
70 * State maintained for each monitored MCx bank to control the
71 * corrected machine check interrupt threshold.
72 */
73struct cmc_state {
74	int	max_threshold;
75	int	last_intr;
76};
77#endif
78
79struct mca_internal {
80	struct mca_record rec;
81	int		logged;
82	STAILQ_ENTRY(mca_internal) link;
83};
84
85static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
86
87static int mca_count;		/* Number of records stored. */
88
89SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
90
91static int mca_enabled = 1;
92TUNABLE_INT("hw.mca.enabled", &mca_enabled);
93SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
94    "Administrative toggle for machine check support");
95
96static int amd10h_L1TP = 1;
97TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
98SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
99    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
100
101int workaround_erratum383;
102SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
103    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
104
105static STAILQ_HEAD(, mca_internal) mca_records;
106static struct callout mca_timer;
107static int mca_ticks = 3600;	/* Check hourly by default. */
108static struct task mca_task;
109static struct mtx mca_lock;
110
111#ifdef DEV_APIC
112static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
113static int cmc_banks;
114static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
115#endif
116
117static int
118sysctl_positive_int(SYSCTL_HANDLER_ARGS)
119{
120	int error, value;
121
122	value = *(int *)arg1;
123	error = sysctl_handle_int(oidp, &value, 0, req);
124	if (error || req->newptr == NULL)
125		return (error);
126	if (value <= 0)
127		return (EINVAL);
128	*(int *)arg1 = value;
129	return (0);
130}
131
132static int
133sysctl_mca_records(SYSCTL_HANDLER_ARGS)
134{
135	int *name = (int *)arg1;
136	u_int namelen = arg2;
137	struct mca_record record;
138	struct mca_internal *rec;
139	int i;
140
141	if (namelen != 1)
142		return (EINVAL);
143
144	if (name[0] < 0 || name[0] >= mca_count)
145		return (EINVAL);
146
147	mtx_lock_spin(&mca_lock);
148	if (name[0] >= mca_count) {
149		mtx_unlock_spin(&mca_lock);
150		return (EINVAL);
151	}
152	i = 0;
153	STAILQ_FOREACH(rec, &mca_records, link) {
154		if (i == name[0]) {
155			record = rec->rec;
156			break;
157		}
158		i++;
159	}
160	mtx_unlock_spin(&mca_lock);
161	return (SYSCTL_OUT(req, &record, sizeof(record)));
162}
163
164static const char *
165mca_error_ttype(uint16_t mca_error)
166{
167
168	switch ((mca_error & 0x000c) >> 2) {
169	case 0:
170		return ("I");
171	case 1:
172		return ("D");
173	case 2:
174		return ("G");
175	}
176	return ("?");
177}
178
179static const char *
180mca_error_level(uint16_t mca_error)
181{
182
183	switch (mca_error & 0x0003) {
184	case 0:
185		return ("L0");
186	case 1:
187		return ("L1");
188	case 2:
189		return ("L2");
190	case 3:
191		return ("LG");
192	}
193	return ("L?");
194}
195
196static const char *
197mca_error_request(uint16_t mca_error)
198{
199
200	switch ((mca_error & 0x00f0) >> 4) {
201	case 0x0:
202		return ("ERR");
203	case 0x1:
204		return ("RD");
205	case 0x2:
206		return ("WR");
207	case 0x3:
208		return ("DRD");
209	case 0x4:
210		return ("DWR");
211	case 0x5:
212		return ("IRD");
213	case 0x6:
214		return ("PREFETCH");
215	case 0x7:
216		return ("EVICT");
217	case 0x8:
218		return ("SNOOP");
219	}
220	return ("???");
221}
222
223static const char *
224mca_error_mmtype(uint16_t mca_error)
225{
226
227	switch ((mca_error & 0x70) >> 4) {
228	case 0x0:
229		return ("GEN");
230	case 0x1:
231		return ("RD");
232	case 0x2:
233		return ("WR");
234	case 0x3:
235		return ("AC");
236	case 0x4:
237		return ("MS");
238	}
239	return ("???");
240}
241
242/* Dump details about a single machine check. */
243static void __nonnull(1)
244mca_log(const struct mca_record *rec)
245{
246	uint16_t mca_error;
247
248	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
249	    (long long)rec->mr_status);
250	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
251	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
252	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
253	    rec->mr_cpu_id, rec->mr_apic_id);
254	printf("MCA: CPU %d ", rec->mr_cpu);
255	if (rec->mr_status & MC_STATUS_UC)
256		printf("UNCOR ");
257	else {
258		printf("COR ");
259		if (rec->mr_mcg_cap & MCG_CAP_TES_P)
260			printf("(%lld) ", ((long long)rec->mr_status &
261			    MC_STATUS_COR_COUNT) >> 38);
262	}
263	if (rec->mr_status & MC_STATUS_PCC)
264		printf("PCC ");
265	if (rec->mr_status & MC_STATUS_OVER)
266		printf("OVER ");
267	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
268	switch (mca_error) {
269		/* Simple error codes. */
270	case 0x0000:
271		printf("no error");
272		break;
273	case 0x0001:
274		printf("unclassified error");
275		break;
276	case 0x0002:
277		printf("ucode ROM parity error");
278		break;
279	case 0x0003:
280		printf("external error");
281		break;
282	case 0x0004:
283		printf("FRC error");
284		break;
285	case 0x0005:
286		printf("internal parity error");
287		break;
288	case 0x0400:
289		printf("internal timer error");
290		break;
291	default:
292		if ((mca_error & 0xfc00) == 0x0400) {
293			printf("internal error %x", mca_error & 0x03ff);
294			break;
295		}
296
297		/* Compound error codes. */
298
299		/* Memory hierarchy error. */
300		if ((mca_error & 0xeffc) == 0x000c) {
301			printf("%s memory error", mca_error_level(mca_error));
302			break;
303		}
304
305		/* TLB error. */
306		if ((mca_error & 0xeff0) == 0x0010) {
307			printf("%sTLB %s error", mca_error_ttype(mca_error),
308			    mca_error_level(mca_error));
309			break;
310		}
311
312		/* Memory controller error. */
313		if ((mca_error & 0xef80) == 0x0080) {
314			printf("%s channel ", mca_error_mmtype(mca_error));
315			if ((mca_error & 0x000f) != 0x000f)
316				printf("%d", mca_error & 0x000f);
317			else
318				printf("??");
319			printf(" memory error");
320			break;
321		}
322
323		/* Cache error. */
324		if ((mca_error & 0xef00) == 0x0100) {
325			printf("%sCACHE %s %s error",
326			    mca_error_ttype(mca_error),
327			    mca_error_level(mca_error),
328			    mca_error_request(mca_error));
329			break;
330		}
331
332		/* Bus and/or Interconnect error. */
333		if ((mca_error & 0xe800) == 0x0800) {
334			printf("BUS%s ", mca_error_level(mca_error));
335			switch ((mca_error & 0x0600) >> 9) {
336			case 0:
337				printf("Source");
338				break;
339			case 1:
340				printf("Responder");
341				break;
342			case 2:
343				printf("Observer");
344				break;
345			default:
346				printf("???");
347				break;
348			}
349			printf(" %s ", mca_error_request(mca_error));
350			switch ((mca_error & 0x000c) >> 2) {
351			case 0:
352				printf("Memory");
353				break;
354			case 2:
355				printf("I/O");
356				break;
357			case 3:
358				printf("Other");
359				break;
360			default:
361				printf("???");
362				break;
363			}
364			if (mca_error & 0x0100)
365				printf(" timed out");
366			break;
367		}
368
369		printf("unknown error %x", mca_error);
370		break;
371	}
372	printf("\n");
373	if (rec->mr_status & MC_STATUS_ADDRV)
374		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
375	if (rec->mr_status & MC_STATUS_MISCV)
376		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
377}
378
379static int __nonnull(2)
380mca_check_status(int bank, struct mca_record *rec)
381{
382	uint64_t status;
383	u_int p[4];
384
385	status = rdmsr(MSR_MC_STATUS(bank));
386	if (!(status & MC_STATUS_VAL))
387		return (0);
388
389	/* Save exception information. */
390	rec->mr_status = status;
391	rec->mr_bank = bank;
392	rec->mr_addr = 0;
393	if (status & MC_STATUS_ADDRV)
394		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
395	rec->mr_misc = 0;
396	if (status & MC_STATUS_MISCV)
397		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
398	rec->mr_tsc = rdtsc();
399	rec->mr_apic_id = PCPU_GET(apic_id);
400	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
401	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
402	rec->mr_cpu_id = cpu_id;
403	rec->mr_cpu_vendor_id = cpu_vendor_id;
404	rec->mr_cpu = PCPU_GET(cpuid);
405
406	/*
407	 * Clear machine check.  Don't do this for uncorrectable
408	 * errors so that the BIOS can see them.
409	 */
410	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
411		wrmsr(MSR_MC_STATUS(bank), 0);
412		do_cpuid(0, p);
413	}
414	return (1);
415}
416
417static void __nonnull(1)
418mca_record_entry(const struct mca_record *record)
419{
420	struct mca_internal *rec;
421
422	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
423	if (rec == NULL) {
424		printf("MCA: Unable to allocate space for an event.\n");
425		mca_log(record);
426		return;
427	}
428
429	rec->rec = *record;
430	rec->logged = 0;
431	mtx_lock_spin(&mca_lock);
432	STAILQ_INSERT_TAIL(&mca_records, rec, link);
433	mca_count++;
434	mtx_unlock_spin(&mca_lock);
435}
436
437#ifdef DEV_APIC
438/*
439 * Update the interrupt threshold for a CMCI.  The strategy is to use
440 * a low trigger that interrupts as soon as the first event occurs.
441 * However, if a steady stream of events arrive, the threshold is
442 * increased until the interrupts are throttled to once every
443 * cmc_throttle seconds or the periodic scan.  If a periodic scan
444 * finds that the threshold is too high, it is lowered.
445 */
446static void
447cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
448{
449	struct cmc_state *cc;
450	uint64_t ctl;
451	u_int delta;
452	int count, limit;
453
454	/* Fetch the current limit for this bank. */
455	cc = &cmc_state[PCPU_GET(cpuid)][bank];
456	ctl = rdmsr(MSR_MC_CTL2(bank));
457	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
458	delta = (u_int)(ticks - cc->last_intr);
459
460	/*
461	 * If an interrupt was received less than cmc_throttle seconds
462	 * since the previous interrupt and the count from the current
463	 * event is greater than or equal to the current threshold,
464	 * double the threshold up to the max.
465	 */
466	if (mode == CMCI && valid) {
467		limit = ctl & MC_CTL2_THRESHOLD;
468		if (delta < cmc_throttle && count >= limit &&
469		    limit < cc->max_threshold) {
470			limit = min(limit << 1, cc->max_threshold);
471			ctl &= ~MC_CTL2_THRESHOLD;
472			ctl |= limit;
473			wrmsr(MSR_MC_CTL2(bank), limit);
474		}
475		cc->last_intr = ticks;
476		return;
477	}
478
479	/*
480	 * When the banks are polled, check to see if the threshold
481	 * should be lowered.
482	 */
483	if (mode != POLLED)
484		return;
485
486	/* If a CMCI occured recently, do nothing for now. */
487	if (delta < cmc_throttle)
488		return;
489
490	/*
491	 * Compute a new limit based on the average rate of events per
492	 * cmc_throttle seconds since the last interrupt.
493	 */
494	if (valid) {
495		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
496		limit = count * cmc_throttle / delta;
497		if (limit <= 0)
498			limit = 1;
499		else if (limit > cc->max_threshold)
500			limit = cc->max_threshold;
501	} else
502		limit = 1;
503	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
504		ctl &= ~MC_CTL2_THRESHOLD;
505		ctl |= limit;
506		wrmsr(MSR_MC_CTL2(bank), limit);
507	}
508}
509#endif
510
511/*
512 * This scans all the machine check banks of the current CPU to see if
513 * there are any machine checks.  Any non-recoverable errors are
514 * reported immediately via mca_log().  The current thread must be
515 * pinned when this is called.  The 'mode' parameter indicates if we
516 * are being called from the MC exception handler, the CMCI handler,
517 * or the periodic poller.  In the MC exception case this function
518 * returns true if the system is restartable.  Otherwise, it returns a
519 * count of the number of valid MC records found.
520 */
521static int
522mca_scan(enum scan_mode mode)
523{
524	struct mca_record rec;
525	uint64_t mcg_cap, ucmask;
526	int count, i, recoverable, valid;
527
528	count = 0;
529	recoverable = 1;
530	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
531
532	/* When handling a MCE#, treat the OVER flag as non-restartable. */
533	if (mode == MCE)
534		ucmask |= MC_STATUS_OVER;
535	mcg_cap = rdmsr(MSR_MCG_CAP);
536	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
537#ifdef DEV_APIC
538		/*
539		 * For a CMCI, only check banks this CPU is
540		 * responsible for.
541		 */
542		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
543			continue;
544#endif
545
546		valid = mca_check_status(i, &rec);
547		if (valid) {
548			count++;
549			if (rec.mr_status & ucmask) {
550				recoverable = 0;
551				mca_log(&rec);
552			}
553			mca_record_entry(&rec);
554		}
555
556#ifdef DEV_APIC
557		/*
558		 * If this is a bank this CPU monitors via CMCI,
559		 * update the threshold.
560		 */
561		if (PCPU_GET(cmci_mask) & (1 << i))
562			cmci_update(mode, i, valid, &rec);
563#endif
564	}
565	return (mode == MCE ? recoverable : count);
566}
567
568/*
569 * Scan the machine check banks on all CPUs by binding to each CPU in
570 * turn.  If any of the CPUs contained new machine check records, log
571 * them to the console.
572 */
573static void
574mca_scan_cpus(void *context, int pending)
575{
576	struct mca_internal *mca;
577	struct thread *td;
578	int count, cpu;
579
580	td = curthread;
581	count = 0;
582	thread_lock(td);
583	for (cpu = 0; cpu <= mp_maxid; cpu++) {
584		if (CPU_ABSENT(cpu))
585			continue;
586		sched_bind(td, cpu);
587		thread_unlock(td);
588		count += mca_scan(POLLED);
589		thread_lock(td);
590		sched_unbind(td);
591	}
592	thread_unlock(td);
593	if (count != 0) {
594		mtx_lock_spin(&mca_lock);
595		STAILQ_FOREACH(mca, &mca_records, link) {
596			if (!mca->logged) {
597				mca->logged = 1;
598				mtx_unlock_spin(&mca_lock);
599				mca_log(&mca->rec);
600				mtx_lock_spin(&mca_lock);
601			}
602		}
603		mtx_unlock_spin(&mca_lock);
604	}
605}
606
607static void
608mca_periodic_scan(void *arg)
609{
610
611	taskqueue_enqueue(taskqueue_thread, &mca_task);
612	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
613}
614
615static int
616sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
617{
618	int error, i;
619
620	i = 0;
621	error = sysctl_handle_int(oidp, &i, 0, req);
622	if (error)
623		return (error);
624	if (i)
625		taskqueue_enqueue(taskqueue_thread, &mca_task);
626	return (0);
627}
628
629static void
630mca_startup(void *dummy)
631{
632
633	if (!mca_enabled || !(cpu_feature & CPUID_MCA))
634		return;
635
636	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan,
637		    NULL);
638}
639SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
640
641#ifdef DEV_APIC
642static void
643cmci_setup(uint64_t mcg_cap)
644{
645	int i;
646
647	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
648	    M_MCA, M_WAITOK);
649	cmc_banks = mcg_cap & MCG_CAP_COUNT;
650	for (i = 0; i <= mp_maxid; i++)
651		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
652		    M_MCA, M_WAITOK | M_ZERO);
653	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
654	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
655	    &cmc_throttle, 0, sysctl_positive_int, "I",
656	    "Interval in seconds to throttle corrected MC interrupts");
657}
658#endif
659
660static void
661mca_setup(uint64_t mcg_cap)
662{
663
664	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
665	STAILQ_INIT(&mca_records);
666	TASK_INIT(&mca_task, 0x8000, mca_scan_cpus, NULL);
667	callout_init(&mca_timer, CALLOUT_MPSAFE);
668	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
669	    "count", CTLFLAG_RD, &mca_count, 0, "Record count");
670	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
671	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
672	    0, sysctl_positive_int, "I",
673	    "Periodic interval in seconds to scan for machine checks");
674	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
675	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
676	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
677	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
678	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
679#ifdef DEV_APIC
680	if (mcg_cap & MCG_CAP_CMCI_P)
681		cmci_setup(mcg_cap);
682#endif
683}
684
685#ifdef DEV_APIC
686/*
687 * See if we should monitor CMCI for this bank.  If CMCI_EN is already
688 * set in MC_CTL2, then another CPU is responsible for this bank, so
689 * ignore it.  If CMCI_EN returns zero after being set, then this bank
690 * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
691 * now monitor this bank.
692 */
693static void
694cmci_monitor(int i)
695{
696	struct cmc_state *cc;
697	uint64_t ctl;
698
699	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
700
701	ctl = rdmsr(MSR_MC_CTL2(i));
702	if (ctl & MC_CTL2_CMCI_EN)
703		/* Already monitored by another CPU. */
704		return;
705
706	/* Set the threshold to one event for now. */
707	ctl &= ~MC_CTL2_THRESHOLD;
708	ctl |= MC_CTL2_CMCI_EN | 1;
709	wrmsr(MSR_MC_CTL2(i), ctl);
710	ctl = rdmsr(MSR_MC_CTL2(i));
711	if (!(ctl & MC_CTL2_CMCI_EN))
712		/* This bank does not support CMCI. */
713		return;
714
715	cc = &cmc_state[PCPU_GET(cpuid)][i];
716
717	/* Determine maximum threshold. */
718	ctl &= ~MC_CTL2_THRESHOLD;
719	ctl |= 0x7fff;
720	wrmsr(MSR_MC_CTL2(i), ctl);
721	ctl = rdmsr(MSR_MC_CTL2(i));
722	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
723
724	/* Start off with a threshold of 1. */
725	ctl &= ~MC_CTL2_THRESHOLD;
726	ctl |= 1;
727	wrmsr(MSR_MC_CTL2(i), ctl);
728
729	/* Mark this bank as monitored. */
730	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
731}
732#endif
733
734/* Must be executed on each CPU. */
735void
736mca_init(void)
737{
738	uint64_t mcg_cap;
739	uint64_t ctl, mask;
740	int skip;
741	int i;
742
743	/* MCE is required. */
744	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
745		return;
746
747	/*
748	 * On AMD Family 10h processors, unless logging of level one TLB
749	 * parity (L1TP) errors is disabled, enable the recommended workaround
750	 * for Erratum 383.
751	 */
752	if (cpu_vendor_id == CPU_VENDOR_AMD &&
753	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
754		workaround_erratum383 = 1;
755
756	if (cpu_feature & CPUID_MCA) {
757		PCPU_SET(cmci_mask, 0);
758
759		mcg_cap = rdmsr(MSR_MCG_CAP);
760		if (mcg_cap & MCG_CAP_CTL_P)
761			/* Enable MCA features. */
762			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
763		if (PCPU_GET(cpuid) == 0)
764			mca_setup(mcg_cap);
765
766		/*
767		 * Disable logging of level one TLB parity (L1TP) errors by
768		 * the data cache as an alternative workaround for AMD Family
769		 * 10h Erratum 383.  Unlike the recommended workaround, there
770		 * is no performance penalty to this workaround.  However,
771		 * L1TP errors will go unreported.
772		 */
773		if (cpu_vendor_id == CPU_VENDOR_AMD &&
774		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
775			mask = rdmsr(MSR_MC0_CTL_MASK);
776			if ((mask & (1UL << 5)) == 0)
777				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
778		}
779		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
780			/* By default enable logging of all errors. */
781			ctl = 0xffffffffffffffffUL;
782			skip = 0;
783
784			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
785				/*
786				 * For P6 models before Nehalem MC0_CTL is
787				 * always enabled and reserved.
788				 */
789				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
790				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
791					skip = 1;
792			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
793				/* BKDG for Family 10h: unset GartTblWkEn. */
794				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
795					ctl &= ~(1UL << 10);
796			}
797
798			if (!skip)
799				wrmsr(MSR_MC_CTL(i), ctl);
800
801#ifdef DEV_APIC
802			if (mcg_cap & MCG_CAP_CMCI_P)
803				cmci_monitor(i);
804#endif
805
806			/* Clear all errors. */
807			wrmsr(MSR_MC_STATUS(i), 0);
808		}
809
810#ifdef DEV_APIC
811		if (PCPU_GET(cmci_mask) != 0)
812			lapic_enable_cmc();
813#endif
814	}
815
816	load_cr4(rcr4() | CR4_MCE);
817}
818
819/*
820 * The machine check registers for the BSP cannot be initialized until
821 * the local APIC is initialized.  This happens at SI_SUB_CPU,
822 * SI_ORDER_SECOND.
823 */
824static void
825mca_init_bsp(void *arg __unused)
826{
827
828	mca_init();
829}
830SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
831
832/* Called when a machine check exception fires. */
833int
834mca_intr(void)
835{
836	uint64_t mcg_status;
837	int recoverable;
838
839	if (!(cpu_feature & CPUID_MCA)) {
840		/*
841		 * Just print the values of the old Pentium registers
842		 * and panic.
843		 */
844		printf("MC Type: 0x%jx  Address: 0x%jx\n",
845		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
846		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
847		return (0);
848	}
849
850	/* Scan the banks and check for any non-recoverable errors. */
851	recoverable = mca_scan(MCE);
852	mcg_status = rdmsr(MSR_MCG_STATUS);
853	if (!(mcg_status & MCG_STATUS_RIPV))
854		recoverable = 0;
855
856	/* Clear MCIP. */
857	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
858	return (recoverable);
859}
860
861#ifdef DEV_APIC
862/* Called for a CMCI (correctable machine check interrupt). */
863void
864cmc_intr(void)
865{
866	struct mca_internal *mca;
867	int count;
868
869	/*
870	 * Serialize MCA bank scanning to prevent collisions from
871	 * sibling threads.
872	 */
873	count = mca_scan(CMCI);
874
875	/* If we found anything, log them to the console. */
876	if (count != 0) {
877		mtx_lock_spin(&mca_lock);
878		STAILQ_FOREACH(mca, &mca_records, link) {
879			if (!mca->logged) {
880				mca->logged = 1;
881				mtx_unlock_spin(&mca_lock);
882				mca_log(&mca->rec);
883				mtx_lock_spin(&mca_lock);
884			}
885		}
886		mtx_unlock_spin(&mca_lock);
887	}
888}
889#endif
890