1192050Sjhb/*-
2283927Sjhb * Copyright (c) 2009 Hudson River Trading LLC
3192050Sjhb * Written by: John H. Baldwin <jhb@FreeBSD.org>
4192050Sjhb * All rights reserved.
5192050Sjhb *
6192050Sjhb * Redistribution and use in source and binary forms, with or without
7192050Sjhb * modification, are permitted provided that the following conditions
8192050Sjhb * are met:
9192050Sjhb * 1. Redistributions of source code must retain the above copyright
10192050Sjhb *    notice, this list of conditions and the following disclaimer.
11192050Sjhb * 2. Redistributions in binary form must reproduce the above copyright
12192050Sjhb *    notice, this list of conditions and the following disclaimer in the
13192050Sjhb *    documentation and/or other materials provided with the distribution.
14192050Sjhb *
15192050Sjhb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16192050Sjhb * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17192050Sjhb * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18192050Sjhb * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19192050Sjhb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20192050Sjhb * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21192050Sjhb * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22192050Sjhb * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23192050Sjhb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24192050Sjhb * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25192050Sjhb * SUCH DAMAGE.
26192050Sjhb */
27192050Sjhb
28192050Sjhb/*
29192050Sjhb * Support for x86 machine check architecture.
30192050Sjhb */
31192050Sjhb
32192050Sjhb#include <sys/cdefs.h>
33192050Sjhb__FBSDID("$FreeBSD: releng/10.3/sys/x86/x86/mca.c 285174 2015-07-05 19:32:10Z marius $");
34192050Sjhb
35208921Sjhb#ifdef __amd64__
36208921Sjhb#define	DEV_APIC
37208921Sjhb#else
38208556Sjhb#include "opt_apic.h"
39208921Sjhb#endif
40208556Sjhb
41192050Sjhb#include <sys/param.h>
42208507Sjhb#include <sys/bus.h>
43208507Sjhb#include <sys/interrupt.h>
44192050Sjhb#include <sys/kernel.h>
45192050Sjhb#include <sys/lock.h>
46192050Sjhb#include <sys/malloc.h>
47192050Sjhb#include <sys/mutex.h>
48192050Sjhb#include <sys/proc.h>
49192050Sjhb#include <sys/sched.h>
50192050Sjhb#include <sys/smp.h>
51192050Sjhb#include <sys/sysctl.h>
52192050Sjhb#include <sys/systm.h>
53233793Sjhb#include <sys/taskqueue.h>
54208507Sjhb#include <machine/intr_machdep.h>
55208507Sjhb#include <machine/apicvar.h>
56269592Smarius#include <machine/cpu.h>
57200064Savg#include <machine/cputypes.h>
58214630Sjhb#include <x86/mca.h>
59192050Sjhb#include <machine/md_var.h>
60192050Sjhb#include <machine/specialreg.h>
61192050Sjhb
62208507Sjhb/* Modes for mca_scan() */
63208507Sjhbenum scan_mode {
64208507Sjhb	POLLED,
65208507Sjhb	MCE,
66208507Sjhb	CMCI,
67208507Sjhb};
68208507Sjhb
69208556Sjhb#ifdef DEV_APIC
70208507Sjhb/*
71208507Sjhb * State maintained for each monitored MCx bank to control the
72208507Sjhb * corrected machine check interrupt threshold.
73208507Sjhb */
74208507Sjhbstruct cmc_state {
75208507Sjhb	int	max_threshold;
76208507Sjhb	int	last_intr;
77208507Sjhb};
78208556Sjhb#endif
79208507Sjhb
80192050Sjhbstruct mca_internal {
81192050Sjhb	struct mca_record rec;
82192050Sjhb	int		logged;
83192050Sjhb	STAILQ_ENTRY(mca_internal) link;
84192050Sjhb};
85192050Sjhb
86192050Sjhbstatic MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
87192050Sjhb
88269592Smariusstatic volatile int mca_count;	/* Number of records stored. */
89233709Sjhbstatic int mca_banks;		/* Number of per-CPU register banks. */
90192050Sjhb
91227309Sedstatic SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL,
92227309Sed    "Machine Check Architecture");
93192343Sjhb
94205573Salcstatic int mca_enabled = 1;
95192343SjhbTUNABLE_INT("hw.mca.enabled", &mca_enabled);
96192343SjhbSYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
97192343Sjhb    "Administrative toggle for machine check support");
98192343Sjhb
99205573Salcstatic int amd10h_L1TP = 1;
100205573SalcTUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
101205573SalcSYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
102205573Salc    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
103205573Salc
104269592Smariusstatic int intel6h_HSD131;
105269592SmariusTUNABLE_INT("hw.mca.intel6h_hsd131", &intel6h_HSD131);
106269592SmariusSYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
107269592Smarius    "Administrative toggle for logging of spurious corrected errors");
108269592Smarius
109205573Salcint workaround_erratum383;
110205573SalcSYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
111205573Salc    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
112205573Salc
113233709Sjhbstatic STAILQ_HEAD(, mca_internal) mca_freelist;
114233709Sjhbstatic int mca_freecount;
115192050Sjhbstatic STAILQ_HEAD(, mca_internal) mca_records;
116192050Sjhbstatic struct callout mca_timer;
117192050Sjhbstatic int mca_ticks = 3600;	/* Check hourly by default. */
118233793Sjhbstatic struct taskqueue *mca_tq;
119233793Sjhbstatic struct task mca_refill_task, mca_scan_task;
120192050Sjhbstatic struct mtx mca_lock;
121208556Sjhb
122208556Sjhb#ifdef DEV_APIC
123208507Sjhbstatic struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
124208507Sjhbstatic int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
125208556Sjhb#endif
126192050Sjhb
127192050Sjhbstatic int
128208507Sjhbsysctl_positive_int(SYSCTL_HANDLER_ARGS)
129192050Sjhb{
130192050Sjhb	int error, value;
131192050Sjhb
132208507Sjhb	value = *(int *)arg1;
133192050Sjhb	error = sysctl_handle_int(oidp, &value, 0, req);
134192050Sjhb	if (error || req->newptr == NULL)
135192050Sjhb		return (error);
136192050Sjhb	if (value <= 0)
137192050Sjhb		return (EINVAL);
138208507Sjhb	*(int *)arg1 = value;
139192050Sjhb	return (0);
140192050Sjhb}
141192050Sjhb
142192050Sjhbstatic int
143192050Sjhbsysctl_mca_records(SYSCTL_HANDLER_ARGS)
144192050Sjhb{
145192050Sjhb	int *name = (int *)arg1;
146192050Sjhb	u_int namelen = arg2;
147192050Sjhb	struct mca_record record;
148192050Sjhb	struct mca_internal *rec;
149192050Sjhb	int i;
150192050Sjhb
151192050Sjhb	if (namelen != 1)
152192050Sjhb		return (EINVAL);
153192050Sjhb
154192050Sjhb	if (name[0] < 0 || name[0] >= mca_count)
155192050Sjhb		return (EINVAL);
156192050Sjhb
157192050Sjhb	mtx_lock_spin(&mca_lock);
158192050Sjhb	if (name[0] >= mca_count) {
159192050Sjhb		mtx_unlock_spin(&mca_lock);
160192050Sjhb		return (EINVAL);
161192050Sjhb	}
162192050Sjhb	i = 0;
163192050Sjhb	STAILQ_FOREACH(rec, &mca_records, link) {
164192050Sjhb		if (i == name[0]) {
165192050Sjhb			record = rec->rec;
166192050Sjhb			break;
167192050Sjhb		}
168192050Sjhb		i++;
169192050Sjhb	}
170192050Sjhb	mtx_unlock_spin(&mca_lock);
171192050Sjhb	return (SYSCTL_OUT(req, &record, sizeof(record)));
172192050Sjhb}
173192050Sjhb
174192050Sjhbstatic const char *
175192050Sjhbmca_error_ttype(uint16_t mca_error)
176192050Sjhb{
177192050Sjhb
178192050Sjhb	switch ((mca_error & 0x000c) >> 2) {
179192050Sjhb	case 0:
180192050Sjhb		return ("I");
181192050Sjhb	case 1:
182192050Sjhb		return ("D");
183192050Sjhb	case 2:
184192050Sjhb		return ("G");
185192050Sjhb	}
186192050Sjhb	return ("?");
187192050Sjhb}
188192050Sjhb
189192050Sjhbstatic const char *
190192050Sjhbmca_error_level(uint16_t mca_error)
191192050Sjhb{
192192050Sjhb
193192050Sjhb	switch (mca_error & 0x0003) {
194192050Sjhb	case 0:
195192050Sjhb		return ("L0");
196192050Sjhb	case 1:
197192050Sjhb		return ("L1");
198192050Sjhb	case 2:
199192050Sjhb		return ("L2");
200192050Sjhb	case 3:
201192050Sjhb		return ("LG");
202192050Sjhb	}
203192050Sjhb	return ("L?");
204192050Sjhb}
205192050Sjhb
206192050Sjhbstatic const char *
207192050Sjhbmca_error_request(uint16_t mca_error)
208192050Sjhb{
209192050Sjhb
210192050Sjhb	switch ((mca_error & 0x00f0) >> 4) {
211192050Sjhb	case 0x0:
212192050Sjhb		return ("ERR");
213192050Sjhb	case 0x1:
214192050Sjhb		return ("RD");
215192050Sjhb	case 0x2:
216192050Sjhb		return ("WR");
217192050Sjhb	case 0x3:
218192050Sjhb		return ("DRD");
219192050Sjhb	case 0x4:
220192050Sjhb		return ("DWR");
221192050Sjhb	case 0x5:
222192050Sjhb		return ("IRD");
223192050Sjhb	case 0x6:
224192050Sjhb		return ("PREFETCH");
225192050Sjhb	case 0x7:
226192050Sjhb		return ("EVICT");
227192050Sjhb	case 0x8:
228192050Sjhb		return ("SNOOP");
229192050Sjhb	}
230192050Sjhb	return ("???");
231192050Sjhb}
232192050Sjhb
233205214Sjhbstatic const char *
234205214Sjhbmca_error_mmtype(uint16_t mca_error)
235205214Sjhb{
236205214Sjhb
237205214Sjhb	switch ((mca_error & 0x70) >> 4) {
238205214Sjhb	case 0x0:
239205214Sjhb		return ("GEN");
240205214Sjhb	case 0x1:
241205214Sjhb		return ("RD");
242205214Sjhb	case 0x2:
243205214Sjhb		return ("WR");
244205214Sjhb	case 0x3:
245205214Sjhb		return ("AC");
246205214Sjhb	case 0x4:
247205214Sjhb		return ("MS");
248205214Sjhb	}
249205214Sjhb	return ("???");
250205214Sjhb}
251205214Sjhb
252269592Smariusstatic int __nonnull(1)
253269592Smariusmca_mute(const struct mca_record *rec)
254269592Smarius{
255269592Smarius
256269592Smarius	/*
257285174Smarius	 * Skip spurious corrected parity errors generated by Intel Haswell-
258285174Smarius	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
259285174Smarius	 * erratum respectively), unless reporting is enabled.
260285174Smarius	 * Note that these errors also have been observed with the D0-stepping
261285174Smarius	 * of Haswell, while at least initially the CPU specification updates
262285174Smarius	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
263285174Smarius	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
264285174Smarius	 * same problem, with HSM142 only referring to 0x3c and 0x46.
265269592Smarius	 */
266285174Smarius	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
267285174Smarius	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
268285174Smarius	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
269285174Smarius	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
270285174Smarius	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
271285174Smarius	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
272285174Smarius	    rec->mr_bank == 0 &&
273285174Smarius	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
274285174Smarius	    !intel6h_HSD131)
275269592Smarius	    	return (1);
276269592Smarius
277269592Smarius	return (0);
278269592Smarius}
279269592Smarius
280192050Sjhb/* Dump details about a single machine check. */
281200033Savgstatic void __nonnull(1)
282200033Savgmca_log(const struct mca_record *rec)
283192050Sjhb{
284192050Sjhb	uint16_t mca_error;
285192050Sjhb
286269592Smarius	if (mca_mute(rec))
287269592Smarius	    	return;
288269592Smarius
289205214Sjhb	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
290200033Savg	    (long long)rec->mr_status);
291205214Sjhb	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
292205214Sjhb	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
293205214Sjhb	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
294205214Sjhb	    rec->mr_cpu_id, rec->mr_apic_id);
295205214Sjhb	printf("MCA: CPU %d ", rec->mr_cpu);
296192050Sjhb	if (rec->mr_status & MC_STATUS_UC)
297192050Sjhb		printf("UNCOR ");
298205214Sjhb	else {
299192050Sjhb		printf("COR ");
300210577Sjhb		if (rec->mr_mcg_cap & MCG_CAP_CMCI_P)
301205214Sjhb			printf("(%lld) ", ((long long)rec->mr_status &
302205214Sjhb			    MC_STATUS_COR_COUNT) >> 38);
303205214Sjhb	}
304192050Sjhb	if (rec->mr_status & MC_STATUS_PCC)
305192050Sjhb		printf("PCC ");
306192050Sjhb	if (rec->mr_status & MC_STATUS_OVER)
307192050Sjhb		printf("OVER ");
308192050Sjhb	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
309192050Sjhb	switch (mca_error) {
310192050Sjhb		/* Simple error codes. */
311192050Sjhb	case 0x0000:
312192050Sjhb		printf("no error");
313192050Sjhb		break;
314192050Sjhb	case 0x0001:
315192050Sjhb		printf("unclassified error");
316192050Sjhb		break;
317192050Sjhb	case 0x0002:
318192050Sjhb		printf("ucode ROM parity error");
319192050Sjhb		break;
320192050Sjhb	case 0x0003:
321192050Sjhb		printf("external error");
322192050Sjhb		break;
323192050Sjhb	case 0x0004:
324192050Sjhb		printf("FRC error");
325192050Sjhb		break;
326205214Sjhb	case 0x0005:
327205214Sjhb		printf("internal parity error");
328205214Sjhb		break;
329192050Sjhb	case 0x0400:
330192050Sjhb		printf("internal timer error");
331192050Sjhb		break;
332192050Sjhb	default:
333192050Sjhb		if ((mca_error & 0xfc00) == 0x0400) {
334192050Sjhb			printf("internal error %x", mca_error & 0x03ff);
335192050Sjhb			break;
336192050Sjhb		}
337192050Sjhb
338192050Sjhb		/* Compound error codes. */
339192050Sjhb
340192050Sjhb		/* Memory hierarchy error. */
341192050Sjhb		if ((mca_error & 0xeffc) == 0x000c) {
342192050Sjhb			printf("%s memory error", mca_error_level(mca_error));
343192050Sjhb			break;
344192050Sjhb		}
345192050Sjhb
346192050Sjhb		/* TLB error. */
347192050Sjhb		if ((mca_error & 0xeff0) == 0x0010) {
348192050Sjhb			printf("%sTLB %s error", mca_error_ttype(mca_error),
349192050Sjhb			    mca_error_level(mca_error));
350192050Sjhb			break;
351192050Sjhb		}
352192050Sjhb
353205214Sjhb		/* Memory controller error. */
354205214Sjhb		if ((mca_error & 0xef80) == 0x0080) {
355205214Sjhb			printf("%s channel ", mca_error_mmtype(mca_error));
356205214Sjhb			if ((mca_error & 0x000f) != 0x000f)
357205214Sjhb				printf("%d", mca_error & 0x000f);
358205214Sjhb			else
359205214Sjhb				printf("??");
360205214Sjhb			printf(" memory error");
361205214Sjhb			break;
362205214Sjhb		}
363205214Sjhb
364192050Sjhb		/* Cache error. */
365192050Sjhb		if ((mca_error & 0xef00) == 0x0100) {
366192050Sjhb			printf("%sCACHE %s %s error",
367192050Sjhb			    mca_error_ttype(mca_error),
368192050Sjhb			    mca_error_level(mca_error),
369192050Sjhb			    mca_error_request(mca_error));
370192050Sjhb			break;
371192050Sjhb		}
372192050Sjhb
373192050Sjhb		/* Bus and/or Interconnect error. */
374192050Sjhb		if ((mca_error & 0xe800) == 0x0800) {
375192050Sjhb			printf("BUS%s ", mca_error_level(mca_error));
376192050Sjhb			switch ((mca_error & 0x0600) >> 9) {
377192050Sjhb			case 0:
378192050Sjhb				printf("Source");
379192050Sjhb				break;
380192050Sjhb			case 1:
381192050Sjhb				printf("Responder");
382192050Sjhb				break;
383192050Sjhb			case 2:
384192050Sjhb				printf("Observer");
385192050Sjhb				break;
386192050Sjhb			default:
387192050Sjhb				printf("???");
388192050Sjhb				break;
389192050Sjhb			}
390192050Sjhb			printf(" %s ", mca_error_request(mca_error));
391192050Sjhb			switch ((mca_error & 0x000c) >> 2) {
392192050Sjhb			case 0:
393192050Sjhb				printf("Memory");
394192050Sjhb				break;
395192050Sjhb			case 2:
396192050Sjhb				printf("I/O");
397192050Sjhb				break;
398192050Sjhb			case 3:
399192050Sjhb				printf("Other");
400192050Sjhb				break;
401192050Sjhb			default:
402192050Sjhb				printf("???");
403192050Sjhb				break;
404192050Sjhb			}
405192050Sjhb			if (mca_error & 0x0100)
406192050Sjhb				printf(" timed out");
407192050Sjhb			break;
408192050Sjhb		}
409192050Sjhb
410192050Sjhb		printf("unknown error %x", mca_error);
411192050Sjhb		break;
412192050Sjhb	}
413192050Sjhb	printf("\n");
414192050Sjhb	if (rec->mr_status & MC_STATUS_ADDRV)
415192050Sjhb		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
416204518Sjhb	if (rec->mr_status & MC_STATUS_MISCV)
417204518Sjhb		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
418192050Sjhb}
419192050Sjhb
420200033Savgstatic int __nonnull(2)
421200033Savgmca_check_status(int bank, struct mca_record *rec)
422200033Savg{
423200033Savg	uint64_t status;
424200033Savg	u_int p[4];
425200033Savg
426200033Savg	status = rdmsr(MSR_MC_STATUS(bank));
427200033Savg	if (!(status & MC_STATUS_VAL))
428200033Savg		return (0);
429200033Savg
430200033Savg	/* Save exception information. */
431200033Savg	rec->mr_status = status;
432200033Savg	rec->mr_bank = bank;
433200033Savg	rec->mr_addr = 0;
434200033Savg	if (status & MC_STATUS_ADDRV)
435200033Savg		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
436200033Savg	rec->mr_misc = 0;
437200033Savg	if (status & MC_STATUS_MISCV)
438200033Savg		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
439200033Savg	rec->mr_tsc = rdtsc();
440200033Savg	rec->mr_apic_id = PCPU_GET(apic_id);
441205214Sjhb	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
442205214Sjhb	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
443205214Sjhb	rec->mr_cpu_id = cpu_id;
444205214Sjhb	rec->mr_cpu_vendor_id = cpu_vendor_id;
445205214Sjhb	rec->mr_cpu = PCPU_GET(cpuid);
446200033Savg
447200033Savg	/*
448200033Savg	 * Clear machine check.  Don't do this for uncorrectable
449200033Savg	 * errors so that the BIOS can see them.
450200033Savg	 */
451200033Savg	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
452200033Savg		wrmsr(MSR_MC_STATUS(bank), 0);
453200033Savg		do_cpuid(0, p);
454200033Savg	}
455200033Savg	return (1);
456200033Savg}
457200033Savg
458233709Sjhbstatic void
459233709Sjhbmca_fill_freelist(void)
460200033Savg{
461200033Savg	struct mca_internal *rec;
462233709Sjhb	int desired;
463200033Savg
464233709Sjhb	/*
465233709Sjhb	 * Ensure we have at least one record for each bank and one
466233709Sjhb	 * record per CPU.
467233709Sjhb	 */
468233709Sjhb	desired = imax(mp_ncpus, mca_banks);
469233709Sjhb	mtx_lock_spin(&mca_lock);
470233709Sjhb	while (mca_freecount < desired) {
471233709Sjhb		mtx_unlock_spin(&mca_lock);
472233709Sjhb		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
473233709Sjhb		mtx_lock_spin(&mca_lock);
474233709Sjhb		STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
475233709Sjhb		mca_freecount++;
476200033Savg	}
477233709Sjhb	mtx_unlock_spin(&mca_lock);
478233709Sjhb}
479200033Savg
480233709Sjhbstatic void
481233793Sjhbmca_refill(void *context, int pending)
482233709Sjhb{
483233709Sjhb
484233709Sjhb	mca_fill_freelist();
485233709Sjhb}
486233709Sjhb
487233709Sjhbstatic void __nonnull(2)
488233709Sjhbmca_record_entry(enum scan_mode mode, const struct mca_record *record)
489233709Sjhb{
490233709Sjhb	struct mca_internal *rec;
491233709Sjhb
492233709Sjhb	if (mode == POLLED) {
493233709Sjhb		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
494233709Sjhb		mtx_lock_spin(&mca_lock);
495233709Sjhb	} else {
496233709Sjhb		mtx_lock_spin(&mca_lock);
497233709Sjhb		rec = STAILQ_FIRST(&mca_freelist);
498233709Sjhb		if (rec == NULL) {
499233709Sjhb			printf("MCA: Unable to allocate space for an event.\n");
500233709Sjhb			mca_log(record);
501233781Sjhb			mtx_unlock_spin(&mca_lock);
502233709Sjhb			return;
503233709Sjhb		}
504233709Sjhb		STAILQ_REMOVE_HEAD(&mca_freelist, link);
505233709Sjhb		mca_freecount--;
506233709Sjhb	}
507233709Sjhb
508200033Savg	rec->rec = *record;
509200033Savg	rec->logged = 0;
510200033Savg	STAILQ_INSERT_TAIL(&mca_records, rec, link);
511200033Savg	mca_count++;
512200033Savg	mtx_unlock_spin(&mca_lock);
513233793Sjhb	if (mode == CMCI)
514233793Sjhb		taskqueue_enqueue_fast(mca_tq, &mca_refill_task);
515200033Savg}
516200033Savg
517208556Sjhb#ifdef DEV_APIC
518192050Sjhb/*
519208507Sjhb * Update the interrupt threshold for a CMCI.  The strategy is to use
520208507Sjhb * a low trigger that interrupts as soon as the first event occurs.
521208507Sjhb * However, if a steady stream of events arrive, the threshold is
522208507Sjhb * increased until the interrupts are throttled to once every
523208507Sjhb * cmc_throttle seconds or the periodic scan.  If a periodic scan
524208507Sjhb * finds that the threshold is too high, it is lowered.
525208507Sjhb */
526208507Sjhbstatic void
527208507Sjhbcmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
528208507Sjhb{
529208507Sjhb	struct cmc_state *cc;
530208507Sjhb	uint64_t ctl;
531208507Sjhb	u_int delta;
532208507Sjhb	int count, limit;
533208507Sjhb
534208507Sjhb	/* Fetch the current limit for this bank. */
535208507Sjhb	cc = &cmc_state[PCPU_GET(cpuid)][bank];
536208507Sjhb	ctl = rdmsr(MSR_MC_CTL2(bank));
537208507Sjhb	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
538208507Sjhb	delta = (u_int)(ticks - cc->last_intr);
539208507Sjhb
540208507Sjhb	/*
541208507Sjhb	 * If an interrupt was received less than cmc_throttle seconds
542208507Sjhb	 * since the previous interrupt and the count from the current
543208507Sjhb	 * event is greater than or equal to the current threshold,
544208507Sjhb	 * double the threshold up to the max.
545208507Sjhb	 */
546208507Sjhb	if (mode == CMCI && valid) {
547208507Sjhb		limit = ctl & MC_CTL2_THRESHOLD;
548208507Sjhb		if (delta < cmc_throttle && count >= limit &&
549208507Sjhb		    limit < cc->max_threshold) {
550208507Sjhb			limit = min(limit << 1, cc->max_threshold);
551208507Sjhb			ctl &= ~MC_CTL2_THRESHOLD;
552208507Sjhb			ctl |= limit;
553208507Sjhb			wrmsr(MSR_MC_CTL2(bank), limit);
554208507Sjhb		}
555208507Sjhb		cc->last_intr = ticks;
556208507Sjhb		return;
557208507Sjhb	}
558208507Sjhb
559208507Sjhb	/*
560208507Sjhb	 * When the banks are polled, check to see if the threshold
561208507Sjhb	 * should be lowered.
562208507Sjhb	 */
563208507Sjhb	if (mode != POLLED)
564208507Sjhb		return;
565208507Sjhb
566208507Sjhb	/* If a CMCI occured recently, do nothing for now. */
567208507Sjhb	if (delta < cmc_throttle)
568208507Sjhb		return;
569208507Sjhb
570208507Sjhb	/*
571208507Sjhb	 * Compute a new limit based on the average rate of events per
572208507Sjhb	 * cmc_throttle seconds since the last interrupt.
573208507Sjhb	 */
574208507Sjhb	if (valid) {
575208507Sjhb		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
576208507Sjhb		limit = count * cmc_throttle / delta;
577208507Sjhb		if (limit <= 0)
578208507Sjhb			limit = 1;
579208507Sjhb		else if (limit > cc->max_threshold)
580208507Sjhb			limit = cc->max_threshold;
581208507Sjhb	} else
582208507Sjhb		limit = 1;
583208507Sjhb	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
584208507Sjhb		ctl &= ~MC_CTL2_THRESHOLD;
585208507Sjhb		ctl |= limit;
586208507Sjhb		wrmsr(MSR_MC_CTL2(bank), limit);
587208507Sjhb	}
588208507Sjhb}
589208556Sjhb#endif
590208507Sjhb
591208507Sjhb/*
592192050Sjhb * This scans all the machine check banks of the current CPU to see if
593192050Sjhb * there are any machine checks.  Any non-recoverable errors are
594192050Sjhb * reported immediately via mca_log().  The current thread must be
595208507Sjhb * pinned when this is called.  The 'mode' parameter indicates if we
596208507Sjhb * are being called from the MC exception handler, the CMCI handler,
597208507Sjhb * or the periodic poller.  In the MC exception case this function
598208507Sjhb * returns true if the system is restartable.  Otherwise, it returns a
599208507Sjhb * count of the number of valid MC records found.
600192050Sjhb */
601192050Sjhbstatic int
602208507Sjhbmca_scan(enum scan_mode mode)
603192050Sjhb{
604200033Savg	struct mca_record rec;
605192050Sjhb	uint64_t mcg_cap, ucmask;
606208507Sjhb	int count, i, recoverable, valid;
607192050Sjhb
608192050Sjhb	count = 0;
609192050Sjhb	recoverable = 1;
610192050Sjhb	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
611192050Sjhb
612192050Sjhb	/* When handling a MCE#, treat the OVER flag as non-restartable. */
613208507Sjhb	if (mode == MCE)
614192343Sjhb		ucmask |= MC_STATUS_OVER;
615192050Sjhb	mcg_cap = rdmsr(MSR_MCG_CAP);
616192050Sjhb	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
617208556Sjhb#ifdef DEV_APIC
618208507Sjhb		/*
619208507Sjhb		 * For a CMCI, only check banks this CPU is
620208507Sjhb		 * responsible for.
621208507Sjhb		 */
622208507Sjhb		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
623208507Sjhb			continue;
624208556Sjhb#endif
625208507Sjhb
626208507Sjhb		valid = mca_check_status(i, &rec);
627208507Sjhb		if (valid) {
628192050Sjhb			count++;
629200033Savg			if (rec.mr_status & ucmask) {
630192050Sjhb				recoverable = 0;
631233781Sjhb				mtx_lock_spin(&mca_lock);
632200033Savg				mca_log(&rec);
633233781Sjhb				mtx_unlock_spin(&mca_lock);
634192050Sjhb			}
635233709Sjhb			mca_record_entry(mode, &rec);
636192050Sjhb		}
637208507Sjhb
638208556Sjhb#ifdef DEV_APIC
639208507Sjhb		/*
640208507Sjhb		 * If this is a bank this CPU monitors via CMCI,
641208507Sjhb		 * update the threshold.
642208507Sjhb		 */
643209059Sjhb		if (PCPU_GET(cmci_mask) & 1 << i)
644208507Sjhb			cmci_update(mode, i, valid, &rec);
645208556Sjhb#endif
646192050Sjhb	}
647233709Sjhb	if (mode == POLLED)
648233709Sjhb		mca_fill_freelist();
649208507Sjhb	return (mode == MCE ? recoverable : count);
650192050Sjhb}
651192050Sjhb
652192050Sjhb/*
653192050Sjhb * Scan the machine check banks on all CPUs by binding to each CPU in
654192050Sjhb * turn.  If any of the CPUs contained new machine check records, log
655192050Sjhb * them to the console.
656192050Sjhb */
657192050Sjhbstatic void
658233793Sjhbmca_scan_cpus(void *context, int pending)
659192050Sjhb{
660192050Sjhb	struct mca_internal *mca;
661192050Sjhb	struct thread *td;
662192050Sjhb	int count, cpu;
663192050Sjhb
664233793Sjhb	mca_fill_freelist();
665192050Sjhb	td = curthread;
666192050Sjhb	count = 0;
667192050Sjhb	thread_lock(td);
668209059Sjhb	CPU_FOREACH(cpu) {
669192050Sjhb		sched_bind(td, cpu);
670192050Sjhb		thread_unlock(td);
671208507Sjhb		count += mca_scan(POLLED);
672192050Sjhb		thread_lock(td);
673192050Sjhb		sched_unbind(td);
674192050Sjhb	}
675192050Sjhb	thread_unlock(td);
676192050Sjhb	if (count != 0) {
677192050Sjhb		mtx_lock_spin(&mca_lock);
678192050Sjhb		STAILQ_FOREACH(mca, &mca_records, link) {
679192050Sjhb			if (!mca->logged) {
680192050Sjhb				mca->logged = 1;
681192050Sjhb				mca_log(&mca->rec);
682192050Sjhb			}
683192050Sjhb		}
684192050Sjhb		mtx_unlock_spin(&mca_lock);
685192050Sjhb	}
686192050Sjhb}
687192050Sjhb
688192050Sjhbstatic void
689192050Sjhbmca_periodic_scan(void *arg)
690192050Sjhb{
691192050Sjhb
692233793Sjhb	taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
693192050Sjhb	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
694192050Sjhb}
695192050Sjhb
696192050Sjhbstatic int
697192050Sjhbsysctl_mca_scan(SYSCTL_HANDLER_ARGS)
698192050Sjhb{
699192050Sjhb	int error, i;
700192050Sjhb
701192050Sjhb	i = 0;
702192050Sjhb	error = sysctl_handle_int(oidp, &i, 0, req);
703192050Sjhb	if (error)
704192050Sjhb		return (error);
705192050Sjhb	if (i)
706233793Sjhb		taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
707192050Sjhb	return (0);
708192050Sjhb}
709192050Sjhb
710192050Sjhbstatic void
711233793Sjhbmca_createtq(void *dummy)
712233793Sjhb{
713233793Sjhb	if (mca_banks <= 0)
714233793Sjhb		return;
715233793Sjhb
716233793Sjhb	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
717233793Sjhb	    taskqueue_thread_enqueue, &mca_tq);
718233793Sjhb	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
719233793Sjhb}
720233793SjhbSYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
721233793Sjhb
722233793Sjhbstatic void
723192050Sjhbmca_startup(void *dummy)
724192050Sjhb{
725192050Sjhb
726233793Sjhb	if (mca_banks <= 0)
727192050Sjhb		return;
728192050Sjhb
729233709Sjhb	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
730192050Sjhb}
731192050SjhbSYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
732192050Sjhb
733208556Sjhb#ifdef DEV_APIC
734192050Sjhbstatic void
735233709Sjhbcmci_setup(void)
736192050Sjhb{
737208507Sjhb	int i;
738192050Sjhb
739269592Smarius	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
740269592Smarius	    M_WAITOK);
741208507Sjhb	for (i = 0; i <= mp_maxid; i++)
742233709Sjhb		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
743208507Sjhb		    M_MCA, M_WAITOK | M_ZERO);
744208507Sjhb	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
745208507Sjhb	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
746208507Sjhb	    &cmc_throttle, 0, sysctl_positive_int, "I",
747208507Sjhb	    "Interval in seconds to throttle corrected MC interrupts");
748208507Sjhb}
749208556Sjhb#endif
750208507Sjhb
751208507Sjhbstatic void
752208507Sjhbmca_setup(uint64_t mcg_cap)
753208507Sjhb{
754208507Sjhb
755209212Sjhb	/*
756209212Sjhb	 * On AMD Family 10h processors, unless logging of level one TLB
757209212Sjhb	 * parity (L1TP) errors is disabled, enable the recommended workaround
758209212Sjhb	 * for Erratum 383.
759209212Sjhb	 */
760209212Sjhb	if (cpu_vendor_id == CPU_VENDOR_AMD &&
761209212Sjhb	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
762209212Sjhb		workaround_erratum383 = 1;
763209212Sjhb
764233709Sjhb	mca_banks = mcg_cap & MCG_CAP_COUNT;
765192050Sjhb	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
766192050Sjhb	STAILQ_INIT(&mca_records);
767233793Sjhb	TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL);
768192050Sjhb	callout_init(&mca_timer, CALLOUT_MPSAFE);
769233709Sjhb	STAILQ_INIT(&mca_freelist);
770233793Sjhb	TASK_INIT(&mca_refill_task, 0, mca_refill, NULL);
771233709Sjhb	mca_fill_freelist();
772192343Sjhb	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
773269592Smarius	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
774269592Smarius	    "Record count");
775192343Sjhb	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
776192050Sjhb	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
777208507Sjhb	    0, sysctl_positive_int, "I",
778192050Sjhb	    "Periodic interval in seconds to scan for machine checks");
779192343Sjhb	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
780192050Sjhb	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
781192343Sjhb	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
782192050Sjhb	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
783192050Sjhb	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
784208556Sjhb#ifdef DEV_APIC
785208507Sjhb	if (mcg_cap & MCG_CAP_CMCI_P)
786233709Sjhb		cmci_setup();
787208556Sjhb#endif
788192050Sjhb}
789192050Sjhb
790208556Sjhb#ifdef DEV_APIC
791208507Sjhb/*
792208507Sjhb * See if we should monitor CMCI for this bank.  If CMCI_EN is already
793208507Sjhb * set in MC_CTL2, then another CPU is responsible for this bank, so
794208507Sjhb * ignore it.  If CMCI_EN returns zero after being set, then this bank
795208507Sjhb * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
796208507Sjhb * now monitor this bank.
797208507Sjhb */
798208507Sjhbstatic void
799208507Sjhbcmci_monitor(int i)
800208507Sjhb{
801208507Sjhb	struct cmc_state *cc;
802208507Sjhb	uint64_t ctl;
803208507Sjhb
804233709Sjhb	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
805208507Sjhb
806208507Sjhb	ctl = rdmsr(MSR_MC_CTL2(i));
807208507Sjhb	if (ctl & MC_CTL2_CMCI_EN)
808208507Sjhb		/* Already monitored by another CPU. */
809208507Sjhb		return;
810208507Sjhb
811208507Sjhb	/* Set the threshold to one event for now. */
812208507Sjhb	ctl &= ~MC_CTL2_THRESHOLD;
813208507Sjhb	ctl |= MC_CTL2_CMCI_EN | 1;
814208507Sjhb	wrmsr(MSR_MC_CTL2(i), ctl);
815208507Sjhb	ctl = rdmsr(MSR_MC_CTL2(i));
816208507Sjhb	if (!(ctl & MC_CTL2_CMCI_EN))
817208507Sjhb		/* This bank does not support CMCI. */
818208507Sjhb		return;
819208507Sjhb
820208507Sjhb	cc = &cmc_state[PCPU_GET(cpuid)][i];
821208507Sjhb
822208507Sjhb	/* Determine maximum threshold. */
823208507Sjhb	ctl &= ~MC_CTL2_THRESHOLD;
824208507Sjhb	ctl |= 0x7fff;
825208507Sjhb	wrmsr(MSR_MC_CTL2(i), ctl);
826208507Sjhb	ctl = rdmsr(MSR_MC_CTL2(i));
827208507Sjhb	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
828208507Sjhb
829208507Sjhb	/* Start off with a threshold of 1. */
830208507Sjhb	ctl &= ~MC_CTL2_THRESHOLD;
831208507Sjhb	ctl |= 1;
832208507Sjhb	wrmsr(MSR_MC_CTL2(i), ctl);
833208507Sjhb
834208507Sjhb	/* Mark this bank as monitored. */
835208507Sjhb	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
836208507Sjhb}
837209212Sjhb
838209212Sjhb/*
839209212Sjhb * For resume, reset the threshold for any banks we monitor back to
840209212Sjhb * one and throw away the timestamp of the last interrupt.
841209212Sjhb */
842209212Sjhbstatic void
843209212Sjhbcmci_resume(int i)
844209212Sjhb{
845209212Sjhb	struct cmc_state *cc;
846209212Sjhb	uint64_t ctl;
847209212Sjhb
848233709Sjhb	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
849209212Sjhb
850209212Sjhb	/* Ignore banks not monitored by this CPU. */
851209212Sjhb	if (!(PCPU_GET(cmci_mask) & 1 << i))
852209212Sjhb		return;
853209212Sjhb
854209212Sjhb	cc = &cmc_state[PCPU_GET(cpuid)][i];
855209212Sjhb	cc->last_intr = -ticks;
856209212Sjhb	ctl = rdmsr(MSR_MC_CTL2(i));
857209212Sjhb	ctl &= ~MC_CTL2_THRESHOLD;
858209212Sjhb	ctl |= MC_CTL2_CMCI_EN | 1;
859209212Sjhb	wrmsr(MSR_MC_CTL2(i), ctl);
860209212Sjhb}
861208556Sjhb#endif
862208507Sjhb
863209212Sjhb/*
864209212Sjhb * Initializes per-CPU machine check registers and enables corrected
865209212Sjhb * machine check interrupts.
866209212Sjhb */
867209212Sjhbstatic void
868209212Sjhb_mca_init(int boot)
869192050Sjhb{
870192050Sjhb	uint64_t mcg_cap;
871205573Salc	uint64_t ctl, mask;
872209212Sjhb	int i, skip;
873192050Sjhb
874192050Sjhb	/* MCE is required. */
875192343Sjhb	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
876192050Sjhb		return;
877192050Sjhb
878192050Sjhb	if (cpu_feature & CPUID_MCA) {
879209212Sjhb		if (boot)
880209212Sjhb			PCPU_SET(cmci_mask, 0);
881192050Sjhb
882192050Sjhb		mcg_cap = rdmsr(MSR_MCG_CAP);
883192050Sjhb		if (mcg_cap & MCG_CAP_CTL_P)
884192050Sjhb			/* Enable MCA features. */
885192050Sjhb			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
886209212Sjhb		if (PCPU_GET(cpuid) == 0 && boot)
887208507Sjhb			mca_setup(mcg_cap);
888192050Sjhb
889205573Salc		/*
890205573Salc		 * Disable logging of level one TLB parity (L1TP) errors by
891205573Salc		 * the data cache as an alternative workaround for AMD Family
892205573Salc		 * 10h Erratum 383.  Unlike the recommended workaround, there
893205573Salc		 * is no performance penalty to this workaround.  However,
894205573Salc		 * L1TP errors will go unreported.
895205573Salc		 */
896205573Salc		if (cpu_vendor_id == CPU_VENDOR_AMD &&
897205573Salc		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
898205573Salc			mask = rdmsr(MSR_MC0_CTL_MASK);
899205573Salc			if ((mask & (1UL << 5)) == 0)
900205573Salc				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
901205573Salc		}
902192050Sjhb		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
903200064Savg			/* By default enable logging of all errors. */
904200064Savg			ctl = 0xffffffffffffffffUL;
905200064Savg			skip = 0;
906192050Sjhb
907200064Savg			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
908200064Savg				/*
909200064Savg				 * For P6 models before Nehalem MC0_CTL is
910200064Savg				 * always enabled and reserved.
911200064Savg				 */
912200064Savg				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
913200064Savg				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
914200064Savg					skip = 1;
915200064Savg			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
916200064Savg				/* BKDG for Family 10h: unset GartTblWkEn. */
917200064Savg				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
918200064Savg					ctl &= ~(1UL << 10);
919200064Savg			}
920200064Savg
921200064Savg			if (!skip)
922200064Savg				wrmsr(MSR_MC_CTL(i), ctl);
923208507Sjhb
924208556Sjhb#ifdef DEV_APIC
925209212Sjhb			if (mcg_cap & MCG_CAP_CMCI_P) {
926209212Sjhb				if (boot)
927209212Sjhb					cmci_monitor(i);
928209212Sjhb				else
929209212Sjhb					cmci_resume(i);
930209212Sjhb			}
931208556Sjhb#endif
932208507Sjhb
933192050Sjhb			/* Clear all errors. */
934192050Sjhb			wrmsr(MSR_MC_STATUS(i), 0);
935192050Sjhb		}
936208507Sjhb
937208556Sjhb#ifdef DEV_APIC
938209212Sjhb		if (PCPU_GET(cmci_mask) != 0 && boot)
939208507Sjhb			lapic_enable_cmc();
940208556Sjhb#endif
941192050Sjhb	}
942192050Sjhb
943192050Sjhb	load_cr4(rcr4() | CR4_MCE);
944192050Sjhb}
945192050Sjhb
946209212Sjhb/* Must be executed on each CPU during boot. */
947209212Sjhbvoid
948209212Sjhbmca_init(void)
949209212Sjhb{
950209212Sjhb
951209212Sjhb	_mca_init(1);
952209212Sjhb}
953209212Sjhb
954209212Sjhb/* Must be executed on each CPU during resume. */
955209212Sjhbvoid
956209212Sjhbmca_resume(void)
957209212Sjhb{
958209212Sjhb
959209212Sjhb	_mca_init(0);
960209212Sjhb}
961209212Sjhb
962208621Sjhb/*
963208621Sjhb * The machine check registers for the BSP cannot be initialized until
964208621Sjhb * the local APIC is initialized.  This happens at SI_SUB_CPU,
965208621Sjhb * SI_ORDER_SECOND.
966208621Sjhb */
967208621Sjhbstatic void
968208621Sjhbmca_init_bsp(void *arg __unused)
969208621Sjhb{
970208621Sjhb
971208621Sjhb	mca_init();
972208621Sjhb}
973208621SjhbSYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
974208621Sjhb
975192050Sjhb/* Called when a machine check exception fires. */
976233781Sjhbvoid
977192050Sjhbmca_intr(void)
978192050Sjhb{
979192050Sjhb	uint64_t mcg_status;
980269592Smarius	int old_count, recoverable;
981192050Sjhb
982192050Sjhb	if (!(cpu_feature & CPUID_MCA)) {
983192050Sjhb		/*
984192050Sjhb		 * Just print the values of the old Pentium registers
985192050Sjhb		 * and panic.
986192050Sjhb		 */
987208921Sjhb		printf("MC Type: 0x%jx  Address: 0x%jx\n",
988208921Sjhb		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
989208921Sjhb		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
990233781Sjhb		panic("Machine check");
991192050Sjhb	}
992192050Sjhb
993192050Sjhb	/* Scan the banks and check for any non-recoverable errors. */
994269592Smarius	old_count = mca_count;
995208507Sjhb	recoverable = mca_scan(MCE);
996192050Sjhb	mcg_status = rdmsr(MSR_MCG_STATUS);
997192050Sjhb	if (!(mcg_status & MCG_STATUS_RIPV))
998192050Sjhb		recoverable = 0;
999192050Sjhb
1000269592Smarius	if (!recoverable) {
1001269592Smarius		/*
1002269592Smarius		 * Wait for at least one error to be logged before
1003269592Smarius		 * panic'ing.  Some errors will assert a machine check
1004269592Smarius		 * on all CPUs, but only certain CPUs will find a valid
1005269592Smarius		 * bank to log.
1006269592Smarius		 */
1007269592Smarius		while (mca_count == old_count)
1008269592Smarius			cpu_spinwait();
1009269592Smarius
1010269592Smarius		panic("Unrecoverable machine check exception");
1011269592Smarius	}
1012269592Smarius
1013192050Sjhb	/* Clear MCIP. */
1014192050Sjhb	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1015192050Sjhb}
1016208507Sjhb
1017208556Sjhb#ifdef DEV_APIC
1018208507Sjhb/* Called for a CMCI (correctable machine check interrupt). */
1019208507Sjhbvoid
1020208507Sjhbcmc_intr(void)
1021208507Sjhb{
1022208507Sjhb	struct mca_internal *mca;
1023208507Sjhb	int count;
1024208507Sjhb
1025208507Sjhb	/*
1026208507Sjhb	 * Serialize MCA bank scanning to prevent collisions from
1027208507Sjhb	 * sibling threads.
1028208507Sjhb	 */
1029208507Sjhb	count = mca_scan(CMCI);
1030208507Sjhb
1031208507Sjhb	/* If we found anything, log them to the console. */
1032208507Sjhb	if (count != 0) {
1033208507Sjhb		mtx_lock_spin(&mca_lock);
1034208507Sjhb		STAILQ_FOREACH(mca, &mca_records, link) {
1035208507Sjhb			if (!mca->logged) {
1036208507Sjhb				mca->logged = 1;
1037208507Sjhb				mca_log(&mca->rec);
1038208507Sjhb			}
1039208507Sjhb		}
1040208507Sjhb		mtx_unlock_spin(&mca_lock);
1041208507Sjhb	}
1042208507Sjhb}
1043208556Sjhb#endif
1044