1/*-
2 * Copyright (c) 2009 Hudson River Trading LLC
3 * Written by: John H. Baldwin <jhb@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Support for x86 machine check architecture.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mca.c 333159 2018-05-02 07:38:38Z kib $");
34
35#ifdef __amd64__
36#define	DEV_APIC
37#else
38#include "opt_apic.h"
39#endif
40
41#include <sys/param.h>
42#include <sys/bus.h>
43#include <sys/interrupt.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52#include <sys/systm.h>
53#include <sys/taskqueue.h>
54#include <machine/intr_machdep.h>
55#include <x86/apicvar.h>
56#include <machine/cpu.h>
57#include <machine/cputypes.h>
58#include <x86/mca.h>
59#include <machine/md_var.h>
60#include <machine/specialreg.h>
61
62/* Modes for mca_scan() */
63enum scan_mode {
64	POLLED,
65	MCE,
66	CMCI,
67};
68
69#ifdef DEV_APIC
70/*
71 * State maintained for each monitored MCx bank to control the
72 * corrected machine check interrupt threshold.
73 */
74struct cmc_state {
75	int	max_threshold;
76	time_t	last_intr;
77};
78
79struct amd_et_state {
80	int	cur_threshold;
81	time_t	last_intr;
82};
83#endif
84
85struct mca_internal {
86	struct mca_record rec;
87	int		logged;
88	STAILQ_ENTRY(mca_internal) link;
89};
90
91static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
92
93static volatile int mca_count;	/* Number of records stored. */
94static int mca_banks;		/* Number of per-CPU register banks. */
95
96static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL,
97    "Machine Check Architecture");
98
99static int mca_enabled = 1;
100SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
101    "Administrative toggle for machine check support");
102
103static int amd10h_L1TP = 1;
104SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
105    "Administrative toggle for logging of level one TLB parity (L1TP) errors");
106
107static int intel6h_HSD131;
108SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
109    "Administrative toggle for logging of spurious corrected errors");
110
111int workaround_erratum383;
112SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
113    &workaround_erratum383, 0,
114    "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
115
116static STAILQ_HEAD(, mca_internal) mca_freelist;
117static int mca_freecount;
118static STAILQ_HEAD(, mca_internal) mca_records;
119static struct callout mca_timer;
120static int mca_ticks = 3600;	/* Check hourly by default. */
121static struct taskqueue *mca_tq;
122static struct task mca_refill_task, mca_scan_task;
123static struct mtx mca_lock;
124
125#ifdef DEV_APIC
126static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
127static struct amd_et_state *amd_et_state;	/* Indexed by cpuid. */
128static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
129
130static int amd_elvt = -1;
131
132static inline bool
133amd_thresholding_supported(void)
134{
135	return (cpu_vendor_id == CPU_VENDOR_AMD &&
136	    CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
137}
138#endif
139
140static int
141sysctl_positive_int(SYSCTL_HANDLER_ARGS)
142{
143	int error, value;
144
145	value = *(int *)arg1;
146	error = sysctl_handle_int(oidp, &value, 0, req);
147	if (error || req->newptr == NULL)
148		return (error);
149	if (value <= 0)
150		return (EINVAL);
151	*(int *)arg1 = value;
152	return (0);
153}
154
155static int
156sysctl_mca_records(SYSCTL_HANDLER_ARGS)
157{
158	int *name = (int *)arg1;
159	u_int namelen = arg2;
160	struct mca_record record;
161	struct mca_internal *rec;
162	int i;
163
164	if (namelen != 1)
165		return (EINVAL);
166
167	if (name[0] < 0 || name[0] >= mca_count)
168		return (EINVAL);
169
170	mtx_lock_spin(&mca_lock);
171	if (name[0] >= mca_count) {
172		mtx_unlock_spin(&mca_lock);
173		return (EINVAL);
174	}
175	i = 0;
176	STAILQ_FOREACH(rec, &mca_records, link) {
177		if (i == name[0]) {
178			record = rec->rec;
179			break;
180		}
181		i++;
182	}
183	mtx_unlock_spin(&mca_lock);
184	return (SYSCTL_OUT(req, &record, sizeof(record)));
185}
186
187static const char *
188mca_error_ttype(uint16_t mca_error)
189{
190
191	switch ((mca_error & 0x000c) >> 2) {
192	case 0:
193		return ("I");
194	case 1:
195		return ("D");
196	case 2:
197		return ("G");
198	}
199	return ("?");
200}
201
202static const char *
203mca_error_level(uint16_t mca_error)
204{
205
206	switch (mca_error & 0x0003) {
207	case 0:
208		return ("L0");
209	case 1:
210		return ("L1");
211	case 2:
212		return ("L2");
213	case 3:
214		return ("LG");
215	}
216	return ("L?");
217}
218
219static const char *
220mca_error_request(uint16_t mca_error)
221{
222
223	switch ((mca_error & 0x00f0) >> 4) {
224	case 0x0:
225		return ("ERR");
226	case 0x1:
227		return ("RD");
228	case 0x2:
229		return ("WR");
230	case 0x3:
231		return ("DRD");
232	case 0x4:
233		return ("DWR");
234	case 0x5:
235		return ("IRD");
236	case 0x6:
237		return ("PREFETCH");
238	case 0x7:
239		return ("EVICT");
240	case 0x8:
241		return ("SNOOP");
242	}
243	return ("???");
244}
245
246static const char *
247mca_error_mmtype(uint16_t mca_error)
248{
249
250	switch ((mca_error & 0x70) >> 4) {
251	case 0x0:
252		return ("GEN");
253	case 0x1:
254		return ("RD");
255	case 0x2:
256		return ("WR");
257	case 0x3:
258		return ("AC");
259	case 0x4:
260		return ("MS");
261	}
262	return ("???");
263}
264
265static int
266mca_mute(const struct mca_record *rec)
267{
268
269	/*
270	 * Skip spurious corrected parity errors generated by Intel Haswell-
271	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
272	 * erratum respectively), unless reporting is enabled.
273	 * Note that these errors also have been observed with the D0-stepping
274	 * of Haswell, while at least initially the CPU specification updates
275	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
276	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
277	 * same problem, with HSM142 only referring to 0x3c and 0x46.
278	 */
279	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
280	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
281	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
282	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
283	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
284	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
285	    rec->mr_bank == 0 &&
286	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
287	    !intel6h_HSD131)
288	    	return (1);
289
290	return (0);
291}
292
293/* Dump details about a single machine check. */
294static void
295mca_log(const struct mca_record *rec)
296{
297	uint16_t mca_error;
298
299	if (mca_mute(rec))
300	    	return;
301
302	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
303	    (long long)rec->mr_status);
304	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
305	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
306	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
307	    rec->mr_cpu_id, rec->mr_apic_id);
308	printf("MCA: CPU %d ", rec->mr_cpu);
309	if (rec->mr_status & MC_STATUS_UC)
310		printf("UNCOR ");
311	else {
312		printf("COR ");
313		if (rec->mr_mcg_cap & MCG_CAP_CMCI_P)
314			printf("(%lld) ", ((long long)rec->mr_status &
315			    MC_STATUS_COR_COUNT) >> 38);
316	}
317	if (rec->mr_status & MC_STATUS_PCC)
318		printf("PCC ");
319	if (rec->mr_status & MC_STATUS_OVER)
320		printf("OVER ");
321	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
322	switch (mca_error) {
323		/* Simple error codes. */
324	case 0x0000:
325		printf("no error");
326		break;
327	case 0x0001:
328		printf("unclassified error");
329		break;
330	case 0x0002:
331		printf("ucode ROM parity error");
332		break;
333	case 0x0003:
334		printf("external error");
335		break;
336	case 0x0004:
337		printf("FRC error");
338		break;
339	case 0x0005:
340		printf("internal parity error");
341		break;
342	case 0x0400:
343		printf("internal timer error");
344		break;
345	default:
346		if ((mca_error & 0xfc00) == 0x0400) {
347			printf("internal error %x", mca_error & 0x03ff);
348			break;
349		}
350
351		/* Compound error codes. */
352
353		/* Memory hierarchy error. */
354		if ((mca_error & 0xeffc) == 0x000c) {
355			printf("%s memory error", mca_error_level(mca_error));
356			break;
357		}
358
359		/* TLB error. */
360		if ((mca_error & 0xeff0) == 0x0010) {
361			printf("%sTLB %s error", mca_error_ttype(mca_error),
362			    mca_error_level(mca_error));
363			break;
364		}
365
366		/* Memory controller error. */
367		if ((mca_error & 0xef80) == 0x0080) {
368			printf("%s channel ", mca_error_mmtype(mca_error));
369			if ((mca_error & 0x000f) != 0x000f)
370				printf("%d", mca_error & 0x000f);
371			else
372				printf("??");
373			printf(" memory error");
374			break;
375		}
376
377		/* Cache error. */
378		if ((mca_error & 0xef00) == 0x0100) {
379			printf("%sCACHE %s %s error",
380			    mca_error_ttype(mca_error),
381			    mca_error_level(mca_error),
382			    mca_error_request(mca_error));
383			break;
384		}
385
386		/* Bus and/or Interconnect error. */
387		if ((mca_error & 0xe800) == 0x0800) {
388			printf("BUS%s ", mca_error_level(mca_error));
389			switch ((mca_error & 0x0600) >> 9) {
390			case 0:
391				printf("Source");
392				break;
393			case 1:
394				printf("Responder");
395				break;
396			case 2:
397				printf("Observer");
398				break;
399			default:
400				printf("???");
401				break;
402			}
403			printf(" %s ", mca_error_request(mca_error));
404			switch ((mca_error & 0x000c) >> 2) {
405			case 0:
406				printf("Memory");
407				break;
408			case 2:
409				printf("I/O");
410				break;
411			case 3:
412				printf("Other");
413				break;
414			default:
415				printf("???");
416				break;
417			}
418			if (mca_error & 0x0100)
419				printf(" timed out");
420			break;
421		}
422
423		printf("unknown error %x", mca_error);
424		break;
425	}
426	printf("\n");
427	if (rec->mr_status & MC_STATUS_ADDRV)
428		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
429	if (rec->mr_status & MC_STATUS_MISCV)
430		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
431}
432
433static int
434mca_check_status(int bank, struct mca_record *rec)
435{
436	uint64_t status;
437	u_int p[4];
438
439	status = rdmsr(MSR_MC_STATUS(bank));
440	if (!(status & MC_STATUS_VAL))
441		return (0);
442
443	/* Save exception information. */
444	rec->mr_status = status;
445	rec->mr_bank = bank;
446	rec->mr_addr = 0;
447	if (status & MC_STATUS_ADDRV)
448		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
449	rec->mr_misc = 0;
450	if (status & MC_STATUS_MISCV)
451		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
452	rec->mr_tsc = rdtsc();
453	rec->mr_apic_id = PCPU_GET(apic_id);
454	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
455	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
456	rec->mr_cpu_id = cpu_id;
457	rec->mr_cpu_vendor_id = cpu_vendor_id;
458	rec->mr_cpu = PCPU_GET(cpuid);
459
460	/*
461	 * Clear machine check.  Don't do this for uncorrectable
462	 * errors so that the BIOS can see them.
463	 */
464	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
465		wrmsr(MSR_MC_STATUS(bank), 0);
466		do_cpuid(0, p);
467	}
468	return (1);
469}
470
471static void
472mca_fill_freelist(void)
473{
474	struct mca_internal *rec;
475	int desired;
476
477	/*
478	 * Ensure we have at least one record for each bank and one
479	 * record per CPU.
480	 */
481	desired = imax(mp_ncpus, mca_banks);
482	mtx_lock_spin(&mca_lock);
483	while (mca_freecount < desired) {
484		mtx_unlock_spin(&mca_lock);
485		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
486		mtx_lock_spin(&mca_lock);
487		STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
488		mca_freecount++;
489	}
490	mtx_unlock_spin(&mca_lock);
491}
492
493static void
494mca_refill(void *context, int pending)
495{
496
497	mca_fill_freelist();
498}
499
500static void
501mca_record_entry(enum scan_mode mode, const struct mca_record *record)
502{
503	struct mca_internal *rec;
504
505	if (mode == POLLED) {
506		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
507		mtx_lock_spin(&mca_lock);
508	} else {
509		mtx_lock_spin(&mca_lock);
510		rec = STAILQ_FIRST(&mca_freelist);
511		if (rec == NULL) {
512			printf("MCA: Unable to allocate space for an event.\n");
513			mca_log(record);
514			mtx_unlock_spin(&mca_lock);
515			return;
516		}
517		STAILQ_REMOVE_HEAD(&mca_freelist, link);
518		mca_freecount--;
519	}
520
521	rec->rec = *record;
522	rec->logged = 0;
523	STAILQ_INSERT_TAIL(&mca_records, rec, link);
524	mca_count++;
525	mtx_unlock_spin(&mca_lock);
526	if (mode == CMCI && !cold)
527		taskqueue_enqueue(mca_tq, &mca_refill_task);
528}
529
530#ifdef DEV_APIC
531/*
532 * Update the interrupt threshold for a CMCI.  The strategy is to use
533 * a low trigger that interrupts as soon as the first event occurs.
534 * However, if a steady stream of events arrive, the threshold is
535 * increased until the interrupts are throttled to once every
536 * cmc_throttle seconds or the periodic scan.  If a periodic scan
537 * finds that the threshold is too high, it is lowered.
538 */
539static int
540update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
541    int cur_threshold, int max_threshold)
542{
543	u_int delta;
544	int limit;
545
546	delta = (u_int)(time_uptime - last_intr);
547	limit = cur_threshold;
548
549	/*
550	 * If an interrupt was received less than cmc_throttle seconds
551	 * since the previous interrupt and the count from the current
552	 * event is greater than or equal to the current threshold,
553	 * double the threshold up to the max.
554	 */
555	if (mode == CMCI && valid) {
556		if (delta < cmc_throttle && count >= limit &&
557		    limit < max_threshold) {
558			limit = min(limit << 1, max_threshold);
559		}
560		return (limit);
561	}
562
563	/*
564	 * When the banks are polled, check to see if the threshold
565	 * should be lowered.
566	 */
567	if (mode != POLLED)
568		return (limit);
569
570	/* If a CMCI occured recently, do nothing for now. */
571	if (delta < cmc_throttle)
572		return (limit);
573
574	/*
575	 * Compute a new limit based on the average rate of events per
576	 * cmc_throttle seconds since the last interrupt.
577	 */
578	if (valid) {
579		limit = count * cmc_throttle / delta;
580		if (limit <= 0)
581			limit = 1;
582		else if (limit > max_threshold)
583			limit = max_threshold;
584	} else {
585		limit = 1;
586	}
587	return (limit);
588}
589
590static void
591cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
592{
593	struct cmc_state *cc;
594	uint64_t ctl;
595	int cur_threshold, new_threshold;
596	int count;
597
598	/* Fetch the current limit for this bank. */
599	cc = &cmc_state[PCPU_GET(cpuid)][bank];
600	ctl = rdmsr(MSR_MC_CTL2(bank));
601	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
602	cur_threshold = ctl & MC_CTL2_THRESHOLD;
603
604	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
605	    cur_threshold, cc->max_threshold);
606
607	if (mode == CMCI && valid)
608		cc->last_intr = time_uptime;
609	if (new_threshold != cur_threshold) {
610		ctl &= ~MC_CTL2_THRESHOLD;
611		ctl |= new_threshold;
612		wrmsr(MSR_MC_CTL2(bank), ctl);
613	}
614}
615
616static void
617amd_thresholding_update(enum scan_mode mode, int bank, int valid)
618{
619	struct amd_et_state *cc;
620	uint64_t misc;
621	int new_threshold;
622	int count;
623
624	KASSERT(bank == MC_AMDNB_BANK,
625	    ("%s: unexpected bank %d", __func__, bank));
626	cc = &amd_et_state[PCPU_GET(cpuid)];
627	misc = rdmsr(MSR_MC_MISC(bank));
628	count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
629	count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
630
631	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
632	    cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
633
634	cc->cur_threshold = new_threshold;
635	misc &= ~MC_MISC_AMDNB_CNT_MASK;
636	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
637	    << MC_MISC_AMDNB_CNT_SHIFT;
638	misc &= ~MC_MISC_AMDNB_OVERFLOW;
639	wrmsr(MSR_MC_MISC(bank), misc);
640	if (mode == CMCI && valid)
641		cc->last_intr = time_uptime;
642}
643#endif
644
645/*
646 * This scans all the machine check banks of the current CPU to see if
647 * there are any machine checks.  Any non-recoverable errors are
648 * reported immediately via mca_log().  The current thread must be
649 * pinned when this is called.  The 'mode' parameter indicates if we
650 * are being called from the MC exception handler, the CMCI handler,
651 * or the periodic poller.  In the MC exception case this function
652 * returns true if the system is restartable.  Otherwise, it returns a
653 * count of the number of valid MC records found.
654 */
655static int
656mca_scan(enum scan_mode mode, int *recoverablep)
657{
658	struct mca_record rec;
659	uint64_t mcg_cap, ucmask;
660	int count, i, recoverable, valid;
661
662	count = 0;
663	recoverable = 1;
664	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
665
666	/* When handling a MCE#, treat the OVER flag as non-restartable. */
667	if (mode == MCE)
668		ucmask |= MC_STATUS_OVER;
669	mcg_cap = rdmsr(MSR_MCG_CAP);
670	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
671#ifdef DEV_APIC
672		/*
673		 * For a CMCI, only check banks this CPU is
674		 * responsible for.
675		 */
676		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
677			continue;
678#endif
679
680		valid = mca_check_status(i, &rec);
681		if (valid) {
682			count++;
683			if (rec.mr_status & ucmask) {
684				recoverable = 0;
685				mtx_lock_spin(&mca_lock);
686				mca_log(&rec);
687				mtx_unlock_spin(&mca_lock);
688			}
689			mca_record_entry(mode, &rec);
690		}
691
692#ifdef DEV_APIC
693		/*
694		 * If this is a bank this CPU monitors via CMCI,
695		 * update the threshold.
696		 */
697		if (PCPU_GET(cmci_mask) & 1 << i) {
698			if (cmc_state != NULL)
699				cmci_update(mode, i, valid, &rec);
700			else
701				amd_thresholding_update(mode, i, valid);
702		}
703#endif
704	}
705	if (mode == POLLED)
706		mca_fill_freelist();
707	if (recoverablep != NULL)
708		*recoverablep = recoverable;
709	return (count);
710}
711
712/*
713 * Scan the machine check banks on all CPUs by binding to each CPU in
714 * turn.  If any of the CPUs contained new machine check records, log
715 * them to the console.
716 */
717static void
718mca_scan_cpus(void *context, int pending)
719{
720	struct mca_internal *mca;
721	struct thread *td;
722	int count, cpu;
723
724	mca_fill_freelist();
725	td = curthread;
726	count = 0;
727	thread_lock(td);
728	CPU_FOREACH(cpu) {
729		sched_bind(td, cpu);
730		thread_unlock(td);
731		count += mca_scan(POLLED, NULL);
732		thread_lock(td);
733		sched_unbind(td);
734	}
735	thread_unlock(td);
736	if (count != 0) {
737		mtx_lock_spin(&mca_lock);
738		STAILQ_FOREACH(mca, &mca_records, link) {
739			if (!mca->logged) {
740				mca->logged = 1;
741				mca_log(&mca->rec);
742			}
743		}
744		mtx_unlock_spin(&mca_lock);
745	}
746}
747
748static void
749mca_periodic_scan(void *arg)
750{
751
752	taskqueue_enqueue(mca_tq, &mca_scan_task);
753	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
754}
755
756static int
757sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
758{
759	int error, i;
760
761	i = 0;
762	error = sysctl_handle_int(oidp, &i, 0, req);
763	if (error)
764		return (error);
765	if (i)
766		taskqueue_enqueue(mca_tq, &mca_scan_task);
767	return (0);
768}
769
770static void
771mca_createtq(void *dummy)
772{
773	if (mca_banks <= 0)
774		return;
775
776	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
777	    taskqueue_thread_enqueue, &mca_tq);
778	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
779
780	/* CMCIs during boot may have claimed items from the freelist. */
781	mca_fill_freelist();
782}
783SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
784
785static void
786mca_startup(void *dummy)
787{
788
789	if (mca_banks <= 0)
790		return;
791
792	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
793}
794#ifdef EARLY_AP_STARTUP
795SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
796#else
797SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
798#endif
799
800#ifdef DEV_APIC
801static void
802cmci_setup(void)
803{
804	int i;
805
806	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
807	    M_WAITOK);
808	for (i = 0; i <= mp_maxid; i++)
809		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
810		    M_MCA, M_WAITOK | M_ZERO);
811	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
812	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
813	    &cmc_throttle, 0, sysctl_positive_int, "I",
814	    "Interval in seconds to throttle corrected MC interrupts");
815}
816
817static void
818amd_thresholding_setup(void)
819{
820
821	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
822	    M_MCA, M_WAITOK | M_ZERO);
823	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
824	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
825	    &cmc_throttle, 0, sysctl_positive_int, "I",
826	    "Interval in seconds to throttle corrected MC interrupts");
827}
828#endif
829
830static void
831mca_setup(uint64_t mcg_cap)
832{
833
834	/*
835	 * On AMD Family 10h processors, unless logging of level one TLB
836	 * parity (L1TP) errors is disabled, enable the recommended workaround
837	 * for Erratum 383.
838	 */
839	if (cpu_vendor_id == CPU_VENDOR_AMD &&
840	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
841		workaround_erratum383 = 1;
842
843	mca_banks = mcg_cap & MCG_CAP_COUNT;
844	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
845	STAILQ_INIT(&mca_records);
846	TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL);
847	callout_init(&mca_timer, 1);
848	STAILQ_INIT(&mca_freelist);
849	TASK_INIT(&mca_refill_task, 0, mca_refill, NULL);
850	mca_fill_freelist();
851	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
852	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
853	    "Record count");
854	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
855	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
856	    0, sysctl_positive_int, "I",
857	    "Periodic interval in seconds to scan for machine checks");
858	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
859	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
860	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
861	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
862	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
863#ifdef DEV_APIC
864	if (mcg_cap & MCG_CAP_CMCI_P)
865		cmci_setup();
866	else if (amd_thresholding_supported())
867		amd_thresholding_setup();
868#endif
869}
870
871#ifdef DEV_APIC
872/*
873 * See if we should monitor CMCI for this bank.  If CMCI_EN is already
874 * set in MC_CTL2, then another CPU is responsible for this bank, so
875 * ignore it.  If CMCI_EN returns zero after being set, then this bank
876 * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
877 * now monitor this bank.
878 */
879static void
880cmci_monitor(int i)
881{
882	struct cmc_state *cc;
883	uint64_t ctl;
884
885	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
886
887	ctl = rdmsr(MSR_MC_CTL2(i));
888	if (ctl & MC_CTL2_CMCI_EN)
889		/* Already monitored by another CPU. */
890		return;
891
892	/* Set the threshold to one event for now. */
893	ctl &= ~MC_CTL2_THRESHOLD;
894	ctl |= MC_CTL2_CMCI_EN | 1;
895	wrmsr(MSR_MC_CTL2(i), ctl);
896	ctl = rdmsr(MSR_MC_CTL2(i));
897	if (!(ctl & MC_CTL2_CMCI_EN))
898		/* This bank does not support CMCI. */
899		return;
900
901	cc = &cmc_state[PCPU_GET(cpuid)][i];
902
903	/* Determine maximum threshold. */
904	ctl &= ~MC_CTL2_THRESHOLD;
905	ctl |= 0x7fff;
906	wrmsr(MSR_MC_CTL2(i), ctl);
907	ctl = rdmsr(MSR_MC_CTL2(i));
908	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
909
910	/* Start off with a threshold of 1. */
911	ctl &= ~MC_CTL2_THRESHOLD;
912	ctl |= 1;
913	wrmsr(MSR_MC_CTL2(i), ctl);
914
915	/* Mark this bank as monitored. */
916	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
917}
918
919/*
920 * For resume, reset the threshold for any banks we monitor back to
921 * one and throw away the timestamp of the last interrupt.
922 */
923static void
924cmci_resume(int i)
925{
926	struct cmc_state *cc;
927	uint64_t ctl;
928
929	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
930
931	/* Ignore banks not monitored by this CPU. */
932	if (!(PCPU_GET(cmci_mask) & 1 << i))
933		return;
934
935	cc = &cmc_state[PCPU_GET(cpuid)][i];
936	cc->last_intr = 0;
937	ctl = rdmsr(MSR_MC_CTL2(i));
938	ctl &= ~MC_CTL2_THRESHOLD;
939	ctl |= MC_CTL2_CMCI_EN | 1;
940	wrmsr(MSR_MC_CTL2(i), ctl);
941}
942
943static void
944amd_thresholding_start(struct amd_et_state *cc)
945{
946	uint64_t misc;
947
948	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
949	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
950	misc &= ~MC_MISC_AMDNB_INT_MASK;
951	misc |= MC_MISC_AMDNB_INT_LVT;
952	misc &= ~MC_MISC_AMDNB_LVT_MASK;
953	misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
954	misc &= ~MC_MISC_AMDNB_CNT_MASK;
955	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
956	    << MC_MISC_AMDNB_CNT_SHIFT;
957	misc &= ~MC_MISC_AMDNB_OVERFLOW;
958	misc |= MC_MISC_AMDNB_CNTEN;
959
960	wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
961}
962
963static void
964amd_thresholding_init(void)
965{
966	struct amd_et_state *cc;
967	uint64_t misc;
968
969	/* The counter must be valid and present. */
970	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
971	if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
972	    (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
973		return;
974
975	/* The register should not be locked. */
976	if ((misc & MC_MISC_AMDNB_LOCK) != 0)
977		return;
978
979	/*
980	 * If counter is enabled then either the firmware or another CPU
981	 * has already claimed it.
982	 */
983	if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
984		return;
985
986	/*
987	 * Configure an Extended Interrupt LVT register for reporting
988	 * counter overflows if that feature is supported and the first
989	 * extended register is available.
990	 */
991	amd_elvt = lapic_enable_mca_elvt();
992	if (amd_elvt < 0)
993		return;
994
995	/* Re-use Intel CMC support infrastructure. */
996	cc = &amd_et_state[PCPU_GET(cpuid)];
997	cc->cur_threshold = 1;
998	amd_thresholding_start(cc);
999
1000	/* Mark the NB bank as monitored. */
1001	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
1002}
1003
1004static void
1005amd_thresholding_resume(void)
1006{
1007	struct amd_et_state *cc;
1008
1009	/* Nothing to do if this CPU doesn't monitor the NB bank. */
1010	if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
1011		return;
1012
1013	cc = &amd_et_state[PCPU_GET(cpuid)];
1014	cc->last_intr = 0;
1015	cc->cur_threshold = 1;
1016	amd_thresholding_start(cc);
1017}
1018#endif
1019
1020/*
1021 * Initializes per-CPU machine check registers and enables corrected
1022 * machine check interrupts.
1023 */
1024static void
1025_mca_init(int boot)
1026{
1027	uint64_t mcg_cap;
1028	uint64_t ctl, mask;
1029	int i, skip;
1030
1031	/* MCE is required. */
1032	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1033		return;
1034
1035	if (cpu_feature & CPUID_MCA) {
1036		if (boot)
1037			PCPU_SET(cmci_mask, 0);
1038
1039		mcg_cap = rdmsr(MSR_MCG_CAP);
1040		if (mcg_cap & MCG_CAP_CTL_P)
1041			/* Enable MCA features. */
1042			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1043		if (IS_BSP() && boot)
1044			mca_setup(mcg_cap);
1045
1046		/*
1047		 * Disable logging of level one TLB parity (L1TP) errors by
1048		 * the data cache as an alternative workaround for AMD Family
1049		 * 10h Erratum 383.  Unlike the recommended workaround, there
1050		 * is no performance penalty to this workaround.  However,
1051		 * L1TP errors will go unreported.
1052		 */
1053		if (cpu_vendor_id == CPU_VENDOR_AMD &&
1054		    CPUID_TO_FAMILY(cpu_id) == 0x10 && !amd10h_L1TP) {
1055			mask = rdmsr(MSR_MC0_CTL_MASK);
1056			if ((mask & (1UL << 5)) == 0)
1057				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1058		}
1059
1060		/*
1061		 * The cmci_monitor() must not be executed
1062		 * simultaneously by several CPUs.
1063		 */
1064		if (boot)
1065			mtx_lock_spin(&mca_lock);
1066
1067		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1068			/* By default enable logging of all errors. */
1069			ctl = 0xffffffffffffffffUL;
1070			skip = 0;
1071
1072			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1073				/*
1074				 * For P6 models before Nehalem MC0_CTL is
1075				 * always enabled and reserved.
1076				 */
1077				if (i == 0 && CPUID_TO_FAMILY(cpu_id) == 0x6
1078				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
1079					skip = 1;
1080			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1081				/* BKDG for Family 10h: unset GartTblWkEn. */
1082				if (i == 4 && CPUID_TO_FAMILY(cpu_id) >= 0xf)
1083					ctl &= ~(1UL << 10);
1084			}
1085
1086			if (!skip)
1087				wrmsr(MSR_MC_CTL(i), ctl);
1088
1089#ifdef DEV_APIC
1090			if (mcg_cap & MCG_CAP_CMCI_P) {
1091				if (boot)
1092					cmci_monitor(i);
1093				else
1094					cmci_resume(i);
1095			}
1096#endif
1097
1098			/* Clear all errors. */
1099			wrmsr(MSR_MC_STATUS(i), 0);
1100		}
1101		if (boot)
1102			mtx_unlock_spin(&mca_lock);
1103
1104#ifdef DEV_APIC
1105		/*
1106		 * AMD Processors from families 10h - 16h provide support
1107		 * for Machine Check Error Thresholding.
1108		 * The processors support counters of MC errors and they
1109		 * can be configured to generate an interrupt when a counter
1110		 * overflows.
1111		 * The counters are all associated with Bank 4 and each
1112		 * of them covers a group of errors reported via that bank.
1113		 * At the moment only the DRAM Error Threshold Group is
1114		 * supported.
1115		 */
1116		if (amd_thresholding_supported() &&
1117		    (mcg_cap & MCG_CAP_COUNT) >= 4) {
1118			if (boot)
1119				amd_thresholding_init();
1120			else
1121				amd_thresholding_resume();
1122		} else if (PCPU_GET(cmci_mask) != 0 && boot) {
1123			lapic_enable_cmc();
1124		}
1125#endif
1126	}
1127
1128	load_cr4(rcr4() | CR4_MCE);
1129}
1130
1131/* Must be executed on each CPU during boot. */
1132void
1133mca_init(void)
1134{
1135
1136	_mca_init(1);
1137}
1138
1139/* Must be executed on each CPU during resume. */
1140void
1141mca_resume(void)
1142{
1143
1144	_mca_init(0);
1145}
1146
1147/*
1148 * The machine check registers for the BSP cannot be initialized until
1149 * the local APIC is initialized.  This happens at SI_SUB_CPU,
1150 * SI_ORDER_SECOND.
1151 */
1152static void
1153mca_init_bsp(void *arg __unused)
1154{
1155
1156	mca_init();
1157}
1158SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1159
1160/* Called when a machine check exception fires. */
1161void
1162mca_intr(void)
1163{
1164	uint64_t mcg_status;
1165	int recoverable, count;
1166
1167	if (!(cpu_feature & CPUID_MCA)) {
1168		/*
1169		 * Just print the values of the old Pentium registers
1170		 * and panic.
1171		 */
1172		printf("MC Type: 0x%jx  Address: 0x%jx\n",
1173		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1174		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1175		panic("Machine check");
1176	}
1177
1178	/* Scan the banks and check for any non-recoverable errors. */
1179	count = mca_scan(MCE, &recoverable);
1180	mcg_status = rdmsr(MSR_MCG_STATUS);
1181	if (!(mcg_status & MCG_STATUS_RIPV))
1182		recoverable = 0;
1183
1184	if (!recoverable) {
1185		/*
1186		 * Only panic if the error was detected local to this CPU.
1187		 * Some errors will assert a machine check on all CPUs, but
1188		 * only certain CPUs will find a valid bank to log.
1189		 */
1190		while (count == 0)
1191			cpu_spinwait();
1192
1193		panic("Unrecoverable machine check exception");
1194	}
1195
1196	/* Clear MCIP. */
1197	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1198}
1199
1200#ifdef DEV_APIC
1201/* Called for a CMCI (correctable machine check interrupt). */
1202void
1203cmc_intr(void)
1204{
1205	struct mca_internal *mca;
1206	int count;
1207
1208	/*
1209	 * Serialize MCA bank scanning to prevent collisions from
1210	 * sibling threads.
1211	 */
1212	count = mca_scan(CMCI, NULL);
1213
1214	/* If we found anything, log them to the console. */
1215	if (count != 0) {
1216		mtx_lock_spin(&mca_lock);
1217		STAILQ_FOREACH(mca, &mca_records, link) {
1218			if (!mca->logged) {
1219				mca->logged = 1;
1220				mca_log(&mca->rec);
1221			}
1222		}
1223		mtx_unlock_spin(&mca_lock);
1224	}
1225}
1226#endif
1227