1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include <sys/cdefs.h>
40#include "opt_acpi.h"
41#include "opt_atpic.h"
42#include "opt_cpu.h"
43#include "opt_ddb.h"
44#include "opt_inet.h"
45#include "opt_isa.h"
46#include "opt_kdb.h"
47#include "opt_kstack_pages.h"
48#include "opt_maxmem.h"
49#include "opt_platform.h"
50#include "opt_sched.h"
51#ifdef __i386__
52#include "opt_apic.h"
53#endif
54
55#include <sys/param.h>
56#include <sys/proc.h>
57#include <sys/systm.h>
58#include <sys/bus.h>
59#include <sys/cpu.h>
60#include <sys/domainset.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/ktr.h>
64#include <sys/lock.h>
65#include <sys/malloc.h>
66#include <sys/mutex.h>
67#include <sys/pcpu.h>
68#include <sys/rwlock.h>
69#include <sys/sched.h>
70#include <sys/smp.h>
71#include <sys/sysctl.h>
72
73#include <machine/clock.h>
74#include <machine/cpu.h>
75#include <machine/cpufunc.h>
76#include <machine/cputypes.h>
77#include <machine/specialreg.h>
78#include <machine/md_var.h>
79#include <machine/tss.h>
80#ifdef SMP
81#include <machine/smp.h>
82#endif
83#ifdef CPU_ELAN
84#include <machine/elan_mmcr.h>
85#endif
86#include <x86/acpica_machdep.h>
87#include <x86/ifunc.h>
88
89#include <vm/vm.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_object.h>
95#include <vm/vm_pager.h>
96#include <vm/vm_param.h>
97
98#include <isa/isareg.h>
99
100#include <contrib/dev/acpica/include/acpi.h>
101
102#define	STATE_RUNNING	0x0
103#define	STATE_MWAIT	0x1
104#define	STATE_SLEEPING	0x2
105
106#ifdef SMP
107static u_int	cpu_reset_proxyid;
108static volatile u_int	cpu_reset_proxy_active;
109#endif
110
111char bootmethod[16];
112SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
113    "System firmware boot method");
114
115struct msr_op_arg {
116	u_int msr;
117	int op;
118	uint64_t arg1;
119	uint64_t *res;
120};
121
122static void
123x86_msr_op_one(void *argp)
124{
125	struct msr_op_arg *a;
126	uint64_t v;
127
128	a = argp;
129	switch (a->op) {
130	case MSR_OP_ANDNOT:
131		v = rdmsr(a->msr);
132		v &= ~a->arg1;
133		wrmsr(a->msr, v);
134		break;
135	case MSR_OP_OR:
136		v = rdmsr(a->msr);
137		v |= a->arg1;
138		wrmsr(a->msr, v);
139		break;
140	case MSR_OP_WRITE:
141		wrmsr(a->msr, a->arg1);
142		break;
143	case MSR_OP_READ:
144		v = rdmsr(a->msr);
145		*a->res = v;
146		break;
147	}
148}
149
150#define	MSR_OP_EXMODE_MASK	0xf0000000
151#define	MSR_OP_OP_MASK		0x000000ff
152#define	MSR_OP_GET_CPUID(x)	(((x) & ~MSR_OP_EXMODE_MASK) >> 8)
153
154void
155x86_msr_op(u_int msr, u_int op, uint64_t arg1, uint64_t *res)
156{
157	struct thread *td;
158	struct msr_op_arg a;
159	cpuset_t set;
160	u_int exmode;
161	int bound_cpu, cpu, i, is_bound;
162
163	a.op = op & MSR_OP_OP_MASK;
164	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
165	    a.op == MSR_OP_WRITE || a.op == MSR_OP_READ);
166	exmode = op & MSR_OP_EXMODE_MASK;
167	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED_ALL ||
168	    exmode == MSR_OP_SCHED_ONE || exmode == MSR_OP_RENDEZVOUS_ALL ||
169	    exmode == MSR_OP_RENDEZVOUS_ONE);
170	a.msr = msr;
171	a.arg1 = arg1;
172	a.res = res;
173	switch (exmode) {
174	case MSR_OP_LOCAL:
175		x86_msr_op_one(&a);
176		break;
177	case MSR_OP_SCHED_ALL:
178		td = curthread;
179		thread_lock(td);
180		is_bound = sched_is_bound(td);
181		bound_cpu = td->td_oncpu;
182		CPU_FOREACH(i) {
183			sched_bind(td, i);
184			x86_msr_op_one(&a);
185		}
186		if (is_bound)
187			sched_bind(td, bound_cpu);
188		else
189			sched_unbind(td);
190		thread_unlock(td);
191		break;
192	case MSR_OP_SCHED_ONE:
193		td = curthread;
194		cpu = MSR_OP_GET_CPUID(op);
195		thread_lock(td);
196		is_bound = sched_is_bound(td);
197		bound_cpu = td->td_oncpu;
198		if (!is_bound || bound_cpu != cpu)
199			sched_bind(td, cpu);
200		x86_msr_op_one(&a);
201		if (is_bound) {
202			if (bound_cpu != cpu)
203				sched_bind(td, bound_cpu);
204		} else {
205			sched_unbind(td);
206		}
207		thread_unlock(td);
208		break;
209	case MSR_OP_RENDEZVOUS_ALL:
210		smp_rendezvous(smp_no_rendezvous_barrier, x86_msr_op_one,
211		    smp_no_rendezvous_barrier, &a);
212		break;
213	case MSR_OP_RENDEZVOUS_ONE:
214		cpu = MSR_OP_GET_CPUID(op);
215		CPU_SETOF(cpu, &set);
216		smp_rendezvous_cpus(set, smp_no_rendezvous_barrier,
217		    x86_msr_op_one, smp_no_rendezvous_barrier, &a);
218		break;
219	}
220}
221
222/*
223 * Automatically initialized per CPU errata in cpu_idle_tun below.
224 */
225bool mwait_cpustop_broken = false;
226SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
227    &mwait_cpustop_broken, 0,
228    "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
229
230/*
231 * Flush the D-cache for non-DMA I/O so that the I-cache can
232 * be made coherent later.
233 */
234void
235cpu_flush_dcache(void *ptr, size_t len)
236{
237	/* Not applicable */
238}
239
240void
241acpi_cpu_c1(void)
242{
243
244	__asm __volatile("sti; hlt");
245}
246
247/*
248 * Use mwait to pause execution while waiting for an interrupt or
249 * another thread to signal that there is more work.
250 *
251 * NOTE: Interrupts will cause a wakeup; however, this function does
252 * not enable interrupt handling. The caller is responsible to enable
253 * interrupts.
254 */
255void
256acpi_cpu_idle_mwait(uint32_t mwait_hint)
257{
258	int *state;
259	uint64_t v;
260
261	/*
262	 * A comment in Linux patch claims that 'CPUs run faster with
263	 * speculation protection disabled. All CPU threads in a core
264	 * must disable speculation protection for it to be
265	 * disabled. Disable it while we are idle so the other
266	 * hyperthread can run fast.'
267	 *
268	 * XXXKIB.  Software coordination mode should be supported,
269	 * but all Intel CPUs provide hardware coordination.
270	 */
271
272	state = &PCPU_PTR(monitorbuf)->idle_state;
273	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
274	    ("cpu_mwait_cx: wrong monitorbuf state"));
275	atomic_store_int(state, STATE_MWAIT);
276	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
277		v = rdmsr(MSR_IA32_SPEC_CTRL);
278		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
279		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
280	} else {
281		v = 0;
282	}
283	cpu_monitor(state, 0, 0);
284	if (atomic_load_int(state) == STATE_MWAIT)
285		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
286
287	/*
288	 * SSB cannot be disabled while we sleep, or rather, if it was
289	 * disabled, the sysctl thread will bind to our cpu to tweak
290	 * MSR.
291	 */
292	if (v != 0)
293		wrmsr(MSR_IA32_SPEC_CTRL, v);
294
295	/*
296	 * We should exit on any event that interrupts mwait, because
297	 * that event might be a wanted interrupt.
298	 */
299	atomic_store_int(state, STATE_RUNNING);
300}
301
302/* Get current clock frequency for the given cpu id. */
303int
304cpu_est_clockrate(int cpu_id, uint64_t *rate)
305{
306	uint64_t tsc1, tsc2;
307	uint64_t acnt, mcnt, perf;
308	register_t reg;
309
310	if (pcpu_find(cpu_id) == NULL || rate == NULL)
311		return (EINVAL);
312#ifdef __i386__
313	if ((cpu_feature & CPUID_TSC) == 0)
314		return (EOPNOTSUPP);
315#endif
316
317	/*
318	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
319	 * DELAY(9) based logic fails.
320	 */
321	if (tsc_is_invariant && !tsc_perf_stat)
322		return (EOPNOTSUPP);
323
324#ifdef SMP
325	if (smp_cpus > 1) {
326		/* Schedule ourselves on the indicated cpu. */
327		thread_lock(curthread);
328		sched_bind(curthread, cpu_id);
329		thread_unlock(curthread);
330	}
331#endif
332
333	/* Calibrate by measuring a short delay. */
334	reg = intr_disable();
335	if (tsc_is_invariant) {
336		wrmsr(MSR_MPERF, 0);
337		wrmsr(MSR_APERF, 0);
338		tsc1 = rdtsc();
339		DELAY(1000);
340		mcnt = rdmsr(MSR_MPERF);
341		acnt = rdmsr(MSR_APERF);
342		tsc2 = rdtsc();
343		intr_restore(reg);
344		perf = 1000 * acnt / mcnt;
345		*rate = (tsc2 - tsc1) * perf;
346	} else {
347		tsc1 = rdtsc();
348		DELAY(1000);
349		tsc2 = rdtsc();
350		intr_restore(reg);
351		*rate = (tsc2 - tsc1) * 1000;
352	}
353
354#ifdef SMP
355	if (smp_cpus > 1) {
356		thread_lock(curthread);
357		sched_unbind(curthread);
358		thread_unlock(curthread);
359	}
360#endif
361
362	return (0);
363}
364
365/*
366 * Shutdown the CPU as much as possible
367 */
368void
369cpu_halt(void)
370{
371	for (;;)
372		halt();
373}
374
375static void
376cpu_reset_real(void)
377{
378	struct region_descriptor null_idt;
379	int b;
380
381	disable_intr();
382#ifdef CPU_ELAN
383	if (elan_mmcr != NULL)
384		elan_mmcr->RESCFG = 1;
385#endif
386#ifdef __i386__
387	if (cpu == CPU_GEODE1100) {
388		/* Attempt Geode's own reset */
389		outl(0xcf8, 0x80009044ul);
390		outl(0xcfc, 0xf);
391	}
392#endif
393#if !defined(BROKEN_KEYBOARD_RESET)
394	/*
395	 * Attempt to do a CPU reset via the keyboard controller,
396	 * do not turn off GateA20, as any machine that fails
397	 * to do the reset here would then end up in no man's land.
398	 */
399	outb(IO_KBD + 4, 0xFE);
400	DELAY(500000);	/* wait 0.5 sec to see if that did it */
401#endif
402
403	/*
404	 * Attempt to force a reset via the Reset Control register at
405	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
406	 * transitions from 0 to 1.  Bit 1 selects the type of reset
407	 * to attempt: 0 selects a "soft" reset, and 1 selects a
408	 * "hard" reset.  We try a "hard" reset.  The first write sets
409	 * bit 1 to select a "hard" reset and clears bit 2.  The
410	 * second write forces a 0 -> 1 transition in bit 2 to trigger
411	 * a reset.
412	 */
413	outb(0xcf9, 0x2);
414	outb(0xcf9, 0x6);
415	DELAY(500000);  /* wait 0.5 sec to see if that did it */
416
417	/*
418	 * Attempt to force a reset via the Fast A20 and Init register
419	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
420	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
421	 * preserve bit 1 while setting bit 0.  We also must clear bit
422	 * 0 before setting it if it isn't already clear.
423	 */
424	b = inb(0x92);
425	if (b != 0xff) {
426		if ((b & 0x1) != 0)
427			outb(0x92, b & 0xfe);
428		outb(0x92, b | 0x1);
429		DELAY(500000);  /* wait 0.5 sec to see if that did it */
430	}
431
432	printf("No known reset method worked, attempting CPU shutdown\n");
433	DELAY(1000000); /* wait 1 sec for printf to complete */
434
435	/* Wipe the IDT. */
436	null_idt.rd_limit = 0;
437	null_idt.rd_base = 0;
438	lidt(&null_idt);
439
440	/* "good night, sweet prince .... <THUNK!>" */
441	breakpoint();
442
443	/* NOTREACHED */
444	while(1);
445}
446
447#ifdef SMP
448static void
449cpu_reset_proxy(void)
450{
451
452	cpu_reset_proxy_active = 1;
453	while (cpu_reset_proxy_active == 1)
454		ia32_pause(); /* Wait for other cpu to see that we've started */
455
456	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
457	DELAY(1000000);
458	cpu_reset_real();
459}
460#endif
461
462void
463cpu_reset(void)
464{
465#ifdef SMP
466	struct monitorbuf *mb;
467	cpuset_t map;
468	u_int cnt;
469
470	if (smp_started) {
471		map = all_cpus;
472		CPU_CLR(PCPU_GET(cpuid), &map);
473		CPU_ANDNOT(&map, &map, &stopped_cpus);
474		if (!CPU_EMPTY(&map)) {
475			printf("cpu_reset: Stopping other CPUs\n");
476			stop_cpus(map);
477		}
478
479		if (PCPU_GET(cpuid) != 0) {
480			cpu_reset_proxyid = PCPU_GET(cpuid);
481			cpustop_restartfunc = cpu_reset_proxy;
482			cpu_reset_proxy_active = 0;
483			printf("cpu_reset: Restarting BSP\n");
484
485			/* Restart CPU #0. */
486			CPU_SETOF(0, &started_cpus);
487			mb = &pcpu_find(0)->pc_monitorbuf;
488			atomic_store_int(&mb->stop_state,
489			    MONITOR_STOPSTATE_RUNNING);
490
491			cnt = 0;
492			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
493				ia32_pause();
494				cnt++;	/* Wait for BSP to announce restart */
495			}
496			if (cpu_reset_proxy_active == 0) {
497				printf("cpu_reset: Failed to restart BSP\n");
498			} else {
499				cpu_reset_proxy_active = 2;
500				while (1)
501					ia32_pause();
502				/* NOTREACHED */
503			}
504		}
505	}
506#endif
507	cpu_reset_real();
508	/* NOTREACHED */
509}
510
511bool
512cpu_mwait_usable(void)
513{
514
515	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
516	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
517	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
518}
519
520void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
521
522int cpu_amdc1e_bug = 0;			/* AMD C1E APIC workaround required. */
523
524static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
525SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
526    0, "Use MONITOR/MWAIT for short idle");
527
528static bool
529cpu_idle_enter(int *statep, int newstate)
530{
531	KASSERT(atomic_load_int(statep) == STATE_RUNNING,
532	    ("%s: state %d", __func__, atomic_load_int(statep)));
533
534	/*
535	 * A fence is needed to prevent reordering of the load in
536	 * sched_runnable() with this store to the idle state word.  Without it,
537	 * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
538	 * added load to the queue, and elide an IPI.  Then, sched_runnable()
539	 * can observe tdq_load == 0, so the CPU ends up idling with pending
540	 * work.  tdq_notify() similarly ensures that a prior update to tdq_load
541	 * is visible before calling cpu_idle_wakeup().
542	 */
543	atomic_store_int(statep, newstate);
544#if defined(SCHED_ULE) && defined(SMP)
545	atomic_thread_fence_seq_cst();
546#endif
547
548	/*
549	 * Since we may be in a critical section from cpu_idle(), if
550	 * an interrupt fires during that critical section we may have
551	 * a pending preemption.  If the CPU halts, then that thread
552	 * may not execute until a later interrupt awakens the CPU.
553	 * To handle this race, check for a runnable thread after
554	 * disabling interrupts and immediately return if one is
555	 * found.  Also, we must absolutely guarentee that hlt is
556	 * the next instruction after sti.  This ensures that any
557	 * interrupt that fires after the call to disable_intr() will
558	 * immediately awaken the CPU from hlt.  Finally, please note
559	 * that on x86 this works fine because of interrupts enabled only
560	 * after the instruction following sti takes place, while IF is set
561	 * to 1 immediately, allowing hlt instruction to acknowledge the
562	 * interrupt.
563	 */
564	disable_intr();
565	if (sched_runnable()) {
566		enable_intr();
567		atomic_store_int(statep, STATE_RUNNING);
568		return (false);
569	} else {
570		return (true);
571	}
572}
573
574static void
575cpu_idle_exit(int *statep)
576{
577	atomic_store_int(statep, STATE_RUNNING);
578}
579
580static void
581cpu_idle_acpi(sbintime_t sbt)
582{
583	int *state;
584
585	state = &PCPU_PTR(monitorbuf)->idle_state;
586	if (cpu_idle_enter(state, STATE_SLEEPING)) {
587		if (cpu_idle_hook)
588			cpu_idle_hook(sbt);
589		else
590			acpi_cpu_c1();
591		cpu_idle_exit(state);
592	}
593}
594
595static void
596cpu_idle_hlt(sbintime_t sbt)
597{
598	int *state;
599
600	state = &PCPU_PTR(monitorbuf)->idle_state;
601	if (cpu_idle_enter(state, STATE_SLEEPING)) {
602		acpi_cpu_c1();
603		atomic_store_int(state, STATE_RUNNING);
604	}
605}
606
607static void
608cpu_idle_mwait(sbintime_t sbt)
609{
610	int *state;
611
612	state = &PCPU_PTR(monitorbuf)->idle_state;
613	if (cpu_idle_enter(state, STATE_MWAIT)) {
614		cpu_monitor(state, 0, 0);
615		if (atomic_load_int(state) == STATE_MWAIT)
616			__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
617		else
618			enable_intr();
619		cpu_idle_exit(state);
620	}
621}
622
623static void
624cpu_idle_spin(sbintime_t sbt)
625{
626	int *state;
627	int i;
628
629	state = &PCPU_PTR(monitorbuf)->idle_state;
630	atomic_store_int(state, STATE_RUNNING);
631
632	/*
633	 * The sched_runnable() call is racy but as long as there is
634	 * a loop missing it one time will have just a little impact if any
635	 * (and it is much better than missing the check at all).
636	 */
637	for (i = 0; i < 1000; i++) {
638		if (sched_runnable())
639			return;
640		cpu_spinwait();
641	}
642}
643
644void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
645
646void
647cpu_idle(int busy)
648{
649	uint64_t msr;
650	sbintime_t sbt = -1;
651
652	CTR1(KTR_SPARE2, "cpu_idle(%d)", busy);
653
654	/* If we are busy - try to use fast methods. */
655	if (busy) {
656		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
657			cpu_idle_mwait(busy);
658			goto out;
659		}
660	}
661
662	/* If we have time - switch timers into idle mode. */
663	if (!busy) {
664		critical_enter();
665		sbt = cpu_idleclock();
666	}
667
668	/* Apply AMD APIC timer C1E workaround. */
669	if (cpu_amdc1e_bug && cpu_disable_c3_sleep) {
670		msr = rdmsr(MSR_AMDK8_IPM);
671		if ((msr & (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)) != 0)
672			wrmsr(MSR_AMDK8_IPM, msr & ~(AMDK8_SMIONCMPHALT |
673			    AMDK8_C1EONCMPHALT));
674	}
675
676	/* Call main idle method. */
677	cpu_idle_fn(sbt);
678
679	/* Switch timers back into active mode. */
680	if (!busy) {
681		cpu_activeclock();
682		critical_exit();
683	}
684out:
685	CTR1(KTR_SPARE2, "cpu_idle(%d) done", busy);
686}
687
688static int cpu_idle_apl31_workaround;
689SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
690    &cpu_idle_apl31_workaround, 0,
691    "Apollo Lake APL31 MWAIT bug workaround");
692
693int
694cpu_idle_wakeup(int cpu)
695{
696	struct monitorbuf *mb;
697	int *state;
698
699	mb = &pcpu_find(cpu)->pc_monitorbuf;
700	state = &mb->idle_state;
701	switch (atomic_load_int(state)) {
702	case STATE_SLEEPING:
703		return (0);
704	case STATE_MWAIT:
705		atomic_store_int(state, STATE_RUNNING);
706		return (cpu_idle_apl31_workaround ? 0 : 1);
707	case STATE_RUNNING:
708		return (1);
709	default:
710		panic("bad monitor state");
711		return (1);
712	}
713}
714
715/*
716 * Ordered by speed/power consumption.
717 */
718static const struct {
719	void	*id_fn;
720	const char *id_name;
721	int	id_cpuid2_flag;
722} idle_tbl[] = {
723	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
724	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
725	    .id_cpuid2_flag = CPUID2_MON },
726	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
727	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
728};
729
730static int
731idle_sysctl_available(SYSCTL_HANDLER_ARGS)
732{
733	char *avail, *p;
734	int error;
735	int i;
736
737	avail = malloc(256, M_TEMP, M_WAITOK);
738	p = avail;
739	for (i = 0; i < nitems(idle_tbl); i++) {
740		if (idle_tbl[i].id_cpuid2_flag != 0 &&
741		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
742			continue;
743		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
744		    cpu_idle_hook == NULL)
745			continue;
746		p += sprintf(p, "%s%s", p != avail ? ", " : "",
747		    idle_tbl[i].id_name);
748	}
749	error = sysctl_handle_string(oidp, avail, 0, req);
750	free(avail, M_TEMP);
751	return (error);
752}
753
754SYSCTL_PROC(_machdep, OID_AUTO, idle_available,
755    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
756    0, 0, idle_sysctl_available, "A",
757    "list of available idle functions");
758
759static bool
760cpu_idle_selector(const char *new_idle_name)
761{
762	int i;
763
764	for (i = 0; i < nitems(idle_tbl); i++) {
765		if (idle_tbl[i].id_cpuid2_flag != 0 &&
766		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
767			continue;
768		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
769		    cpu_idle_hook == NULL)
770			continue;
771		if (strcmp(idle_tbl[i].id_name, new_idle_name))
772			continue;
773		cpu_idle_fn = idle_tbl[i].id_fn;
774		if (bootverbose)
775			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
776		return (true);
777	}
778	return (false);
779}
780
781static int
782cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
783{
784	char buf[16];
785	const char *p;
786	int error, i;
787
788	p = "unknown";
789	for (i = 0; i < nitems(idle_tbl); i++) {
790		if (idle_tbl[i].id_fn == cpu_idle_fn) {
791			p = idle_tbl[i].id_name;
792			break;
793		}
794	}
795	strncpy(buf, p, sizeof(buf));
796	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
797	if (error != 0 || req->newptr == NULL)
798		return (error);
799	return (cpu_idle_selector(buf) ? 0 : EINVAL);
800}
801
802SYSCTL_PROC(_machdep, OID_AUTO, idle,
803    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
804    0, 0, cpu_idle_sysctl, "A",
805    "currently selected idle function");
806
807static void
808cpu_idle_tun(void *unused __unused)
809{
810	char tunvar[16];
811
812	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
813		cpu_idle_selector(tunvar);
814	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
815	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
816		/* Ryzen erratas 1057, 1109. */
817		cpu_idle_selector("hlt");
818		idle_mwait = 0;
819		mwait_cpustop_broken = true;
820	}
821
822	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
823	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x5c) {
824		/*
825		 * Apollo Lake errata APL31 (public errata APL30).
826		 * Stores to the armed address range may not trigger
827		 * MWAIT to resume execution.  OS needs to use
828		 * interrupts to wake processors from MWAIT-induced
829		 * sleep states.
830		 */
831		cpu_idle_apl31_workaround = 1;
832		mwait_cpustop_broken = true;
833	}
834	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
835}
836SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
837
838static int panic_on_nmi = 0xff;
839SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
840    &panic_on_nmi, 0,
841    "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all");
842int nmi_is_broadcast = 1;
843SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
844    &nmi_is_broadcast, 0,
845    "Chipset NMI is broadcast");
846int (*apei_nmi)(void);
847
848void
849nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
850{
851	bool claimed = false;
852
853#ifdef DEV_ISA
854	/* machine/parity/power fail/"kitchen sink" faults */
855	if (isa_nmi(frame->tf_err)) {
856		claimed = true;
857		if ((panic_on_nmi & 1) != 0)
858			panic("NMI indicates hardware failure");
859	}
860#endif /* DEV_ISA */
861
862	/* ACPI Platform Error Interfaces callback. */
863	if (apei_nmi != NULL && (*apei_nmi)())
864		claimed = true;
865
866	/*
867	 * NMIs can be useful for debugging.  They can be hooked up to a
868	 * pushbutton, usually on an ISA, PCI, or PCIe card.  They can also be
869	 * generated by an IPMI BMC, either manually or in response to a
870	 * watchdog timeout.  For example, see the "power diag" command in
871	 * ports/sysutils/ipmitool.  They can also be generated by a
872	 * hypervisor; see "bhyvectl --inject-nmi".
873	 */
874
875#ifdef KDB
876	if (!claimed && (panic_on_nmi & 2) != 0) {
877		if (debugger_on_panic) {
878			printf("NMI/cpu%d ... going to debugger\n", cpu);
879			claimed = kdb_trap(type, 0, frame);
880		}
881	}
882#endif /* KDB */
883
884	if (!claimed && panic_on_nmi != 0)
885		panic("NMI");
886}
887
888void
889nmi_handle_intr(u_int type, struct trapframe *frame)
890{
891
892#ifdef SMP
893	if (nmi_is_broadcast) {
894		nmi_call_kdb_smp(type, frame);
895		return;
896	}
897#endif
898	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
899}
900
901static int hw_ibrs_active;
902int hw_ibrs_ibpb_active;
903int hw_ibrs_disable = 1;
904
905SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
906    "Indirect Branch Restricted Speculation active");
907
908SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs,
909    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
910    "Indirect Branch Restricted Speculation active");
911
912SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD,
913    &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active");
914
915void
916hw_ibrs_recalculate(bool for_all_cpus)
917{
918	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
919		x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
920		    MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL) |
921		    (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
922		    IA32_SPEC_CTRL_IBRS, NULL);
923		hw_ibrs_active = hw_ibrs_disable == 0;
924		hw_ibrs_ibpb_active = 0;
925	} else {
926		hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
927		    CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
928	}
929}
930
931static int
932hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
933{
934	int error, val;
935
936	val = hw_ibrs_disable;
937	error = sysctl_handle_int(oidp, &val, 0, req);
938	if (error != 0 || req->newptr == NULL)
939		return (error);
940	hw_ibrs_disable = val != 0;
941	hw_ibrs_recalculate(true);
942	return (0);
943}
944SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
945    CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
946    "Disable Indirect Branch Restricted Speculation");
947
948SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT |
949    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
950    hw_ibrs_disable_handler, "I",
951    "Disable Indirect Branch Restricted Speculation");
952
953int hw_ssb_active;
954int hw_ssb_disable;
955
956SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
957    &hw_ssb_active, 0,
958    "Speculative Store Bypass Disable active");
959
960SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb,
961    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
962    "Speculative Store Bypass Disable active");
963
964SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD,
965    &hw_ssb_active, 0, "Speculative Store Bypass Disable active");
966
967static void
968hw_ssb_set(bool enable, bool for_all_cpus)
969{
970
971	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
972		hw_ssb_active = 0;
973		return;
974	}
975	hw_ssb_active = enable;
976	x86_msr_op(MSR_IA32_SPEC_CTRL,
977	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
978	    (for_all_cpus ? MSR_OP_SCHED_ALL : MSR_OP_LOCAL),
979	    IA32_SPEC_CTRL_SSBD, NULL);
980}
981
982void
983hw_ssb_recalculate(bool all_cpus)
984{
985
986	switch (hw_ssb_disable) {
987	default:
988		hw_ssb_disable = 0;
989		/* FALLTHROUGH */
990	case 0: /* off */
991		hw_ssb_set(false, all_cpus);
992		break;
993	case 1: /* on */
994		hw_ssb_set(true, all_cpus);
995		break;
996	case 2: /* auto */
997		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
998		    false : true, all_cpus);
999		break;
1000	}
1001}
1002
1003static int
1004hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
1005{
1006	int error, val;
1007
1008	val = hw_ssb_disable;
1009	error = sysctl_handle_int(oidp, &val, 0, req);
1010	if (error != 0 || req->newptr == NULL)
1011		return (error);
1012	hw_ssb_disable = val;
1013	hw_ssb_recalculate(true);
1014	return (0);
1015}
1016SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
1017    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1018    hw_ssb_disable_handler, "I",
1019    "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
1020
1021SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT |
1022    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1023    hw_ssb_disable_handler, "I",
1024    "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
1025
1026int hw_mds_disable;
1027
1028/*
1029 * Handler for Microarchitectural Data Sampling issues.  Really not a
1030 * pointer to C function: on amd64 the code must not change any CPU
1031 * architectural state except possibly %rflags. Also, it is always
1032 * called with interrupts disabled.
1033 */
1034void mds_handler_void(void);
1035void mds_handler_verw(void);
1036void mds_handler_ivb(void);
1037void mds_handler_bdw(void);
1038void mds_handler_skl_sse(void);
1039void mds_handler_skl_avx(void);
1040void mds_handler_skl_avx512(void);
1041void mds_handler_silvermont(void);
1042void (*mds_handler)(void) = mds_handler_void;
1043
1044static int
1045sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
1046{
1047	const char *state;
1048
1049	if (mds_handler == mds_handler_void)
1050		state = "inactive";
1051	else if (mds_handler == mds_handler_verw)
1052		state = "VERW";
1053	else if (mds_handler == mds_handler_ivb)
1054		state = "software IvyBridge";
1055	else if (mds_handler == mds_handler_bdw)
1056		state = "software Broadwell";
1057	else if (mds_handler == mds_handler_skl_sse)
1058		state = "software Skylake SSE";
1059	else if (mds_handler == mds_handler_skl_avx)
1060		state = "software Skylake AVX";
1061	else if (mds_handler == mds_handler_skl_avx512)
1062		state = "software Skylake AVX512";
1063	else if (mds_handler == mds_handler_silvermont)
1064		state = "software Silvermont";
1065	else
1066		state = "unknown";
1067	return (SYSCTL_OUT(req, state, strlen(state)));
1068}
1069
1070SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
1071    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1072    sysctl_hw_mds_disable_state_handler, "A",
1073    "Microarchitectural Data Sampling Mitigation state");
1074
1075SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds,
1076    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1077    "Microarchitectural Data Sampling Mitigation state");
1078
1079SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state,
1080    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1081    sysctl_hw_mds_disable_state_handler, "A",
1082    "Microarchitectural Data Sampling Mitigation state");
1083
1084_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
1085
1086void
1087hw_mds_recalculate(void)
1088{
1089	struct pcpu *pc;
1090	vm_offset_t b64;
1091	u_long xcr0;
1092	int i;
1093
1094	/*
1095	 * Allow user to force VERW variant even if MD_CLEAR is not
1096	 * reported.  For instance, hypervisor might unknowingly
1097	 * filter the cap out.
1098	 * For the similar reasons, and for testing, allow to enable
1099	 * mitigation even when MDS_NO cap is set.
1100	 */
1101	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
1102	    ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
1103	    hw_mds_disable == 3)) {
1104		mds_handler = mds_handler_void;
1105	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
1106	    hw_mds_disable == 3) || hw_mds_disable == 1) {
1107		mds_handler = mds_handler_verw;
1108	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1109	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
1110	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
1111	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
1112	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
1113	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
1114	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
1115	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1116		/*
1117		 * Nehalem, SandyBridge, IvyBridge
1118		 */
1119		CPU_FOREACH(i) {
1120			pc = pcpu_find(i);
1121			if (pc->pc_mds_buf == NULL) {
1122				pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
1123				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
1124				bzero(pc->pc_mds_buf, 16);
1125			}
1126		}
1127		mds_handler = mds_handler_ivb;
1128	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1129	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
1130	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
1131	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
1132	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
1133	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1134		/*
1135		 * Haswell, Broadwell
1136		 */
1137		CPU_FOREACH(i) {
1138			pc = pcpu_find(i);
1139			if (pc->pc_mds_buf == NULL) {
1140				pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
1141				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
1142				bzero(pc->pc_mds_buf, 16);
1143			}
1144		}
1145		mds_handler = mds_handler_bdw;
1146	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1147	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
1148	    CPUID_STEPPING) <= 5) ||
1149	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
1150	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
1151	    CPUID_STEPPING) <= 0xb) ||
1152	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
1153	    CPUID_STEPPING) <= 0xc)) &&
1154	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1155		/*
1156		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
1157		 * CascadeLake
1158		 */
1159		CPU_FOREACH(i) {
1160			pc = pcpu_find(i);
1161			if (pc->pc_mds_buf == NULL) {
1162				pc->pc_mds_buf = malloc_domainset(6 * 1024,
1163				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
1164				    M_WAITOK);
1165				b64 = (vm_offset_t)malloc_domainset(64 + 63,
1166				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
1167				    M_WAITOK);
1168				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
1169				bzero(pc->pc_mds_buf64, 64);
1170			}
1171		}
1172		xcr0 = rxcr(0);
1173		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
1174		    (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0)
1175			mds_handler = mds_handler_skl_avx512;
1176		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
1177		    (cpu_feature2 & CPUID2_AVX) != 0)
1178			mds_handler = mds_handler_skl_avx;
1179		else
1180			mds_handler = mds_handler_skl_sse;
1181	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1182	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
1183	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
1184	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
1185	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
1186	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
1187	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
1188	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
1189	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
1190	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
1191	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
1192	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
1193	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
1194	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
1195	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
1196	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
1197		/* Silvermont, Airmont */
1198		CPU_FOREACH(i) {
1199			pc = pcpu_find(i);
1200			if (pc->pc_mds_buf == NULL)
1201				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
1202		}
1203		mds_handler = mds_handler_silvermont;
1204	} else {
1205		hw_mds_disable = 0;
1206		mds_handler = mds_handler_void;
1207	}
1208}
1209
1210static void
1211hw_mds_recalculate_boot(void *arg __unused)
1212{
1213
1214	hw_mds_recalculate();
1215}
1216SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
1217
1218static int
1219sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
1220{
1221	int error, val;
1222
1223	val = hw_mds_disable;
1224	error = sysctl_handle_int(oidp, &val, 0, req);
1225	if (error != 0 || req->newptr == NULL)
1226		return (error);
1227	if (val < 0 || val > 3)
1228		return (EINVAL);
1229	hw_mds_disable = val;
1230	hw_mds_recalculate();
1231	return (0);
1232}
1233
1234SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
1235    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1236    sysctl_mds_disable_handler, "I",
1237    "Microarchitectural Data Sampling Mitigation "
1238    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
1239
1240SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT |
1241    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1242    sysctl_mds_disable_handler, "I",
1243    "Microarchitectural Data Sampling Mitigation "
1244    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
1245
1246/*
1247 * Intel Transactional Memory Asynchronous Abort Mitigation
1248 * CVE-2019-11135
1249 */
1250int x86_taa_enable;
1251int x86_taa_state;
1252enum {
1253	TAA_NONE	= 0,	/* No mitigation enabled */
1254	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
1255	TAA_VERW	= 2,	/* Use VERW mitigation */
1256	TAA_AUTO	= 3,	/* Automatically select the mitigation */
1257
1258	/* The states below are not selectable by the operator */
1259
1260	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
1261	TAA_NOT_PRESENT	= 5	/* TSX is not present */
1262};
1263
1264static void
1265taa_set(bool enable, bool all)
1266{
1267
1268	x86_msr_op(MSR_IA32_TSX_CTRL,
1269	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1270	    (all ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1271	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR,
1272	    NULL);
1273}
1274
1275void
1276x86_taa_recalculate(void)
1277{
1278	static int taa_saved_mds_disable = 0;
1279	int taa_need = 0, taa_state = 0;
1280	int mds_disable = 0, need_mds_recalc = 0;
1281
1282	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
1283	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
1284	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
1285		/* TSX is not present */
1286		x86_taa_state = TAA_NOT_PRESENT;
1287		return;
1288	}
1289
1290	/* Check to see what mitigation options the CPU gives us */
1291	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
1292		/* CPU is not suseptible to TAA */
1293		taa_need = TAA_TAA_UC;
1294	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
1295		/*
1296		 * CPU can turn off TSX.  This is the next best option
1297		 * if TAA_NO hardware mitigation isn't present
1298		 */
1299		taa_need = TAA_TSX_DISABLE;
1300	} else {
1301		/* No TSX/TAA specific remedies are available. */
1302		if (x86_taa_enable == TAA_TSX_DISABLE) {
1303			if (bootverbose)
1304				printf("TSX control not available\n");
1305			return;
1306		} else
1307			taa_need = TAA_VERW;
1308	}
1309
1310	/* Can we automatically take action, or are we being forced? */
1311	if (x86_taa_enable == TAA_AUTO)
1312		taa_state = taa_need;
1313	else
1314		taa_state = x86_taa_enable;
1315
1316	/* No state change, nothing to do */
1317	if (taa_state == x86_taa_state) {
1318		if (bootverbose)
1319			printf("No TSX change made\n");
1320		return;
1321	}
1322
1323	/* Does the MSR need to be turned on or off? */
1324	if (taa_state == TAA_TSX_DISABLE)
1325		taa_set(true, true);
1326	else if (x86_taa_state == TAA_TSX_DISABLE)
1327		taa_set(false, true);
1328
1329	/* Does MDS need to be set to turn on VERW? */
1330	if (taa_state == TAA_VERW) {
1331		taa_saved_mds_disable = hw_mds_disable;
1332		mds_disable = hw_mds_disable = 1;
1333		need_mds_recalc = 1;
1334	} else if (x86_taa_state == TAA_VERW) {
1335		mds_disable = hw_mds_disable = taa_saved_mds_disable;
1336		need_mds_recalc = 1;
1337	}
1338	if (need_mds_recalc) {
1339		hw_mds_recalculate();
1340		if (mds_disable != hw_mds_disable) {
1341			if (bootverbose)
1342				printf("Cannot change MDS state for TAA\n");
1343			/* Don't update our state */
1344			return;
1345		}
1346	}
1347
1348	x86_taa_state = taa_state;
1349	return;
1350}
1351
1352static void
1353taa_recalculate_boot(void * arg __unused)
1354{
1355
1356	x86_taa_recalculate();
1357}
1358SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
1359
1360SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa,
1361    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1362    "TSX Asynchronous Abort Mitigation");
1363
1364static int
1365sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
1366{
1367	int error, val;
1368
1369	val = x86_taa_enable;
1370	error = sysctl_handle_int(oidp, &val, 0, req);
1371	if (error != 0 || req->newptr == NULL)
1372		return (error);
1373	if (val < TAA_NONE || val > TAA_AUTO)
1374		return (EINVAL);
1375	x86_taa_enable = val;
1376	x86_taa_recalculate();
1377	return (0);
1378}
1379
1380SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
1381    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1382    sysctl_taa_handler, "I",
1383    "TAA Mitigation enablement control "
1384    "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO)");
1385
1386static int
1387sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
1388{
1389	const char *state;
1390
1391	switch (x86_taa_state) {
1392	case TAA_NONE:
1393		state = "inactive";
1394		break;
1395	case TAA_TSX_DISABLE:
1396		state = "TSX disabled";
1397		break;
1398	case TAA_VERW:
1399		state = "VERW";
1400		break;
1401	case TAA_TAA_UC:
1402		state = "Mitigated in microcode";
1403		break;
1404	case TAA_NOT_PRESENT:
1405		state = "TSX not present";
1406		break;
1407	default:
1408		state = "unknown";
1409	}
1410
1411	return (SYSCTL_OUT(req, state, strlen(state)));
1412}
1413
1414SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
1415    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1416    sysctl_taa_state_handler, "A",
1417    "TAA Mitigation state");
1418
1419int __read_frequently cpu_flush_rsb_ctxsw;
1420SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
1421    CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
1422    "Flush Return Stack Buffer on context switch");
1423
1424SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
1425    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1426    "MCU Optimization, disable RDSEED mitigation");
1427
1428int x86_rngds_mitg_enable = 1;
1429void
1430x86_rngds_mitg_recalculate(bool all_cpus)
1431{
1432	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
1433		return;
1434	x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
1435	    (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1436	    (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1437	    IA32_RNGDS_MITG_DIS, NULL);
1438}
1439
1440static int
1441sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
1442{
1443	int error, val;
1444
1445	val = x86_rngds_mitg_enable;
1446	error = sysctl_handle_int(oidp, &val, 0, req);
1447	if (error != 0 || req->newptr == NULL)
1448		return (error);
1449	x86_rngds_mitg_enable = val;
1450	x86_rngds_mitg_recalculate(true);
1451	return (0);
1452}
1453SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
1454    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1455    sysctl_rngds_mitg_enable_handler, "I",
1456    "MCU Optimization, disabling RDSEED mitigation control "
1457    "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled)");
1458
1459static int
1460sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
1461{
1462	const char *state;
1463
1464	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
1465		state = "Not applicable";
1466	} else if (x86_rngds_mitg_enable == 0) {
1467		state = "RDSEED not serialized";
1468	} else {
1469		state = "Mitigated";
1470	}
1471	return (SYSCTL_OUT(req, state, strlen(state)));
1472}
1473SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
1474    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1475    sysctl_rngds_state_handler, "A",
1476    "MCU Optimization state");
1477
1478
1479/*
1480 * Zenbleed.
1481 *
1482 * No corresponding errata is publicly listed.  AMD has issued a security
1483 * bulletin (AMD-SB-7008), entitled "Cross-Process Information Leak".  This
1484 * document lists (as of August 2023) platform firmware's availability target
1485 * dates, with most being November/December 2023.  It will then be up to
1486 * motherboard manufacturers to produce corresponding BIOS updates, which will
1487 * happen with an inevitable lag.  Additionally, for a variety of reasons,
1488 * operators might not be able to apply them everywhere due.  On the side of
1489 * standalone CPU microcodes, no plans for availability have been published so
1490 * far.  However, a developer appearing to be an AMD employee has hardcoded in
1491 * Linux revision numbers of future microcodes that are presumed to fix the
1492 * vulnerability.
1493 *
1494 * Given the stability issues encountered with early microcode releases for Rome
1495 * (the only microcode publicly released so far) and the absence of official
1496 * communication on standalone CPU microcodes, we have opted instead for
1497 * matching by default all AMD Zen2 processors which, according to the
1498 * vulnerability's discoverer, are all affected (see
1499 * https://lock.cmpxchg8b.com/zenbleed.html).  This policy, also adopted by
1500 * OpenBSD, may be overriden using the tunable/sysctl
1501 * 'machdep.mitigations.zenbleed.enable'.  We might revise it later depending on
1502 * official statements, microcode updates' public availability and community
1503 * assessment that they actually fix the vulnerability without any instability
1504 * side effects.
1505 */
1506
1507SYSCTL_NODE(_machdep_mitigations, OID_AUTO, zenbleed,
1508    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1509    "Zenbleed OS-triggered prevention (via chicken bit)");
1510
1511/* 2 is auto, see below. */
1512int zenbleed_enable = 2;
1513
1514void
1515zenbleed_sanitize_enable(void)
1516{
1517	/* Default to auto (2). */
1518	if (zenbleed_enable < 0 || zenbleed_enable > 2)
1519		zenbleed_enable = 2;
1520}
1521
1522static bool
1523zenbleed_chicken_bit_applicable(void)
1524{
1525	/* Concerns only bare-metal AMD Zen2 processors. */
1526	return (cpu_vendor_id == CPU_VENDOR_AMD &&
1527	    CPUID_TO_FAMILY(cpu_id) == 0x17 &&
1528	    CPUID_TO_MODEL(cpu_id) >= 0x30 &&
1529	    vm_guest == VM_GUEST_NO);
1530}
1531
1532static bool
1533zenbleed_chicken_bit_should_enable(void)
1534{
1535	/*
1536	 * Obey tunable/sysctl.
1537	 *
1538	 * As explained above, currently, the automatic setting (2) and the "on"
1539	 * one (1) have the same effect.  In the future, we might additionally
1540	 * check for specific microcode revisions as part of the automatic
1541	 * determination.
1542	 */
1543	return (zenbleed_enable != 0);
1544}
1545
1546void
1547zenbleed_check_and_apply(bool all_cpus)
1548{
1549	bool set;
1550
1551	if (!zenbleed_chicken_bit_applicable())
1552		return;
1553
1554	set = zenbleed_chicken_bit_should_enable();
1555
1556	x86_msr_op(MSR_DE_CFG,
1557	    (set ? MSR_OP_OR : MSR_OP_ANDNOT) |
1558	    (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
1559	    DE_CFG_ZEN2_FP_BACKUP_FIX_BIT, NULL);
1560}
1561
1562static int
1563sysctl_zenbleed_enable_handler(SYSCTL_HANDLER_ARGS)
1564{
1565	int error, val;
1566
1567	val = zenbleed_enable;
1568	error = sysctl_handle_int(oidp, &val, 0, req);
1569	if (error != 0 || req->newptr == NULL)
1570		return (error);
1571	zenbleed_enable = val;
1572	zenbleed_sanitize_enable();
1573	zenbleed_check_and_apply(true);
1574	return (0);
1575}
1576SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, enable, CTLTYPE_INT |
1577    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1578    sysctl_zenbleed_enable_handler, "I",
1579    "Enable Zenbleed OS-triggered mitigation (chicken bit) "
1580    "(0: Force disable, 1: Force enable, 2: Automatic determination)");
1581
1582static int
1583sysctl_zenbleed_state_handler(SYSCTL_HANDLER_ARGS)
1584{
1585	const char *state;
1586
1587	if (!zenbleed_chicken_bit_applicable())
1588		state = "Not applicable";
1589	else if (zenbleed_chicken_bit_should_enable())
1590		state = "Mitigation enabled";
1591	else
1592		state = "Mitigation disabled";
1593	return (SYSCTL_OUT(req, state, strlen(state)));
1594}
1595SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, state,
1596    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1597    sysctl_zenbleed_state_handler, "A",
1598    "Zenbleed OS-triggered mitigation (chicken bit) state");
1599
1600
1601/*
1602 * Enable and restore kernel text write permissions.
1603 * Callers must ensure that disable_wp()/restore_wp() are executed
1604 * without rescheduling on the same core.
1605 */
1606bool
1607disable_wp(void)
1608{
1609	u_int cr0;
1610
1611	cr0 = rcr0();
1612	if ((cr0 & CR0_WP) == 0)
1613		return (false);
1614	load_cr0(cr0 & ~CR0_WP);
1615	return (true);
1616}
1617
1618void
1619restore_wp(bool old_wp)
1620{
1621
1622	if (old_wp)
1623		load_cr0(rcr0() | CR0_WP);
1624}
1625
1626bool
1627acpi_get_fadt_bootflags(uint16_t *flagsp)
1628{
1629#ifdef DEV_ACPI
1630	ACPI_TABLE_FADT *fadt;
1631	vm_paddr_t physaddr;
1632
1633	physaddr = acpi_find_table(ACPI_SIG_FADT);
1634	if (physaddr == 0)
1635		return (false);
1636	fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
1637	if (fadt == NULL)
1638		return (false);
1639	*flagsp = fadt->BootFlags;
1640	acpi_unmap_table(fadt);
1641	return (true);
1642#else
1643	return (false);
1644#endif
1645}
1646
1647DEFINE_IFUNC(, uint64_t, rdtsc_ordered, (void))
1648{
1649	bool cpu_is_amd = cpu_vendor_id == CPU_VENDOR_AMD ||
1650	    cpu_vendor_id == CPU_VENDOR_HYGON;
1651
1652	if ((amd_feature & AMDID_RDTSCP) != 0)
1653		return (rdtscp);
1654	else if ((cpu_feature & CPUID_SSE2) != 0)
1655		return (cpu_is_amd ? rdtsc_ordered_mfence :
1656		    rdtsc_ordered_lfence);
1657	else
1658		return (rdtsc);
1659}
1660