1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/proc.h>
37
38#include <machine/clock.h>
39#include <machine/cpufunc.h>
40#include <machine/md_var.h>
41#include <machine/pcb.h>
42#include <machine/specialreg.h>
43#include <machine/vmm.h>
44
45#include "vmx.h"
46#include "vmx_msr.h"
47#include "x86.h"
48
49static bool
50vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
51{
52
53	return ((msr_val & (1UL << (bitpos + 32))) != 0);
54}
55
56static bool
57vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
58{
59
60	return ((msr_val & (1UL << bitpos)) == 0);
61}
62
63uint32_t
64vmx_revision(void)
65{
66
67	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
68}
69
70/*
71 * Generate a bitmask to be used for the VMCS execution control fields.
72 *
73 * The caller specifies what bits should be set to one in 'ones_mask'
74 * and what bits should be set to zero in 'zeros_mask'. The don't-care
75 * bits are set to the default value. The default values are obtained
76 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
77 * VMX Capabilities".
78 *
79 * Returns zero on success and non-zero on error.
80 */
81int
82vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
83	       uint32_t zeros_mask, uint32_t *retval)
84{
85	int i;
86	uint64_t val, trueval;
87	bool true_ctls_avail, one_allowed, zero_allowed;
88
89	/* We cannot ask the same bit to be set to both '1' and '0' */
90	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
91		return (EINVAL);
92
93	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
94
95	val = rdmsr(ctl_reg);
96	if (true_ctls_avail)
97		trueval = rdmsr(true_ctl_reg);		/* step c */
98	else
99		trueval = val;				/* step a */
100
101	for (i = 0; i < 32; i++) {
102		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
103		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
104
105		KASSERT(one_allowed || zero_allowed,
106			("invalid zero/one setting for bit %d of ctl 0x%0x, "
107			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
108
109		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
110			if (ones_mask & (1 << i))
111				return (EINVAL);
112			*retval &= ~(1 << i);
113		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
114			if (zeros_mask & (1 << i))
115				return (EINVAL);
116			*retval |= 1 << i;
117		} else {
118			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
119				*retval &= ~(1 << i);
120			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
121				*retval |= 1 << i;
122			else if (!true_ctls_avail)
123				*retval &= ~(1 << i);	/* b(iii) */
124			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
125				*retval &= ~(1 << i);
126			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
127				*retval |= 1 << i;
128			else {
129				panic("vmx_set_ctlreg: unable to determine "
130				      "correct value of ctl bit %d for msr "
131				      "0x%0x and true msr 0x%0x", i, ctl_reg,
132				      true_ctl_reg);
133			}
134		}
135	}
136
137	return (0);
138}
139
140void
141msr_bitmap_initialize(char *bitmap)
142{
143
144	memset(bitmap, 0xff, PAGE_SIZE);
145}
146
147int
148msr_bitmap_change_access(char *bitmap, u_int msr, int access)
149{
150	int byte, bit;
151
152	if (msr <= 0x00001FFF)
153		byte = msr / 8;
154	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
155		byte = 1024 + (msr - 0xC0000000) / 8;
156	else
157		return (EINVAL);
158
159	bit = msr & 0x7;
160
161	if (access & MSR_BITMAP_ACCESS_READ)
162		bitmap[byte] &= ~(1 << bit);
163	else
164		bitmap[byte] |= 1 << bit;
165
166	byte += 2048;
167	if (access & MSR_BITMAP_ACCESS_WRITE)
168		bitmap[byte] &= ~(1 << bit);
169	else
170		bitmap[byte] |= 1 << bit;
171
172	return (0);
173}
174
175static uint64_t misc_enable;
176static uint64_t platform_info;
177static uint64_t turbo_ratio_limit;
178static uint64_t host_msrs[GUEST_MSR_NUM];
179
180static bool
181nehalem_cpu(void)
182{
183	u_int family, model;
184
185	/*
186	 * The family:model numbers belonging to the Nehalem microarchitecture
187	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
188	 */
189	family = CPUID_TO_FAMILY(cpu_id);
190	model = CPUID_TO_MODEL(cpu_id);
191	if (family == 0x6) {
192		switch (model) {
193		case 0x1A:
194		case 0x1E:
195		case 0x1F:
196		case 0x2E:
197			return (true);
198		default:
199			break;
200		}
201	}
202	return (false);
203}
204
205static bool
206westmere_cpu(void)
207{
208	u_int family, model;
209
210	/*
211	 * The family:model numbers belonging to the Westmere microarchitecture
212	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
213	 */
214	family = CPUID_TO_FAMILY(cpu_id);
215	model = CPUID_TO_MODEL(cpu_id);
216	if (family == 0x6) {
217		switch (model) {
218		case 0x25:
219		case 0x2C:
220			return (true);
221		default:
222			break;
223		}
224	}
225	return (false);
226}
227
228static bool
229pat_valid(uint64_t val)
230{
231	int i, pa;
232
233	/*
234	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
235	 *
236	 * Extract PA0 through PA7 and validate that each one encodes a
237	 * valid memory type.
238	 */
239	for (i = 0; i < 8; i++) {
240		pa = (val >> (i * 8)) & 0xff;
241		if (pa == 2 || pa == 3 || pa >= 8)
242			return (false);
243	}
244	return (true);
245}
246
247void
248vmx_msr_init(void)
249{
250	uint64_t bus_freq, ratio;
251	int i;
252
253	/*
254	 * It is safe to cache the values of the following MSRs because
255	 * they don't change based on curcpu, curproc or curthread.
256	 */
257	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
258	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
259	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
260	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
261
262	/*
263	 * Initialize emulated MSRs
264	 */
265	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
266	/*
267	 * Set mandatory bits
268	 *  11:   branch trace disabled
269	 *  12:   PEBS unavailable
270	 * Clear unsupported features
271	 *  16:   SpeedStep enable
272	 *  18:   enable MONITOR FSM
273	 */
274	misc_enable |= (1 << 12) | (1 << 11);
275	misc_enable &= ~((1 << 18) | (1 << 16));
276
277	if (nehalem_cpu() || westmere_cpu())
278		bus_freq = 133330000;		/* 133Mhz */
279	else
280		bus_freq = 100000000;		/* 100Mhz */
281
282	/*
283	 * XXXtime
284	 * The ratio should really be based on the virtual TSC frequency as
285	 * opposed to the host TSC.
286	 */
287	ratio = (tsc_freq / bus_freq) & 0xff;
288
289	/*
290	 * The register definition is based on the micro-architecture
291	 * but the following bits are always the same:
292	 * [15:8]  Maximum Non-Turbo Ratio
293	 * [28]    Programmable Ratio Limit for Turbo Mode
294	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
295	 * [47:40] Maximum Efficiency Ratio
296	 *
297	 * The other bits can be safely set to 0 on all
298	 * micro-architectures up to Haswell.
299	 */
300	platform_info = (ratio << 8) | (ratio << 40);
301
302	/*
303	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
304	 * dependent on the maximum cores per package supported by the micro-
305	 * architecture. For e.g., Westmere supports 6 cores per package and
306	 * uses the low 48 bits. Sandybridge support 8 cores per package and
307	 * uses up all 64 bits.
308	 *
309	 * However, the unused bits are reserved so we pretend that all bits
310	 * in this MSR are valid.
311	 */
312	for (i = 0; i < 8; i++)
313		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
314}
315
316void
317vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
318{
319	uint64_t *guest_msrs;
320
321	guest_msrs = vmx->guest_msrs[vcpuid];
322
323	/*
324	 * The permissions bitmap is shared between all vcpus so initialize it
325	 * once when initializing the vBSP.
326	 */
327	if (vcpuid == 0) {
328		guest_msr_rw(vmx, MSR_LSTAR);
329		guest_msr_rw(vmx, MSR_CSTAR);
330		guest_msr_rw(vmx, MSR_STAR);
331		guest_msr_rw(vmx, MSR_SF_MASK);
332		guest_msr_rw(vmx, MSR_KGSBASE);
333	}
334
335	/*
336	 * Initialize guest IA32_PAT MSR with default value after reset.
337	 */
338	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
339	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
340	    PAT_VALUE(2, PAT_UNCACHED)		|
341	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
342	    PAT_VALUE(4, PAT_WRITE_BACK)	|
343	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
344	    PAT_VALUE(6, PAT_UNCACHED)		|
345	    PAT_VALUE(7, PAT_UNCACHEABLE);
346
347	return;
348}
349
350void
351vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
352{
353	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
354
355	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
356	update_pcb_bases(curpcb);
357	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
358	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
359	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
360	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
361	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
362}
363
364void
365vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, int vcpuid)
366{
367	uint64_t guest_tsc_aux = vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX];
368	uint32_t host_aux = cpu_auxmsr();
369
370	if (vmx_have_msr_tsc_aux(vmx) && guest_tsc_aux != host_aux)
371		wrmsr(MSR_TSC_AUX, guest_tsc_aux);
372}
373
374void
375vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
376{
377	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
378
379	/* Save guest MSRs */
380	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
381	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
382	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
383	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
384	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
385
386	/* Restore host MSRs */
387	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
388	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
389	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
390	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
391
392	/* MSR_KGSBASE will be restored on the way back to userspace */
393}
394
395void
396vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, int vcpuid)
397{
398	uint64_t guest_tsc_aux = vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX];
399	uint32_t host_aux = cpu_auxmsr();
400
401	if (vmx_have_msr_tsc_aux(vmx) && guest_tsc_aux != host_aux)
402		/*
403		 * Note that it is not necessary to save the guest value
404		 * here; vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX] always
405		 * contains the current value since it is updated whenever
406		 * the guest writes to it (which is expected to be very
407		 * rare).
408		 */
409		wrmsr(MSR_TSC_AUX, host_aux);
410}
411
412int
413vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
414{
415	const uint64_t *guest_msrs;
416	int error;
417
418	guest_msrs = vmx->guest_msrs[vcpuid];
419	error = 0;
420
421	switch (num) {
422	case MSR_MCG_CAP:
423	case MSR_MCG_STATUS:
424		*val = 0;
425		break;
426	case MSR_MTRRcap:
427	case MSR_MTRRdefType:
428	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
429	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
430	case MSR_MTRR64kBase:
431		*val = 0;
432		break;
433	case MSR_IA32_MISC_ENABLE:
434		*val = misc_enable;
435		break;
436	case MSR_PLATFORM_INFO:
437		*val = platform_info;
438		break;
439	case MSR_TURBO_RATIO_LIMIT:
440	case MSR_TURBO_RATIO_LIMIT1:
441		*val = turbo_ratio_limit;
442		break;
443	case MSR_PAT:
444		*val = guest_msrs[IDX_MSR_PAT];
445		break;
446	default:
447		error = EINVAL;
448		break;
449	}
450	return (error);
451}
452
453int
454vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
455{
456	uint64_t *guest_msrs;
457	uint64_t changed;
458	int error;
459
460	guest_msrs = vmx->guest_msrs[vcpuid];
461	error = 0;
462
463	switch (num) {
464	case MSR_MCG_CAP:
465	case MSR_MCG_STATUS:
466		break;		/* ignore writes */
467	case MSR_MTRRcap:
468		vm_inject_gp(vmx->vm, vcpuid);
469		break;
470	case MSR_MTRRdefType:
471	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
472	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
473	case MSR_MTRR64kBase:
474		break;		/* Ignore writes */
475	case MSR_IA32_MISC_ENABLE:
476		changed = val ^ misc_enable;
477		/*
478		 * If the host has disabled the NX feature then the guest
479		 * also cannot use it. However, a Linux guest will try to
480		 * enable the NX feature by writing to the MISC_ENABLE MSR.
481		 *
482		 * This can be safely ignored because the memory management
483		 * code looks at CPUID.80000001H:EDX.NX to check if the
484		 * functionality is actually enabled.
485		 */
486		changed &= ~(1UL << 34);
487
488		/*
489		 * Punt to userspace if any other bits are being modified.
490		 */
491		if (changed)
492			error = EINVAL;
493
494		break;
495	case MSR_PAT:
496		if (pat_valid(val))
497			guest_msrs[IDX_MSR_PAT] = val;
498		else
499			vm_inject_gp(vmx->vm, vcpuid);
500		break;
501	case MSR_TSC:
502		error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
503		break;
504	case MSR_TSC_AUX:
505		if (vmx_have_msr_tsc_aux(vmx))
506			/*
507			 * vmx_msr_guest_enter_tsc_aux() will apply this
508			 * value when it is called immediately before guest
509			 * entry.
510			 */
511			guest_msrs[IDX_MSR_TSC_AUX] = val;
512		else
513			vm_inject_gp(vmx->vm, vcpuid);
514		break;
515	default:
516		error = EINVAL;
517		break;
518	}
519
520	return (error);
521}
522