ept.c revision 241147
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD$
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/types.h>
33#include <sys/errno.h>
34#include <sys/systm.h>
35#include <sys/malloc.h>
36#include <sys/smp.h>
37
38#include <vm/vm.h>
39#include <vm/pmap.h>
40
41#include <machine/param.h>
42#include <machine/cpufunc.h>
43#include <machine/pmap.h>
44#include <machine/vmparam.h>
45
46#include <machine/vmm.h>
47#include "vmx_cpufunc.h"
48#include "vmx_msr.h"
49#include "vmx.h"
50#include "ept.h"
51
52#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
53#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
54#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
55#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
56#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
57#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
58
59#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
60#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
61	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
62
63#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
64#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
65	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
66
67#define	EPT_PG_RD			(1 << 0)
68#define	EPT_PG_WR			(1 << 1)
69#define	EPT_PG_EX			(1 << 2)
70#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
71#define	EPT_PG_IGNORE_PAT		(1 << 6)
72#define	EPT_PG_SUPERPAGE		(1 << 7)
73
74#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
75
76MALLOC_DECLARE(M_VMX);
77
78static uint64_t page_sizes_mask;
79
80int
81ept_init(void)
82{
83	int page_shift;
84	uint64_t cap;
85
86	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
87
88	/*
89	 * Verify that:
90	 * - page walk length is 4 steps
91	 * - extended page tables can be laid out in write-back memory
92	 * - invvpid instruction with all possible types is supported
93	 * - invept instruction with all possible types is supported
94	 */
95	if (!EPT_PWL4(cap) ||
96	    !EPT_MEMORY_TYPE_WB(cap) ||
97	    !INVVPID_SUPPORTED(cap) ||
98	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
99	    !INVEPT_SUPPORTED(cap) ||
100	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
101		return (EINVAL);
102
103	/* Set bits in 'page_sizes_mask' for each valid page size */
104	page_shift = PAGE_SHIFT;
105	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
106
107	page_shift += 9;
108	if (EPT_PDE_SUPERPAGE(cap))
109		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
110
111	page_shift += 9;
112	if (EPT_PDPTE_SUPERPAGE(cap))
113		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
114
115	return (0);
116}
117
118#if 0
119static void
120ept_dump(uint64_t *ptp, int nlevels)
121{
122	int i, t, tabs;
123	uint64_t *ptpnext, ptpval;
124
125	if (--nlevels < 0)
126		return;
127
128	tabs = 3 - nlevels;
129	for (t = 0; t < tabs; t++)
130		printf("\t");
131	printf("PTP = %p\n", ptp);
132
133	for (i = 0; i < 512; i++) {
134		ptpval = ptp[i];
135
136		if (ptpval == 0)
137			continue;
138
139		for (t = 0; t < tabs; t++)
140			printf("\t");
141		printf("%3d 0x%016lx\n", i, ptpval);
142
143		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
144			ptpnext = (uint64_t *)
145				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
146			ept_dump(ptpnext, nlevels);
147		}
148	}
149}
150#endif
151
152static size_t
153ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
154		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
155{
156	int spshift, ptpshift, ptpindex, nlevels;
157
158	/*
159	 * Compute the size of the mapping that we can accomodate.
160	 *
161	 * This is based on three factors:
162	 * - super page sizes supported by the processor
163	 * - alignment of the region starting at 'gpa' and 'hpa'
164	 * - length of the region 'len'
165	 */
166	spshift = PAGE_SHIFT;
167	if (spok)
168		spshift += (EPT_PWLEVELS - 1) * 9;
169	while (spshift >= PAGE_SHIFT) {
170		uint64_t spsize = 1UL << spshift;
171		if ((page_sizes_mask & spsize) != 0 &&
172		    (gpa & (spsize - 1)) == 0 &&
173		    (hpa & (spsize - 1)) == 0 &&
174		    length >= spsize) {
175			break;
176		}
177		spshift -= 9;
178	}
179
180	if (spshift < PAGE_SHIFT) {
181		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
182		      "length 0x%016lx, page_sizes_mask 0x%016lx",
183		      gpa, hpa, length, page_sizes_mask);
184	}
185
186	nlevels = EPT_PWLEVELS;
187	while (--nlevels >= 0) {
188		ptpshift = PAGE_SHIFT + nlevels * 9;
189		ptpindex = (gpa >> ptpshift) & 0x1FF;
190
191		/* We have reached the leaf mapping */
192		if (spshift >= ptpshift)
193			break;
194
195		/*
196		 * We are working on a non-leaf page table page.
197		 *
198		 * Create the next level page table page if necessary and point
199		 * to it from the current page table.
200		 */
201		if (ptp[ptpindex] == 0) {
202			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
203			ptp[ptpindex] = vtophys(nlp);
204			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
205		}
206
207		/* Work our way down to the next level page table page */
208		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
209	}
210
211	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
212		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
213		      "mismatch\n", gpa, ptpshift);
214	}
215
216	if (prot != VM_PROT_NONE) {
217		/* Do the mapping */
218		ptp[ptpindex] = hpa;
219
220		/* Apply the access controls */
221		if (prot & VM_PROT_READ)
222			ptp[ptpindex] |= EPT_PG_RD;
223		if (prot & VM_PROT_WRITE)
224			ptp[ptpindex] |= EPT_PG_WR;
225		if (prot & VM_PROT_EXECUTE)
226			ptp[ptpindex] |= EPT_PG_EX;
227
228		/*
229		 * XXX should we enforce this memory type by setting the
230		 * ignore PAT bit to 1.
231		 */
232		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
233
234		if (nlevels > 0)
235			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
236	} else {
237		/* Remove the mapping */
238		ptp[ptpindex] = 0;
239	}
240
241	return (1UL << ptpshift);
242}
243
244static vm_paddr_t
245ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
246{
247	int nlevels, ptpshift, ptpindex;
248	uint64_t ptpval, hpabase, pgmask;
249
250	nlevels = EPT_PWLEVELS;
251	while (--nlevels >= 0) {
252		ptpshift = PAGE_SHIFT + nlevels * 9;
253		ptpindex = (gpa >> ptpshift) & 0x1FF;
254
255		ptpval = ptp[ptpindex];
256
257		/* Cannot make progress beyond this point */
258		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
259			break;
260
261		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
262			pgmask = (1UL << ptpshift) - 1;
263			hpabase = ptpval & ~pgmask;
264			return (hpabase | (gpa & pgmask));
265		}
266
267		/* Work our way down to the next level page table page */
268		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
269	}
270
271	return ((vm_paddr_t)-1);
272}
273
274static void
275ept_free_pt_entry(pt_entry_t pte)
276{
277	if (pte == 0)
278		return;
279
280	/* sanity check */
281	if ((pte & EPT_PG_SUPERPAGE) != 0)
282		panic("ept_free_pt_entry: pte cannot have superpage bit");
283
284	return;
285}
286
287static void
288ept_free_pd_entry(pd_entry_t pde)
289{
290	pt_entry_t	*pt;
291	int		i;
292
293	if (pde == 0)
294		return;
295
296	if ((pde & EPT_PG_SUPERPAGE) == 0) {
297		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
298		for (i = 0; i < NPTEPG; i++)
299			ept_free_pt_entry(pt[i]);
300		free(pt, M_VMX);	/* free the page table page */
301	}
302}
303
304static void
305ept_free_pdp_entry(pdp_entry_t pdpe)
306{
307	pd_entry_t 	*pd;
308	int		 i;
309
310	if (pdpe == 0)
311		return;
312
313	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
314		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
315		for (i = 0; i < NPDEPG; i++)
316			ept_free_pd_entry(pd[i]);
317		free(pd, M_VMX);	/* free the page directory page */
318	}
319}
320
321static void
322ept_free_pml4_entry(pml4_entry_t pml4e)
323{
324	pdp_entry_t	*pdp;
325	int		i;
326
327	if (pml4e == 0)
328		return;
329
330	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
331		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
332		for (i = 0; i < NPDPEPG; i++)
333			ept_free_pdp_entry(pdp[i]);
334		free(pdp, M_VMX);	/* free the page directory ptr page */
335	}
336}
337
338void
339ept_vmcleanup(struct vmx *vmx)
340{
341	int 		 i;
342
343	for (i = 0; i < NPML4EPG; i++)
344		ept_free_pml4_entry(vmx->pml4ept[i]);
345}
346
347int
348ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
349		vm_memattr_t attr, int prot, boolean_t spok)
350{
351	size_t n;
352	struct vmx *vmx = arg;
353
354	while (len > 0) {
355		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
356				       prot, spok);
357		len -= n;
358		gpa += n;
359		hpa += n;
360	}
361
362	return (0);
363}
364
365vm_paddr_t
366ept_vmmmap_get(void *arg, vm_paddr_t gpa)
367{
368	vm_paddr_t hpa;
369	struct vmx *vmx;
370
371	vmx = arg;
372	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
373	return (hpa);
374}
375
376static void
377invept_single_context(void *arg)
378{
379	struct invept_desc desc = *(struct invept_desc *)arg;
380
381	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
382}
383
384void
385ept_invalidate_mappings(u_long pml4ept)
386{
387	struct invept_desc invept_desc = { 0 };
388
389	invept_desc.eptp = EPTP(pml4ept);
390
391	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
392}
393