Deleted Added
full compact
vmm.c (255469) vmm.c (256072)
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright

--- 9 unchanged lines hidden (view full) ---

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright

--- 9 unchanged lines hidden (view full) ---

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/amd64/vmm/vmm.c 255469 2013-09-11 07:11:14Z neel $
26 * $FreeBSD: head/sys/amd64/vmm/vmm.c 256072 2013-10-05 21:22:35Z neel $
27 */
28
29#include <sys/cdefs.h>
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 255469 2013-09-11 07:11:14Z neel $");
30__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 256072 2013-10-05 21:22:35Z neel $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/systm.h>
45
46#include <vm/vm.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
47
48#include <machine/vm.h>
49#include <machine/pcb.h>
50#include <machine/smp.h>
51#include <x86/apicreg.h>
54
55#include <machine/vm.h>
56#include <machine/pcb.h>
57#include <machine/smp.h>
58#include <x86/apicreg.h>
59#include <machine/pmap.h>
60#include <machine/vmparam.h>
52
53#include <machine/vmm.h>
61
62#include <machine/vmm.h>
63#include "vmm_ktr.h"
54#include "vmm_host.h"
55#include "vmm_mem.h"
56#include "vmm_util.h"
57#include <machine/vmm_dev.h>
58#include "vlapic.h"
59#include "vmm_msr.h"
60#include "vmm_ipi.h"
61#include "vmm_stat.h"

--- 17 unchanged lines hidden (view full) ---

79 struct vm_exit exitinfo;
80 enum x2apic_state x2apic_state;
81 int nmi_pending;
82};
83
84#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
85#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
86#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
64#include "vmm_host.h"
65#include "vmm_mem.h"
66#include "vmm_util.h"
67#include <machine/vmm_dev.h>
68#include "vlapic.h"
69#include "vmm_msr.h"
70#include "vmm_ipi.h"
71#include "vmm_stat.h"

--- 17 unchanged lines hidden (view full) ---

89 struct vm_exit exitinfo;
90 enum x2apic_state x2apic_state;
91 int nmi_pending;
92};
93
94#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
95#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
96#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
97#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
87
98
99struct mem_seg {
100 vm_paddr_t gpa;
101 size_t len;
102 boolean_t wired;
103 vm_object_t object;
104};
88#define VM_MAX_MEMORY_SEGMENTS 2
89
90struct vm {
91 void *cookie; /* processor-specific data */
92 void *iommu; /* iommu-specific data */
105#define VM_MAX_MEMORY_SEGMENTS 2
106
107struct vm {
108 void *cookie; /* processor-specific data */
109 void *iommu; /* iommu-specific data */
110 struct vmspace *vmspace; /* guest's address space */
93 struct vcpu vcpu[VM_MAXCPU];
94 int num_mem_segs;
111 struct vcpu vcpu[VM_MAXCPU];
112 int num_mem_segs;
95 struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
113 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
96 char name[VM_MAX_NAMELEN];
97
98 /*
99 * Set of active vcpus.
100 * An active vcpu is one that has been started implicitly (BSP) or
101 * explicitly (AP) by sending it a startup ipi.
102 */
103 cpuset_t active_cpus;
104};
105
106static int vmm_initialized;
107
108static struct vmm_ops *ops;
109#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
110#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
111
114 char name[VM_MAX_NAMELEN];
115
116 /*
117 * Set of active vcpus.
118 * An active vcpu is one that has been started implicitly (BSP) or
119 * explicitly (AP) by sending it a startup ipi.
120 */
121 cpuset_t active_cpus;
122};
123
124static int vmm_initialized;
125
126static struct vmm_ops *ops;
127#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
128#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
129
112#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
113#define VMRUN(vmi, vcpu, rip) \
114 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
130#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
131#define VMRUN(vmi, vcpu, rip, pmap) \
132 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
115#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
133#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
116#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
117 (ops != NULL ? \
118 (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
119 ENXIO)
120#define VMMMAP_GET(vmi, gpa) \
121 (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
134#define VMSPACE_ALLOC(min, max) \
135 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
136#define VMSPACE_FREE(vmspace) \
137 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
122#define VMGETREG(vmi, vcpu, num, retval) \
123 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
124#define VMSETREG(vmi, vcpu, num, val) \
125 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
126#define VMGETDESC(vmi, vcpu, num, desc) \
127 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
128#define VMSETDESC(vmi, vcpu, num, desc) \
129 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)

--- 78 unchanged lines hidden (view full) ---

208static int
209vmm_handler(module_t mod, int what, void *arg)
210{
211 int error;
212
213 switch (what) {
214 case MOD_LOAD:
215 vmmdev_init();
138#define VMGETREG(vmi, vcpu, num, retval) \
139 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
140#define VMSETREG(vmi, vcpu, num, val) \
141 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
142#define VMGETDESC(vmi, vcpu, num, desc) \
143 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
144#define VMSETDESC(vmi, vcpu, num, desc) \
145 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)

--- 78 unchanged lines hidden (view full) ---

224static int
225vmm_handler(module_t mod, int what, void *arg)
226{
227 int error;
228
229 switch (what) {
230 case MOD_LOAD:
231 vmmdev_init();
216 if (ppt_num_devices() > 0)
217 iommu_init();
232 iommu_init();
218 error = vmm_init();
219 if (error == 0)
220 vmm_initialized = 1;
221 break;
222 case MOD_UNLOAD:
223 error = vmmdev_cleanup();
224 if (error == 0) {
225 iommu_cleanup();

--- 34 unchanged lines hidden (view full) ---

260
261SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
262
263int
264vm_create(const char *name, struct vm **retvm)
265{
266 int i;
267 struct vm *vm;
233 error = vmm_init();
234 if (error == 0)
235 vmm_initialized = 1;
236 break;
237 case MOD_UNLOAD:
238 error = vmmdev_cleanup();
239 if (error == 0) {
240 iommu_cleanup();

--- 34 unchanged lines hidden (view full) ---

275
276SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
277
278int
279vm_create(const char *name, struct vm **retvm)
280{
281 int i;
282 struct vm *vm;
268 vm_paddr_t maxaddr;
283 struct vmspace *vmspace;
269
270 const int BSP = 0;
271
272 /*
273 * If vmm.ko could not be successfully initialized then don't attempt
274 * to create the virtual machine.
275 */
276 if (!vmm_initialized)
277 return (ENXIO);
278
279 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
280 return (EINVAL);
281
284
285 const int BSP = 0;
286
287 /*
288 * If vmm.ko could not be successfully initialized then don't attempt
289 * to create the virtual machine.
290 */
291 if (!vmm_initialized)
292 return (ENXIO);
293
294 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
295 return (EINVAL);
296
297 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
298 if (vmspace == NULL)
299 return (ENOMEM);
300
282 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
283 strcpy(vm->name, name);
301 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
302 strcpy(vm->name, name);
284 vm->cookie = VMINIT(vm);
303 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
285
286 for (i = 0; i < VM_MAXCPU; i++) {
287 vcpu_init(vm, i);
288 guest_msrs_init(vm, i);
289 }
290
304
305 for (i = 0; i < VM_MAXCPU; i++) {
306 vcpu_init(vm, i);
307 guest_msrs_init(vm, i);
308 }
309
291 maxaddr = vmm_mem_maxaddr();
292 vm->iommu = iommu_create_domain(maxaddr);
293 vm_activate_cpu(vm, BSP);
310 vm_activate_cpu(vm, BSP);
311 vm->vmspace = vmspace;
294
295 *retvm = vm;
296 return (0);
297}
298
299static void
312
313 *retvm = vm;
314 return (0);
315}
316
317static void
300vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
318vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
301{
319{
302 size_t len;
303 vm_paddr_t hpa;
304 void *host_domain;
305
320
306 host_domain = iommu_host_domain();
321 if (seg->object != NULL)
322 vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
307
323
308 len = 0;
309 while (len < seg->len) {
310 hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
311 if (hpa == (vm_paddr_t)-1) {
312 panic("vm_free_mem_segs: cannot free hpa "
313 "associated with gpa 0x%016lx", seg->gpa + len);
314 }
315
316 /*
317 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
318 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
319 */
320 iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
321 iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
322
323 vmm_mem_free(hpa, PAGE_SIZE);
324
325 len += PAGE_SIZE;
326 }
327
328 /*
329 * Invalidate cached translations associated with 'vm->iommu' since
330 * we have now moved some pages from it.
331 */
332 iommu_invalidate_tlb(vm->iommu);
333
334 bzero(seg, sizeof(struct vm_memory_segment));
324 bzero(seg, sizeof(*seg));
335}
336
337void
338vm_destroy(struct vm *vm)
339{
340 int i;
341
342 ppt_unassign_all(vm);
343
325}
326
327void
328vm_destroy(struct vm *vm)
329{
330 int i;
331
332 ppt_unassign_all(vm);
333
334 if (vm->iommu != NULL)
335 iommu_destroy_domain(vm->iommu);
336
344 for (i = 0; i < vm->num_mem_segs; i++)
345 vm_free_mem_seg(vm, &vm->mem_segs[i]);
346
347 vm->num_mem_segs = 0;
348
349 for (i = 0; i < VM_MAXCPU; i++)
350 vcpu_cleanup(&vm->vcpu[i]);
351
337 for (i = 0; i < vm->num_mem_segs; i++)
338 vm_free_mem_seg(vm, &vm->mem_segs[i]);
339
340 vm->num_mem_segs = 0;
341
342 for (i = 0; i < VM_MAXCPU; i++)
343 vcpu_cleanup(&vm->vcpu[i]);
344
352 iommu_destroy_domain(vm->iommu);
345 VMSPACE_FREE(vm->vmspace);
353
354 VMCLEANUP(vm->cookie);
355
356 free(vm, M_VM);
357}
358
359const char *
360vm_name(struct vm *vm)
361{
362 return (vm->name);
363}
364
365int
366vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
367{
346
347 VMCLEANUP(vm->cookie);
348
349 free(vm, M_VM);
350}
351
352const char *
353vm_name(struct vm *vm)
354{
355 return (vm->name);
356}
357
358int
359vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
360{
368 const boolean_t spok = TRUE; /* superpage mappings are ok */
361 vm_object_t obj;
369
362
370 return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
371 VM_PROT_RW, spok));
363 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
364 return (ENOMEM);
365 else
366 return (0);
372}
373
374int
375vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
376{
367}
368
369int
370vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
371{
377 const boolean_t spok = TRUE; /* superpage mappings are ok */
378
372
379 return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
380 VM_PROT_NONE, spok));
373 vmm_mmio_free(vm->vmspace, gpa, len);
374 return (0);
381}
382
375}
376
383/*
384 * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
385 */
386static boolean_t
387vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
377boolean_t
378vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
388{
389 int i;
390 vm_paddr_t gpabase, gpalimit;
391
379{
380 int i;
381 vm_paddr_t gpabase, gpalimit;
382
392 if (gpa & PAGE_MASK)
393 panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
394
395 for (i = 0; i < vm->num_mem_segs; i++) {
396 gpabase = vm->mem_segs[i].gpa;
397 gpalimit = gpabase + vm->mem_segs[i].len;
398 if (gpa >= gpabase && gpa < gpalimit)
383 for (i = 0; i < vm->num_mem_segs; i++) {
384 gpabase = vm->mem_segs[i].gpa;
385 gpalimit = gpabase + vm->mem_segs[i].len;
386 if (gpa >= gpabase && gpa < gpalimit)
399 return (FALSE);
387 return (TRUE); /* 'gpa' is regular memory */
400 }
401
388 }
389
402 return (TRUE);
390 if (ppt_is_mmio(vm, gpa))
391 return (TRUE); /* 'gpa' is pci passthru mmio */
392
393 return (FALSE);
403}
404
405int
406vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
407{
394}
395
396int
397vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
398{
408 int error, available, allocated;
409 struct vm_memory_segment *seg;
410 vm_paddr_t g, hpa;
411 void *host_domain;
399 int available, allocated;
400 struct mem_seg *seg;
401 vm_object_t object;
402 vm_paddr_t g;
412
403
413 const boolean_t spok = TRUE; /* superpage mappings are ok */
414
415 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
416 return (EINVAL);
417
418 available = allocated = 0;
419 g = gpa;
420 while (g < gpa + len) {
404 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
405 return (EINVAL);
406
407 available = allocated = 0;
408 g = gpa;
409 while (g < gpa + len) {
421 if (vm_gpa_available(vm, g))
422 available++;
423 else
410 if (vm_mem_allocated(vm, g))
424 allocated++;
411 allocated++;
412 else
413 available++;
425
426 g += PAGE_SIZE;
427 }
428
429 /*
430 * If there are some allocated and some available pages in the address
431 * range then it is an error.
432 */

--- 5 unchanged lines hidden (view full) ---

438 * allocated then there isn't anything more to do.
439 */
440 if (allocated && available == 0)
441 return (0);
442
443 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
444 return (E2BIG);
445
414
415 g += PAGE_SIZE;
416 }
417
418 /*
419 * If there are some allocated and some available pages in the address
420 * range then it is an error.
421 */

--- 5 unchanged lines hidden (view full) ---

427 * allocated then there isn't anything more to do.
428 */
429 if (allocated && available == 0)
430 return (0);
431
432 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
433 return (E2BIG);
434
446 host_domain = iommu_host_domain();
447
448 seg = &vm->mem_segs[vm->num_mem_segs];
449
435 seg = &vm->mem_segs[vm->num_mem_segs];
436
450 error = 0;
437 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
438 return (ENOMEM);
439
451 seg->gpa = gpa;
440 seg->gpa = gpa;
452 seg->len = 0;
453 while (seg->len < len) {
454 hpa = vmm_mem_alloc(PAGE_SIZE);
455 if (hpa == 0) {
456 error = ENOMEM;
457 break;
458 }
441 seg->len = len;
442 seg->object = object;
443 seg->wired = FALSE;
459
444
460 error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
461 VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
462 if (error)
445 vm->num_mem_segs++;
446
447 return (0);
448}
449
450static void
451vm_gpa_unwire(struct vm *vm)
452{
453 int i, rv;
454 struct mem_seg *seg;
455
456 for (i = 0; i < vm->num_mem_segs; i++) {
457 seg = &vm->mem_segs[i];
458 if (!seg->wired)
459 continue;
460
461 rv = vm_map_unwire(&vm->vmspace->vm_map,
462 seg->gpa, seg->gpa + seg->len,
463 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
464 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
465 "%#lx/%ld could not be unwired: %d",
466 vm_name(vm), seg->gpa, seg->len, rv));
467
468 seg->wired = FALSE;
469 }
470}
471
472static int
473vm_gpa_wire(struct vm *vm)
474{
475 int i, rv;
476 struct mem_seg *seg;
477
478 for (i = 0; i < vm->num_mem_segs; i++) {
479 seg = &vm->mem_segs[i];
480 if (seg->wired)
481 continue;
482
483 /* XXX rlimits? */
484 rv = vm_map_wire(&vm->vmspace->vm_map,
485 seg->gpa, seg->gpa + seg->len,
486 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
487 if (rv != KERN_SUCCESS)
463 break;
464
488 break;
489
490 seg->wired = TRUE;
491 }
492
493 if (i < vm->num_mem_segs) {
465 /*
494 /*
466 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
467 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
495 * Undo the wiring before returning an error.
468 */
496 */
469 iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
470 iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
497 vm_gpa_unwire(vm);
498 return (EAGAIN);
499 }
471
500
472 seg->len += PAGE_SIZE;
501 return (0);
502}
503
504static void
505vm_iommu_modify(struct vm *vm, boolean_t map)
506{
507 int i, sz;
508 vm_paddr_t gpa, hpa;
509 struct mem_seg *seg;
510 void *vp, *cookie, *host_domain;
511
512 sz = PAGE_SIZE;
513 host_domain = iommu_host_domain();
514
515 for (i = 0; i < vm->num_mem_segs; i++) {
516 seg = &vm->mem_segs[i];
517 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
518 vm_name(vm), seg->gpa, seg->len));
519
520 gpa = seg->gpa;
521 while (gpa < seg->gpa + seg->len) {
522 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
523 &cookie);
524 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
525 vm_name(vm), gpa));
526
527 vm_gpa_release(cookie);
528
529 hpa = DMAP_TO_PHYS((uintptr_t)vp);
530 if (map) {
531 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
532 iommu_remove_mapping(host_domain, hpa, sz);
533 } else {
534 iommu_remove_mapping(vm->iommu, gpa, sz);
535 iommu_create_mapping(host_domain, hpa, hpa, sz);
536 }
537
538 gpa += PAGE_SIZE;
539 }
473 }
474
540 }
541
475 if (error) {
476 vm_free_mem_seg(vm, seg);
542 /*
543 * Invalidate the cached translations associated with the domain
544 * from which pages were removed.
545 */
546 if (map)
547 iommu_invalidate_tlb(host_domain);
548 else
549 iommu_invalidate_tlb(vm->iommu);
550}
551
552#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
553#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
554
555int
556vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
557{
558 int error;
559
560 error = ppt_unassign_device(vm, bus, slot, func);
561 if (error)
477 return (error);
562 return (error);
563
564 if (ppt_num_devices(vm) == 0) {
565 vm_iommu_unmap(vm);
566 vm_gpa_unwire(vm);
478 }
567 }
568 return (0);
569}
479
570
571int
572vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
573{
574 int error;
575 vm_paddr_t maxaddr;
576
480 /*
577 /*
481 * Invalidate cached translations associated with 'host_domain' since
482 * we have now moved some pages from it.
578 * Virtual machines with pci passthru devices get special treatment:
579 * - the guest physical memory is wired
580 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
581 *
582 * We need to do this before the first pci passthru device is attached.
483 */
583 */
484 iommu_invalidate_tlb(host_domain);
584 if (ppt_num_devices(vm) == 0) {
585 KASSERT(vm->iommu == NULL,
586 ("vm_assign_pptdev: iommu must be NULL"));
587 maxaddr = vmm_mem_maxaddr();
588 vm->iommu = iommu_create_domain(maxaddr);
485
589
486 vm->num_mem_segs++;
590 error = vm_gpa_wire(vm);
591 if (error)
592 return (error);
487
593
488 return (0);
594 vm_iommu_map(vm);
595 }
596
597 error = ppt_assign_device(vm, bus, slot, func);
598 return (error);
489}
490
599}
600
491vm_paddr_t
492vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
601void *
602vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
603 void **cookie)
493{
604{
494 vm_paddr_t nextpage;
605 int count, pageoff;
606 vm_page_t m;
495
607
496 nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
497 if (len > nextpage - gpa)
498 panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
608 pageoff = gpa & PAGE_MASK;
609 if (len > PAGE_SIZE - pageoff)
610 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
499
611
500 return (VMMMAP_GET(vm->cookie, gpa));
612 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
613 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
614
615 if (count == 1) {
616 *cookie = m;
617 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
618 } else {
619 *cookie = NULL;
620 return (NULL);
621 }
501}
502
622}
623
624void
625vm_gpa_release(void *cookie)
626{
627 vm_page_t m = cookie;
628
629 vm_page_lock(m);
630 vm_page_unhold(m);
631 vm_page_unlock(m);
632}
633
503int
504vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
505 struct vm_memory_segment *seg)
506{
507 int i;
508
509 for (i = 0; i < vm->num_mem_segs; i++) {
510 if (gpabase == vm->mem_segs[i].gpa) {
634int
635vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
636 struct vm_memory_segment *seg)
637{
638 int i;
639
640 for (i = 0; i < vm->num_mem_segs; i++) {
641 if (gpabase == vm->mem_segs[i].gpa) {
511 *seg = vm->mem_segs[i];
642 seg->gpa = vm->mem_segs[i].gpa;
643 seg->len = vm->mem_segs[i].len;
644 seg->wired = vm->mem_segs[i].wired;
512 return (0);
513 }
514 }
515 return (-1);
516}
517
518int
645 return (0);
646 }
647 }
648 return (-1);
649}
650
651int
652vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
653 vm_offset_t *offset, struct vm_object **object)
654{
655 int i;
656 size_t seg_len;
657 vm_paddr_t seg_gpa;
658 vm_object_t seg_obj;
659
660 for (i = 0; i < vm->num_mem_segs; i++) {
661 if ((seg_obj = vm->mem_segs[i].object) == NULL)
662 continue;
663
664 seg_gpa = vm->mem_segs[i].gpa;
665 seg_len = vm->mem_segs[i].len;
666
667 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
668 *offset = gpa - seg_gpa;
669 *object = seg_obj;
670 vm_object_reference(seg_obj);
671 return (0);
672 }
673 }
674
675 return (EINVAL);
676}
677
678int
519vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
520{
521
522 if (vcpu < 0 || vcpu >= VM_MAXCPU)
523 return (EINVAL);
524
525 if (reg >= VM_REG_LAST)
526 return (EINVAL);

--- 101 unchanged lines hidden (view full) ---

628 /* save guest FPU state */
629 fpu_stop_emulating();
630 fpusave(vcpu->guestfpu);
631 fpu_start_emulating();
632}
633
634static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
635
679vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
680{
681
682 if (vcpu < 0 || vcpu >= VM_MAXCPU)
683 return (EINVAL);
684
685 if (reg >= VM_REG_LAST)
686 return (EINVAL);

--- 101 unchanged lines hidden (view full) ---

788 /* save guest FPU state */
789 fpu_stop_emulating();
790 fpusave(vcpu->guestfpu);
791 fpu_start_emulating();
792}
793
794static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
795
796static int
797vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
798{
799 int error;
800
801 vcpu_assert_locked(vcpu);
802
803 /*
804 * The following state transitions are allowed:
805 * IDLE -> FROZEN -> IDLE
806 * FROZEN -> RUNNING -> FROZEN
807 * FROZEN -> SLEEPING -> FROZEN
808 */
809 switch (vcpu->state) {
810 case VCPU_IDLE:
811 case VCPU_RUNNING:
812 case VCPU_SLEEPING:
813 error = (newstate != VCPU_FROZEN);
814 break;
815 case VCPU_FROZEN:
816 error = (newstate == VCPU_FROZEN);
817 break;
818 default:
819 error = 1;
820 break;
821 }
822
823 if (error == 0)
824 vcpu->state = newstate;
825 else
826 error = EBUSY;
827
828 return (error);
829}
830
831static void
832vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
833{
834 int error;
835
836 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
837 panic("Error %d setting state to %d\n", error, newstate);
838}
839
840static void
841vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
842{
843 int error;
844
845 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
846 panic("Error %d setting state to %d", error, newstate);
847}
848
849/*
850 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
851 */
852static int
853vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
854{
855 struct vcpu *vcpu;
856 int sleepticks, t;
857
858 vcpu = &vm->vcpu[vcpuid];
859
860 vcpu_lock(vcpu);
861
862 /*
863 * Figure out the number of host ticks until the next apic
864 * timer interrupt in the guest.
865 */
866 sleepticks = lapic_timer_tick(vm, vcpuid);
867
868 /*
869 * If the guest local apic timer is disabled then sleep for
870 * a long time but not forever.
871 */
872 if (sleepticks < 0)
873 sleepticks = hz;
874
875 /*
876 * Do a final check for pending NMI or interrupts before
877 * really putting this thread to sleep.
878 *
879 * These interrupts could have happened any time after we
880 * returned from VMRUN() and before we grabbed the vcpu lock.
881 */
882 if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
883 if (sleepticks <= 0)
884 panic("invalid sleepticks %d", sleepticks);
885 t = ticks;
886 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
887 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
888 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
889 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
890 }
891 vcpu_unlock(vcpu);
892
893 return (0);
894}
895
896static int
897vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
898{
899 int rv, ftype;
900 struct vm_map *map;
901 struct vcpu *vcpu;
902 struct vm_exit *vme;
903
904 vcpu = &vm->vcpu[vcpuid];
905 vme = &vcpu->exitinfo;
906
907 ftype = vme->u.paging.fault_type;
908 KASSERT(ftype == VM_PROT_READ ||
909 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
910 ("vm_handle_paging: invalid fault_type %d", ftype));
911
912 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
913 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
914 vme->u.paging.gpa, ftype);
915 if (rv == 0)
916 goto done;
917 }
918
919 map = &vm->vmspace->vm_map;
920 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
921
922 VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
923 rv, vme->u.paging.gpa, ftype);
924
925 if (rv != KERN_SUCCESS)
926 return (EFAULT);
927done:
928 /* restart execution at the faulting instruction */
929 vme->inst_length = 0;
930
931 return (0);
932}
933
934static int
935vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
936{
937 struct vie *vie;
938 struct vcpu *vcpu;
939 struct vm_exit *vme;
940 int error, inst_length;
941 uint64_t rip, gla, gpa, cr3;
942
943 vcpu = &vm->vcpu[vcpuid];
944 vme = &vcpu->exitinfo;
945
946 rip = vme->rip;
947 inst_length = vme->inst_length;
948
949 gla = vme->u.inst_emul.gla;
950 gpa = vme->u.inst_emul.gpa;
951 cr3 = vme->u.inst_emul.cr3;
952 vie = &vme->u.inst_emul.vie;
953
954 vie_init(vie);
955
956 /* Fetch, decode and emulate the faulting instruction */
957 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
958 return (EFAULT);
959
960 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
961 return (EFAULT);
962
963 /* return to userland unless this is a local apic access */
964 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
965 *retu = TRUE;
966 return (0);
967 }
968
969 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
970 lapic_mmio_read, lapic_mmio_write, 0);
971
972 /* return to userland to spin up the AP */
973 if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
974 *retu = TRUE;
975
976 return (error);
977}
978
636int
637vm_run(struct vm *vm, struct vm_run *vmrun)
638{
979int
980vm_run(struct vm *vm, struct vm_run *vmrun)
981{
639 int error, vcpuid, sleepticks, t;
982 int error, vcpuid;
640 struct vcpu *vcpu;
641 struct pcb *pcb;
642 uint64_t tscval, rip;
643 struct vm_exit *vme;
983 struct vcpu *vcpu;
984 struct pcb *pcb;
985 uint64_t tscval, rip;
986 struct vm_exit *vme;
987 boolean_t retu;
988 pmap_t pmap;
644
645 vcpuid = vmrun->cpuid;
646
647 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
648 return (EINVAL);
649
989
990 vcpuid = vmrun->cpuid;
991
992 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
993 return (EINVAL);
994
995 pmap = vmspace_pmap(vm->vmspace);
650 vcpu = &vm->vcpu[vcpuid];
996 vcpu = &vm->vcpu[vcpuid];
651 vme = &vmrun->vm_exit;
997 vme = &vcpu->exitinfo;
652 rip = vmrun->rip;
653restart:
654 critical_enter();
655
998 rip = vmrun->rip;
999restart:
1000 critical_enter();
1001
1002 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1003 ("vm_run: absurd pm_active"));
1004
656 tscval = rdtsc();
657
658 pcb = PCPU_GET(curpcb);
659 set_pcb_flags(pcb, PCB_FULL_IRET);
660
661 restore_guest_msrs(vm, vcpuid);
662 restore_guest_fpustate(vcpu);
663
1005 tscval = rdtsc();
1006
1007 pcb = PCPU_GET(curpcb);
1008 set_pcb_flags(pcb, PCB_FULL_IRET);
1009
1010 restore_guest_msrs(vm, vcpuid);
1011 restore_guest_fpustate(vcpu);
1012
1013 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
664 vcpu->hostcpu = curcpu;
1014 vcpu->hostcpu = curcpu;
665 error = VMRUN(vm->cookie, vcpuid, rip);
1015 error = VMRUN(vm->cookie, vcpuid, rip, pmap);
666 vcpu->hostcpu = NOCPU;
1016 vcpu->hostcpu = NOCPU;
1017 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
667
668 save_guest_fpustate(vcpu);
669 restore_host_msrs(vm, vcpuid);
670
671 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
672
1018
1019 save_guest_fpustate(vcpu);
1020 restore_host_msrs(vm, vcpuid);
1021
1022 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1023
673 /* copy the exit information */
674 bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
675
676 critical_exit();
677
1024 critical_exit();
1025
678 /*
679 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
680 * is ready to run.
681 */
682 if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
683 vcpu_lock(vcpu);
684
685 /*
686 * Figure out the number of host ticks until the next apic
687 * timer interrupt in the guest.
688 */
689 sleepticks = lapic_timer_tick(vm, vcpuid);
690
691 /*
692 * If the guest local apic timer is disabled then sleep for
693 * a long time but not forever.
694 */
695 if (sleepticks < 0)
696 sleepticks = hz;
697
698 /*
699 * Do a final check for pending NMI or interrupts before
700 * really putting this thread to sleep.
701 *
702 * These interrupts could have happened any time after we
703 * returned from VMRUN() and before we grabbed the vcpu lock.
704 */
705 if (!vm_nmi_pending(vm, vcpuid) &&
706 lapic_pending_intr(vm, vcpuid) < 0) {
707 if (sleepticks <= 0)
708 panic("invalid sleepticks %d", sleepticks);
709 t = ticks;
710 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
711 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1026 if (error == 0) {
1027 retu = FALSE;
1028 switch (vme->exitcode) {
1029 case VM_EXITCODE_HLT:
1030 error = vm_handle_hlt(vm, vcpuid, &retu);
1031 break;
1032 case VM_EXITCODE_PAGING:
1033 error = vm_handle_paging(vm, vcpuid, &retu);
1034 break;
1035 case VM_EXITCODE_INST_EMUL:
1036 error = vm_handle_inst_emul(vm, vcpuid, &retu);
1037 break;
1038 default:
1039 retu = TRUE; /* handled in userland */
1040 break;
712 }
1041 }
1042 }
713
1043
714 vcpu_unlock(vcpu);
715
1044 if (error == 0 && retu == FALSE) {
716 rip = vme->rip + vme->inst_length;
717 goto restart;
718 }
719
1045 rip = vme->rip + vme->inst_length;
1046 goto restart;
1047 }
1048
1049 /* copy the exit information */
1050 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
720 return (error);
721}
722
723int
724vm_inject_event(struct vm *vm, int vcpuid, int type,
725 int vector, uint32_t code, int code_valid)
726{
727 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)

--- 136 unchanged lines hidden (view full) ---

864void *
865vm_iommu_domain(struct vm *vm)
866{
867
868 return (vm->iommu);
869}
870
871int
1051 return (error);
1052}
1053
1054int
1055vm_inject_event(struct vm *vm, int vcpuid, int type,
1056 int vector, uint32_t code, int code_valid)
1057{
1058 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)

--- 136 unchanged lines hidden (view full) ---

1195void *
1196vm_iommu_domain(struct vm *vm)
1197{
1198
1199 return (vm->iommu);
1200}
1201
1202int
872vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
1203vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
873{
874 int error;
875 struct vcpu *vcpu;
876
877 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
878 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
879
880 vcpu = &vm->vcpu[vcpuid];
881
882 vcpu_lock(vcpu);
1204{
1205 int error;
1206 struct vcpu *vcpu;
1207
1208 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1209 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1210
1211 vcpu = &vm->vcpu[vcpuid];
1212
1213 vcpu_lock(vcpu);
883
884 /*
885 * The following state transitions are allowed:
886 * IDLE -> RUNNING -> IDLE
887 * IDLE -> CANNOT_RUN -> IDLE
888 */
889 if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
890 (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
891 error = 0;
892 vcpu->state = state;
893 } else {
894 error = EBUSY;
895 }
896
1214 error = vcpu_set_state_locked(vcpu, newstate);
897 vcpu_unlock(vcpu);
898
899 return (error);
900}
901
902enum vcpu_state
903vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
904{

--- 69 unchanged lines hidden (view full) ---

974 int hostcpu;
975 struct vcpu *vcpu;
976
977 vcpu = &vm->vcpu[vcpuid];
978
979 vcpu_lock(vcpu);
980 hostcpu = vcpu->hostcpu;
981 if (hostcpu == NOCPU) {
1215 vcpu_unlock(vcpu);
1216
1217 return (error);
1218}
1219
1220enum vcpu_state
1221vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1222{

--- 69 unchanged lines hidden (view full) ---

1292 int hostcpu;
1293 struct vcpu *vcpu;
1294
1295 vcpu = &vm->vcpu[vcpuid];
1296
1297 vcpu_lock(vcpu);
1298 hostcpu = vcpu->hostcpu;
1299 if (hostcpu == NOCPU) {
982 /*
983 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
984 * the host thread must be sleeping waiting for an event to
985 * kick the vcpu out of 'hlt'.
986 *
987 * XXX this is racy because the condition exists right before
988 * and after calling VMRUN() in vm_run(). The wakeup() is
989 * benign in this case.
990 */
991 if (vcpu->state == VCPU_RUNNING)
1300 if (vcpu->state == VCPU_SLEEPING)
992 wakeup_one(vcpu);
993 } else {
994 if (vcpu->state != VCPU_RUNNING)
995 panic("invalid vcpu state %d", vcpu->state);
996 if (hostcpu != curcpu)
997 ipi_cpu(hostcpu, vmm_ipinum);
998 }
999 vcpu_unlock(vcpu);
1000}
1301 wakeup_one(vcpu);
1302 } else {
1303 if (vcpu->state != VCPU_RUNNING)
1304 panic("invalid vcpu state %d", vcpu->state);
1305 if (hostcpu != curcpu)
1306 ipi_cpu(hostcpu, vmm_ipinum);
1307 }
1308 vcpu_unlock(vcpu);
1309}
1310
1311struct vmspace *
1312vm_get_vmspace(struct vm *vm)
1313{
1314
1315 return (vm->vmspace);
1316}