1268777Sneel/*-
2268777Sneel * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
3268777Sneel * All rights reserved.
4268777Sneel *
5268777Sneel * Redistribution and use in source and binary forms, with or without
6268777Sneel * modification, are permitted provided that the following conditions
7268777Sneel * are met:
8268777Sneel * 1. Redistributions of source code must retain the above copyright
9268777Sneel *    notice, this list of conditions and the following disclaimer.
10268777Sneel * 2. Redistributions in binary form must reproduce the above copyright
11268777Sneel *    notice, this list of conditions and the following disclaimer in the
12268777Sneel *    documentation and/or other materials provided with the distribution.
13268777Sneel *
14268777Sneel * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15268777Sneel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16268777Sneel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17268777Sneel * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18268777Sneel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19268777Sneel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20268777Sneel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21268777Sneel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22268777Sneel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23268777Sneel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24268777Sneel * SUCH DAMAGE.
25268777Sneel */
26268777Sneel
27268777Sneel#include <sys/cdefs.h>
28268777Sneel__FBSDID("$FreeBSD: releng/11.0/usr.sbin/bhyve/task_switch.c 302373 2016-07-06 16:02:15Z ngie $");
29268777Sneel
30268777Sneel#include <sys/param.h>
31268777Sneel#include <sys/_iovec.h>
32268777Sneel#include <sys/mman.h>
33268777Sneel
34268777Sneel#include <x86/psl.h>
35268777Sneel#include <x86/segments.h>
36268777Sneel#include <x86/specialreg.h>
37268777Sneel#include <machine/vmm.h>
38268777Sneel#include <machine/vmm_instruction_emul.h>
39268777Sneel
40302373Sngie#include <assert.h>
41302373Sngie#include <errno.h>
42268777Sneel#include <stdbool.h>
43268777Sneel#include <stdio.h>
44268777Sneel#include <stdlib.h>
45268777Sneel
46268777Sneel#include <vmmapi.h>
47268777Sneel
48268777Sneel#include "bhyverun.h"
49268777Sneel
50268777Sneel/*
51268777Sneel * Using 'struct i386tss' is tempting but causes myriad sign extension
52268777Sneel * issues because all of its fields are defined as signed integers.
53268777Sneel */
54268777Sneelstruct tss32 {
55268777Sneel	uint16_t	tss_link;
56268777Sneel	uint16_t	rsvd1;
57268777Sneel	uint32_t	tss_esp0;
58268777Sneel	uint16_t	tss_ss0;
59268777Sneel	uint16_t	rsvd2;
60268777Sneel	uint32_t	tss_esp1;
61268777Sneel	uint16_t	tss_ss1;
62268777Sneel	uint16_t	rsvd3;
63268777Sneel	uint32_t	tss_esp2;
64268777Sneel	uint16_t	tss_ss2;
65268777Sneel	uint16_t	rsvd4;
66268777Sneel	uint32_t	tss_cr3;
67268777Sneel	uint32_t	tss_eip;
68268777Sneel	uint32_t	tss_eflags;
69268777Sneel	uint32_t	tss_eax;
70268777Sneel	uint32_t	tss_ecx;
71268777Sneel	uint32_t	tss_edx;
72268777Sneel	uint32_t	tss_ebx;
73268777Sneel	uint32_t	tss_esp;
74268777Sneel	uint32_t	tss_ebp;
75268777Sneel	uint32_t	tss_esi;
76268777Sneel	uint32_t	tss_edi;
77268777Sneel	uint16_t	tss_es;
78268777Sneel	uint16_t	rsvd5;
79268777Sneel	uint16_t	tss_cs;
80268777Sneel	uint16_t	rsvd6;
81268777Sneel	uint16_t	tss_ss;
82268777Sneel	uint16_t	rsvd7;
83268777Sneel	uint16_t	tss_ds;
84268777Sneel	uint16_t	rsvd8;
85268777Sneel	uint16_t	tss_fs;
86268777Sneel	uint16_t	rsvd9;
87268777Sneel	uint16_t	tss_gs;
88268777Sneel	uint16_t	rsvd10;
89268777Sneel	uint16_t	tss_ldt;
90268777Sneel	uint16_t	rsvd11;
91268777Sneel	uint16_t	tss_trap;
92268777Sneel	uint16_t	tss_iomap;
93268777Sneel};
94302373Sngiestatic_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
95268777Sneel
96268777Sneel#define	SEL_START(sel)	(((sel) & ~0x7))
97268777Sneel#define	SEL_LIMIT(sel)	(((sel) | 0x7))
98268777Sneel#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
99268777Sneel
100268777Sneelstatic uint64_t
101268777SneelGETREG(struct vmctx *ctx, int vcpu, int reg)
102268777Sneel{
103268777Sneel	uint64_t val;
104268777Sneel	int error;
105268777Sneel
106268777Sneel	error = vm_get_register(ctx, vcpu, reg, &val);
107268777Sneel	assert(error == 0);
108268777Sneel	return (val);
109268777Sneel}
110268777Sneel
111268777Sneelstatic void
112268777SneelSETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
113268777Sneel{
114268777Sneel	int error;
115268777Sneel
116268777Sneel	error = vm_set_register(ctx, vcpu, reg, val);
117268777Sneel	assert(error == 0);
118268777Sneel}
119268777Sneel
120268777Sneelstatic struct seg_desc
121268777Sneelusd_to_seg_desc(struct user_segment_descriptor *usd)
122268777Sneel{
123268777Sneel	struct seg_desc seg_desc;
124268777Sneel
125268777Sneel	seg_desc.base = (u_int)USD_GETBASE(usd);
126268777Sneel	if (usd->sd_gran)
127268777Sneel		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
128268777Sneel	else
129268777Sneel		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
130268777Sneel	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
131268777Sneel	seg_desc.access |= usd->sd_xx << 12;
132268777Sneel	seg_desc.access |= usd->sd_def32 << 14;
133268777Sneel	seg_desc.access |= usd->sd_gran << 15;
134268777Sneel
135268777Sneel	return (seg_desc);
136268777Sneel}
137268777Sneel
138268777Sneel/*
139268777Sneel * Inject an exception with an error code that is a segment selector.
140268777Sneel * The format of the error code is described in section 6.13, "Error Code",
141268777Sneel * Intel SDM volume 3.
142268777Sneel *
143268777Sneel * Bit 0 (EXT) denotes whether the exception occurred during delivery
144268777Sneel * of an external event like an interrupt.
145268777Sneel *
146268777Sneel * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
147268777Sneel * in the IDT.
148268777Sneel *
149268777Sneel * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
150268777Sneel */
151268777Sneelstatic void
152268777Sneelsel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
153268777Sneel{
154268777Sneel	/*
155268777Sneel	 * Bit 2 from the selector is retained as-is in the error code.
156268777Sneel	 *
157268777Sneel	 * Bit 1 can be safely cleared because none of the selectors
158268777Sneel	 * encountered during task switch emulation refer to a task
159268777Sneel	 * gate in the IDT.
160268777Sneel	 *
161268777Sneel	 * Bit 0 is set depending on the value of 'ext'.
162268777Sneel	 */
163268777Sneel	sel &= ~0x3;
164268777Sneel	if (ext)
165268777Sneel		sel |= 0x1;
166269042Sneel	vm_inject_fault(ctx, vcpu, vector, 1, sel);
167268777Sneel}
168268777Sneel
169269043Sneel/*
170269043Sneel * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
171269043Sneel * and non-zero otherwise.
172269043Sneel */
173268777Sneelstatic int
174268777Sneeldesc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
175268777Sneel{
176268777Sneel	uint64_t base;
177268777Sneel	uint32_t limit, access;
178268777Sneel	int error, reg;
179268777Sneel
180268777Sneel	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
181268777Sneel	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
182268777Sneel	assert(error == 0);
183268777Sneel
184268777Sneel	if (reg == VM_REG_GUEST_LDTR) {
185268777Sneel		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
186268777Sneel			return (-1);
187268777Sneel	}
188268777Sneel
189268777Sneel	if (limit < SEL_LIMIT(sel))
190268777Sneel		return (-1);
191268777Sneel	else
192268777Sneel		return (0);
193268777Sneel}
194268777Sneel
195269043Sneel/*
196269043Sneel * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
197269043Sneel * by the selector 'sel'.
198269043Sneel *
199269043Sneel * Returns 0 on success.
200269043Sneel * Returns 1 if an exception was injected into the guest.
201269043Sneel * Returns -1 otherwise.
202269043Sneel */
203268777Sneelstatic int
204268777Sneeldesc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
205282558Sneel    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
206282558Sneel    int *faultptr)
207268777Sneel{
208268777Sneel	struct iovec iov[2];
209268777Sneel	uint64_t base;
210268777Sneel	uint32_t limit, access;
211268777Sneel	int error, reg;
212268777Sneel
213268777Sneel	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
214268777Sneel	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
215268777Sneel	assert(error == 0);
216268777Sneel	assert(limit >= SEL_LIMIT(sel));
217268777Sneel
218269008Sneel	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
219282558Sneel	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
220282558Sneel	    faultptr);
221282558Sneel	if (error || *faultptr)
222282558Sneel		return (error);
223282558Sneel
224282558Sneel	if (doread)
225282558Sneel		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
226282558Sneel	else
227282558Sneel		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
228282558Sneel	return (0);
229268777Sneel}
230268777Sneel
231268777Sneelstatic int
232268777Sneeldesc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
233282558Sneel    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
234268777Sneel{
235282558Sneel	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
236268777Sneel}
237268777Sneel
238268777Sneelstatic int
239268777Sneeldesc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
240282558Sneel    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
241268777Sneel{
242282558Sneel	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
243268777Sneel}
244268777Sneel
245269043Sneel/*
246269043Sneel * Read the TSS descriptor referenced by 'sel' into 'desc'.
247269043Sneel *
248269043Sneel * Returns 0 on success.
249269043Sneel * Returns 1 if an exception was injected into the guest.
250269043Sneel * Returns -1 otherwise.
251269043Sneel */
252268777Sneelstatic int
253268777Sneelread_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
254282558Sneel    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
255268777Sneel{
256268777Sneel	struct vm_guest_paging sup_paging;
257268777Sneel	int error;
258268777Sneel
259268777Sneel	assert(!ISLDT(sel));
260268777Sneel	assert(IDXSEL(sel) != 0);
261268777Sneel
262268777Sneel	/* Fetch the new TSS descriptor */
263268777Sneel	if (desc_table_limit_check(ctx, vcpu, sel)) {
264268777Sneel		if (ts->reason == TSR_IRET)
265268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
266268777Sneel		else
267268777Sneel			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
268269043Sneel		return (1);
269268777Sneel	}
270268777Sneel
271268777Sneel	sup_paging = ts->paging;
272268777Sneel	sup_paging.cpl = 0;		/* implicit supervisor mode */
273282558Sneel	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
274269043Sneel	return (error);
275268777Sneel}
276268777Sneel
277268777Sneelstatic bool
278268777Sneelcode_desc(int sd_type)
279268777Sneel{
280268777Sneel	/* code descriptor */
281268777Sneel	return ((sd_type & 0x18) == 0x18);
282268777Sneel}
283268777Sneel
284268777Sneelstatic bool
285268777Sneelstack_desc(int sd_type)
286268777Sneel{
287268777Sneel	/* writable data descriptor */
288268777Sneel	return ((sd_type & 0x1A) == 0x12);
289268777Sneel}
290268777Sneel
291268777Sneelstatic bool
292268777Sneeldata_desc(int sd_type)
293268777Sneel{
294268777Sneel	/* data descriptor or a readable code descriptor */
295268777Sneel	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
296268777Sneel}
297268777Sneel
298268777Sneelstatic bool
299268777Sneelldt_desc(int sd_type)
300268777Sneel{
301268777Sneel
302268777Sneel	return (sd_type == SDT_SYSLDT);
303268777Sneel}
304268777Sneel
305269043Sneel/*
306269043Sneel * Validate the descriptor 'seg_desc' associated with 'segment'.
307269043Sneel */
308268777Sneelstatic int
309268777Sneelvalidate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
310282558Sneel    int segment, struct seg_desc *seg_desc, int *faultptr)
311268777Sneel{
312268777Sneel	struct vm_guest_paging sup_paging;
313268777Sneel	struct user_segment_descriptor usd;
314268777Sneel	int error, idtvec;
315268777Sneel	int cpl, dpl, rpl;
316268777Sneel	uint16_t sel, cs;
317268777Sneel	bool ldtseg, codeseg, stackseg, dataseg, conforming;
318268777Sneel
319268777Sneel	ldtseg = codeseg = stackseg = dataseg = false;
320268777Sneel	switch (segment) {
321268777Sneel	case VM_REG_GUEST_LDTR:
322268777Sneel		ldtseg = true;
323268777Sneel		break;
324268777Sneel	case VM_REG_GUEST_CS:
325268777Sneel		codeseg = true;
326268777Sneel		break;
327268777Sneel	case VM_REG_GUEST_SS:
328268777Sneel		stackseg = true;
329268777Sneel		break;
330268777Sneel	case VM_REG_GUEST_DS:
331268777Sneel	case VM_REG_GUEST_ES:
332268777Sneel	case VM_REG_GUEST_FS:
333268777Sneel	case VM_REG_GUEST_GS:
334268777Sneel		dataseg = true;
335268777Sneel		break;
336268777Sneel	default:
337268777Sneel		assert(0);
338268777Sneel	}
339268777Sneel
340268777Sneel	/* Get the segment selector */
341268777Sneel	sel = GETREG(ctx, vcpu, segment);
342268777Sneel
343268777Sneel	/* LDT selector must point into the GDT */
344268777Sneel	if (ldtseg && ISLDT(sel)) {
345268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
346269043Sneel		return (1);
347268777Sneel	}
348268777Sneel
349268777Sneel	/* Descriptor table limit check */
350268777Sneel	if (desc_table_limit_check(ctx, vcpu, sel)) {
351268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
352269043Sneel		return (1);
353268777Sneel	}
354268777Sneel
355268777Sneel	/* NULL selector */
356268777Sneel	if (IDXSEL(sel) == 0) {
357268777Sneel		/* Code and stack segment selectors cannot be NULL */
358268777Sneel		if (codeseg || stackseg) {
359268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
360269043Sneel			return (1);
361268777Sneel		}
362268777Sneel		seg_desc->base = 0;
363268777Sneel		seg_desc->limit = 0;
364268777Sneel		seg_desc->access = 0x10000;	/* unusable */
365268777Sneel		return (0);
366268777Sneel	}
367268777Sneel
368268777Sneel	/* Read the descriptor from the GDT/LDT */
369268777Sneel	sup_paging = ts->paging;
370268777Sneel	sup_paging.cpl = 0;	/* implicit supervisor mode */
371282558Sneel	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
372282558Sneel	if (error || *faultptr)
373269043Sneel		return (error);
374268777Sneel
375268777Sneel	/* Verify that the descriptor type is compatible with the segment */
376268777Sneel	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
377268777Sneel	    (codeseg && !code_desc(usd.sd_type)) ||
378268777Sneel	    (dataseg && !data_desc(usd.sd_type)) ||
379268777Sneel	    (stackseg && !stack_desc(usd.sd_type))) {
380268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
381269043Sneel		return (1);
382268777Sneel	}
383268777Sneel
384268777Sneel	/* Segment must be marked present */
385268777Sneel	if (!usd.sd_p) {
386268777Sneel		if (ldtseg)
387268777Sneel			idtvec = IDT_TS;
388268777Sneel		else if (stackseg)
389268777Sneel			idtvec = IDT_SS;
390268777Sneel		else
391268777Sneel			idtvec = IDT_NP;
392268777Sneel		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
393269043Sneel		return (1);
394268777Sneel	}
395268777Sneel
396268777Sneel	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
397268777Sneel	cpl = cs & SEL_RPL_MASK;
398268777Sneel	rpl = sel & SEL_RPL_MASK;
399268777Sneel	dpl = usd.sd_dpl;
400268777Sneel
401268777Sneel	if (stackseg && (rpl != cpl || dpl != cpl)) {
402268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
403269043Sneel		return (1);
404268777Sneel	}
405268777Sneel
406268777Sneel	if (codeseg) {
407268777Sneel		conforming = (usd.sd_type & 0x4) ? true : false;
408268777Sneel		if ((conforming && (cpl < dpl)) ||
409268777Sneel		    (!conforming && (cpl != dpl))) {
410268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
411269043Sneel			return (1);
412268777Sneel		}
413268777Sneel	}
414268777Sneel
415268777Sneel	if (dataseg) {
416268777Sneel		/*
417268777Sneel		 * A data segment is always non-conforming except when it's
418268777Sneel		 * descriptor is a readable, conforming code segment.
419268777Sneel		 */
420268777Sneel		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
421268777Sneel			conforming = true;
422268777Sneel		else
423268777Sneel			conforming = false;
424268777Sneel
425268777Sneel		if (!conforming && (rpl > dpl || cpl > dpl)) {
426268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
427269043Sneel			return (1);
428268777Sneel		}
429268777Sneel	}
430268777Sneel	*seg_desc = usd_to_seg_desc(&usd);
431268777Sneel	return (0);
432268777Sneel}
433268777Sneel
434268777Sneelstatic void
435268777Sneeltss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
436268777Sneel    uint32_t eip, struct tss32 *tss, struct iovec *iov)
437268777Sneel{
438268777Sneel
439268777Sneel	/* General purpose registers */
440268777Sneel	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
441268777Sneel	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
442268777Sneel	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
443268777Sneel	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
444268777Sneel	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
445268777Sneel	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
446268777Sneel	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
447268777Sneel	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
448268777Sneel
449268777Sneel	/* Segment selectors */
450268777Sneel	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
451268777Sneel	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
452268777Sneel	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
453268777Sneel	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
454268777Sneel	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
455268777Sneel	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
456268777Sneel
457268777Sneel	/* eflags and eip */
458268777Sneel	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
459268777Sneel	if (task_switch->reason == TSR_IRET)
460268777Sneel		tss->tss_eflags &= ~PSL_NT;
461268777Sneel	tss->tss_eip = eip;
462268777Sneel
463268777Sneel	/* Copy updated old TSS into guest memory */
464268777Sneel	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
465268777Sneel}
466268777Sneel
467268777Sneelstatic void
468268777Sneelupdate_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
469268777Sneel{
470268777Sneel	int error;
471268777Sneel
472268777Sneel	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
473268777Sneel	assert(error == 0);
474268777Sneel}
475268777Sneel
476269043Sneel/*
477269043Sneel * Update the vcpu registers to reflect the state of the new task.
478269043Sneel */
479268777Sneelstatic int
480268777Sneeltss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
481282558Sneel    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
482268777Sneel{
483268777Sneel	struct seg_desc seg_desc, seg_desc2;
484268777Sneel	uint64_t *pdpte, maxphyaddr, reserved;
485268777Sneel	uint32_t eflags;
486268777Sneel	int error, i;
487268777Sneel	bool nested;
488268777Sneel
489268777Sneel	nested = false;
490268777Sneel	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
491268777Sneel		tss->tss_link = ot_sel;
492268777Sneel		nested = true;
493268777Sneel	}
494268777Sneel
495268777Sneel	eflags = tss->tss_eflags;
496268777Sneel	if (nested)
497268777Sneel		eflags |= PSL_NT;
498268777Sneel
499268777Sneel	/* LDTR */
500268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
501268777Sneel
502268777Sneel	/* PBDR */
503268777Sneel	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
504268777Sneel		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
505268777Sneel			/*
506268777Sneel			 * XXX Assuming 36-bit MAXPHYADDR.
507268777Sneel			 */
508268777Sneel			maxphyaddr = (1UL << 36) - 1;
509268777Sneel			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
510268777Sneel			for (i = 0; i < 4; i++) {
511268777Sneel				/* Check reserved bits if the PDPTE is valid */
512268777Sneel				if (!(pdpte[i] & 0x1))
513268777Sneel					continue;
514268777Sneel				/*
515268777Sneel				 * Bits 2:1, 8:5 and bits above the processor's
516268777Sneel				 * maximum physical address are reserved.
517268777Sneel				 */
518268777Sneel				reserved = ~maxphyaddr | 0x1E6;
519268777Sneel				if (pdpte[i] & reserved) {
520269042Sneel					vm_inject_gp(ctx, vcpu);
521269043Sneel					return (1);
522268777Sneel				}
523268777Sneel			}
524268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
525268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
526268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
527268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
528268777Sneel		}
529268777Sneel		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
530268777Sneel		ts->paging.cr3 = tss->tss_cr3;
531268777Sneel	}
532268777Sneel
533268777Sneel	/* eflags and eip */
534268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
535268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
536268777Sneel
537268777Sneel	/* General purpose registers */
538268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
539268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
540268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
541268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
542268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
543268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
544268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
545268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
546268777Sneel
547268777Sneel	/* Segment selectors */
548268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
549268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
550268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
551268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
552268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
553268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
554268777Sneel
555268777Sneel	/*
556268777Sneel	 * If this is a nested task then write out the new TSS to update
557268777Sneel	 * the previous link field.
558268777Sneel	 */
559268777Sneel	if (nested)
560268777Sneel		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
561268777Sneel
562268777Sneel	/* Validate segment descriptors */
563282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
564282558Sneel	    faultptr);
565282558Sneel	if (error || *faultptr)
566268777Sneel		return (error);
567268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
568268777Sneel
569268777Sneel	/*
570268777Sneel	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
571268777Sneel	 *
572268777Sneel	 * The SS and CS attribute checks on VM-entry are inter-dependent so
573268777Sneel	 * we need to make sure that both segments are valid before updating
574268777Sneel	 * either of them. This ensures that the VMCS state can pass the
575268777Sneel	 * VM-entry checks so the guest can handle any exception injected
576268777Sneel	 * during task switch emulation.
577268777Sneel	 */
578282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
579282558Sneel	    faultptr);
580282558Sneel	if (error || *faultptr)
581268777Sneel		return (error);
582282558Sneel
583282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
584282558Sneel	    faultptr);
585282558Sneel	if (error || *faultptr)
586268777Sneel		return (error);
587268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
588268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
589268777Sneel	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
590268777Sneel
591282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
592282558Sneel	    faultptr);
593282558Sneel	if (error || *faultptr)
594268777Sneel		return (error);
595268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
596268777Sneel
597282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
598282558Sneel	    faultptr);
599282558Sneel	if (error || *faultptr)
600268777Sneel		return (error);
601268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
602268777Sneel
603282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
604282558Sneel	    faultptr);
605282558Sneel	if (error || *faultptr)
606268777Sneel		return (error);
607268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
608268777Sneel
609282558Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
610282558Sneel	    faultptr);
611282558Sneel	if (error || *faultptr)
612268777Sneel		return (error);
613268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
614268777Sneel
615268777Sneel	return (0);
616268777Sneel}
617268777Sneel
618269043Sneel/*
619269043Sneel * Push an error code on the stack of the new task. This is needed if the
620269043Sneel * task switch was triggered by a hardware exception that causes an error
621269043Sneel * code to be saved (e.g. #PF).
622269043Sneel */
623268777Sneelstatic int
624268777Sneelpush_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
625282558Sneel    int task_type, uint32_t errcode, int *faultptr)
626268777Sneel{
627268777Sneel	struct iovec iov[2];
628268777Sneel	struct seg_desc seg_desc;
629268777Sneel	int stacksize, bytes, error;
630268777Sneel	uint64_t gla, cr0, rflags;
631268777Sneel	uint32_t esp;
632268777Sneel	uint16_t stacksel;
633268777Sneel
634282558Sneel	*faultptr = 0;
635282558Sneel
636268777Sneel	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
637268777Sneel	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
638268777Sneel	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
639268777Sneel
640268777Sneel	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
641268777Sneel	    &seg_desc.limit, &seg_desc.access);
642268777Sneel	assert(error == 0);
643268777Sneel
644268777Sneel	/*
645268777Sneel	 * Section "Error Code" in the Intel SDM vol 3: the error code is
646268777Sneel	 * pushed on the stack as a doubleword or word (depending on the
647268777Sneel	 * default interrupt, trap or task gate size).
648268777Sneel	 */
649268777Sneel	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
650268777Sneel		bytes = 4;
651268777Sneel	else
652268777Sneel		bytes = 2;
653268777Sneel
654268777Sneel	/*
655268777Sneel	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
656268777Sneel	 * stack-segment descriptor determines the size of the stack
657268777Sneel	 * pointer outside of 64-bit mode.
658268777Sneel	 */
659268777Sneel	if (SEG_DESC_DEF32(seg_desc.access))
660268777Sneel		stacksize = 4;
661268777Sneel	else
662268777Sneel		stacksize = 2;
663268777Sneel
664268777Sneel	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
665268777Sneel	esp -= bytes;
666268777Sneel
667268777Sneel	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
668268777Sneel	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
669268777Sneel		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
670282558Sneel		*faultptr = 1;
671282558Sneel		return (0);
672268777Sneel	}
673268777Sneel
674268777Sneel	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
675269008Sneel		vm_inject_ac(ctx, vcpu, 1);
676282558Sneel		*faultptr = 1;
677282558Sneel		return (0);
678268777Sneel	}
679268777Sneel
680269008Sneel	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
681282558Sneel	    iov, nitems(iov), faultptr);
682282558Sneel	if (error || *faultptr)
683269043Sneel		return (error);
684268777Sneel
685268777Sneel	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
686268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
687268777Sneel	return (0);
688268777Sneel}
689268777Sneel
690269043Sneel/*
691269043Sneel * Evaluate return value from helper functions and potentially return to
692269043Sneel * the VM run loop.
693269043Sneel */
694282558Sneel#define	CHKERR(error,fault)						\
695269043Sneel	do {								\
696282558Sneel		assert((error == 0) || (error == EFAULT));		\
697282558Sneel		if (error)						\
698269043Sneel			return (VMEXIT_ABORT);				\
699282558Sneel		else if (fault)						\
700269043Sneel			return (VMEXIT_CONTINUE);			\
701269043Sneel	} while (0)
702269043Sneel
703268777Sneelint
704268777Sneelvmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
705268777Sneel{
706268777Sneel	struct seg_desc nt;
707268777Sneel	struct tss32 oldtss, newtss;
708268777Sneel	struct vm_task_switch *task_switch;
709268777Sneel	struct vm_guest_paging *paging, sup_paging;
710268777Sneel	struct user_segment_descriptor nt_desc, ot_desc;
711268777Sneel	struct iovec nt_iov[2], ot_iov[2];
712268777Sneel	uint64_t cr0, ot_base;
713268777Sneel	uint32_t eip, ot_lim, access;
714282558Sneel	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
715268777Sneel	enum task_switch_reason reason;
716268777Sneel	uint16_t nt_sel, ot_sel;
717268777Sneel
718268777Sneel	task_switch = &vmexit->u.task_switch;
719268777Sneel	nt_sel = task_switch->tsssel;
720268777Sneel	ext = vmexit->u.task_switch.ext;
721268777Sneel	reason = vmexit->u.task_switch.reason;
722268777Sneel	paging = &vmexit->u.task_switch.paging;
723268777Sneel	vcpu = *pvcpu;
724268777Sneel
725268777Sneel	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
726268777Sneel
727268777Sneel	/*
728277310Sneel	 * Calculate the instruction pointer to store in the old TSS.
729270855Sneel	 */
730270855Sneel	eip = vmexit->rip + vmexit->inst_length;
731270855Sneel
732270855Sneel	/*
733268777Sneel	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
734268777Sneel	 * The following page table accesses are implicitly supervisor mode:
735268777Sneel	 * - accesses to GDT or LDT to load segment descriptors
736268777Sneel	 * - accesses to the task state segment during task switch
737268777Sneel	 */
738268777Sneel	sup_paging = *paging;
739268777Sneel	sup_paging.cpl = 0;	/* implicit supervisor mode */
740268777Sneel
741268777Sneel	/* Fetch the new TSS descriptor */
742282558Sneel	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
743282558Sneel	    &fault);
744282558Sneel	CHKERR(error, fault);
745268777Sneel
746268777Sneel	nt = usd_to_seg_desc(&nt_desc);
747268777Sneel
748268777Sneel	/* Verify the type of the new TSS */
749268777Sneel	nt_type = SEG_DESC_TYPE(nt.access);
750268777Sneel	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
751268777Sneel	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
752268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
753269043Sneel		goto done;
754268777Sneel	}
755268777Sneel
756268777Sneel	/* TSS descriptor must have present bit set */
757268777Sneel	if (!SEG_DESC_PRESENT(nt.access)) {
758268777Sneel		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
759269043Sneel		goto done;
760268777Sneel	}
761268777Sneel
762268777Sneel	/*
763268777Sneel	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
764268777Sneel	 * 44 bytes for a 16-bit TSS.
765268777Sneel	 */
766268777Sneel	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
767268777Sneel		minlimit = 104 - 1;
768268777Sneel	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
769268777Sneel		minlimit = 44 - 1;
770268777Sneel	else
771268777Sneel		minlimit = 0;
772268777Sneel
773268777Sneel	assert(minlimit > 0);
774268777Sneel	if (nt.limit < minlimit) {
775268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
776269043Sneel		goto done;
777268777Sneel	}
778268777Sneel
779268777Sneel	/* TSS must be busy if task switch is due to IRET */
780268777Sneel	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
781268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
782269043Sneel		goto done;
783268777Sneel	}
784268777Sneel
785268777Sneel	/*
786268777Sneel	 * TSS must be available (not busy) if task switch reason is
787268777Sneel	 * CALL, JMP, exception or interrupt.
788268777Sneel	 */
789268777Sneel	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
790268777Sneel		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
791269043Sneel		goto done;
792268777Sneel	}
793268777Sneel
794268777Sneel	/* Fetch the new TSS */
795269008Sneel	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
796282558Sneel	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
797282558Sneel	CHKERR(error, fault);
798269043Sneel	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
799268777Sneel
800268777Sneel	/* Get the old TSS selector from the guest's task register */
801268777Sneel	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
802268777Sneel	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
803268777Sneel		/*
804268777Sneel		 * This might happen if a task switch was attempted without
805268777Sneel		 * ever loading the task register with LTR. In this case the
806268777Sneel		 * TR would contain the values from power-on:
807268777Sneel		 * (sel = 0, base = 0, limit = 0xffff).
808268777Sneel		 */
809268777Sneel		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
810269043Sneel		goto done;
811268777Sneel	}
812268777Sneel
813268777Sneel	/* Get the old TSS base and limit from the guest's task register */
814268777Sneel	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
815268777Sneel	    &access);
816268777Sneel	assert(error == 0);
817268777Sneel	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
818268777Sneel	ot_type = SEG_DESC_TYPE(access);
819268777Sneel	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
820268777Sneel
821268777Sneel	/* Fetch the old TSS descriptor */
822282558Sneel	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
823282558Sneel	    &fault);
824282558Sneel	CHKERR(error, fault);
825268777Sneel
826268777Sneel	/* Get the old TSS */
827269008Sneel	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
828282558Sneel	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
829282558Sneel	CHKERR(error, fault);
830269043Sneel	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
831268777Sneel
832268777Sneel	/*
833268777Sneel	 * Clear the busy bit in the old TSS descriptor if the task switch
834268777Sneel	 * due to an IRET or JMP instruction.
835268777Sneel	 */
836268777Sneel	if (reason == TSR_IRET || reason == TSR_JMP) {
837268777Sneel		ot_desc.sd_type &= ~0x2;
838268777Sneel		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
839282558Sneel		    &ot_desc, &fault);
840282558Sneel		CHKERR(error, fault);
841268777Sneel	}
842268777Sneel
843268777Sneel	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
844268777Sneel		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
845268777Sneel		return (VMEXIT_ABORT);
846268777Sneel	}
847268777Sneel
848268777Sneel	/* Save processor state in old TSS */
849268777Sneel	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
850268777Sneel
851268777Sneel	/*
852268777Sneel	 * If the task switch was triggered for any reason other than IRET
853268777Sneel	 * then set the busy bit in the new TSS descriptor.
854268777Sneel	 */
855268777Sneel	if (reason != TSR_IRET) {
856268777Sneel		nt_desc.sd_type |= 0x2;
857268777Sneel		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
858282558Sneel		    &nt_desc, &fault);
859282558Sneel		CHKERR(error, fault);
860268777Sneel	}
861268777Sneel
862268777Sneel	/* Update task register to point at the new TSS */
863268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
864268777Sneel
865268777Sneel	/* Update the hidden descriptor state of the task register */
866268777Sneel	nt = usd_to_seg_desc(&nt_desc);
867268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
868268777Sneel
869268777Sneel	/* Set CR0.TS */
870268777Sneel	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
871268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
872268777Sneel
873268777Sneel	/*
874268777Sneel	 * We are now committed to the task switch. Any exceptions encountered
875268777Sneel	 * after this point will be handled in the context of the new task and
876268777Sneel	 * the saved instruction pointer will belong to the new task.
877268777Sneel	 */
878277310Sneel	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
879277310Sneel	assert(error == 0);
880268777Sneel
881268777Sneel	/* Load processor state from new TSS */
882282558Sneel	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
883282558Sneel	    &fault);
884282558Sneel	CHKERR(error, fault);
885268777Sneel
886268777Sneel	/*
887268777Sneel	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
888268777Sneel	 * caused an error code to be generated, this error code is copied
889268777Sneel	 * to the stack of the new task.
890268777Sneel	 */
891268777Sneel	if (task_switch->errcode_valid) {
892268777Sneel		assert(task_switch->ext);
893268777Sneel		assert(task_switch->reason == TSR_IDT_GATE);
894268777Sneel		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
895282558Sneel		    task_switch->errcode, &fault);
896282558Sneel		CHKERR(error, fault);
897268777Sneel	}
898268777Sneel
899268777Sneel	/*
900268777Sneel	 * Treatment of virtual-NMI blocking if NMI is delivered through
901268777Sneel	 * a task gate.
902268777Sneel	 *
903268777Sneel	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
904268777Sneel	 * If the virtual NMIs VM-execution control is 1, VM entry injects
905268777Sneel	 * an NMI, and delivery of the NMI causes a task switch that causes
906268777Sneel	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
907268777Sneel	 * commences.
908268777Sneel	 *
909268777Sneel	 * Thus, virtual-NMI blocking is in effect at the time of the task
910268777Sneel	 * switch VM exit.
911268777Sneel	 */
912268777Sneel
913268777Sneel	/*
914268777Sneel	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
915268777Sneel	 *
916268777Sneel	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
917268777Sneel	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
918268777Sneel	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
919268777Sneel	 *
920268777Sneel	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
921268777Sneel	 * VM exit.
922268777Sneel	 */
923268777Sneel
924268777Sneel	/*
925268889Sneel	 * If the task switch was triggered by an event delivered through
926268889Sneel	 * the IDT then extinguish the pending event from the vcpu's
927268889Sneel	 * exitintinfo.
928268777Sneel	 */
929268889Sneel	if (task_switch->reason == TSR_IDT_GATE) {
930268889Sneel		error = vm_set_intinfo(ctx, vcpu, 0);
931268889Sneel		assert(error == 0);
932268889Sneel	}
933268777Sneel
934268777Sneel	/*
935268777Sneel	 * XXX should inject debug exception if 'T' bit is 1
936268777Sneel	 */
937269043Sneeldone:
938269043Sneel	return (VMEXIT_CONTINUE);
939268777Sneel}
940