1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/_iovec.h>
34#include <sys/mman.h>
35
36#include <x86/psl.h>
37#include <x86/segments.h>
38#include <x86/specialreg.h>
39#include <machine/vmm.h>
40#include <machine/vmm_instruction_emul.h>
41
42#include <assert.h>
43#include <errno.h>
44#include <stdbool.h>
45#include <stdio.h>
46#include <stdlib.h>
47
48#include <vmmapi.h>
49
50#include "bhyverun.h"
51#include "debug.h"
52
53/*
54 * Using 'struct i386tss' is tempting but causes myriad sign extension
55 * issues because all of its fields are defined as signed integers.
56 */
57struct tss32 {
58	uint16_t	tss_link;
59	uint16_t	rsvd1;
60	uint32_t	tss_esp0;
61	uint16_t	tss_ss0;
62	uint16_t	rsvd2;
63	uint32_t	tss_esp1;
64	uint16_t	tss_ss1;
65	uint16_t	rsvd3;
66	uint32_t	tss_esp2;
67	uint16_t	tss_ss2;
68	uint16_t	rsvd4;
69	uint32_t	tss_cr3;
70	uint32_t	tss_eip;
71	uint32_t	tss_eflags;
72	uint32_t	tss_eax;
73	uint32_t	tss_ecx;
74	uint32_t	tss_edx;
75	uint32_t	tss_ebx;
76	uint32_t	tss_esp;
77	uint32_t	tss_ebp;
78	uint32_t	tss_esi;
79	uint32_t	tss_edi;
80	uint16_t	tss_es;
81	uint16_t	rsvd5;
82	uint16_t	tss_cs;
83	uint16_t	rsvd6;
84	uint16_t	tss_ss;
85	uint16_t	rsvd7;
86	uint16_t	tss_ds;
87	uint16_t	rsvd8;
88	uint16_t	tss_fs;
89	uint16_t	rsvd9;
90	uint16_t	tss_gs;
91	uint16_t	rsvd10;
92	uint16_t	tss_ldt;
93	uint16_t	rsvd11;
94	uint16_t	tss_trap;
95	uint16_t	tss_iomap;
96};
97static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
98
99#define	SEL_START(sel)	(((sel) & ~0x7))
100#define	SEL_LIMIT(sel)	(((sel) | 0x7))
101#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
102
103static uint64_t
104GETREG(struct vmctx *ctx, int vcpu, int reg)
105{
106	uint64_t val;
107	int error;
108
109	error = vm_get_register(ctx, vcpu, reg, &val);
110	assert(error == 0);
111	return (val);
112}
113
114static void
115SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
116{
117	int error;
118
119	error = vm_set_register(ctx, vcpu, reg, val);
120	assert(error == 0);
121}
122
123static struct seg_desc
124usd_to_seg_desc(struct user_segment_descriptor *usd)
125{
126	struct seg_desc seg_desc;
127
128	seg_desc.base = (u_int)USD_GETBASE(usd);
129	if (usd->sd_gran)
130		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
131	else
132		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
133	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
134	seg_desc.access |= usd->sd_xx << 12;
135	seg_desc.access |= usd->sd_def32 << 14;
136	seg_desc.access |= usd->sd_gran << 15;
137
138	return (seg_desc);
139}
140
141/*
142 * Inject an exception with an error code that is a segment selector.
143 * The format of the error code is described in section 6.13, "Error Code",
144 * Intel SDM volume 3.
145 *
146 * Bit 0 (EXT) denotes whether the exception occurred during delivery
147 * of an external event like an interrupt.
148 *
149 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
150 * in the IDT.
151 *
152 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
153 */
154static void
155sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
156{
157	/*
158	 * Bit 2 from the selector is retained as-is in the error code.
159	 *
160	 * Bit 1 can be safely cleared because none of the selectors
161	 * encountered during task switch emulation refer to a task
162	 * gate in the IDT.
163	 *
164	 * Bit 0 is set depending on the value of 'ext'.
165	 */
166	sel &= ~0x3;
167	if (ext)
168		sel |= 0x1;
169	vm_inject_fault(ctx, vcpu, vector, 1, sel);
170}
171
172/*
173 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
174 * and non-zero otherwise.
175 */
176static int
177desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
178{
179	uint64_t base;
180	uint32_t limit, access;
181	int error, reg;
182
183	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
184	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
185	assert(error == 0);
186
187	if (reg == VM_REG_GUEST_LDTR) {
188		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
189			return (-1);
190	}
191
192	if (limit < SEL_LIMIT(sel))
193		return (-1);
194	else
195		return (0);
196}
197
198/*
199 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
200 * by the selector 'sel'.
201 *
202 * Returns 0 on success.
203 * Returns 1 if an exception was injected into the guest.
204 * Returns -1 otherwise.
205 */
206static int
207desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
208    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
209    int *faultptr)
210{
211	struct iovec iov[2];
212	uint64_t base;
213	uint32_t limit, access;
214	int error, reg;
215
216	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
217	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
218	assert(error == 0);
219	assert(limit >= SEL_LIMIT(sel));
220
221	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
222	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
223	    faultptr);
224	if (error || *faultptr)
225		return (error);
226
227	if (doread)
228		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
229	else
230		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
231	return (0);
232}
233
234static int
235desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
236    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
237{
238	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
239}
240
241static int
242desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
243    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
244{
245	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
246}
247
248/*
249 * Read the TSS descriptor referenced by 'sel' into 'desc'.
250 *
251 * Returns 0 on success.
252 * Returns 1 if an exception was injected into the guest.
253 * Returns -1 otherwise.
254 */
255static int
256read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
257    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
258{
259	struct vm_guest_paging sup_paging;
260	int error;
261
262	assert(!ISLDT(sel));
263	assert(IDXSEL(sel) != 0);
264
265	/* Fetch the new TSS descriptor */
266	if (desc_table_limit_check(ctx, vcpu, sel)) {
267		if (ts->reason == TSR_IRET)
268			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
269		else
270			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
271		return (1);
272	}
273
274	sup_paging = ts->paging;
275	sup_paging.cpl = 0;		/* implicit supervisor mode */
276	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
277	return (error);
278}
279
280static bool
281code_desc(int sd_type)
282{
283	/* code descriptor */
284	return ((sd_type & 0x18) == 0x18);
285}
286
287static bool
288stack_desc(int sd_type)
289{
290	/* writable data descriptor */
291	return ((sd_type & 0x1A) == 0x12);
292}
293
294static bool
295data_desc(int sd_type)
296{
297	/* data descriptor or a readable code descriptor */
298	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
299}
300
301static bool
302ldt_desc(int sd_type)
303{
304
305	return (sd_type == SDT_SYSLDT);
306}
307
308/*
309 * Validate the descriptor 'seg_desc' associated with 'segment'.
310 */
311static int
312validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
313    int segment, struct seg_desc *seg_desc, int *faultptr)
314{
315	struct vm_guest_paging sup_paging;
316	struct user_segment_descriptor usd;
317	int error, idtvec;
318	int cpl, dpl, rpl;
319	uint16_t sel, cs;
320	bool ldtseg, codeseg, stackseg, dataseg, conforming;
321
322	ldtseg = codeseg = stackseg = dataseg = false;
323	switch (segment) {
324	case VM_REG_GUEST_LDTR:
325		ldtseg = true;
326		break;
327	case VM_REG_GUEST_CS:
328		codeseg = true;
329		break;
330	case VM_REG_GUEST_SS:
331		stackseg = true;
332		break;
333	case VM_REG_GUEST_DS:
334	case VM_REG_GUEST_ES:
335	case VM_REG_GUEST_FS:
336	case VM_REG_GUEST_GS:
337		dataseg = true;
338		break;
339	default:
340		assert(0);
341	}
342
343	/* Get the segment selector */
344	sel = GETREG(ctx, vcpu, segment);
345
346	/* LDT selector must point into the GDT */
347	if (ldtseg && ISLDT(sel)) {
348		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
349		return (1);
350	}
351
352	/* Descriptor table limit check */
353	if (desc_table_limit_check(ctx, vcpu, sel)) {
354		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
355		return (1);
356	}
357
358	/* NULL selector */
359	if (IDXSEL(sel) == 0) {
360		/* Code and stack segment selectors cannot be NULL */
361		if (codeseg || stackseg) {
362			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
363			return (1);
364		}
365		seg_desc->base = 0;
366		seg_desc->limit = 0;
367		seg_desc->access = 0x10000;	/* unusable */
368		return (0);
369	}
370
371	/* Read the descriptor from the GDT/LDT */
372	sup_paging = ts->paging;
373	sup_paging.cpl = 0;	/* implicit supervisor mode */
374	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
375	if (error || *faultptr)
376		return (error);
377
378	/* Verify that the descriptor type is compatible with the segment */
379	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
380	    (codeseg && !code_desc(usd.sd_type)) ||
381	    (dataseg && !data_desc(usd.sd_type)) ||
382	    (stackseg && !stack_desc(usd.sd_type))) {
383		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
384		return (1);
385	}
386
387	/* Segment must be marked present */
388	if (!usd.sd_p) {
389		if (ldtseg)
390			idtvec = IDT_TS;
391		else if (stackseg)
392			idtvec = IDT_SS;
393		else
394			idtvec = IDT_NP;
395		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
396		return (1);
397	}
398
399	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
400	cpl = cs & SEL_RPL_MASK;
401	rpl = sel & SEL_RPL_MASK;
402	dpl = usd.sd_dpl;
403
404	if (stackseg && (rpl != cpl || dpl != cpl)) {
405		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
406		return (1);
407	}
408
409	if (codeseg) {
410		conforming = (usd.sd_type & 0x4) ? true : false;
411		if ((conforming && (cpl < dpl)) ||
412		    (!conforming && (cpl != dpl))) {
413			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
414			return (1);
415		}
416	}
417
418	if (dataseg) {
419		/*
420		 * A data segment is always non-conforming except when it's
421		 * descriptor is a readable, conforming code segment.
422		 */
423		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
424			conforming = true;
425		else
426			conforming = false;
427
428		if (!conforming && (rpl > dpl || cpl > dpl)) {
429			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
430			return (1);
431		}
432	}
433	*seg_desc = usd_to_seg_desc(&usd);
434	return (0);
435}
436
437static void
438tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
439    uint32_t eip, struct tss32 *tss, struct iovec *iov)
440{
441
442	/* General purpose registers */
443	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
444	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
445	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
446	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
447	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
448	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
449	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
450	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
451
452	/* Segment selectors */
453	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
454	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
455	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
456	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
457	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
458	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
459
460	/* eflags and eip */
461	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
462	if (task_switch->reason == TSR_IRET)
463		tss->tss_eflags &= ~PSL_NT;
464	tss->tss_eip = eip;
465
466	/* Copy updated old TSS into guest memory */
467	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
468}
469
470static void
471update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
472{
473	int error;
474
475	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
476	assert(error == 0);
477}
478
479/*
480 * Update the vcpu registers to reflect the state of the new task.
481 */
482static int
483tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
484    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
485{
486	struct seg_desc seg_desc, seg_desc2;
487	uint64_t *pdpte, maxphyaddr, reserved;
488	uint32_t eflags;
489	int error, i;
490	bool nested;
491
492	nested = false;
493	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
494		tss->tss_link = ot_sel;
495		nested = true;
496	}
497
498	eflags = tss->tss_eflags;
499	if (nested)
500		eflags |= PSL_NT;
501
502	/* LDTR */
503	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
504
505	/* PBDR */
506	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
507		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
508			/*
509			 * XXX Assuming 36-bit MAXPHYADDR.
510			 */
511			maxphyaddr = (1UL << 36) - 1;
512			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
513			for (i = 0; i < 4; i++) {
514				/* Check reserved bits if the PDPTE is valid */
515				if (!(pdpte[i] & 0x1))
516					continue;
517				/*
518				 * Bits 2:1, 8:5 and bits above the processor's
519				 * maximum physical address are reserved.
520				 */
521				reserved = ~maxphyaddr | 0x1E6;
522				if (pdpte[i] & reserved) {
523					vm_inject_gp(ctx, vcpu);
524					return (1);
525				}
526			}
527			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
528			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
529			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
530			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
531		}
532		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
533		ts->paging.cr3 = tss->tss_cr3;
534	}
535
536	/* eflags and eip */
537	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
538	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
539
540	/* General purpose registers */
541	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
542	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
543	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
544	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
545	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
546	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
547	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
548	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
549
550	/* Segment selectors */
551	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
552	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
553	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
554	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
555	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
556	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
557
558	/*
559	 * If this is a nested task then write out the new TSS to update
560	 * the previous link field.
561	 */
562	if (nested)
563		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
564
565	/* Validate segment descriptors */
566	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
567	    faultptr);
568	if (error || *faultptr)
569		return (error);
570	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
571
572	/*
573	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
574	 *
575	 * The SS and CS attribute checks on VM-entry are inter-dependent so
576	 * we need to make sure that both segments are valid before updating
577	 * either of them. This ensures that the VMCS state can pass the
578	 * VM-entry checks so the guest can handle any exception injected
579	 * during task switch emulation.
580	 */
581	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
582	    faultptr);
583	if (error || *faultptr)
584		return (error);
585
586	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
587	    faultptr);
588	if (error || *faultptr)
589		return (error);
590	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
591	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
592	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
593
594	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
595	    faultptr);
596	if (error || *faultptr)
597		return (error);
598	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
599
600	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
601	    faultptr);
602	if (error || *faultptr)
603		return (error);
604	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
605
606	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
607	    faultptr);
608	if (error || *faultptr)
609		return (error);
610	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
611
612	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
613	    faultptr);
614	if (error || *faultptr)
615		return (error);
616	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
617
618	return (0);
619}
620
621/*
622 * Push an error code on the stack of the new task. This is needed if the
623 * task switch was triggered by a hardware exception that causes an error
624 * code to be saved (e.g. #PF).
625 */
626static int
627push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
628    int task_type, uint32_t errcode, int *faultptr)
629{
630	struct iovec iov[2];
631	struct seg_desc seg_desc;
632	int stacksize, bytes, error;
633	uint64_t gla, cr0, rflags;
634	uint32_t esp;
635	uint16_t stacksel;
636
637	*faultptr = 0;
638
639	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
640	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
641	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
642
643	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
644	    &seg_desc.limit, &seg_desc.access);
645	assert(error == 0);
646
647	/*
648	 * Section "Error Code" in the Intel SDM vol 3: the error code is
649	 * pushed on the stack as a doubleword or word (depending on the
650	 * default interrupt, trap or task gate size).
651	 */
652	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
653		bytes = 4;
654	else
655		bytes = 2;
656
657	/*
658	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
659	 * stack-segment descriptor determines the size of the stack
660	 * pointer outside of 64-bit mode.
661	 */
662	if (SEG_DESC_DEF32(seg_desc.access))
663		stacksize = 4;
664	else
665		stacksize = 2;
666
667	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
668	esp -= bytes;
669
670	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
671	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
672		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
673		*faultptr = 1;
674		return (0);
675	}
676
677	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
678		vm_inject_ac(ctx, vcpu, 1);
679		*faultptr = 1;
680		return (0);
681	}
682
683	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
684	    iov, nitems(iov), faultptr);
685	if (error || *faultptr)
686		return (error);
687
688	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
689	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
690	return (0);
691}
692
693/*
694 * Evaluate return value from helper functions and potentially return to
695 * the VM run loop.
696 */
697#define	CHKERR(error,fault)						\
698	do {								\
699		assert((error == 0) || (error == EFAULT));		\
700		if (error)						\
701			return (VMEXIT_ABORT);				\
702		else if (fault)						\
703			return (VMEXIT_CONTINUE);			\
704	} while (0)
705
706int
707vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
708{
709	struct seg_desc nt;
710	struct tss32 oldtss, newtss;
711	struct vm_task_switch *task_switch;
712	struct vm_guest_paging *paging, sup_paging;
713	struct user_segment_descriptor nt_desc, ot_desc;
714	struct iovec nt_iov[2], ot_iov[2];
715	uint64_t cr0, ot_base;
716	uint32_t eip, ot_lim, access;
717	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
718	enum task_switch_reason reason;
719	uint16_t nt_sel, ot_sel;
720
721	task_switch = &vmexit->u.task_switch;
722	nt_sel = task_switch->tsssel;
723	ext = vmexit->u.task_switch.ext;
724	reason = vmexit->u.task_switch.reason;
725	paging = &vmexit->u.task_switch.paging;
726	vcpu = *pvcpu;
727
728	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
729
730	/*
731	 * Calculate the instruction pointer to store in the old TSS.
732	 */
733	eip = vmexit->rip + vmexit->inst_length;
734
735	/*
736	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
737	 * The following page table accesses are implicitly supervisor mode:
738	 * - accesses to GDT or LDT to load segment descriptors
739	 * - accesses to the task state segment during task switch
740	 */
741	sup_paging = *paging;
742	sup_paging.cpl = 0;	/* implicit supervisor mode */
743
744	/* Fetch the new TSS descriptor */
745	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
746	    &fault);
747	CHKERR(error, fault);
748
749	nt = usd_to_seg_desc(&nt_desc);
750
751	/* Verify the type of the new TSS */
752	nt_type = SEG_DESC_TYPE(nt.access);
753	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
754	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
755		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
756		goto done;
757	}
758
759	/* TSS descriptor must have present bit set */
760	if (!SEG_DESC_PRESENT(nt.access)) {
761		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
762		goto done;
763	}
764
765	/*
766	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
767	 * 44 bytes for a 16-bit TSS.
768	 */
769	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
770		minlimit = 104 - 1;
771	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
772		minlimit = 44 - 1;
773	else
774		minlimit = 0;
775
776	assert(minlimit > 0);
777	if (nt.limit < minlimit) {
778		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
779		goto done;
780	}
781
782	/* TSS must be busy if task switch is due to IRET */
783	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
784		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
785		goto done;
786	}
787
788	/*
789	 * TSS must be available (not busy) if task switch reason is
790	 * CALL, JMP, exception or interrupt.
791	 */
792	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
793		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
794		goto done;
795	}
796
797	/* Fetch the new TSS */
798	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
799	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
800	CHKERR(error, fault);
801	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
802
803	/* Get the old TSS selector from the guest's task register */
804	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
805	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
806		/*
807		 * This might happen if a task switch was attempted without
808		 * ever loading the task register with LTR. In this case the
809		 * TR would contain the values from power-on:
810		 * (sel = 0, base = 0, limit = 0xffff).
811		 */
812		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
813		goto done;
814	}
815
816	/* Get the old TSS base and limit from the guest's task register */
817	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
818	    &access);
819	assert(error == 0);
820	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
821	ot_type = SEG_DESC_TYPE(access);
822	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
823
824	/* Fetch the old TSS descriptor */
825	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
826	    &fault);
827	CHKERR(error, fault);
828
829	/* Get the old TSS */
830	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
831	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
832	CHKERR(error, fault);
833	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
834
835	/*
836	 * Clear the busy bit in the old TSS descriptor if the task switch
837	 * due to an IRET or JMP instruction.
838	 */
839	if (reason == TSR_IRET || reason == TSR_JMP) {
840		ot_desc.sd_type &= ~0x2;
841		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
842		    &ot_desc, &fault);
843		CHKERR(error, fault);
844	}
845
846	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
847		EPRINTLN("Task switch to 16-bit TSS not supported");
848		return (VMEXIT_ABORT);
849	}
850
851	/* Save processor state in old TSS */
852	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
853
854	/*
855	 * If the task switch was triggered for any reason other than IRET
856	 * then set the busy bit in the new TSS descriptor.
857	 */
858	if (reason != TSR_IRET) {
859		nt_desc.sd_type |= 0x2;
860		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
861		    &nt_desc, &fault);
862		CHKERR(error, fault);
863	}
864
865	/* Update task register to point at the new TSS */
866	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
867
868	/* Update the hidden descriptor state of the task register */
869	nt = usd_to_seg_desc(&nt_desc);
870	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
871
872	/* Set CR0.TS */
873	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
874	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
875
876	/*
877	 * We are now committed to the task switch. Any exceptions encountered
878	 * after this point will be handled in the context of the new task and
879	 * the saved instruction pointer will belong to the new task.
880	 */
881	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
882	assert(error == 0);
883
884	/* Load processor state from new TSS */
885	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
886	    &fault);
887	CHKERR(error, fault);
888
889	/*
890	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
891	 * caused an error code to be generated, this error code is copied
892	 * to the stack of the new task.
893	 */
894	if (task_switch->errcode_valid) {
895		assert(task_switch->ext);
896		assert(task_switch->reason == TSR_IDT_GATE);
897		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
898		    task_switch->errcode, &fault);
899		CHKERR(error, fault);
900	}
901
902	/*
903	 * Treatment of virtual-NMI blocking if NMI is delivered through
904	 * a task gate.
905	 *
906	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
907	 * If the virtual NMIs VM-execution control is 1, VM entry injects
908	 * an NMI, and delivery of the NMI causes a task switch that causes
909	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
910	 * commences.
911	 *
912	 * Thus, virtual-NMI blocking is in effect at the time of the task
913	 * switch VM exit.
914	 */
915
916	/*
917	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
918	 *
919	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
920	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
921	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
922	 *
923	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
924	 * VM exit.
925	 */
926
927	/*
928	 * If the task switch was triggered by an event delivered through
929	 * the IDT then extinguish the pending event from the vcpu's
930	 * exitintinfo.
931	 */
932	if (task_switch->reason == TSR_IDT_GATE) {
933		error = vm_set_intinfo(ctx, vcpu, 0);
934		assert(error == 0);
935	}
936
937	/*
938	 * XXX should inject debug exception if 'T' bit is 1
939	 */
940done:
941	return (VMEXIT_CONTINUE);
942}
943