task_switch.c revision 336189
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/11/usr.sbin/bhyve/task_switch.c 336189 2018-07-11 07:16:13Z araujo $");
31
32#include <sys/param.h>
33#include <sys/_iovec.h>
34#include <sys/mman.h>
35
36#include <x86/psl.h>
37#include <x86/segments.h>
38#include <x86/specialreg.h>
39#include <machine/vmm.h>
40#include <machine/vmm_instruction_emul.h>
41
42#include <assert.h>
43#include <errno.h>
44#include <stdbool.h>
45#include <stdio.h>
46#include <stdlib.h>
47
48#include <vmmapi.h>
49
50#include "bhyverun.h"
51
52/*
53 * Using 'struct i386tss' is tempting but causes myriad sign extension
54 * issues because all of its fields are defined as signed integers.
55 */
56struct tss32 {
57	uint16_t	tss_link;
58	uint16_t	rsvd1;
59	uint32_t	tss_esp0;
60	uint16_t	tss_ss0;
61	uint16_t	rsvd2;
62	uint32_t	tss_esp1;
63	uint16_t	tss_ss1;
64	uint16_t	rsvd3;
65	uint32_t	tss_esp2;
66	uint16_t	tss_ss2;
67	uint16_t	rsvd4;
68	uint32_t	tss_cr3;
69	uint32_t	tss_eip;
70	uint32_t	tss_eflags;
71	uint32_t	tss_eax;
72	uint32_t	tss_ecx;
73	uint32_t	tss_edx;
74	uint32_t	tss_ebx;
75	uint32_t	tss_esp;
76	uint32_t	tss_ebp;
77	uint32_t	tss_esi;
78	uint32_t	tss_edi;
79	uint16_t	tss_es;
80	uint16_t	rsvd5;
81	uint16_t	tss_cs;
82	uint16_t	rsvd6;
83	uint16_t	tss_ss;
84	uint16_t	rsvd7;
85	uint16_t	tss_ds;
86	uint16_t	rsvd8;
87	uint16_t	tss_fs;
88	uint16_t	rsvd9;
89	uint16_t	tss_gs;
90	uint16_t	rsvd10;
91	uint16_t	tss_ldt;
92	uint16_t	rsvd11;
93	uint16_t	tss_trap;
94	uint16_t	tss_iomap;
95};
96static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
97
98#define	SEL_START(sel)	(((sel) & ~0x7))
99#define	SEL_LIMIT(sel)	(((sel) | 0x7))
100#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
101
102static uint64_t
103GETREG(struct vmctx *ctx, int vcpu, int reg)
104{
105	uint64_t val;
106	int error;
107
108	error = vm_get_register(ctx, vcpu, reg, &val);
109	assert(error == 0);
110	return (val);
111}
112
113static void
114SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
115{
116	int error;
117
118	error = vm_set_register(ctx, vcpu, reg, val);
119	assert(error == 0);
120}
121
122static struct seg_desc
123usd_to_seg_desc(struct user_segment_descriptor *usd)
124{
125	struct seg_desc seg_desc;
126
127	seg_desc.base = (u_int)USD_GETBASE(usd);
128	if (usd->sd_gran)
129		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
130	else
131		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
132	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
133	seg_desc.access |= usd->sd_xx << 12;
134	seg_desc.access |= usd->sd_def32 << 14;
135	seg_desc.access |= usd->sd_gran << 15;
136
137	return (seg_desc);
138}
139
140/*
141 * Inject an exception with an error code that is a segment selector.
142 * The format of the error code is described in section 6.13, "Error Code",
143 * Intel SDM volume 3.
144 *
145 * Bit 0 (EXT) denotes whether the exception occurred during delivery
146 * of an external event like an interrupt.
147 *
148 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
149 * in the IDT.
150 *
151 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
152 */
153static void
154sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
155{
156	/*
157	 * Bit 2 from the selector is retained as-is in the error code.
158	 *
159	 * Bit 1 can be safely cleared because none of the selectors
160	 * encountered during task switch emulation refer to a task
161	 * gate in the IDT.
162	 *
163	 * Bit 0 is set depending on the value of 'ext'.
164	 */
165	sel &= ~0x3;
166	if (ext)
167		sel |= 0x1;
168	vm_inject_fault(ctx, vcpu, vector, 1, sel);
169}
170
171/*
172 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
173 * and non-zero otherwise.
174 */
175static int
176desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
177{
178	uint64_t base;
179	uint32_t limit, access;
180	int error, reg;
181
182	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
183	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
184	assert(error == 0);
185
186	if (reg == VM_REG_GUEST_LDTR) {
187		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
188			return (-1);
189	}
190
191	if (limit < SEL_LIMIT(sel))
192		return (-1);
193	else
194		return (0);
195}
196
197/*
198 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
199 * by the selector 'sel'.
200 *
201 * Returns 0 on success.
202 * Returns 1 if an exception was injected into the guest.
203 * Returns -1 otherwise.
204 */
205static int
206desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
207    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
208    int *faultptr)
209{
210	struct iovec iov[2];
211	uint64_t base;
212	uint32_t limit, access;
213	int error, reg;
214
215	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
216	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
217	assert(error == 0);
218	assert(limit >= SEL_LIMIT(sel));
219
220	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
221	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
222	    faultptr);
223	if (error || *faultptr)
224		return (error);
225
226	if (doread)
227		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
228	else
229		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
230	return (0);
231}
232
233static int
234desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
235    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
236{
237	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
238}
239
240static int
241desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
242    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
243{
244	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
245}
246
247/*
248 * Read the TSS descriptor referenced by 'sel' into 'desc'.
249 *
250 * Returns 0 on success.
251 * Returns 1 if an exception was injected into the guest.
252 * Returns -1 otherwise.
253 */
254static int
255read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
256    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
257{
258	struct vm_guest_paging sup_paging;
259	int error;
260
261	assert(!ISLDT(sel));
262	assert(IDXSEL(sel) != 0);
263
264	/* Fetch the new TSS descriptor */
265	if (desc_table_limit_check(ctx, vcpu, sel)) {
266		if (ts->reason == TSR_IRET)
267			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
268		else
269			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
270		return (1);
271	}
272
273	sup_paging = ts->paging;
274	sup_paging.cpl = 0;		/* implicit supervisor mode */
275	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
276	return (error);
277}
278
279static bool
280code_desc(int sd_type)
281{
282	/* code descriptor */
283	return ((sd_type & 0x18) == 0x18);
284}
285
286static bool
287stack_desc(int sd_type)
288{
289	/* writable data descriptor */
290	return ((sd_type & 0x1A) == 0x12);
291}
292
293static bool
294data_desc(int sd_type)
295{
296	/* data descriptor or a readable code descriptor */
297	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
298}
299
300static bool
301ldt_desc(int sd_type)
302{
303
304	return (sd_type == SDT_SYSLDT);
305}
306
307/*
308 * Validate the descriptor 'seg_desc' associated with 'segment'.
309 */
310static int
311validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
312    int segment, struct seg_desc *seg_desc, int *faultptr)
313{
314	struct vm_guest_paging sup_paging;
315	struct user_segment_descriptor usd;
316	int error, idtvec;
317	int cpl, dpl, rpl;
318	uint16_t sel, cs;
319	bool ldtseg, codeseg, stackseg, dataseg, conforming;
320
321	ldtseg = codeseg = stackseg = dataseg = false;
322	switch (segment) {
323	case VM_REG_GUEST_LDTR:
324		ldtseg = true;
325		break;
326	case VM_REG_GUEST_CS:
327		codeseg = true;
328		break;
329	case VM_REG_GUEST_SS:
330		stackseg = true;
331		break;
332	case VM_REG_GUEST_DS:
333	case VM_REG_GUEST_ES:
334	case VM_REG_GUEST_FS:
335	case VM_REG_GUEST_GS:
336		dataseg = true;
337		break;
338	default:
339		assert(0);
340	}
341
342	/* Get the segment selector */
343	sel = GETREG(ctx, vcpu, segment);
344
345	/* LDT selector must point into the GDT */
346	if (ldtseg && ISLDT(sel)) {
347		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
348		return (1);
349	}
350
351	/* Descriptor table limit check */
352	if (desc_table_limit_check(ctx, vcpu, sel)) {
353		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
354		return (1);
355	}
356
357	/* NULL selector */
358	if (IDXSEL(sel) == 0) {
359		/* Code and stack segment selectors cannot be NULL */
360		if (codeseg || stackseg) {
361			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
362			return (1);
363		}
364		seg_desc->base = 0;
365		seg_desc->limit = 0;
366		seg_desc->access = 0x10000;	/* unusable */
367		return (0);
368	}
369
370	/* Read the descriptor from the GDT/LDT */
371	sup_paging = ts->paging;
372	sup_paging.cpl = 0;	/* implicit supervisor mode */
373	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
374	if (error || *faultptr)
375		return (error);
376
377	/* Verify that the descriptor type is compatible with the segment */
378	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
379	    (codeseg && !code_desc(usd.sd_type)) ||
380	    (dataseg && !data_desc(usd.sd_type)) ||
381	    (stackseg && !stack_desc(usd.sd_type))) {
382		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
383		return (1);
384	}
385
386	/* Segment must be marked present */
387	if (!usd.sd_p) {
388		if (ldtseg)
389			idtvec = IDT_TS;
390		else if (stackseg)
391			idtvec = IDT_SS;
392		else
393			idtvec = IDT_NP;
394		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
395		return (1);
396	}
397
398	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
399	cpl = cs & SEL_RPL_MASK;
400	rpl = sel & SEL_RPL_MASK;
401	dpl = usd.sd_dpl;
402
403	if (stackseg && (rpl != cpl || dpl != cpl)) {
404		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
405		return (1);
406	}
407
408	if (codeseg) {
409		conforming = (usd.sd_type & 0x4) ? true : false;
410		if ((conforming && (cpl < dpl)) ||
411		    (!conforming && (cpl != dpl))) {
412			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
413			return (1);
414		}
415	}
416
417	if (dataseg) {
418		/*
419		 * A data segment is always non-conforming except when it's
420		 * descriptor is a readable, conforming code segment.
421		 */
422		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
423			conforming = true;
424		else
425			conforming = false;
426
427		if (!conforming && (rpl > dpl || cpl > dpl)) {
428			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
429			return (1);
430		}
431	}
432	*seg_desc = usd_to_seg_desc(&usd);
433	return (0);
434}
435
436static void
437tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
438    uint32_t eip, struct tss32 *tss, struct iovec *iov)
439{
440
441	/* General purpose registers */
442	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
443	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
444	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
445	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
446	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
447	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
448	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
449	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
450
451	/* Segment selectors */
452	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
453	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
454	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
455	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
456	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
457	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
458
459	/* eflags and eip */
460	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
461	if (task_switch->reason == TSR_IRET)
462		tss->tss_eflags &= ~PSL_NT;
463	tss->tss_eip = eip;
464
465	/* Copy updated old TSS into guest memory */
466	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
467}
468
469static void
470update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
471{
472	int error;
473
474	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
475	assert(error == 0);
476}
477
478/*
479 * Update the vcpu registers to reflect the state of the new task.
480 */
481static int
482tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
483    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
484{
485	struct seg_desc seg_desc, seg_desc2;
486	uint64_t *pdpte, maxphyaddr, reserved;
487	uint32_t eflags;
488	int error, i;
489	bool nested;
490
491	nested = false;
492	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
493		tss->tss_link = ot_sel;
494		nested = true;
495	}
496
497	eflags = tss->tss_eflags;
498	if (nested)
499		eflags |= PSL_NT;
500
501	/* LDTR */
502	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
503
504	/* PBDR */
505	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
506		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
507			/*
508			 * XXX Assuming 36-bit MAXPHYADDR.
509			 */
510			maxphyaddr = (1UL << 36) - 1;
511			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
512			for (i = 0; i < 4; i++) {
513				/* Check reserved bits if the PDPTE is valid */
514				if (!(pdpte[i] & 0x1))
515					continue;
516				/*
517				 * Bits 2:1, 8:5 and bits above the processor's
518				 * maximum physical address are reserved.
519				 */
520				reserved = ~maxphyaddr | 0x1E6;
521				if (pdpte[i] & reserved) {
522					vm_inject_gp(ctx, vcpu);
523					return (1);
524				}
525			}
526			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
527			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
528			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
529			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
530		}
531		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
532		ts->paging.cr3 = tss->tss_cr3;
533	}
534
535	/* eflags and eip */
536	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
537	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
538
539	/* General purpose registers */
540	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
541	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
542	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
543	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
544	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
545	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
546	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
547	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
548
549	/* Segment selectors */
550	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
551	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
552	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
553	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
554	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
555	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
556
557	/*
558	 * If this is a nested task then write out the new TSS to update
559	 * the previous link field.
560	 */
561	if (nested)
562		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
563
564	/* Validate segment descriptors */
565	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
566	    faultptr);
567	if (error || *faultptr)
568		return (error);
569	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
570
571	/*
572	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
573	 *
574	 * The SS and CS attribute checks on VM-entry are inter-dependent so
575	 * we need to make sure that both segments are valid before updating
576	 * either of them. This ensures that the VMCS state can pass the
577	 * VM-entry checks so the guest can handle any exception injected
578	 * during task switch emulation.
579	 */
580	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
581	    faultptr);
582	if (error || *faultptr)
583		return (error);
584
585	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
586	    faultptr);
587	if (error || *faultptr)
588		return (error);
589	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
590	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
591	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
592
593	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
594	    faultptr);
595	if (error || *faultptr)
596		return (error);
597	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
598
599	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
600	    faultptr);
601	if (error || *faultptr)
602		return (error);
603	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
604
605	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
606	    faultptr);
607	if (error || *faultptr)
608		return (error);
609	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
610
611	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
612	    faultptr);
613	if (error || *faultptr)
614		return (error);
615	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
616
617	return (0);
618}
619
620/*
621 * Push an error code on the stack of the new task. This is needed if the
622 * task switch was triggered by a hardware exception that causes an error
623 * code to be saved (e.g. #PF).
624 */
625static int
626push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
627    int task_type, uint32_t errcode, int *faultptr)
628{
629	struct iovec iov[2];
630	struct seg_desc seg_desc;
631	int stacksize, bytes, error;
632	uint64_t gla, cr0, rflags;
633	uint32_t esp;
634	uint16_t stacksel;
635
636	*faultptr = 0;
637
638	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
639	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
640	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
641
642	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
643	    &seg_desc.limit, &seg_desc.access);
644	assert(error == 0);
645
646	/*
647	 * Section "Error Code" in the Intel SDM vol 3: the error code is
648	 * pushed on the stack as a doubleword or word (depending on the
649	 * default interrupt, trap or task gate size).
650	 */
651	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
652		bytes = 4;
653	else
654		bytes = 2;
655
656	/*
657	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
658	 * stack-segment descriptor determines the size of the stack
659	 * pointer outside of 64-bit mode.
660	 */
661	if (SEG_DESC_DEF32(seg_desc.access))
662		stacksize = 4;
663	else
664		stacksize = 2;
665
666	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
667	esp -= bytes;
668
669	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
670	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
671		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
672		*faultptr = 1;
673		return (0);
674	}
675
676	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
677		vm_inject_ac(ctx, vcpu, 1);
678		*faultptr = 1;
679		return (0);
680	}
681
682	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
683	    iov, nitems(iov), faultptr);
684	if (error || *faultptr)
685		return (error);
686
687	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
688	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
689	return (0);
690}
691
692/*
693 * Evaluate return value from helper functions and potentially return to
694 * the VM run loop.
695 */
696#define	CHKERR(error,fault)						\
697	do {								\
698		assert((error == 0) || (error == EFAULT));		\
699		if (error)						\
700			return (VMEXIT_ABORT);				\
701		else if (fault)						\
702			return (VMEXIT_CONTINUE);			\
703	} while (0)
704
705int
706vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
707{
708	struct seg_desc nt;
709	struct tss32 oldtss, newtss;
710	struct vm_task_switch *task_switch;
711	struct vm_guest_paging *paging, sup_paging;
712	struct user_segment_descriptor nt_desc, ot_desc;
713	struct iovec nt_iov[2], ot_iov[2];
714	uint64_t cr0, ot_base;
715	uint32_t eip, ot_lim, access;
716	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
717	enum task_switch_reason reason;
718	uint16_t nt_sel, ot_sel;
719
720	task_switch = &vmexit->u.task_switch;
721	nt_sel = task_switch->tsssel;
722	ext = vmexit->u.task_switch.ext;
723	reason = vmexit->u.task_switch.reason;
724	paging = &vmexit->u.task_switch.paging;
725	vcpu = *pvcpu;
726
727	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
728
729	/*
730	 * Calculate the instruction pointer to store in the old TSS.
731	 */
732	eip = vmexit->rip + vmexit->inst_length;
733
734	/*
735	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
736	 * The following page table accesses are implicitly supervisor mode:
737	 * - accesses to GDT or LDT to load segment descriptors
738	 * - accesses to the task state segment during task switch
739	 */
740	sup_paging = *paging;
741	sup_paging.cpl = 0;	/* implicit supervisor mode */
742
743	/* Fetch the new TSS descriptor */
744	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
745	    &fault);
746	CHKERR(error, fault);
747
748	nt = usd_to_seg_desc(&nt_desc);
749
750	/* Verify the type of the new TSS */
751	nt_type = SEG_DESC_TYPE(nt.access);
752	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
753	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
754		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
755		goto done;
756	}
757
758	/* TSS descriptor must have present bit set */
759	if (!SEG_DESC_PRESENT(nt.access)) {
760		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
761		goto done;
762	}
763
764	/*
765	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
766	 * 44 bytes for a 16-bit TSS.
767	 */
768	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
769		minlimit = 104 - 1;
770	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
771		minlimit = 44 - 1;
772	else
773		minlimit = 0;
774
775	assert(minlimit > 0);
776	if (nt.limit < minlimit) {
777		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
778		goto done;
779	}
780
781	/* TSS must be busy if task switch is due to IRET */
782	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
783		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
784		goto done;
785	}
786
787	/*
788	 * TSS must be available (not busy) if task switch reason is
789	 * CALL, JMP, exception or interrupt.
790	 */
791	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
792		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
793		goto done;
794	}
795
796	/* Fetch the new TSS */
797	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
798	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
799	CHKERR(error, fault);
800	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
801
802	/* Get the old TSS selector from the guest's task register */
803	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
804	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
805		/*
806		 * This might happen if a task switch was attempted without
807		 * ever loading the task register with LTR. In this case the
808		 * TR would contain the values from power-on:
809		 * (sel = 0, base = 0, limit = 0xffff).
810		 */
811		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
812		goto done;
813	}
814
815	/* Get the old TSS base and limit from the guest's task register */
816	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
817	    &access);
818	assert(error == 0);
819	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
820	ot_type = SEG_DESC_TYPE(access);
821	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
822
823	/* Fetch the old TSS descriptor */
824	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
825	    &fault);
826	CHKERR(error, fault);
827
828	/* Get the old TSS */
829	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
830	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
831	CHKERR(error, fault);
832	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
833
834	/*
835	 * Clear the busy bit in the old TSS descriptor if the task switch
836	 * due to an IRET or JMP instruction.
837	 */
838	if (reason == TSR_IRET || reason == TSR_JMP) {
839		ot_desc.sd_type &= ~0x2;
840		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
841		    &ot_desc, &fault);
842		CHKERR(error, fault);
843	}
844
845	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
846		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
847		return (VMEXIT_ABORT);
848	}
849
850	/* Save processor state in old TSS */
851	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
852
853	/*
854	 * If the task switch was triggered for any reason other than IRET
855	 * then set the busy bit in the new TSS descriptor.
856	 */
857	if (reason != TSR_IRET) {
858		nt_desc.sd_type |= 0x2;
859		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
860		    &nt_desc, &fault);
861		CHKERR(error, fault);
862	}
863
864	/* Update task register to point at the new TSS */
865	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
866
867	/* Update the hidden descriptor state of the task register */
868	nt = usd_to_seg_desc(&nt_desc);
869	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
870
871	/* Set CR0.TS */
872	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
873	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
874
875	/*
876	 * We are now committed to the task switch. Any exceptions encountered
877	 * after this point will be handled in the context of the new task and
878	 * the saved instruction pointer will belong to the new task.
879	 */
880	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
881	assert(error == 0);
882
883	/* Load processor state from new TSS */
884	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
885	    &fault);
886	CHKERR(error, fault);
887
888	/*
889	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
890	 * caused an error code to be generated, this error code is copied
891	 * to the stack of the new task.
892	 */
893	if (task_switch->errcode_valid) {
894		assert(task_switch->ext);
895		assert(task_switch->reason == TSR_IDT_GATE);
896		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
897		    task_switch->errcode, &fault);
898		CHKERR(error, fault);
899	}
900
901	/*
902	 * Treatment of virtual-NMI blocking if NMI is delivered through
903	 * a task gate.
904	 *
905	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
906	 * If the virtual NMIs VM-execution control is 1, VM entry injects
907	 * an NMI, and delivery of the NMI causes a task switch that causes
908	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
909	 * commences.
910	 *
911	 * Thus, virtual-NMI blocking is in effect at the time of the task
912	 * switch VM exit.
913	 */
914
915	/*
916	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
917	 *
918	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
919	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
920	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
921	 *
922	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
923	 * VM exit.
924	 */
925
926	/*
927	 * If the task switch was triggered by an event delivered through
928	 * the IDT then extinguish the pending event from the vcpu's
929	 * exitintinfo.
930	 */
931	if (task_switch->reason == TSR_IDT_GATE) {
932		error = vm_set_intinfo(ctx, vcpu, 0);
933		assert(error == 0);
934	}
935
936	/*
937	 * XXX should inject debug exception if 'T' bit is 1
938	 */
939done:
940	return (VMEXIT_CONTINUE);
941}
942