task_switch.c revision 270159
1/*- 2 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/task_switch.c 270159 2014-08-19 01:20:24Z grehan $"); 29 30#include <sys/param.h> 31#include <sys/_iovec.h> 32#include <sys/mman.h> 33 34#include <x86/psl.h> 35#include <x86/segments.h> 36#include <x86/specialreg.h> 37#include <machine/vmm.h> 38#include <machine/vmm_instruction_emul.h> 39 40#include <stdbool.h> 41#include <stdio.h> 42#include <stdlib.h> 43#include <assert.h> 44#include <errno.h> 45 46#include <vmmapi.h> 47 48#include "bhyverun.h" 49 50/* 51 * Using 'struct i386tss' is tempting but causes myriad sign extension 52 * issues because all of its fields are defined as signed integers. 53 */ 54struct tss32 { 55 uint16_t tss_link; 56 uint16_t rsvd1; 57 uint32_t tss_esp0; 58 uint16_t tss_ss0; 59 uint16_t rsvd2; 60 uint32_t tss_esp1; 61 uint16_t tss_ss1; 62 uint16_t rsvd3; 63 uint32_t tss_esp2; 64 uint16_t tss_ss2; 65 uint16_t rsvd4; 66 uint32_t tss_cr3; 67 uint32_t tss_eip; 68 uint32_t tss_eflags; 69 uint32_t tss_eax; 70 uint32_t tss_ecx; 71 uint32_t tss_edx; 72 uint32_t tss_ebx; 73 uint32_t tss_esp; 74 uint32_t tss_ebp; 75 uint32_t tss_esi; 76 uint32_t tss_edi; 77 uint16_t tss_es; 78 uint16_t rsvd5; 79 uint16_t tss_cs; 80 uint16_t rsvd6; 81 uint16_t tss_ss; 82 uint16_t rsvd7; 83 uint16_t tss_ds; 84 uint16_t rsvd8; 85 uint16_t tss_fs; 86 uint16_t rsvd9; 87 uint16_t tss_gs; 88 uint16_t rsvd10; 89 uint16_t tss_ldt; 90 uint16_t rsvd11; 91 uint16_t tss_trap; 92 uint16_t tss_iomap; 93}; 94CTASSERT(sizeof(struct tss32) == 104); 95 96#define SEL_START(sel) (((sel) & ~0x7)) 97#define SEL_LIMIT(sel) (((sel) | 0x7)) 98#define TSS_BUSY(type) (((type) & 0x2) != 0) 99 100static uint64_t 101GETREG(struct vmctx *ctx, int vcpu, int reg) 102{ 103 uint64_t val; 104 int error; 105 106 error = vm_get_register(ctx, vcpu, reg, &val); 107 assert(error == 0); 108 return (val); 109} 110 111static void 112SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) 113{ 114 int error; 115 116 error = vm_set_register(ctx, vcpu, reg, val); 117 assert(error == 0); 118} 119 120static struct seg_desc 121usd_to_seg_desc(struct user_segment_descriptor *usd) 122{ 123 struct seg_desc seg_desc; 124 125 seg_desc.base = (u_int)USD_GETBASE(usd); 126 if (usd->sd_gran) 127 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 128 else 129 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 130 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 131 seg_desc.access |= usd->sd_xx << 12; 132 seg_desc.access |= usd->sd_def32 << 14; 133 seg_desc.access |= usd->sd_gran << 15; 134 135 return (seg_desc); 136} 137 138/* 139 * Inject an exception with an error code that is a segment selector. 140 * The format of the error code is described in section 6.13, "Error Code", 141 * Intel SDM volume 3. 142 * 143 * Bit 0 (EXT) denotes whether the exception occurred during delivery 144 * of an external event like an interrupt. 145 * 146 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 147 * in the IDT. 148 * 149 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 150 */ 151static void 152sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) 153{ 154 /* 155 * Bit 2 from the selector is retained as-is in the error code. 156 * 157 * Bit 1 can be safely cleared because none of the selectors 158 * encountered during task switch emulation refer to a task 159 * gate in the IDT. 160 * 161 * Bit 0 is set depending on the value of 'ext'. 162 */ 163 sel &= ~0x3; 164 if (ext) 165 sel |= 0x1; 166 vm_inject_fault(ctx, vcpu, vector, 1, sel); 167} 168 169/* 170 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 171 * and non-zero otherwise. 172 */ 173static int 174desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) 175{ 176 uint64_t base; 177 uint32_t limit, access; 178 int error, reg; 179 180 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 181 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 182 assert(error == 0); 183 184 if (reg == VM_REG_GUEST_LDTR) { 185 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 186 return (-1); 187 } 188 189 if (limit < SEL_LIMIT(sel)) 190 return (-1); 191 else 192 return (0); 193} 194 195/* 196 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 197 * by the selector 'sel'. 198 * 199 * Returns 0 on success. 200 * Returns 1 if an exception was injected into the guest. 201 * Returns -1 otherwise. 202 */ 203static int 204desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 205 uint16_t sel, struct user_segment_descriptor *desc, bool doread) 206{ 207 struct iovec iov[2]; 208 uint64_t base; 209 uint32_t limit, access; 210 int error, reg; 211 212 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 213 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 214 assert(error == 0); 215 assert(limit >= SEL_LIMIT(sel)); 216 217 error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), 218 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov)); 219 if (error == 0) { 220 if (doread) 221 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); 222 else 223 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); 224 } 225 return (error); 226} 227 228static int 229desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 230 uint16_t sel, struct user_segment_descriptor *desc) 231{ 232 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true)); 233} 234 235static int 236desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 237 uint16_t sel, struct user_segment_descriptor *desc) 238{ 239 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false)); 240} 241 242/* 243 * Read the TSS descriptor referenced by 'sel' into 'desc'. 244 * 245 * Returns 0 on success. 246 * Returns 1 if an exception was injected into the guest. 247 * Returns -1 otherwise. 248 */ 249static int 250read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 251 uint16_t sel, struct user_segment_descriptor *desc) 252{ 253 struct vm_guest_paging sup_paging; 254 int error; 255 256 assert(!ISLDT(sel)); 257 assert(IDXSEL(sel) != 0); 258 259 /* Fetch the new TSS descriptor */ 260 if (desc_table_limit_check(ctx, vcpu, sel)) { 261 if (ts->reason == TSR_IRET) 262 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 263 else 264 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); 265 return (1); 266 } 267 268 sup_paging = ts->paging; 269 sup_paging.cpl = 0; /* implicit supervisor mode */ 270 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc); 271 return (error); 272} 273 274static bool 275code_desc(int sd_type) 276{ 277 /* code descriptor */ 278 return ((sd_type & 0x18) == 0x18); 279} 280 281static bool 282stack_desc(int sd_type) 283{ 284 /* writable data descriptor */ 285 return ((sd_type & 0x1A) == 0x12); 286} 287 288static bool 289data_desc(int sd_type) 290{ 291 /* data descriptor or a readable code descriptor */ 292 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 293} 294 295static bool 296ldt_desc(int sd_type) 297{ 298 299 return (sd_type == SDT_SYSLDT); 300} 301 302/* 303 * Validate the descriptor 'seg_desc' associated with 'segment'. 304 * 305 * Returns 0 on success. 306 * Returns 1 if an exception was injected into the guest. 307 * Returns -1 otherwise. 308 */ 309static int 310validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 311 int segment, struct seg_desc *seg_desc) 312{ 313 struct vm_guest_paging sup_paging; 314 struct user_segment_descriptor usd; 315 int error, idtvec; 316 int cpl, dpl, rpl; 317 uint16_t sel, cs; 318 bool ldtseg, codeseg, stackseg, dataseg, conforming; 319 320 ldtseg = codeseg = stackseg = dataseg = false; 321 switch (segment) { 322 case VM_REG_GUEST_LDTR: 323 ldtseg = true; 324 break; 325 case VM_REG_GUEST_CS: 326 codeseg = true; 327 break; 328 case VM_REG_GUEST_SS: 329 stackseg = true; 330 break; 331 case VM_REG_GUEST_DS: 332 case VM_REG_GUEST_ES: 333 case VM_REG_GUEST_FS: 334 case VM_REG_GUEST_GS: 335 dataseg = true; 336 break; 337 default: 338 assert(0); 339 } 340 341 /* Get the segment selector */ 342 sel = GETREG(ctx, vcpu, segment); 343 344 /* LDT selector must point into the GDT */ 345 if (ldtseg && ISLDT(sel)) { 346 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 347 return (1); 348 } 349 350 /* Descriptor table limit check */ 351 if (desc_table_limit_check(ctx, vcpu, sel)) { 352 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 353 return (1); 354 } 355 356 /* NULL selector */ 357 if (IDXSEL(sel) == 0) { 358 /* Code and stack segment selectors cannot be NULL */ 359 if (codeseg || stackseg) { 360 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 361 return (1); 362 } 363 seg_desc->base = 0; 364 seg_desc->limit = 0; 365 seg_desc->access = 0x10000; /* unusable */ 366 return (0); 367 } 368 369 /* Read the descriptor from the GDT/LDT */ 370 sup_paging = ts->paging; 371 sup_paging.cpl = 0; /* implicit supervisor mode */ 372 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd); 373 if (error) 374 return (error); 375 376 /* Verify that the descriptor type is compatible with the segment */ 377 if ((ldtseg && !ldt_desc(usd.sd_type)) || 378 (codeseg && !code_desc(usd.sd_type)) || 379 (dataseg && !data_desc(usd.sd_type)) || 380 (stackseg && !stack_desc(usd.sd_type))) { 381 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 382 return (1); 383 } 384 385 /* Segment must be marked present */ 386 if (!usd.sd_p) { 387 if (ldtseg) 388 idtvec = IDT_TS; 389 else if (stackseg) 390 idtvec = IDT_SS; 391 else 392 idtvec = IDT_NP; 393 sel_exception(ctx, vcpu, idtvec, sel, ts->ext); 394 return (1); 395 } 396 397 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 398 cpl = cs & SEL_RPL_MASK; 399 rpl = sel & SEL_RPL_MASK; 400 dpl = usd.sd_dpl; 401 402 if (stackseg && (rpl != cpl || dpl != cpl)) { 403 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 404 return (1); 405 } 406 407 if (codeseg) { 408 conforming = (usd.sd_type & 0x4) ? true : false; 409 if ((conforming && (cpl < dpl)) || 410 (!conforming && (cpl != dpl))) { 411 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 412 return (1); 413 } 414 } 415 416 if (dataseg) { 417 /* 418 * A data segment is always non-conforming except when it's 419 * descriptor is a readable, conforming code segment. 420 */ 421 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 422 conforming = true; 423 else 424 conforming = false; 425 426 if (!conforming && (rpl > dpl || cpl > dpl)) { 427 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 428 return (1); 429 } 430 } 431 *seg_desc = usd_to_seg_desc(&usd); 432 return (0); 433} 434 435static void 436tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, 437 uint32_t eip, struct tss32 *tss, struct iovec *iov) 438{ 439 440 /* General purpose registers */ 441 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); 442 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); 443 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); 444 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); 445 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 446 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); 447 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); 448 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); 449 450 /* Segment selectors */ 451 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); 452 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 453 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 454 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); 455 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); 456 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); 457 458 /* eflags and eip */ 459 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 460 if (task_switch->reason == TSR_IRET) 461 tss->tss_eflags &= ~PSL_NT; 462 tss->tss_eip = eip; 463 464 /* Copy updated old TSS into guest memory */ 465 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); 466} 467 468static void 469update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) 470{ 471 int error; 472 473 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); 474 assert(error == 0); 475} 476 477/* 478 * Update the vcpu registers to reflect the state of the new task. 479 * 480 * Returns 0 on success. 481 * Returns 1 if an exception was injected into the guest. 482 * Returns -1 otherwise. 483 */ 484static int 485tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 486 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov) 487{ 488 struct seg_desc seg_desc, seg_desc2; 489 uint64_t *pdpte, maxphyaddr, reserved; 490 uint32_t eflags; 491 int error, i; 492 bool nested; 493 494 nested = false; 495 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 496 tss->tss_link = ot_sel; 497 nested = true; 498 } 499 500 eflags = tss->tss_eflags; 501 if (nested) 502 eflags |= PSL_NT; 503 504 /* LDTR */ 505 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 506 507 /* PBDR */ 508 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 509 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 510 /* 511 * XXX Assuming 36-bit MAXPHYADDR. 512 */ 513 maxphyaddr = (1UL << 36) - 1; 514 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 515 for (i = 0; i < 4; i++) { 516 /* Check reserved bits if the PDPTE is valid */ 517 if (!(pdpte[i] & 0x1)) 518 continue; 519 /* 520 * Bits 2:1, 8:5 and bits above the processor's 521 * maximum physical address are reserved. 522 */ 523 reserved = ~maxphyaddr | 0x1E6; 524 if (pdpte[i] & reserved) { 525 vm_inject_gp(ctx, vcpu); 526 return (1); 527 } 528 } 529 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 530 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 531 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 532 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 533 } 534 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 535 ts->paging.cr3 = tss->tss_cr3; 536 } 537 538 /* eflags and eip */ 539 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); 540 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 541 542 /* General purpose registers */ 543 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 544 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 545 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 546 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 547 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 548 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 549 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 550 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 551 552 /* Segment selectors */ 553 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); 554 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); 555 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); 556 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); 557 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); 558 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); 559 560 /* 561 * If this is a nested task then write out the new TSS to update 562 * the previous link field. 563 */ 564 if (nested) 565 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); 566 567 /* Validate segment descriptors */ 568 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc); 569 if (error) 570 return (error); 571 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); 572 573 /* 574 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 575 * 576 * The SS and CS attribute checks on VM-entry are inter-dependent so 577 * we need to make sure that both segments are valid before updating 578 * either of them. This ensures that the VMCS state can pass the 579 * VM-entry checks so the guest can handle any exception injected 580 * during task switch emulation. 581 */ 582 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc); 583 if (error) 584 return (error); 585 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2); 586 if (error) 587 return (error); 588 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); 589 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); 590 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 591 592 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc); 593 if (error) 594 return (error); 595 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); 596 597 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc); 598 if (error) 599 return (error); 600 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); 601 602 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc); 603 if (error) 604 return (error); 605 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); 606 607 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc); 608 if (error) 609 return (error); 610 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); 611 612 return (0); 613} 614 615/* 616 * Push an error code on the stack of the new task. This is needed if the 617 * task switch was triggered by a hardware exception that causes an error 618 * code to be saved (e.g. #PF). 619 * 620 * Returns 0 on success. 621 * Returns 1 if an exception was injected into the guest. 622 * Returns -1 otherwise. 623 */ 624static int 625push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 626 int task_type, uint32_t errcode) 627{ 628 struct iovec iov[2]; 629 struct seg_desc seg_desc; 630 int stacksize, bytes, error; 631 uint64_t gla, cr0, rflags; 632 uint32_t esp; 633 uint16_t stacksel; 634 635 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 636 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 637 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 638 639 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, 640 &seg_desc.limit, &seg_desc.access); 641 assert(error == 0); 642 643 /* 644 * Section "Error Code" in the Intel SDM vol 3: the error code is 645 * pushed on the stack as a doubleword or word (depending on the 646 * default interrupt, trap or task gate size). 647 */ 648 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 649 bytes = 4; 650 else 651 bytes = 2; 652 653 /* 654 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 655 * stack-segment descriptor determines the size of the stack 656 * pointer outside of 64-bit mode. 657 */ 658 if (SEG_DESC_DEF32(seg_desc.access)) 659 stacksize = 4; 660 else 661 stacksize = 2; 662 663 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 664 esp -= bytes; 665 666 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 667 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 668 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); 669 return (1); 670 } 671 672 if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 673 vm_inject_ac(ctx, vcpu, 1); 674 return (1); 675 } 676 677 error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, 678 iov, nitems(iov)); 679 if (error) 680 return (error); 681 682 vm_copyout(ctx, vcpu, &errcode, iov, bytes); 683 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); 684 return (0); 685} 686 687/* 688 * Evaluate return value from helper functions and potentially return to 689 * the VM run loop. 690 * 0: success 691 * +1: an exception was injected into the guest vcpu 692 * -1: unrecoverable/programming error 693 */ 694#define CHKERR(x) \ 695 do { \ 696 assert(((x) == 0) || ((x) == 1) || ((x) == -1)); \ 697 if ((x) == -1) \ 698 return (VMEXIT_ABORT); \ 699 else if ((x) == 1) \ 700 return (VMEXIT_CONTINUE); \ 701 } while (0) 702 703int 704vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 705{ 706 struct seg_desc nt; 707 struct tss32 oldtss, newtss; 708 struct vm_task_switch *task_switch; 709 struct vm_guest_paging *paging, sup_paging; 710 struct user_segment_descriptor nt_desc, ot_desc; 711 struct iovec nt_iov[2], ot_iov[2]; 712 uint64_t cr0, ot_base; 713 uint32_t eip, ot_lim, access; 714 int error, ext, minlimit, nt_type, ot_type, vcpu; 715 enum task_switch_reason reason; 716 uint16_t nt_sel, ot_sel; 717 718 task_switch = &vmexit->u.task_switch; 719 nt_sel = task_switch->tsssel; 720 ext = vmexit->u.task_switch.ext; 721 reason = vmexit->u.task_switch.reason; 722 paging = &vmexit->u.task_switch.paging; 723 vcpu = *pvcpu; 724 725 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 726 727 /* 728 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 729 * The following page table accesses are implicitly supervisor mode: 730 * - accesses to GDT or LDT to load segment descriptors 731 * - accesses to the task state segment during task switch 732 */ 733 sup_paging = *paging; 734 sup_paging.cpl = 0; /* implicit supervisor mode */ 735 736 /* Fetch the new TSS descriptor */ 737 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc); 738 CHKERR(error); 739 740 nt = usd_to_seg_desc(&nt_desc); 741 742 /* Verify the type of the new TSS */ 743 nt_type = SEG_DESC_TYPE(nt.access); 744 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 745 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 746 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 747 goto done; 748 } 749 750 /* TSS descriptor must have present bit set */ 751 if (!SEG_DESC_PRESENT(nt.access)) { 752 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); 753 goto done; 754 } 755 756 /* 757 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 758 * 44 bytes for a 16-bit TSS. 759 */ 760 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 761 minlimit = 104 - 1; 762 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 763 minlimit = 44 - 1; 764 else 765 minlimit = 0; 766 767 assert(minlimit > 0); 768 if (nt.limit < minlimit) { 769 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 770 goto done; 771 } 772 773 /* TSS must be busy if task switch is due to IRET */ 774 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 775 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 776 goto done; 777 } 778 779 /* 780 * TSS must be available (not busy) if task switch reason is 781 * CALL, JMP, exception or interrupt. 782 */ 783 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 784 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); 785 goto done; 786 } 787 788 /* Fetch the new TSS */ 789 error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, 790 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov)); 791 CHKERR(error); 792 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); 793 794 /* Get the old TSS selector from the guest's task register */ 795 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); 796 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 797 /* 798 * This might happen if a task switch was attempted without 799 * ever loading the task register with LTR. In this case the 800 * TR would contain the values from power-on: 801 * (sel = 0, base = 0, limit = 0xffff). 802 */ 803 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); 804 goto done; 805 } 806 807 /* Get the old TSS base and limit from the guest's task register */ 808 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 809 &access); 810 assert(error == 0); 811 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 812 ot_type = SEG_DESC_TYPE(access); 813 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 814 815 /* Fetch the old TSS descriptor */ 816 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc); 817 CHKERR(error); 818 819 /* Get the old TSS */ 820 error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, 821 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov)); 822 CHKERR(error); 823 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); 824 825 /* 826 * Clear the busy bit in the old TSS descriptor if the task switch 827 * due to an IRET or JMP instruction. 828 */ 829 if (reason == TSR_IRET || reason == TSR_JMP) { 830 ot_desc.sd_type &= ~0x2; 831 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, 832 &ot_desc); 833 CHKERR(error); 834 } 835 836 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 837 fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); 838 return (VMEXIT_ABORT); 839 } 840 841 /* Save processor state in old TSS */ 842 eip = vmexit->rip + vmexit->inst_length; 843 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); 844 845 /* 846 * If the task switch was triggered for any reason other than IRET 847 * then set the busy bit in the new TSS descriptor. 848 */ 849 if (reason != TSR_IRET) { 850 nt_desc.sd_type |= 0x2; 851 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, 852 &nt_desc); 853 CHKERR(error); 854 } 855 856 /* Update task register to point at the new TSS */ 857 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); 858 859 /* Update the hidden descriptor state of the task register */ 860 nt = usd_to_seg_desc(&nt_desc); 861 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); 862 863 /* Set CR0.TS */ 864 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 865 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 866 867 /* 868 * We are now committed to the task switch. Any exceptions encountered 869 * after this point will be handled in the context of the new task and 870 * the saved instruction pointer will belong to the new task. 871 */ 872 vmexit->rip = newtss.tss_eip; 873 vmexit->inst_length = 0; 874 875 /* Load processor state from new TSS */ 876 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov); 877 CHKERR(error); 878 879 /* 880 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 881 * caused an error code to be generated, this error code is copied 882 * to the stack of the new task. 883 */ 884 if (task_switch->errcode_valid) { 885 assert(task_switch->ext); 886 assert(task_switch->reason == TSR_IDT_GATE); 887 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, 888 task_switch->errcode); 889 CHKERR(error); 890 } 891 892 /* 893 * Treatment of virtual-NMI blocking if NMI is delivered through 894 * a task gate. 895 * 896 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 897 * If the virtual NMIs VM-execution control is 1, VM entry injects 898 * an NMI, and delivery of the NMI causes a task switch that causes 899 * a VM exit, virtual-NMI blocking is in effect before the VM exit 900 * commences. 901 * 902 * Thus, virtual-NMI blocking is in effect at the time of the task 903 * switch VM exit. 904 */ 905 906 /* 907 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 908 * 909 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 910 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 911 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 912 * 913 * Thus, virtual-NMI blocking is cleared at the time of the task switch 914 * VM exit. 915 */ 916 917 /* 918 * If the task switch was triggered by an event delivered through 919 * the IDT then extinguish the pending event from the vcpu's 920 * exitintinfo. 921 */ 922 if (task_switch->reason == TSR_IDT_GATE) { 923 error = vm_set_intinfo(ctx, vcpu, 0); 924 assert(error == 0); 925 } 926 927 /* 928 * XXX should inject debug exception if 'T' bit is 1 929 */ 930done: 931 return (VMEXIT_CONTINUE); 932} 933