1/* $OpenBSD: locore.S,v 1.147 2024/03/17 05:49:41 guenther Exp $ */ 2/* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ 3 4/* 5 * Copyright-o-rama! 6 */ 7 8/* 9 * Copyright (c) 2001 Wasabi Systems, Inc. 10 * All rights reserved. 11 * 12 * Written by Frank van der Linden for Wasabi Systems, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed for the NetBSD Project by 25 * Wasabi Systems, Inc. 26 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 27 * or promote products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 32 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 33 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 34 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 35 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 36 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 37 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 38 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 39 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 40 * POSSIBILITY OF SUCH DAMAGE. 41 */ 42 43 44/*- 45 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. 46 * All rights reserved. 47 * 48 * This code is derived from software contributed to The NetBSD Foundation 49 * by Charles M. Hannum. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 63 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 70 * POSSIBILITY OF SUCH DAMAGE. 71 */ 72 73/*- 74 * Copyright (c) 1990 The Regents of the University of California. 75 * All rights reserved. 76 * 77 * This code is derived from software contributed to Berkeley by 78 * William Jolitz. 79 * 80 * Redistribution and use in source and binary forms, with or without 81 * modification, are permitted provided that the following conditions 82 * are met: 83 * 1. Redistributions of source code must retain the above copyright 84 * notice, this list of conditions and the following disclaimer. 85 * 2. Redistributions in binary form must reproduce the above copyright 86 * notice, this list of conditions and the following disclaimer in the 87 * documentation and/or other materials provided with the distribution. 88 * 3. Neither the name of the University nor the names of its contributors 89 * may be used to endorse or promote products derived from this software 90 * without specific prior written permission. 91 * 92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 95 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 102 * SUCH DAMAGE. 103 * 104 * @(#)locore.s 7.3 (Berkeley) 5/13/91 105 */ 106 107#include "assym.h" 108#include "efi.h" 109#include "lapic.h" 110#include "ksyms.h" 111#include "xen.h" 112#include "hyperv.h" 113 114#include <sys/syscall.h> 115 116#include <machine/param.h> 117#include <machine/codepatch.h> 118#include <machine/psl.h> 119#include <machine/segments.h> 120#include <machine/specialreg.h> 121#include <machine/trap.h> /* T_PROTFLT */ 122#include <machine/frameasm.h> 123 124#if NLAPIC > 0 125#include <machine/i82489reg.h> 126#endif 127 128/* 129 * override user-land alignment before including asm.h 130 */ 131#define ALIGN_DATA .align 8,0xcc 132 133#include <machine/asm.h> 134 135#define SET_CURPROC(proc,cpu) \ 136 movq CPUVAR(SELF),cpu ; \ 137 movq proc,CPUVAR(CURPROC) ; \ 138 movq cpu,P_CPU(proc) 139 140#define GET_CURPCB(reg) movq CPUVAR(CURPCB),reg 141#define SET_CURPCB(reg) movq reg,CPUVAR(CURPCB) 142 143 144/* 145 * Initialization 146 */ 147 .data 148 149#if NLAPIC > 0 150 .align NBPG, 0xcc 151 .globl local_apic, lapic_id, lapic_tpr 152local_apic: 153 .space LAPIC_ID 154lapic_id: 155 .long 0x00000000 156 .space LAPIC_TPRI-(LAPIC_ID+4) 157lapic_tpr: 158 .space LAPIC_PPRI-LAPIC_TPRI 159lapic_ppr: 160 .space LAPIC_ISR-LAPIC_PPRI 161lapic_isr: 162 .space NBPG-LAPIC_ISR 163#endif 164 165/*****************************************************************************/ 166 167/* 168 * Signal trampoline; copied to a page mapped into userspace. 169 * gdb's backtrace logic matches against the instructions in this. 170 */ 171 .section .rodata 172 .globl sigcode 173sigcode: 174 endbr64 175 call 1f 176 movq %rsp,%rdi 177 pushq %rdi /* fake return address */ 178 movq $SYS_sigreturn,%rax 179 .globl sigcodecall 180sigcodecall: 181 syscall 182 .globl sigcoderet 183sigcoderet: 184 int3 1851: JMP_RETPOLINE(rax) 186 .globl esigcode 187esigcode: 188 .globl sigfill 189sigfill: 190 int3 191esigfill: 192 .globl sigfillsiz 193sigfillsiz: 194 .long esigfill - sigfill 195 196 .text 197/* 198 * void lgdt(struct region_descriptor *rdp); 199 * Change the global descriptor table. 200 */ 201NENTRY(lgdt) 202 RETGUARD_SETUP(lgdt, r11) 203 /* Reload the descriptor table. */ 204 movq %rdi,%rax 205 lgdt (%rax) 206 /* Flush the prefetch q. */ 207 jmp 1f 208 nop 2091: /* Reload "stale" selectors. */ 210 movl $GSEL(GDATA_SEL, SEL_KPL),%eax 211 movl %eax,%ds 212 movl %eax,%es 213 movl %eax,%ss 214 /* Reload code selector by doing intersegment return. */ 215 popq %rax 216 pushq $GSEL(GCODE_SEL, SEL_KPL) 217 pushq %rax 218 RETGUARD_CHECK(lgdt, r11) 219 lretq 220END(lgdt) 221 222#if defined(DDB) || NEFI > 0 223ENTRY(setjmp) 224 RETGUARD_SETUP(setjmp, r11) 225 /* 226 * Only save registers that must be preserved across function 227 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15) 228 * and %rip. 229 */ 230 movq %rdi,%rax 231 movq %rbx,(%rax) 232 movq %rsp,8(%rax) 233 movq %rbp,16(%rax) 234 movq %r12,24(%rax) 235 movq %r13,32(%rax) 236 movq %r14,40(%rax) 237 movq %r15,48(%rax) 238 movq (%rsp),%rdx 239 movq %rdx,56(%rax) 240 xorl %eax,%eax 241 RETGUARD_CHECK(setjmp, r11) 242 ret 243 lfence 244END(setjmp) 245 246ENTRY(longjmp) 247 movq %rdi,%rax 248 movq 8(%rax),%rsp 249 movq 56(%rax),%rdx 250 movq %rdx,(%rsp) 251 RETGUARD_SETUP(longjmp, r11) 252 movq (%rax),%rbx 253 movq 16(%rax),%rbp 254 movq 24(%rax),%r12 255 movq 32(%rax),%r13 256 movq 40(%rax),%r14 257 movq 48(%rax),%r15 258 xorl %eax,%eax 259 incl %eax 260 RETGUARD_CHECK(longjmp, r11) 261 ret 262 lfence 263END(longjmp) 264#endif /* DDB || NEFI > 0 */ 265 266/*****************************************************************************/ 267 268/* 269 * int cpu_switchto(struct proc *old, struct proc *new) 270 * Switch from "old" proc to "new". 271 */ 272ENTRY(cpu_switchto) 273 pushq %rbx 274 pushq %rbp 275 pushq %r12 276 pushq %r13 277 pushq %r14 278 pushq %r15 279 280 movq %rdi, %r13 281 movq %rsi, %r12 282 283 /* Record new proc. */ 284 movb $SONPROC,P_STAT(%r12) # p->p_stat = SONPROC 285 SET_CURPROC(%r12,%rcx) 286 287 movl CPUVAR(CPUID),%r9d 288 289 /* for the FPU/"extended CPU state" handling below */ 290 movq xsave_mask(%rip),%rdx 291 movl %edx,%eax 292 shrq $32,%rdx 293 294 /* If old proc exited, don't bother. */ 295 xorl %ecx,%ecx 296 testq %r13,%r13 297 jz switch_exited 298 299 /* 300 * Save old context. 301 * 302 * Registers: 303 * %rax - scratch 304 * %r13 - old proc, then old pcb 305 * %rcx - old pmap if not P_SYSTEM 306 * %r12 - new proc 307 * %r9d - cpuid 308 */ 309 310 /* remember the pmap if not P_SYSTEM */ 311 testl $P_SYSTEM,P_FLAG(%r13) 312 movq P_ADDR(%r13),%r13 313 jnz 0f 314 movq PCB_PMAP(%r13),%rcx 3150: 316 317 /* Save stack pointers. */ 318 movq %rsp,PCB_RSP(%r13) 319 movq %rbp,PCB_RBP(%r13) 320 321 /* 322 * If the old proc ran in userspace then save the 323 * floating-point/"extended state" registers 324 */ 325 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 326 jz .Lxstate_reset 327 328 movq %r13, %rdi 329#if PCB_SAVEFPU != 0 330 addq $PCB_SAVEFPU,%rdi 331#endif 332 CODEPATCH_START 333 fxsave64 (%rdi) 334 CODEPATCH_END(CPTAG_XSAVE) 335 336switch_exited: 337 /* now clear the xstate */ 338 movq proc0paddr(%rip),%rdi 339#if PCB_SAVEFPU != 0 340 addq $PCB_SAVEFPU,%rdi 341#endif 342 CODEPATCH_START 343 fxrstor64 (%rdi) 344 CODEPATCH_END(CPTAG_XRSTORS) 345 andl $~CPUPF_USERXSTATE,CPUVAR(PFLAGS) 346 347.Lxstate_reset: 348 /* 349 * If the segment registers haven't been reset since the old proc 350 * ran in userspace then reset them now 351 */ 352 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 353 jz restore_saved 354 andl $~CPUPF_USERSEGS,CPUVAR(PFLAGS) 355 356 /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */ 357 movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax 358 movw %ax,%ds 359 movw %ax,%es 360 movw %ax,%fs 361 cli /* block interrupts when on user GS.base */ 362 swapgs /* switch from kernel to user GS.base */ 363 movw %ax,%gs /* set %gs to UDATA and GS.base to 0 */ 364 swapgs /* back to kernel GS.base */ 365 366restore_saved: 367 /* 368 * Restore saved context. 369 * 370 * Registers: 371 * %rax, %rdx - scratch 372 * %rcx - old pmap if not P_SYSTEM 373 * %r12 - new process 374 * %r13 - new pcb 375 * %rbx - new pmap if not P_SYSTEM 376 */ 377 378 movq P_ADDR(%r12),%r13 379 380 /* remember the pmap if not P_SYSTEM */ 381 xorl %ebx,%ebx 382 testl $P_SYSTEM,P_FLAG(%r12) 383 jnz 1f 384 movq PCB_PMAP(%r13),%rbx 3851: 386 387 /* No interrupts while loading new state. */ 388 cli 389 390 /* Restore stack pointers. */ 391 movq PCB_RSP(%r13),%rsp 392 movq PCB_RBP(%r13),%rbp 393 394 /* Stack pivot done, setup RETGUARD */ 395 RETGUARD_SETUP_OFF(cpu_switchto, r11, 6*8) 396 397 /* don't switch cr3 to the same thing it already was */ 398 movq PCB_CR3(%r13),%rax 399 movq %cr3,%rdi 400 xorq %rax,%rdi 401 btrq $63,%rdi /* ignore CR3_REUSE_PCID */ 402 testq %rdi,%rdi 403 jz .Lsame_cr3 404 405#ifdef DIAGNOSTIC 406 /* verify ci_proc_pmap had been updated properly */ 407 cmpq %rcx,CPUVAR(PROC_PMAP) 408 jnz .Lbogus_proc_pmap 409#endif 410 /* record which pmap this CPU should get IPIs for */ 411 movq %rbx,CPUVAR(PROC_PMAP) 412 413.Lset_cr3: 414 movq %rax,%cr3 /* %rax used below too */ 415 416.Lsame_cr3: 417 /* 418 * If we switched from a userland thread with a shallow call stack 419 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto) 420 * then the RSB may have attacker controlled entries when we switch 421 * to a deeper call stack in the new thread. Refill the RSB with 422 * entries safe to speculate into/through. 423 */ 424 RET_STACK_REFILL_WITH_RCX 425 426 /* Don't bother with the rest if switching to a system process. */ 427 testq %rbx,%rbx 428 jz switch_restored 429 430 /* record the bits needed for future U-->K transition */ 431 movq PCB_KSTACK(%r13),%rdx 432 subq $FRAMESIZE,%rdx 433 movq %rdx,CPUVAR(KERN_RSP) 434 435 CODEPATCH_START 436 /* 437 * Meltdown: iff we're doing separate U+K and U-K page tables, 438 * then record them in cpu_info for easy access in syscall and 439 * interrupt trampolines. 440 */ 441 movq PM_PDIRPA_INTEL(%rbx),%rdx 442 orq cr3_reuse_pcid,%rax 443 orq cr3_pcid_proc_intel,%rdx 444 movq %rax,CPUVAR(KERN_CR3) 445 movq %rdx,CPUVAR(USER_CR3) 446 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 447 448switch_restored: 449 SET_CURPCB(%r13) 450 451 /* Interrupts are okay again. */ 452 sti 453 popq %r15 454 popq %r14 455 popq %r13 456 popq %r12 457 popq %rbp 458 popq %rbx 459 RETGUARD_CHECK(cpu_switchto, r11) 460 ret 461 lfence 462 463#ifdef DIAGNOSTIC 464.Lbogus_proc_pmap: 465 leaq bogus_proc_pmap,%rdi 466 call panic 467 int3 /* NOTREACHED */ 468 .pushsection .rodata 469bogus_proc_pmap: 470 .asciz "curcpu->ci_proc_pmap didn't point to previous pmap" 471 .popsection 472#endif /* DIAGNOSTIC */ 473END(cpu_switchto) 474 475NENTRY(retpoline_rax) 476 CODEPATCH_START 477 JMP_RETPOLINE(rax) 478 CODEPATCH_END(CPTAG_RETPOLINE_RAX) 479END(retpoline_rax) 480 481NENTRY(__x86_indirect_thunk_r11) 482 CODEPATCH_START 483 JMP_RETPOLINE(r11) 484 CODEPATCH_END(CPTAG_RETPOLINE_R11) 485END(__x86_indirect_thunk_r11) 486 487ENTRY(cpu_idle_cycle_hlt) 488 RETGUARD_SETUP(cpu_idle_cycle_hlt, r11) 489 sti 490 hlt 491 RETGUARD_CHECK(cpu_idle_cycle_hlt, r11) 492 ret 493 lfence 494END(cpu_idle_cycle_hlt) 495 496/* 497 * savectx(struct pcb *pcb); 498 * Update pcb, saving current processor state. 499 */ 500ENTRY(savectx) 501 RETGUARD_SETUP(savectx, r11) 502 /* Save stack pointers. */ 503 movq %rsp,PCB_RSP(%rdi) 504 movq %rbp,PCB_RBP(%rdi) 505 RETGUARD_CHECK(savectx, r11) 506 ret 507 lfence 508END(savectx) 509 510/* 511 * syscall insn entry. 512 * Enter here with interrupts blocked; %rcx contains the caller's 513 * %rip and the original rflags has been copied to %r11. %cs and 514 * %ss have been updated to the kernel segments, but %rsp is still 515 * the user-space value. 516 * First order of business is to swap to the kernel GS.base so that 517 * we can access our struct cpu_info. After possibly mucking with 518 * pagetables, we switch to our kernel stack. Once that's in place 519 * we can save the rest of the syscall frame and unblock interrupts. 520 */ 521KUTEXT_PAGE_START 522 .align NBPG, 0xcc 523XUsyscall_meltdown: 524 /* 525 * This is the real Xsyscall_meltdown page, which is mapped into 526 * the U-K page tables at the same location as Xsyscall_meltdown 527 * below. For this, the Meltdown case, we use the scratch space 528 * in cpu_info so we can switch to the kernel page tables 529 * (thank you, Intel), at which point we'll continue at the 530 * "SYSCALL_ENTRY" after Xsyscall below. 531 * In case the CPU speculates past the mov to cr3, we put a 532 * retpoline-style pause-lfence-jmp-to-pause loop. 533 */ 534 endbr64 535 swapgs 536 movq %rax,CPUVAR(SCRATCH) 537 movq CPUVAR(KERN_CR3),%rax 538 movq %rax,%cr3 5390: pause 540 lfence 541 jmp 0b 542KUTEXT_PAGE_END 543 544KTEXT_PAGE_START 545 .align NBPG, 0xcc 546GENTRY(Xsyscall_meltdown) 547 /* pad to match real Xsyscall_meltdown positioning above */ 548 movq CPUVAR(KERN_CR3),%rax 549 movq %rax,%cr3 550GENTRY(Xsyscall) 551 endbr64 552 swapgs 553 movq %rax,CPUVAR(SCRATCH) 554 SYSCALL_ENTRY /* create trapframe */ 555 sti 556 557 movq CPUVAR(CURPROC),%r14 558 movq %rsp,P_MD_REGS(%r14) # save pointer to frame 559 andl $~MDP_IRET,P_MD_FLAGS(%r14) 560 movq %rsp,%rdi 561 call syscall 562 563.Lsyscall_check_asts: 564 /* Check for ASTs on exit to user mode. */ 565 cli 566 CHECK_ASTPENDING(%r11) 567 je 2f 568 CLEAR_ASTPENDING(%r11) 569 sti 570 movq %rsp,%rdi 571 call ast 572 jmp .Lsyscall_check_asts 573 5742: 575#ifdef DIAGNOSTIC 576 cmpl $IPL_NONE,CPUVAR(ILEVEL) 577 jne .Lsyscall_spl_not_lowered 578#endif /* DIAGNOSTIC */ 579 580 /* Could registers have been changed that require an iretq? */ 581 testl $MDP_IRET, P_MD_FLAGS(%r14) 582 jne intr_user_exit_post_ast 583 584 /* Restore FPU/"extended CPU state" if it's not already in the CPU */ 585 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 586 jz .Lsyscall_restore_xstate 587 588 /* Restore FS.base if it's not already in the CPU */ 589 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 590 jz .Lsyscall_restore_fsbase 591 592.Lsyscall_restore_registers: 593 /* 594 * If the pmap we're now on isn't the same as the one we 595 * were on last time we were in userspace, then use IBPB 596 * to prevent cross-process branch-target injection. 597 */ 598 CODEPATCH_START 599 movq CPUVAR(PROC_PMAP),%rbx 600 cmpq CPUVAR(USER_PMAP),%rbx 601 je 1f 602 xorl %edx,%edx 603 movl $PRED_CMD_IBPB,%eax 604 movl $MSR_PRED_CMD,%ecx 605 wrmsr 606 movq %rbx,CPUVAR(USER_PMAP) 6071: 608 CODEPATCH_END(CPTAG_IBPB_NOP) 609 call pku_xonly 610 RET_STACK_REFILL_WITH_RCX 611 612 movq TF_R8(%rsp),%r8 613 movq TF_R9(%rsp),%r9 614 movq TF_R10(%rsp),%r10 615 movq TF_R12(%rsp),%r12 616 movq TF_R13(%rsp),%r13 617 movq TF_R14(%rsp),%r14 618 movq TF_R15(%rsp),%r15 619 movq TF_RBX(%rsp),%rbx 620 movq TF_RDX(%rsp),%rdx 621 622 CODEPATCH_START 623 xorl %edi,%edi 624 xorl %esi,%esi 625 xorl %r11d,%r11d 626 xorl %eax,%eax 627 xorl %ecx,%ecx 628 movw %ds,TF_R8(%rsp) 629 verw TF_R8(%rsp) 630 CODEPATCH_END(CPTAG_MDS) 631 632 movq TF_RDI(%rsp),%rdi 633 movq TF_RSI(%rsp),%rsi 634 movq TF_RBP(%rsp),%rbp 635 636 /* 637 * We need to finish reading from the trapframe, then switch 638 * to the user page tables, swapgs, and return. We need 639 * to get the final value for the register that was used 640 * for the mov to %cr3 from somewhere accessible on the 641 * user page tables, so save it in CPUVAR(SCRATCH) across 642 * the switch. 643 */ 644 movq TF_RAX(%rsp),%rax 645 movq TF_RIP(%rsp),%rcx 646 movq TF_RFLAGS(%rsp),%r11 647 movq TF_RSP(%rsp),%rsp 648 CODEPATCH_START 649 movq %rax,CPUVAR(SCRATCH) 650 movq CPUVAR(USER_CR3),%rax 651 PCID_SET_REUSE_NOP 652 movq %rax,%cr3 653Xsyscall_trampback: 6540: pause 655 lfence 656 jmp 0b 657 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 658 swapgs 659 sysretq 660END(Xsyscall) 661END(Xsyscall_meltdown) 662KTEXT_PAGE_END 663 664KUTEXT_PAGE_START 665 .space (Xsyscall_trampback - Xsyscall_meltdown) - \ 666 (. - XUsyscall_meltdown), 0xcc 667 movq %rax,%cr3 668 movq CPUVAR(SCRATCH),%rax 669 swapgs 670 sysretq 671KUTEXT_PAGE_END 672 673 .text 674 _ALIGN_TRAPS 675 /* in this case, need FS.base but not xstate, rarely happens */ 676.Lsyscall_restore_fsbase: /* CPU doesn't have curproc's FS.base */ 677 orl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 678 movq CPUVAR(CURPCB),%rdi 679 jmp .Lsyscall_restore_fsbase_real 680 681 _ALIGN_TRAPS 682.Lsyscall_restore_xstate: /* CPU doesn't have curproc's xstate */ 683 orl $(CPUPF_USERXSTATE|CPUPF_USERSEGS),CPUVAR(PFLAGS) 684 movq CPUVAR(CURPCB),%rdi 685 movq xsave_mask(%rip),%rdx 686 movl %edx,%eax 687 shrq $32,%rdx 688#if PCB_SAVEFPU != 0 689 addq $PCB_SAVEFPU,%rdi 690#endif 691 /* untouched state so can't fault */ 692 CODEPATCH_START 693 fxrstor64 (%rdi) 694 CODEPATCH_END(CPTAG_XRSTORS) 695#if PCB_SAVEFPU != 0 696 subq $PCB_SAVEFPU,%rdi 697#endif 698.Lsyscall_restore_fsbase_real: 699 movq PCB_FSBASE(%rdi),%rdx 700 movl %edx,%eax 701 shrq $32,%rdx 702 movl $MSR_FSBASE,%ecx 703 wrmsr 704 jmp .Lsyscall_restore_registers 705 706#ifdef DIAGNOSTIC 707.Lsyscall_spl_not_lowered: 708 leaq spl_lowered(%rip), %rdi 709 movl TF_ERR(%rsp),%esi /* syscall # stashed above */ 710 movl TF_RDI(%rsp),%edx 711 movl %ebx,%ecx 712 movl CPUVAR(ILEVEL),%r8d 713 xorq %rax,%rax 714 call printf 715#ifdef DDB 716 int $3 717#endif /* DDB */ 718 movl $IPL_NONE,CPUVAR(ILEVEL) 719 jmp .Lsyscall_check_asts 720 721 .section .rodata 722spl_lowered: 723 .asciz "WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n" 724 .text 725#endif 726 727NENTRY(proc_trampoline) 728 call proc_trampoline_mi 729 movq %r13,%rdi 730 movq %r12,%rax 731 call retpoline_rax 732 movq CPUVAR(CURPROC),%r14 733 jmp .Lsyscall_check_asts 734END(proc_trampoline) 735 736 737/* 738 * Returning to userspace via iretq. We do things in this order: 739 * - check for ASTs 740 * - restore FPU/"extended CPU state" if it's not already in the CPU 741 * - DIAGNOSTIC: no more C calls after this, so check the SPL 742 * - restore FS.base if it's not already in the CPU 743 * - restore most registers 744 * - update the iret frame from the trapframe 745 * - finish reading from the trapframe 746 * - switch to the trampoline stack \ 747 * - jump to the .kutext segment |-- Meltdown workaround 748 * - switch to the user page tables / 749 * - swapgs 750 * - iretq 751 */ 752KTEXT_PAGE_START 753 _ALIGN_TRAPS 754GENTRY(intr_user_exit) 755#ifdef DIAGNOSTIC 756 pushfq 757 popq %rdx 758 testq $PSL_I,%rdx 759 jnz .Lintr_user_exit_not_blocked 760#endif /* DIAGNOSTIC */ 761 762 /* Check for ASTs */ 763 CHECK_ASTPENDING(%r11) 764 je intr_user_exit_post_ast 765 CLEAR_ASTPENDING(%r11) 766 sti 767 movq %rsp,%rdi 768 call ast 769 cli 770 jmp intr_user_exit 771 772intr_user_exit_post_ast: 773 /* Restore FPU/"extended CPU state" if it's not already in the CPU */ 774 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 775 jz .Lintr_restore_xstate 776 777 /* Restore FS.base if it's not already in the CPU */ 778 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 779 jz .Lintr_restore_fsbase 780 781.Lintr_restore_registers: 782#ifdef DIAGNOSTIC 783 /* no more C calls after this, so check the SPL */ 784 cmpl $0,CPUVAR(ILEVEL) 785 jne .Luser_spl_not_lowered 786#endif /* DIAGNOSTIC */ 787 788 /* 789 * If the pmap we're now on isn't the same as the one we 790 * were on last time we were in userspace, then use IBPB 791 * to prevent cross-process branch-target injection. 792 */ 793 CODEPATCH_START 794 movq CPUVAR(PROC_PMAP),%rbx 795 cmpq CPUVAR(USER_PMAP),%rbx 796 je 1f 797 xorl %edx,%edx 798 movl $PRED_CMD_IBPB,%eax 799 movl $MSR_PRED_CMD,%ecx 800 wrmsr 801 movq %rbx,CPUVAR(USER_PMAP) 8021: 803 CODEPATCH_END(CPTAG_IBPB_NOP) 804 call pku_xonly 805 RET_STACK_REFILL_WITH_RCX 806 807 movq TF_R8(%rsp),%r8 808 movq TF_R9(%rsp),%r9 809 movq TF_R10(%rsp),%r10 810 movq TF_R12(%rsp),%r12 811 movq TF_R13(%rsp),%r13 812 movq TF_R14(%rsp),%r14 813 movq TF_R15(%rsp),%r15 814 movq TF_RBX(%rsp),%rbx 815 816 CODEPATCH_START 817 xorl %edi,%edi 818 xorl %esi,%esi 819 xorl %r11d,%r11d 820 xorl %eax,%eax 821 xorl %edx,%edx 822 xorl %ecx,%ecx 823 movw %ds,TF_R8(%rsp) 824 verw TF_R8(%rsp) 825 CODEPATCH_END(CPTAG_MDS) 826 827 movq TF_RDI(%rsp),%rdi 828 movq TF_RSI(%rsp),%rsi 829 movq TF_RBP(%rsp),%rbp 830 831 /* 832 * To get the final value for the register that was used 833 * for the mov to %cr3, we need access to somewhere accessible 834 * on the user page tables, so we save it in CPUVAR(SCRATCH) 835 * across the switch. 836 */ 837 /* update iret frame */ 838 movq CPUVAR(INTR_RSP),%rdx 839 movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx) 840 movq TF_RIP(%rsp),%rax 841 movq %rax,IRETQ_RIP(%rdx) 842 movq TF_RFLAGS(%rsp),%rax 843 movq %rax,IRETQ_RFLAGS(%rdx) 844 movq TF_RSP(%rsp),%rax 845 movq %rax,IRETQ_RSP(%rdx) 846 movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx) 847 /* finish with the trap frame */ 848 movq TF_RAX(%rsp),%rax 849 movq TF_RCX(%rsp),%rcx 850 movq TF_R11(%rsp),%r11 851 /* switch to the trampoline stack */ 852 xchgq %rdx,%rsp 853 movq TF_RDX(%rdx),%rdx 854 CODEPATCH_START 855 movq %rax,CPUVAR(SCRATCH) 856 movq CPUVAR(USER_CR3),%rax 857 PCID_SET_REUSE_NOP 858 movq %rax,%cr3 859Xiretq_trampback: 860KTEXT_PAGE_END 861/* the movq %cr3 switches to this "KUTEXT" page */ 862KUTEXT_PAGE_START 863 .space (Xiretq_trampback - Xsyscall_meltdown) - \ 864 (. - XUsyscall_meltdown), 0xcc 865 movq CPUVAR(SCRATCH),%rax 866.Liretq_swapgs: 867 swapgs 868doreti_iret_meltdown: 869 iretq 870KUTEXT_PAGE_END 871/* 872 * Back to the "KTEXT" page to fill in the speculation trap and the 873 * swapgs+iretq used for non-Meltdown kernels. This switching back 874 * and forth between segments is so that we can do the .space 875 * calculation below to guarantee the iretq's above and below line 876 * up, so the 'doreti_iret' label lines up with the iretq whether 877 * the CPU is affected by Meltdown or not. 878 */ 879KTEXT_PAGE_START 8800: pause 881 lfence 882 jmp 0b 883 .space (.Liretq_swapgs - XUsyscall_meltdown) - \ 884 (. - Xsyscall_meltdown), 0xcc 885 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 886 swapgs 887 888 .globl doreti_iret 889doreti_iret: 890 iretq 891KTEXT_PAGE_END 892 893 .text 894 _ALIGN_TRAPS 895.Lintr_restore_xstate: /* CPU doesn't have curproc's xstate */ 896 orl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 897 movq CPUVAR(CURPCB),%rdi 898#if PCB_SAVEFPU != 0 899 addq $PCB_SAVEFPU,%rdi 900#endif 901 movq xsave_mask(%rip),%rdx 902 movl %edx,%eax 903 shrq $32, %rdx 904 CODEPATCH_START 905 fxrstor64 (%rdi) 906 CODEPATCH_END(CPTAG_XRSTORS) 907 //testl %eax,%eax 908 //jnz .Lintr_xrstor_faulted 909.Lintr_restore_fsbase: /* CPU doesn't have curproc's FS.base */ 910 orl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 911 movq CPUVAR(CURPCB),%rdx 912 movq PCB_FSBASE(%rdx),%rdx 913 movl %edx,%eax 914 shrq $32,%rdx 915 movl $MSR_FSBASE,%ecx 916 wrmsr 917 jmp .Lintr_restore_registers 918 919.Lintr_xrstor_faulted: 920 /* 921 * xrstor faulted; we need to reset the FPU state and call trap() 922 * to post a signal, which requires interrupts be enabled. 923 */ 924 sti 925 movq proc0paddr(%rip),%rdi 926#if PCB_SAVEFPU != 0 927 addq $PCB_SAVEFPU,%rdi 928#endif 929 CODEPATCH_START 930 fxrstor64 (%rdi) 931 CODEPATCH_END(CPTAG_XRSTORS) 932 movq $T_PROTFLT,TF_TRAPNO(%rsp) 933 jmp recall_trap 934 935#ifdef DIAGNOSTIC 936.Lintr_user_exit_not_blocked: 937 movl warn_once(%rip),%edi 938 testl %edi,%edi 939 jnz 1f 940 incl %edi 941 movl %edi,warn_once(%rip) 942 leaq .Lnot_blocked(%rip),%rdi 943 call printf 944#ifdef DDB 945 int $3 946#endif /* DDB */ 9471: cli 948 jmp intr_user_exit 949 950.Luser_spl_not_lowered: 951 sti 952 leaq intr_spl_lowered(%rip),%rdi 953 movl CPUVAR(ILEVEL),%esi 954 xorl %edx,%edx /* always SPL zero for userspace */ 955 xorl %eax,%eax 956 call printf 957#ifdef DDB 958 int $3 959#endif /* DDB */ 960 movl $0,CPUVAR(ILEVEL) 961 cli 962 jmp intr_user_exit 963 964 .section .rodata 965intr_spl_lowered: 966 .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n" 967 .text 968#endif /* DIAGNOSTIC */ 969END(Xintr_user_exit) 970 971 972/* 973 * Return to supervisor mode from trap or interrupt 974 */ 975NENTRY(intr_fast_exit) 976#ifdef DIAGNOSTIC 977 pushfq 978 popq %rdx 979 testq $PSL_I,%rdx 980 jnz .Lintr_exit_not_blocked 981#endif /* DIAGNOSTIC */ 982 movq TF_RDI(%rsp),%rdi 983 movq TF_RSI(%rsp),%rsi 984 movq TF_R8(%rsp),%r8 985 movq TF_R9(%rsp),%r9 986 movq TF_R10(%rsp),%r10 987 movq TF_R12(%rsp),%r12 988 movq TF_R13(%rsp),%r13 989 movq TF_R14(%rsp),%r14 990 movq TF_R15(%rsp),%r15 991 movq TF_RBP(%rsp),%rbp 992 movq TF_RBX(%rsp),%rbx 993 movq TF_RDX(%rsp),%rdx 994 movq TF_RCX(%rsp),%rcx 995 movq TF_R11(%rsp),%r11 996 movq TF_RAX(%rsp),%rax 997 addq $TF_RIP,%rsp 998 iretq 999 1000#ifdef DIAGNOSTIC 1001.Lintr_exit_not_blocked: 1002 movl warn_once(%rip),%edi 1003 testl %edi,%edi 1004 jnz 1f 1005 incl %edi 1006 movl %edi,warn_once(%rip) 1007 leaq .Lnot_blocked(%rip),%rdi 1008 call printf 1009#ifdef DDB 1010 int $3 1011#endif /* DDB */ 10121: cli 1013 jmp intr_fast_exit 1014 1015 .data 1016.global warn_once 1017warn_once: 1018 .long 0 1019 .section .rodata 1020.Lnot_blocked: 1021 .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n" 1022 .text 1023#endif 1024END(intr_fast_exit) 1025 1026/* 1027 * FPU/"extended CPU state" handling 1028 * void xrstor_kern(sfp, mask) 1029 * using first of xrstors/xrstor/fxrstor, load given state 1030 * which is assumed to be trusted: i.e., unaltered from 1031 * xsaves/xsaveopt/xsave/fxsave by kernel 1032 * int xrstor_user(sfp, mask) 1033 * using first of xrstor/fxrstor, load given state which might 1034 * not be trustable: #GP faults will be caught; returns 0/1 if 1035 * okay/it trapped. 1036 * void fpusave(sfp) 1037 * save current state, but retain it in the FPU 1038 * void fpusavereset(sfp) 1039 * save current state and reset FPU to initial/kernel state 1040 * int xsetbv_user(reg, mask) 1041 * load specified %xcr# register, returns 0/1 if okay/it trapped 1042 */ 1043 1044ENTRY(xrstor_kern) 1045 RETGUARD_SETUP(xrstor_kern, r11) 1046 movq %rsi, %rdx 1047 movl %esi, %eax 1048 shrq $32, %rdx 1049 CODEPATCH_START 1050 fxrstor64 (%rdi) 1051 CODEPATCH_END(CPTAG_XRSTORS) 1052 RETGUARD_CHECK(xrstor_kern, r11) 1053 ret 1054 lfence 1055END(xrstor_kern) 1056 1057ENTRY(xrstor_user) 1058 RETGUARD_SETUP(xrstor_user, r11) 1059 movq %rsi, %rdx 1060 movl %esi, %eax 1061 shrq $32, %rdx 1062 .globl xrstor_fault 1063xrstor_fault: 1064 CODEPATCH_START 1065 fxrstor64 (%rdi) 1066 CODEPATCH_END(CPTAG_XRSTOR) 1067 xorl %eax, %eax 1068 RETGUARD_CHECK(xrstor_user, r11) 1069 ret 1070 lfence 1071NENTRY(xrstor_resume) 1072 movl $1, %eax 1073 RETGUARD_CHECK(xrstor_user, r11) 1074 ret 1075 lfence 1076END(xrstor_user) 1077 1078ENTRY(fpusave) 1079 RETGUARD_SETUP(fpusave, r11) 1080 movq xsave_mask(%rip),%rdx 1081 movl %edx,%eax 1082 shrq $32,%rdx 1083 CODEPATCH_START 1084 fxsave64 (%rdi) 1085 CODEPATCH_END(CPTAG_XSAVE) 1086 RETGUARD_CHECK(fpusave, r11) 1087 ret 1088 lfence 1089END(fpusave) 1090 1091ENTRY(fpusavereset) 1092 RETGUARD_SETUP(fpusavereset, r11) 1093 movq xsave_mask(%rip),%rdx 1094 movl %edx,%eax 1095 shrq $32,%rdx 1096 CODEPATCH_START 1097 fxsave64 (%rdi) 1098 CODEPATCH_END(CPTAG_XSAVE) 1099 movq proc0paddr(%rip),%rdi 1100#if PCB_SAVEFPU != 0 1101 addq $PCB_SAVEFPU,%rdi 1102#endif 1103 CODEPATCH_START 1104 fxrstor64 (%rdi) 1105 CODEPATCH_END(CPTAG_XRSTORS) 1106 RETGUARD_CHECK(fpusavereset, r11) 1107 ret 1108 lfence 1109END(fpusavereset) 1110 1111ENTRY(xsetbv_user) 1112 RETGUARD_SETUP(xsetbv_user, r11) 1113 movl %edi, %ecx 1114 movq %rsi, %rdx 1115 movl %esi, %eax 1116 shrq $32, %rdx 1117 .globl xsetbv_fault 1118xsetbv_fault: 1119 xsetbv 1120 xorl %eax, %eax 1121 RETGUARD_CHECK(xsetbv_user, r11) 1122 ret 1123 lfence 1124NENTRY(xsetbv_resume) 1125 movl $1, %eax 1126 RETGUARD_CHECK(xsetbv_user, r11) 1127 ret 1128 lfence 1129END(xsetbv_user) 1130 1131CODEPATCH_CODE(_xrstor, xrstor64 (%rdi)) 1132CODEPATCH_CODE(_xrstors, xrstors64 (%rdi)) 1133CODEPATCH_CODE(_xsave, xsave64 (%rdi)) 1134CODEPATCH_CODE(_xsaves, xsaves64 (%rdi)) 1135CODEPATCH_CODE(_xsaveopt, xsaveopt64 (%rdi)) 1136CODEPATCH_CODE(_pcid_set_reuse, 1137 orl $(CR3_REUSE_PCID >> 32),CPUVAR(USER_CR3 + 4)) 1138CODEPATCH_CODE_LEN(_jmprax, jmp *%rax; int3) 1139CODEPATCH_CODE_LEN(_jmpr11, jmp *%r11; int3) 1140CODEPATCH_CODE_LEN(_jmpr13, jmp *%r13; int3) 1141 1142ENTRY(pagezero) 1143 RETGUARD_SETUP(pagezero, r11) 1144 movq $-PAGE_SIZE,%rdx 1145 subq %rdx,%rdi 1146 xorq %rax,%rax 11471: 1148 movnti %rax,(%rdi,%rdx) 1149 movnti %rax,8(%rdi,%rdx) 1150 movnti %rax,16(%rdi,%rdx) 1151 movnti %rax,24(%rdi,%rdx) 1152 addq $32,%rdx 1153 jne 1b 1154 sfence 1155 RETGUARD_CHECK(pagezero, r11) 1156 ret 1157 lfence 1158END(pagezero) 1159 1160/* void pku_xonly(void) */ 1161ENTRY(pku_xonly) 1162 movq pg_xo,%rax /* have PKU support? */ 1163 cmpq $0,%rax 1164 je 1f 1165 movl $0,%ecx /* force PKRU for xonly restriction */ 1166 movl $0,%edx 1167 movl $PGK_VALUE,%eax /* key0 normal, key1 is exec without read */ 1168 wrpkru 11691: ret 1170 lfence 1171END(pku_xonly) 1172 1173/* int rdmsr_safe(u_int msr, uint64_t *data) */ 1174ENTRY(rdmsr_safe) 1175 RETGUARD_SETUP(rdmsr_safe, r10) 1176 1177 movl %edi, %ecx /* u_int msr */ 1178 .globl rdmsr_safe_fault 1179rdmsr_safe_fault: 1180 rdmsr 1181 salq $32, %rdx 1182 movl %eax, %eax 1183 orq %rdx, %rax 1184 movq %rax, (%rsi) /* *data */ 1185 xorq %rax, %rax 1186 1187 RETGUARD_CHECK(rdmsr_safe, r10) 1188 ret 1189 lfence 1190 1191NENTRY(rdmsr_resume) 1192 movl $0x1, %eax 1193 RETGUARD_CHECK(rdmsr_safe, r10) 1194 ret 1195 lfence 1196END(rdmsr_safe) 1197 1198#if NHYPERV > 0 1199/* uint64_t hv_hypercall_trampoline(uint64_t control, paddr_t input, paddr_t output) */ 1200NENTRY(hv_hypercall_trampoline) 1201 endbr64 1202 mov %rdx, %r8 1203 mov %rsi, %rdx 1204 mov %rdi, %rcx 1205 jmp hv_hypercall_page 1206END(hv_hypercall_trampoline) 1207 /* Hypercall page needs to be page aligned */ 1208 .text 1209 .align NBPG, 0xcc 1210 .globl hv_hypercall_page 1211hv_hypercall_page: 1212 .skip 0x1000, 0xcc 1213#endif /* NHYPERV > 0 */ 1214 1215#if NXEN > 0 1216 /* Hypercall page needs to be page aligned */ 1217 .text 1218 .align NBPG, 0xcc 1219 .globl xen_hypercall_page 1220xen_hypercall_page: 1221 .skip 0x1000, 0xcc 1222#endif /* NXEN > 0 */ 1223