1/*	$OpenBSD: locore.S,v 1.147 2024/03/17 05:49:41 guenther Exp $	*/
2/*	$NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $	*/
3
4/*
5 * Copyright-o-rama!
6 */
7
8/*
9 * Copyright (c) 2001 Wasabi Systems, Inc.
10 * All rights reserved.
11 *
12 * Written by Frank van der Linden for Wasabi Systems, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 *    must display the following acknowledgement:
24 *      This product includes software developed for the NetBSD Project by
25 *      Wasabi Systems, Inc.
26 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
27 *    or promote products derived from this software without specific prior
28 *    written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
32 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
33 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
34 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
35 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
36 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
37 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
38 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
39 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40 * POSSIBILITY OF SUCH DAMAGE.
41 */
42
43
44/*-
45 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
46 * All rights reserved.
47 *
48 * This code is derived from software contributed to The NetBSD Foundation
49 * by Charles M. Hannum.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions
53 * are met:
54 * 1. Redistributions of source code must retain the above copyright
55 *    notice, this list of conditions and the following disclaimer.
56 * 2. Redistributions in binary form must reproduce the above copyright
57 *    notice, this list of conditions and the following disclaimer in the
58 *    documentation and/or other materials provided with the distribution.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
63 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70 * POSSIBILITY OF SUCH DAMAGE.
71 */
72
73/*-
74 * Copyright (c) 1990 The Regents of the University of California.
75 * All rights reserved.
76 *
77 * This code is derived from software contributed to Berkeley by
78 * William Jolitz.
79 *
80 * Redistribution and use in source and binary forms, with or without
81 * modification, are permitted provided that the following conditions
82 * are met:
83 * 1. Redistributions of source code must retain the above copyright
84 *    notice, this list of conditions and the following disclaimer.
85 * 2. Redistributions in binary form must reproduce the above copyright
86 *    notice, this list of conditions and the following disclaimer in the
87 *    documentation and/or other materials provided with the distribution.
88 * 3. Neither the name of the University nor the names of its contributors
89 *    may be used to endorse or promote products derived from this software
90 *    without specific prior written permission.
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
95 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
102 * SUCH DAMAGE.
103 *
104 *	@(#)locore.s	7.3 (Berkeley) 5/13/91
105 */
106
107#include "assym.h"
108#include "efi.h"
109#include "lapic.h"
110#include "ksyms.h"
111#include "xen.h"
112#include "hyperv.h"
113
114#include <sys/syscall.h>
115
116#include <machine/param.h>
117#include <machine/codepatch.h>
118#include <machine/psl.h>
119#include <machine/segments.h>
120#include <machine/specialreg.h>
121#include <machine/trap.h>			/* T_PROTFLT */
122#include <machine/frameasm.h>
123
124#if NLAPIC > 0
125#include <machine/i82489reg.h>
126#endif
127
128/*
129 * override user-land alignment before including asm.h
130 */
131#define	ALIGN_DATA	.align	8,0xcc
132
133#include <machine/asm.h>
134
135#define SET_CURPROC(proc,cpu)			\
136	movq	CPUVAR(SELF),cpu	;	\
137	movq	proc,CPUVAR(CURPROC)      ;	\
138	movq	cpu,P_CPU(proc)
139
140#define GET_CURPCB(reg)			movq	CPUVAR(CURPCB),reg
141#define SET_CURPCB(reg)			movq	reg,CPUVAR(CURPCB)
142
143
144/*
145 * Initialization
146 */
147	.data
148
149#if NLAPIC > 0
150	.align	NBPG, 0xcc
151	.globl	local_apic, lapic_id, lapic_tpr
152local_apic:
153	.space	LAPIC_ID
154lapic_id:
155	.long	0x00000000
156	.space	LAPIC_TPRI-(LAPIC_ID+4)
157lapic_tpr:
158	.space	LAPIC_PPRI-LAPIC_TPRI
159lapic_ppr:
160	.space	LAPIC_ISR-LAPIC_PPRI
161lapic_isr:
162	.space	NBPG-LAPIC_ISR
163#endif
164
165/*****************************************************************************/
166
167/*
168 * Signal trampoline; copied to a page mapped into userspace.
169 * gdb's backtrace logic matches against the instructions in this.
170 */
171	.section .rodata
172	.globl	sigcode
173sigcode:
174	endbr64
175	call	1f
176	movq	%rsp,%rdi
177	pushq	%rdi			/* fake return address */
178	movq	$SYS_sigreturn,%rax
179	.globl sigcodecall
180sigcodecall:
181	syscall
182	.globl	sigcoderet
183sigcoderet:
184	int3
1851:	JMP_RETPOLINE(rax)
186	.globl	esigcode
187esigcode:
188	.globl	sigfill
189sigfill:
190	int3
191esigfill:
192	.globl	sigfillsiz
193sigfillsiz:
194	.long	esigfill - sigfill
195
196	.text
197/*
198 * void lgdt(struct region_descriptor *rdp);
199 * Change the global descriptor table.
200 */
201NENTRY(lgdt)
202	RETGUARD_SETUP(lgdt, r11)
203	/* Reload the descriptor table. */
204	movq	%rdi,%rax
205	lgdt	(%rax)
206	/* Flush the prefetch q. */
207	jmp	1f
208	nop
2091:	/* Reload "stale" selectors. */
210	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
211	movl	%eax,%ds
212	movl	%eax,%es
213	movl	%eax,%ss
214	/* Reload code selector by doing intersegment return. */
215	popq	%rax
216	pushq	$GSEL(GCODE_SEL, SEL_KPL)
217	pushq	%rax
218	RETGUARD_CHECK(lgdt, r11)
219	lretq
220END(lgdt)
221
222#if defined(DDB) || NEFI > 0
223ENTRY(setjmp)
224	RETGUARD_SETUP(setjmp, r11)
225	/*
226	 * Only save registers that must be preserved across function
227	 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15)
228	 * and %rip.
229	 */
230	movq	%rdi,%rax
231	movq	%rbx,(%rax)
232	movq	%rsp,8(%rax)
233	movq	%rbp,16(%rax)
234	movq	%r12,24(%rax)
235	movq	%r13,32(%rax)
236	movq	%r14,40(%rax)
237	movq	%r15,48(%rax)
238	movq	(%rsp),%rdx
239	movq	%rdx,56(%rax)
240	xorl	%eax,%eax
241	RETGUARD_CHECK(setjmp, r11)
242	ret
243	lfence
244END(setjmp)
245
246ENTRY(longjmp)
247	movq	%rdi,%rax
248	movq	8(%rax),%rsp
249	movq	56(%rax),%rdx
250	movq	%rdx,(%rsp)
251	RETGUARD_SETUP(longjmp, r11)
252	movq	(%rax),%rbx
253	movq	16(%rax),%rbp
254	movq	24(%rax),%r12
255	movq	32(%rax),%r13
256	movq	40(%rax),%r14
257	movq	48(%rax),%r15
258	xorl	%eax,%eax
259	incl	%eax
260	RETGUARD_CHECK(longjmp, r11)
261	ret
262	lfence
263END(longjmp)
264#endif /* DDB || NEFI > 0 */
265
266/*****************************************************************************/
267
268/*
269 * int cpu_switchto(struct proc *old, struct proc *new)
270 * Switch from "old" proc to "new".
271 */
272ENTRY(cpu_switchto)
273	pushq	%rbx
274	pushq	%rbp
275	pushq	%r12
276	pushq	%r13
277	pushq	%r14
278	pushq	%r15
279
280	movq	%rdi, %r13
281	movq	%rsi, %r12
282
283	/* Record new proc. */
284	movb	$SONPROC,P_STAT(%r12)	# p->p_stat = SONPROC
285	SET_CURPROC(%r12,%rcx)
286
287	movl	CPUVAR(CPUID),%r9d
288
289	/* for the FPU/"extended CPU state" handling below */
290	movq	xsave_mask(%rip),%rdx
291	movl	%edx,%eax
292	shrq	$32,%rdx
293
294	/* If old proc exited, don't bother. */
295	xorl	%ecx,%ecx
296	testq	%r13,%r13
297	jz	switch_exited
298
299	/*
300	 * Save old context.
301	 *
302	 * Registers:
303	 *   %rax - scratch
304	 *   %r13 - old proc, then old pcb
305	 *   %rcx - old pmap if not P_SYSTEM
306	 *   %r12 - new proc
307	 *   %r9d - cpuid
308	 */
309
310	/* remember the pmap if not P_SYSTEM */
311	testl	$P_SYSTEM,P_FLAG(%r13)
312	movq	P_ADDR(%r13),%r13
313	jnz	0f
314	movq	PCB_PMAP(%r13),%rcx
3150:
316
317	/* Save stack pointers. */
318	movq	%rsp,PCB_RSP(%r13)
319	movq	%rbp,PCB_RBP(%r13)
320
321	/*
322	 * If the old proc ran in userspace then save the
323	 * floating-point/"extended state" registers
324	 */
325	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
326	jz	.Lxstate_reset
327
328	movq	%r13, %rdi
329#if PCB_SAVEFPU != 0
330	addq	$PCB_SAVEFPU,%rdi
331#endif
332	CODEPATCH_START
333	fxsave64	(%rdi)
334	CODEPATCH_END(CPTAG_XSAVE)
335
336switch_exited:
337	/* now clear the xstate */
338	movq	proc0paddr(%rip),%rdi
339#if PCB_SAVEFPU != 0
340	addq	$PCB_SAVEFPU,%rdi
341#endif
342	CODEPATCH_START
343	fxrstor64	(%rdi)
344	CODEPATCH_END(CPTAG_XRSTORS)
345	andl	$~CPUPF_USERXSTATE,CPUVAR(PFLAGS)
346
347.Lxstate_reset:
348	/*
349	 * If the segment registers haven't been reset since the old proc
350	 * ran in userspace then reset them now
351	 */
352	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
353	jz	restore_saved
354	andl	$~CPUPF_USERSEGS,CPUVAR(PFLAGS)
355
356	/* set %ds, %es, %fs, and %gs to expected value to prevent info leak */
357	movw	$(GSEL(GUDATA_SEL, SEL_UPL)),%ax
358	movw	%ax,%ds
359	movw	%ax,%es
360	movw	%ax,%fs
361	cli			/* block interrupts when on user GS.base */
362	swapgs			/* switch from kernel to user GS.base */
363	movw	%ax,%gs		/* set %gs to UDATA and GS.base to 0 */
364	swapgs			/* back to kernel GS.base */
365
366restore_saved:
367	/*
368	 * Restore saved context.
369	 *
370	 * Registers:
371	 *   %rax, %rdx - scratch
372	 *   %rcx - old pmap if not P_SYSTEM
373	 *   %r12 - new process
374	 *   %r13 - new pcb
375	 *   %rbx - new pmap if not P_SYSTEM
376	 */
377
378	movq	P_ADDR(%r12),%r13
379
380	/* remember the pmap if not P_SYSTEM */
381	xorl	%ebx,%ebx
382	testl	$P_SYSTEM,P_FLAG(%r12)
383	jnz	1f
384	movq	PCB_PMAP(%r13),%rbx
3851:
386
387	/* No interrupts while loading new state. */
388	cli
389
390	/* Restore stack pointers. */
391	movq	PCB_RSP(%r13),%rsp
392	movq	PCB_RBP(%r13),%rbp
393
394	/* Stack pivot done, setup RETGUARD */
395	RETGUARD_SETUP_OFF(cpu_switchto, r11, 6*8)
396
397	/* don't switch cr3 to the same thing it already was */
398	movq	PCB_CR3(%r13),%rax
399	movq	%cr3,%rdi
400	xorq	%rax,%rdi
401	btrq	$63,%rdi	/* ignore CR3_REUSE_PCID */
402	testq	%rdi,%rdi
403	jz	.Lsame_cr3
404
405#ifdef DIAGNOSTIC
406	/* verify ci_proc_pmap had been updated properly */
407	cmpq	%rcx,CPUVAR(PROC_PMAP)
408	jnz	.Lbogus_proc_pmap
409#endif
410	/* record which pmap this CPU should get IPIs for */
411	movq	%rbx,CPUVAR(PROC_PMAP)
412
413.Lset_cr3:
414	movq	%rax,%cr3			/* %rax used below too */
415
416.Lsame_cr3:
417	/*
418	 * If we switched from a userland thread with a shallow call stack
419	 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
420	 * then the RSB may have attacker controlled entries when we switch
421	 * to a deeper call stack in the new thread.  Refill the RSB with
422	 * entries safe to speculate into/through.
423	 */
424	RET_STACK_REFILL_WITH_RCX
425
426	/* Don't bother with the rest if switching to a system process. */
427	testq	%rbx,%rbx
428	jz	switch_restored
429
430	/* record the bits needed for future U-->K transition */
431	movq	PCB_KSTACK(%r13),%rdx
432	subq	$FRAMESIZE,%rdx
433	movq	%rdx,CPUVAR(KERN_RSP)
434
435	CODEPATCH_START
436	/*
437	 * Meltdown: iff we're doing separate U+K and U-K page tables,
438	 * then record them in cpu_info for easy access in syscall and
439	 * interrupt trampolines.
440	 */
441	movq	PM_PDIRPA_INTEL(%rbx),%rdx
442	orq	cr3_reuse_pcid,%rax
443	orq	cr3_pcid_proc_intel,%rdx
444	movq	%rax,CPUVAR(KERN_CR3)
445	movq	%rdx,CPUVAR(USER_CR3)
446	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
447
448switch_restored:
449	SET_CURPCB(%r13)
450
451	/* Interrupts are okay again. */
452	sti
453	popq	%r15
454	popq	%r14
455	popq	%r13
456	popq	%r12
457	popq	%rbp
458	popq	%rbx
459	RETGUARD_CHECK(cpu_switchto, r11)
460	ret
461	lfence
462
463#ifdef DIAGNOSTIC
464.Lbogus_proc_pmap:
465	leaq	bogus_proc_pmap,%rdi
466	call	panic
467	int3	/* NOTREACHED */
468	.pushsection .rodata
469bogus_proc_pmap:
470	.asciz	"curcpu->ci_proc_pmap didn't point to previous pmap"
471	.popsection
472#endif /* DIAGNOSTIC */
473END(cpu_switchto)
474
475NENTRY(retpoline_rax)
476	CODEPATCH_START
477	JMP_RETPOLINE(rax)
478	CODEPATCH_END(CPTAG_RETPOLINE_RAX)
479END(retpoline_rax)
480
481NENTRY(__x86_indirect_thunk_r11)
482	CODEPATCH_START
483	JMP_RETPOLINE(r11)
484	CODEPATCH_END(CPTAG_RETPOLINE_R11)
485END(__x86_indirect_thunk_r11)
486
487ENTRY(cpu_idle_cycle_hlt)
488	RETGUARD_SETUP(cpu_idle_cycle_hlt, r11)
489	sti
490	hlt
491	RETGUARD_CHECK(cpu_idle_cycle_hlt, r11)
492	ret
493	lfence
494END(cpu_idle_cycle_hlt)
495
496/*
497 * savectx(struct pcb *pcb);
498 * Update pcb, saving current processor state.
499 */
500ENTRY(savectx)
501	RETGUARD_SETUP(savectx, r11)
502	/* Save stack pointers. */
503	movq	%rsp,PCB_RSP(%rdi)
504	movq	%rbp,PCB_RBP(%rdi)
505	RETGUARD_CHECK(savectx, r11)
506	ret
507	lfence
508END(savectx)
509
510/*
511 * syscall insn entry.
512 * Enter here with interrupts blocked; %rcx contains the caller's
513 * %rip and the original rflags has been copied to %r11.  %cs and
514 * %ss have been updated to the kernel segments, but %rsp is still
515 * the user-space value.
516 * First order of business is to swap to the kernel GS.base so that
517 * we can access our struct cpu_info.  After possibly mucking with
518 * pagetables, we switch to our kernel stack.  Once that's in place
519 * we can save the rest of the syscall frame and unblock interrupts.
520 */
521KUTEXT_PAGE_START
522 	.align	NBPG, 0xcc
523XUsyscall_meltdown:
524	/*
525	 * This is the real Xsyscall_meltdown page, which is mapped into
526	 * the U-K page tables at the same location as Xsyscall_meltdown
527	 * below.  For this, the Meltdown case, we use the scratch space
528	 * in cpu_info so we can switch to the kernel page tables
529	 * (thank you, Intel), at which point we'll continue at the
530	 * "SYSCALL_ENTRY" after Xsyscall below.
531	 * In case the CPU speculates past the mov to cr3, we put a
532	 * retpoline-style pause-lfence-jmp-to-pause loop.
533	 */
534	endbr64
535	swapgs
536	movq	%rax,CPUVAR(SCRATCH)
537	movq	CPUVAR(KERN_CR3),%rax
538	movq	%rax,%cr3
5390:	pause
540	lfence
541	jmp	0b
542KUTEXT_PAGE_END
543
544KTEXT_PAGE_START
545	.align	NBPG, 0xcc
546GENTRY(Xsyscall_meltdown)
547	/* pad to match real Xsyscall_meltdown positioning above */
548	movq	CPUVAR(KERN_CR3),%rax
549	movq	%rax,%cr3
550GENTRY(Xsyscall)
551	endbr64
552	swapgs
553	movq	%rax,CPUVAR(SCRATCH)
554	SYSCALL_ENTRY			/* create trapframe */
555	sti
556
557	movq	CPUVAR(CURPROC),%r14
558	movq	%rsp,P_MD_REGS(%r14)	# save pointer to frame
559	andl	$~MDP_IRET,P_MD_FLAGS(%r14)
560	movq	%rsp,%rdi
561	call	syscall
562
563.Lsyscall_check_asts:
564	/* Check for ASTs on exit to user mode. */
565	cli
566	CHECK_ASTPENDING(%r11)
567	je	2f
568	CLEAR_ASTPENDING(%r11)
569	sti
570	movq	%rsp,%rdi
571	call	ast
572	jmp	.Lsyscall_check_asts
573
5742:
575#ifdef DIAGNOSTIC
576	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
577	jne	.Lsyscall_spl_not_lowered
578#endif /* DIAGNOSTIC */
579
580	/* Could registers have been changed that require an iretq? */
581	testl	$MDP_IRET, P_MD_FLAGS(%r14)
582	jne	intr_user_exit_post_ast
583
584	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
585	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
586	jz	.Lsyscall_restore_xstate
587
588	/* Restore FS.base if it's not already in the CPU */
589	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
590	jz	.Lsyscall_restore_fsbase
591
592.Lsyscall_restore_registers:
593	/*
594	 * If the pmap we're now on isn't the same as the one we
595	 * were on last time we were in userspace, then use IBPB
596	 * to prevent cross-process branch-target injection.
597	 */
598	CODEPATCH_START
599	movq	CPUVAR(PROC_PMAP),%rbx
600	cmpq	CPUVAR(USER_PMAP),%rbx
601	je	1f
602	xorl	%edx,%edx
603	movl	$PRED_CMD_IBPB,%eax
604	movl	$MSR_PRED_CMD,%ecx
605	wrmsr
606	movq	%rbx,CPUVAR(USER_PMAP)
6071:
608	CODEPATCH_END(CPTAG_IBPB_NOP)
609	call	pku_xonly
610	RET_STACK_REFILL_WITH_RCX
611
612	movq	TF_R8(%rsp),%r8
613	movq	TF_R9(%rsp),%r9
614	movq	TF_R10(%rsp),%r10
615	movq	TF_R12(%rsp),%r12
616	movq	TF_R13(%rsp),%r13
617	movq	TF_R14(%rsp),%r14
618	movq	TF_R15(%rsp),%r15
619	movq	TF_RBX(%rsp),%rbx
620	movq	TF_RDX(%rsp),%rdx
621
622	CODEPATCH_START
623	xorl	%edi,%edi
624	xorl	%esi,%esi
625	xorl	%r11d,%r11d
626	xorl	%eax,%eax
627	xorl	%ecx,%ecx
628	movw	%ds,TF_R8(%rsp)
629	verw	TF_R8(%rsp)
630	CODEPATCH_END(CPTAG_MDS)
631
632	movq	TF_RDI(%rsp),%rdi
633	movq	TF_RSI(%rsp),%rsi
634	movq	TF_RBP(%rsp),%rbp
635
636	/*
637	 * We need to finish reading from the trapframe, then switch
638	 * to the user page tables, swapgs, and return.  We need
639	 * to get the final value for the register that was used
640	 * for the mov to %cr3 from somewhere accessible on the
641	 * user page tables, so save it in CPUVAR(SCRATCH) across
642	 * the switch.
643	 */
644	movq	TF_RAX(%rsp),%rax
645	movq	TF_RIP(%rsp),%rcx
646	movq	TF_RFLAGS(%rsp),%r11
647	movq	TF_RSP(%rsp),%rsp
648	CODEPATCH_START
649	movq	%rax,CPUVAR(SCRATCH)
650	movq	CPUVAR(USER_CR3),%rax
651	PCID_SET_REUSE_NOP
652	movq	%rax,%cr3
653Xsyscall_trampback:
6540:	pause
655	lfence
656	jmp	0b
657	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
658	swapgs
659	sysretq
660END(Xsyscall)
661END(Xsyscall_meltdown)
662KTEXT_PAGE_END
663
664KUTEXT_PAGE_START
665	.space	(Xsyscall_trampback - Xsyscall_meltdown) - \
666		(. - XUsyscall_meltdown), 0xcc
667	movq	%rax,%cr3
668	movq	CPUVAR(SCRATCH),%rax
669	swapgs
670	sysretq
671KUTEXT_PAGE_END
672
673	.text
674	_ALIGN_TRAPS
675	/* in this case, need FS.base but not xstate, rarely happens */
676.Lsyscall_restore_fsbase:	/* CPU doesn't have curproc's FS.base */
677	orl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
678	movq	CPUVAR(CURPCB),%rdi
679	jmp	.Lsyscall_restore_fsbase_real
680
681	_ALIGN_TRAPS
682.Lsyscall_restore_xstate:	/* CPU doesn't have curproc's xstate */
683	orl	$(CPUPF_USERXSTATE|CPUPF_USERSEGS),CPUVAR(PFLAGS)
684	movq	CPUVAR(CURPCB),%rdi
685	movq	xsave_mask(%rip),%rdx
686	movl	%edx,%eax
687	shrq	$32,%rdx
688#if PCB_SAVEFPU != 0
689	addq	$PCB_SAVEFPU,%rdi
690#endif
691	/* untouched state so can't fault */
692	CODEPATCH_START
693	fxrstor64	(%rdi)
694	CODEPATCH_END(CPTAG_XRSTORS)
695#if PCB_SAVEFPU != 0
696	subq	$PCB_SAVEFPU,%rdi
697#endif
698.Lsyscall_restore_fsbase_real:
699	movq	PCB_FSBASE(%rdi),%rdx
700	movl	%edx,%eax
701	shrq	$32,%rdx
702	movl	$MSR_FSBASE,%ecx
703	wrmsr
704	jmp	.Lsyscall_restore_registers
705
706#ifdef DIAGNOSTIC
707.Lsyscall_spl_not_lowered:
708	leaq	spl_lowered(%rip), %rdi
709	movl	TF_ERR(%rsp),%esi	/* syscall # stashed above */
710	movl	TF_RDI(%rsp),%edx
711	movl	%ebx,%ecx
712	movl	CPUVAR(ILEVEL),%r8d
713	xorq	%rax,%rax
714	call	printf
715#ifdef DDB
716	int	$3
717#endif /* DDB */
718	movl	$IPL_NONE,CPUVAR(ILEVEL)
719	jmp	.Lsyscall_check_asts
720
721	.section .rodata
722spl_lowered:
723	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n"
724	.text
725#endif
726
727NENTRY(proc_trampoline)
728	call	proc_trampoline_mi
729	movq	%r13,%rdi
730	movq	%r12,%rax
731	call	retpoline_rax
732	movq	CPUVAR(CURPROC),%r14
733	jmp	.Lsyscall_check_asts
734END(proc_trampoline)
735
736
737/*
738 * Returning to userspace via iretq.  We do things in this order:
739 *  - check for ASTs
740 *  - restore FPU/"extended CPU state" if it's not already in the CPU
741 *  - DIAGNOSTIC: no more C calls after this, so check the SPL
742 *  - restore FS.base if it's not already in the CPU
743 *  - restore most registers
744 *  - update the iret frame from the trapframe
745 *  - finish reading from the trapframe
746 *  - switch to the trampoline stack	\
747 *  - jump to the .kutext segment	|-- Meltdown workaround
748 *  - switch to the user page tables	/
749 *  - swapgs
750 *  - iretq
751 */
752KTEXT_PAGE_START
753        _ALIGN_TRAPS
754GENTRY(intr_user_exit)
755#ifdef DIAGNOSTIC
756	pushfq
757	popq	%rdx
758	testq	$PSL_I,%rdx
759	jnz	.Lintr_user_exit_not_blocked
760#endif /* DIAGNOSTIC */
761
762	/* Check for ASTs */
763	CHECK_ASTPENDING(%r11)
764	je	intr_user_exit_post_ast
765	CLEAR_ASTPENDING(%r11)
766	sti
767	movq	%rsp,%rdi
768	call	ast
769	cli
770	jmp	intr_user_exit
771
772intr_user_exit_post_ast:
773	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
774	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
775	jz	.Lintr_restore_xstate
776
777	/* Restore FS.base if it's not already in the CPU */
778	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
779	jz	.Lintr_restore_fsbase
780
781.Lintr_restore_registers:
782#ifdef DIAGNOSTIC
783	/* no more C calls after this, so check the SPL */
784	cmpl	$0,CPUVAR(ILEVEL)
785	jne	.Luser_spl_not_lowered
786#endif /* DIAGNOSTIC */
787
788	/*
789	 * If the pmap we're now on isn't the same as the one we
790	 * were on last time we were in userspace, then use IBPB
791	 * to prevent cross-process branch-target injection.
792	 */
793	CODEPATCH_START
794	movq	CPUVAR(PROC_PMAP),%rbx
795	cmpq	CPUVAR(USER_PMAP),%rbx
796	je	1f
797	xorl	%edx,%edx
798	movl	$PRED_CMD_IBPB,%eax
799	movl	$MSR_PRED_CMD,%ecx
800	wrmsr
801	movq	%rbx,CPUVAR(USER_PMAP)
8021:
803	CODEPATCH_END(CPTAG_IBPB_NOP)
804	call	pku_xonly
805	RET_STACK_REFILL_WITH_RCX
806
807	movq	TF_R8(%rsp),%r8
808	movq	TF_R9(%rsp),%r9
809	movq	TF_R10(%rsp),%r10
810	movq	TF_R12(%rsp),%r12
811	movq	TF_R13(%rsp),%r13
812	movq	TF_R14(%rsp),%r14
813	movq	TF_R15(%rsp),%r15
814	movq	TF_RBX(%rsp),%rbx
815
816	CODEPATCH_START
817	xorl	%edi,%edi
818	xorl	%esi,%esi
819	xorl	%r11d,%r11d
820	xorl	%eax,%eax
821	xorl	%edx,%edx
822	xorl	%ecx,%ecx
823	movw	%ds,TF_R8(%rsp)
824	verw	TF_R8(%rsp)
825	CODEPATCH_END(CPTAG_MDS)
826
827	movq	TF_RDI(%rsp),%rdi
828	movq	TF_RSI(%rsp),%rsi
829	movq	TF_RBP(%rsp),%rbp
830
831	/*
832	 * To get the final value for the register that was used
833	 * for the mov to %cr3, we need access to somewhere accessible
834	 * on the user page tables, so we save it in CPUVAR(SCRATCH)
835	 * across the switch.
836	 */
837	/* update iret frame */
838	movq	CPUVAR(INTR_RSP),%rdx
839	movq	$(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
840	movq	TF_RIP(%rsp),%rax
841	movq	%rax,IRETQ_RIP(%rdx)
842	movq	TF_RFLAGS(%rsp),%rax
843	movq	%rax,IRETQ_RFLAGS(%rdx)
844	movq	TF_RSP(%rsp),%rax
845	movq	%rax,IRETQ_RSP(%rdx)
846	movq	$(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
847	/* finish with the trap frame */
848	movq	TF_RAX(%rsp),%rax
849	movq	TF_RCX(%rsp),%rcx
850	movq	TF_R11(%rsp),%r11
851	/* switch to the trampoline stack */
852	xchgq	%rdx,%rsp
853	movq	TF_RDX(%rdx),%rdx
854	CODEPATCH_START
855	movq	%rax,CPUVAR(SCRATCH)
856	movq	CPUVAR(USER_CR3),%rax
857	PCID_SET_REUSE_NOP
858	movq	%rax,%cr3
859Xiretq_trampback:
860KTEXT_PAGE_END
861/* the movq %cr3 switches to this "KUTEXT" page */
862KUTEXT_PAGE_START
863	.space	(Xiretq_trampback - Xsyscall_meltdown) - \
864		(. - XUsyscall_meltdown), 0xcc
865	movq	CPUVAR(SCRATCH),%rax
866.Liretq_swapgs:
867	swapgs
868doreti_iret_meltdown:
869	iretq
870KUTEXT_PAGE_END
871/*
872 * Back to the "KTEXT" page to fill in the speculation trap and the
873 * swapgs+iretq used for non-Meltdown kernels.  This switching back
874 * and forth between segments is so that we can do the .space
875 * calculation below to guarantee the iretq's above and below line
876 * up, so the 'doreti_iret' label lines up with the iretq whether
877 * the CPU is affected by Meltdown or not.
878 */
879KTEXT_PAGE_START
8800:	pause
881	lfence
882	jmp	0b
883	.space	(.Liretq_swapgs - XUsyscall_meltdown) - \
884		(. - Xsyscall_meltdown), 0xcc
885	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
886	swapgs
887
888	.globl	doreti_iret
889doreti_iret:
890	iretq
891KTEXT_PAGE_END
892
893	.text
894	_ALIGN_TRAPS
895.Lintr_restore_xstate:		/* CPU doesn't have curproc's xstate */
896	orl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
897	movq	CPUVAR(CURPCB),%rdi
898#if PCB_SAVEFPU != 0
899	addq	$PCB_SAVEFPU,%rdi
900#endif
901	movq	xsave_mask(%rip),%rdx
902	movl	%edx,%eax
903	shrq	$32, %rdx
904	CODEPATCH_START
905	fxrstor64	(%rdi)
906	CODEPATCH_END(CPTAG_XRSTORS)
907	//testl	%eax,%eax
908	//jnz	.Lintr_xrstor_faulted
909.Lintr_restore_fsbase:		/* CPU doesn't have curproc's FS.base */
910	orl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
911	movq	CPUVAR(CURPCB),%rdx
912	movq	PCB_FSBASE(%rdx),%rdx
913	movl	%edx,%eax
914	shrq	$32,%rdx
915	movl	$MSR_FSBASE,%ecx
916	wrmsr
917	jmp	.Lintr_restore_registers
918
919.Lintr_xrstor_faulted:
920	/*
921	 * xrstor faulted; we need to reset the FPU state and call trap()
922	 * to post a signal, which requires interrupts be enabled.
923	 */
924	sti
925	movq	proc0paddr(%rip),%rdi
926#if PCB_SAVEFPU != 0
927	addq	$PCB_SAVEFPU,%rdi
928#endif
929	CODEPATCH_START
930	fxrstor64	(%rdi)
931	CODEPATCH_END(CPTAG_XRSTORS)
932	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
933	jmp	recall_trap
934
935#ifdef DIAGNOSTIC
936.Lintr_user_exit_not_blocked:
937	movl	warn_once(%rip),%edi
938	testl	%edi,%edi
939	jnz	1f
940	incl	%edi
941	movl	%edi,warn_once(%rip)
942	leaq	.Lnot_blocked(%rip),%rdi
943	call	printf
944#ifdef DDB
945	int	$3
946#endif /* DDB */
9471:	cli
948	jmp	intr_user_exit
949
950.Luser_spl_not_lowered:
951	sti
952	leaq	intr_spl_lowered(%rip),%rdi
953	movl	CPUVAR(ILEVEL),%esi
954	xorl	%edx,%edx		/* always SPL zero for userspace */
955	xorl	%eax,%eax
956	call	printf
957#ifdef DDB
958	int	$3
959#endif /* DDB */
960	movl	$0,CPUVAR(ILEVEL)
961	cli
962	jmp	intr_user_exit
963
964	.section .rodata
965intr_spl_lowered:
966	.asciz	"WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n"
967	.text
968#endif /* DIAGNOSTIC */
969END(Xintr_user_exit)
970
971
972/*
973 * Return to supervisor mode from trap or interrupt
974 */
975NENTRY(intr_fast_exit)
976#ifdef DIAGNOSTIC
977	pushfq
978	popq	%rdx
979	testq	$PSL_I,%rdx
980	jnz	.Lintr_exit_not_blocked
981#endif /* DIAGNOSTIC */
982	movq	TF_RDI(%rsp),%rdi
983	movq	TF_RSI(%rsp),%rsi
984	movq	TF_R8(%rsp),%r8
985	movq	TF_R9(%rsp),%r9
986	movq	TF_R10(%rsp),%r10
987	movq	TF_R12(%rsp),%r12
988	movq	TF_R13(%rsp),%r13
989	movq	TF_R14(%rsp),%r14
990	movq	TF_R15(%rsp),%r15
991	movq	TF_RBP(%rsp),%rbp
992	movq	TF_RBX(%rsp),%rbx
993	movq	TF_RDX(%rsp),%rdx
994	movq	TF_RCX(%rsp),%rcx
995	movq	TF_R11(%rsp),%r11
996	movq	TF_RAX(%rsp),%rax
997	addq	$TF_RIP,%rsp
998	iretq
999
1000#ifdef DIAGNOSTIC
1001.Lintr_exit_not_blocked:
1002	movl	warn_once(%rip),%edi
1003	testl	%edi,%edi
1004	jnz	1f
1005	incl	%edi
1006	movl	%edi,warn_once(%rip)
1007	leaq	.Lnot_blocked(%rip),%rdi
1008	call	printf
1009#ifdef DDB
1010	int	$3
1011#endif /* DDB */
10121:	cli
1013	jmp	intr_fast_exit
1014
1015	.data
1016.global warn_once
1017warn_once:
1018	.long	0
1019	.section .rodata
1020.Lnot_blocked:
1021	.asciz	"WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
1022	.text
1023#endif
1024END(intr_fast_exit)
1025
1026/*
1027 * FPU/"extended CPU state" handling
1028 *	void xrstor_kern(sfp, mask)
1029 *		using first of xrstors/xrstor/fxrstor, load given state
1030 *		which is assumed to be trusted: i.e., unaltered from
1031 *		xsaves/xsaveopt/xsave/fxsave by kernel
1032 * 	int xrstor_user(sfp, mask)
1033 *		using first of xrstor/fxrstor, load given state which might
1034 *		not be trustable: #GP faults will be caught; returns 0/1 if
1035 *		okay/it trapped.
1036 *	void fpusave(sfp)
1037 *		save current state, but retain it in the FPU
1038 *	void fpusavereset(sfp)
1039 *		save current state and reset FPU to initial/kernel state
1040 *	int xsetbv_user(reg, mask)
1041 *		load specified %xcr# register, returns 0/1 if okay/it trapped
1042 */
1043
1044ENTRY(xrstor_kern)
1045	RETGUARD_SETUP(xrstor_kern, r11)
1046	movq	%rsi, %rdx
1047	movl	%esi, %eax
1048	shrq	$32, %rdx
1049	CODEPATCH_START
1050	fxrstor64	(%rdi)
1051	CODEPATCH_END(CPTAG_XRSTORS)
1052	RETGUARD_CHECK(xrstor_kern, r11)
1053	ret
1054	lfence
1055END(xrstor_kern)
1056
1057ENTRY(xrstor_user)
1058	RETGUARD_SETUP(xrstor_user, r11)
1059	movq	%rsi, %rdx
1060	movl	%esi, %eax
1061	shrq	$32, %rdx
1062	.globl	xrstor_fault
1063xrstor_fault:
1064	CODEPATCH_START
1065	fxrstor64	(%rdi)
1066	CODEPATCH_END(CPTAG_XRSTOR)
1067	xorl	%eax, %eax
1068	RETGUARD_CHECK(xrstor_user, r11)
1069	ret
1070	lfence
1071NENTRY(xrstor_resume)
1072	movl	$1, %eax
1073	RETGUARD_CHECK(xrstor_user, r11)
1074	ret
1075	lfence
1076END(xrstor_user)
1077
1078ENTRY(fpusave)
1079	RETGUARD_SETUP(fpusave, r11)
1080	movq	xsave_mask(%rip),%rdx
1081	movl	%edx,%eax
1082	shrq	$32,%rdx
1083	CODEPATCH_START
1084	fxsave64	(%rdi)
1085	CODEPATCH_END(CPTAG_XSAVE)
1086	RETGUARD_CHECK(fpusave, r11)
1087	ret
1088	lfence
1089END(fpusave)
1090
1091ENTRY(fpusavereset)
1092	RETGUARD_SETUP(fpusavereset, r11)
1093	movq	xsave_mask(%rip),%rdx
1094	movl	%edx,%eax
1095	shrq	$32,%rdx
1096	CODEPATCH_START
1097	fxsave64	(%rdi)
1098	CODEPATCH_END(CPTAG_XSAVE)
1099	movq	proc0paddr(%rip),%rdi
1100#if PCB_SAVEFPU != 0
1101	addq	$PCB_SAVEFPU,%rdi
1102#endif
1103	CODEPATCH_START
1104	fxrstor64	(%rdi)
1105	CODEPATCH_END(CPTAG_XRSTORS)
1106	RETGUARD_CHECK(fpusavereset, r11)
1107	ret
1108	lfence
1109END(fpusavereset)
1110
1111ENTRY(xsetbv_user)
1112	RETGUARD_SETUP(xsetbv_user, r11)
1113	movl	%edi, %ecx
1114	movq	%rsi, %rdx
1115	movl	%esi, %eax
1116	shrq	$32, %rdx
1117	.globl	xsetbv_fault
1118xsetbv_fault:
1119	xsetbv
1120	xorl	%eax, %eax
1121	RETGUARD_CHECK(xsetbv_user, r11)
1122	ret
1123	lfence
1124NENTRY(xsetbv_resume)
1125	movl	$1, %eax
1126	RETGUARD_CHECK(xsetbv_user, r11)
1127	ret
1128	lfence
1129END(xsetbv_user)
1130
1131CODEPATCH_CODE(_xrstor,		xrstor64 (%rdi))
1132CODEPATCH_CODE(_xrstors,	xrstors64 (%rdi))
1133CODEPATCH_CODE(_xsave,		xsave64 (%rdi))
1134CODEPATCH_CODE(_xsaves,		xsaves64 (%rdi))
1135CODEPATCH_CODE(_xsaveopt,	xsaveopt64 (%rdi))
1136CODEPATCH_CODE(_pcid_set_reuse,
1137		orl	$(CR3_REUSE_PCID >> 32),CPUVAR(USER_CR3 + 4))
1138CODEPATCH_CODE_LEN(_jmprax,	jmp *%rax; int3)
1139CODEPATCH_CODE_LEN(_jmpr11,	jmp *%r11; int3)
1140CODEPATCH_CODE_LEN(_jmpr13,	jmp *%r13; int3)
1141
1142ENTRY(pagezero)
1143	RETGUARD_SETUP(pagezero, r11)
1144	movq    $-PAGE_SIZE,%rdx
1145	subq    %rdx,%rdi
1146	xorq    %rax,%rax
11471:
1148	movnti  %rax,(%rdi,%rdx)
1149	movnti  %rax,8(%rdi,%rdx)
1150	movnti  %rax,16(%rdi,%rdx)
1151	movnti  %rax,24(%rdi,%rdx)
1152	addq    $32,%rdx
1153	jne     1b
1154	sfence
1155	RETGUARD_CHECK(pagezero, r11)
1156	ret
1157	lfence
1158END(pagezero)
1159
1160/* void pku_xonly(void) */
1161ENTRY(pku_xonly)
1162	movq	pg_xo,%rax	/* have PKU support? */
1163	cmpq	$0,%rax
1164	je	1f
1165	movl	$0,%ecx		/* force PKRU for xonly restriction */
1166	movl	$0,%edx
1167	movl	$PGK_VALUE,%eax	/* key0 normal, key1 is exec without read */
1168	wrpkru
11691:	ret
1170	lfence
1171END(pku_xonly)
1172
1173/* int rdmsr_safe(u_int msr, uint64_t *data) */
1174ENTRY(rdmsr_safe)
1175	RETGUARD_SETUP(rdmsr_safe, r10)
1176
1177	movl	%edi,	%ecx	/* u_int msr */
1178	.globl	rdmsr_safe_fault
1179rdmsr_safe_fault:
1180	rdmsr
1181	salq	$32, %rdx
1182	movl	%eax, %eax
1183	orq	%rdx, %rax
1184	movq	%rax, (%rsi)	/* *data */
1185	xorq	%rax, %rax
1186
1187	RETGUARD_CHECK(rdmsr_safe, r10)
1188	ret
1189	lfence
1190
1191NENTRY(rdmsr_resume)
1192	movl	$0x1, %eax
1193	RETGUARD_CHECK(rdmsr_safe, r10)
1194	ret
1195	lfence
1196END(rdmsr_safe)
1197
1198#if NHYPERV > 0
1199/* uint64_t hv_hypercall_trampoline(uint64_t control, paddr_t input, paddr_t output) */
1200NENTRY(hv_hypercall_trampoline)
1201	endbr64
1202	mov	%rdx, %r8
1203	mov	%rsi, %rdx
1204	mov	%rdi, %rcx
1205	jmp	hv_hypercall_page
1206END(hv_hypercall_trampoline)
1207	/* Hypercall page needs to be page aligned */
1208	.text
1209	.align	NBPG, 0xcc
1210	.globl	hv_hypercall_page
1211hv_hypercall_page:
1212	.skip	0x1000, 0xcc
1213#endif /* NHYPERV > 0 */
1214
1215#if NXEN > 0
1216	/* Hypercall page needs to be page aligned */
1217	.text
1218	.align	NBPG, 0xcc
1219	.globl	xen_hypercall_page
1220xen_hypercall_page:
1221	.skip	0x1000, 0xcc
1222#endif /* NXEN > 0 */
1223