1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007-2018 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by A. Joseph Koshy under
8 * sponsorship from the FreeBSD Foundation and Google, Inc.
9 *
10 * Portions of this software were developed by
11 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
12 * the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include "opt_atpic.h"
40#include "opt_hwpmc_hooks.h"
41
42#include "assym.inc"
43
44#include <machine/psl.h>
45#include <machine/asmacros.h>
46#include <machine/trap.h>
47#include <machine/specialreg.h>
48#include <machine/pmap.h>
49
50#ifdef KDTRACE_HOOKS
51	.bss
52	.globl	dtrace_invop_jump_addr
53	.align	8
54	.type	dtrace_invop_jump_addr,@object
55	.size	dtrace_invop_jump_addr,8
56dtrace_invop_jump_addr:
57	.zero	8
58	.globl	dtrace_invop_calltrap_addr
59	.align	8
60	.type	dtrace_invop_calltrap_addr,@object
61	.size	dtrace_invop_calltrap_addr,8
62dtrace_invop_calltrap_addr:
63	.zero	8
64#endif
65	.text
66#ifdef HWPMC_HOOKS
67	ENTRY(start_exceptions)
68#endif
69
70/*****************************************************************************/
71/* Trap handling                                                             */
72/*****************************************************************************/
73/*
74 * Trap and fault vector routines.
75 *
76 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
77 * state on the stack but also disables interrupts.  This is important for
78 * us for the use of the swapgs instruction.  We cannot be interrupted
79 * until the GS.base value is correct.  For most traps, we automatically
80 * then enable interrupts if the interrupted context had them enabled.
81 * This is equivalent to the i386 port's use of SDT_SYS386TGT.
82 *
83 * The cpu will push a certain amount of state onto the kernel stack for
84 * the current process.  See amd64/include/frame.h.
85 * This includes the current RFLAGS (status register, which includes
86 * the interrupt disable state prior to the trap), the code segment register,
87 * and the return instruction pointer are pushed by the cpu.  The cpu
88 * will also push an 'error' code for certain traps.  We push a dummy
89 * error code for those traps where the cpu doesn't in order to maintain
90 * a consistent frame.  We also push a contrived 'trap number'.
91 *
92 * The CPU does not push the general registers, so we must do that, and we
93 * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
94 * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
95 * for the kernel mode operation shortly, without changes to the selector
96 * loaded.  Since superuser long mode works with any selectors loaded into
97 * segment registers other then %cs, which makes them mostly unused in long
98 * mode, and kernel does not reference %fs, leave them alone.  The segment
99 * registers are reloaded on return to the usermode.
100 */
101
102/* Traps that we leave interrupts disabled for. */
103	.macro	TRAP_NOEN	l, trapno
104	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
105\l\()_pti_k:
106	subq	$TF_RIP,%rsp
107	movl	$\trapno,TF_TRAPNO(%rsp)
108	movq	$0,TF_ADDR(%rsp)
109	movq	$0,TF_ERR(%rsp)
110	jmp	alltraps_noen_k
111\l\()_pti_u:
112	subq	$TF_RIP,%rsp
113	movl	$\trapno,TF_TRAPNO(%rsp)
114	movq	$0,TF_ADDR(%rsp)
115	movq	$0,TF_ERR(%rsp)
116	jmp	alltraps_noen_u
117
118	.globl	X\l
119	.type	X\l,@function
120X\l:
121	subq	$TF_RIP,%rsp
122	movl	$\trapno,TF_TRAPNO(%rsp)
123	movq	$0,TF_ADDR(%rsp)
124	movq	$0,TF_ERR(%rsp)
125	testb	$SEL_RPL_MASK,TF_CS(%rsp)
126	jz	alltraps_noen_k
127	swapgs
128	lfence
129	jmp	alltraps_noen_u
130	.endm
131
132	TRAP_NOEN	bpt, T_BPTFLT
133#ifdef KDTRACE_HOOKS
134	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
135#endif
136
137/* Regular traps; The cpu does not supply tf_err for these. */
138	.macro	TRAP	l, trapno
139	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
140\l\()_pti_k:
141	subq	$TF_RIP,%rsp
142	movl	$\trapno,TF_TRAPNO(%rsp)
143	movq	$0,TF_ADDR(%rsp)
144	movq	$0,TF_ERR(%rsp)
145	jmp	alltraps_k
146\l\()_pti_u:
147	subq	$TF_RIP,%rsp
148	movl	$\trapno,TF_TRAPNO(%rsp)
149	movq	$0,TF_ADDR(%rsp)
150	movq	$0,TF_ERR(%rsp)
151	jmp	alltraps_u
152
153	.globl	X\l
154	.type	X\l,@function
155X\l:
156	subq	$TF_RIP,%rsp
157	movl	$\trapno,TF_TRAPNO(%rsp)
158	movq	$0,TF_ADDR(%rsp)
159	movq	$0,TF_ERR(%rsp)
160	testb	$SEL_RPL_MASK,TF_CS(%rsp)
161	jz	alltraps_k
162	swapgs
163	lfence
164	jmp	alltraps_u
165	.endm
166
167	TRAP	div, T_DIVIDE
168	TRAP	ofl, T_OFLOW
169	TRAP	bnd, T_BOUND
170	TRAP	ill, T_PRIVINFLT
171	TRAP	dna, T_DNA
172	TRAP	fpusegm, T_FPOPFLT
173	TRAP	rsvd, T_RESERVED
174	TRAP	fpu, T_ARITHTRAP
175	TRAP	xmm, T_XMMFLT
176
177/* This group of traps have tf_err already pushed by the cpu. */
178	.macro	TRAP_ERR	l, trapno
179	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u,has_err=1
180\l\()_pti_k:
181	subq	$TF_ERR,%rsp
182	movl	$\trapno,TF_TRAPNO(%rsp)
183	movq	$0,TF_ADDR(%rsp)
184	jmp	alltraps_k
185\l\()_pti_u:
186	subq	$TF_ERR,%rsp
187	movl	$\trapno,TF_TRAPNO(%rsp)
188	movq	$0,TF_ADDR(%rsp)
189	jmp	alltraps_u
190	.globl	X\l
191	.type	X\l,@function
192X\l:
193	subq	$TF_ERR,%rsp
194	movl	$\trapno,TF_TRAPNO(%rsp)
195	movq	$0,TF_ADDR(%rsp)
196	testb	$SEL_RPL_MASK,TF_CS(%rsp)
197	jz	alltraps_k
198	swapgs
199	lfence
200	jmp	alltraps_u
201	.endm
202
203	TRAP_ERR	tss, T_TSSFLT
204	TRAP_ERR	align, T_ALIGNFLT
205
206	/*
207	 * alltraps_u/k entry points.
208	 * SWAPGS must be already performed by prologue,
209	 * if this is the first time in the kernel from userland.
210	 * Re-enable interrupts if they were enabled before the trap.
211	 * This approximates SDT_SYS386TGT on the i386 port.
212	 */
213	SUPERALIGN_TEXT
214	.globl	alltraps_u
215	.type	alltraps_u,@function
216alltraps_u:
217	movq	%rdi,TF_RDI(%rsp)
218	movq	%rdx,TF_RDX(%rsp)
219	movq	%rax,TF_RAX(%rsp)
220	movq	%rcx,TF_RCX(%rsp)
221	movq	PCPU(CURPCB),%rdi
222	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
223	call	handle_ibrs_entry
224	jmp	alltraps_save_segs
225	SUPERALIGN_TEXT
226	.globl	alltraps_k
227	.type	alltraps_k,@function
228alltraps_k:
229	lfence
230	movq	%rdi,TF_RDI(%rsp)
231	movq	%rdx,TF_RDX(%rsp)
232	movq	%rax,TF_RAX(%rsp)
233	movq	%rcx,TF_RCX(%rsp)
234alltraps_save_segs:
235	SAVE_SEGS
236	testl	$PSL_I,TF_RFLAGS(%rsp)
237	jz	alltraps_pushregs_no_rax
238	sti
239alltraps_pushregs_no_rax:
240	movq	%rsi,TF_RSI(%rsp)
241	movq	%r8,TF_R8(%rsp)
242	movq	%r9,TF_R9(%rsp)
243	movq	%rbx,TF_RBX(%rsp)
244	movq	%rbp,TF_RBP(%rsp)
245	movq	%r10,TF_R10(%rsp)
246	movq	%r11,TF_R11(%rsp)
247	movq	%r12,TF_R12(%rsp)
248	movq	%r13,TF_R13(%rsp)
249	movq	%r14,TF_R14(%rsp)
250	movq	%r15,TF_R15(%rsp)
251	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
252	pushfq
253	andq	$~(PSL_D | PSL_AC),(%rsp)
254	popfq
255#ifdef KDTRACE_HOOKS
256	/*
257	 * DTrace Function Boundary Trace (fbt) probes are triggered
258	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
259	 * interrupt. For all other trap types, just handle them in
260	 * the usual way.
261	 */
262	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
263	jnz	calltrap		/* ignore userland traps */
264	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
265	jne	calltrap
266
267	/* Check if there is no DTrace hook registered. */
268	cmpq	$0,dtrace_invop_jump_addr
269	je	calltrap
270
271	/*
272	 * Set our jump address for the jump back in the event that
273	 * the breakpoint wasn't caused by DTrace at all.
274	 */
275	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
276
277	/* Jump to the code hooked in by DTrace. */
278	jmpq	*dtrace_invop_jump_addr
279#endif
280	.globl	calltrap
281	.type	calltrap,@function
282calltrap:
283	KMSAN_ENTER
284	movq	%rsp, %rdi
285	call	trap_check
286	KMSAN_LEAVE
287	jmp	doreti			/* Handle any pending ASTs */
288
289	/*
290	 * alltraps_noen_u/k entry points.
291	 * Again, SWAPGS must be already performed by prologue, if needed.
292	 * Unlike alltraps above, we want to leave the interrupts disabled.
293	 * This corresponds to SDT_SYS386IGT on the i386 port.
294	 */
295	SUPERALIGN_TEXT
296	.globl	alltraps_noen_u
297	.type	alltraps_noen_u,@function
298alltraps_noen_u:
299	movq	%rdi,TF_RDI(%rsp)
300	movq	PCPU(CURPCB),%rdi
301	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
302	jmp	alltraps_noen_save_segs
303	SUPERALIGN_TEXT
304	.globl	alltraps_noen_k
305	.type	alltraps_noen_k,@function
306alltraps_noen_k:
307	lfence
308	movq	%rdi,TF_RDI(%rsp)
309alltraps_noen_save_segs:
310	SAVE_SEGS
311	movq	%rdx,TF_RDX(%rsp)
312	movq	%rax,TF_RAX(%rsp)
313	movq	%rcx,TF_RCX(%rsp)
314	testb	$SEL_RPL_MASK,TF_CS(%rsp)
315	jz	alltraps_pushregs_no_rax
316	call	handle_ibrs_entry
317	jmp	alltraps_pushregs_no_rax
318
319IDTVEC(dblfault)
320	subq	$TF_ERR,%rsp
321	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
322	movq	$0,TF_ADDR(%rsp)
323	movq	$0,TF_ERR(%rsp)
324	movq	%rdi,TF_RDI(%rsp)
325	movq	%rsi,TF_RSI(%rsp)
326	movq	%rdx,TF_RDX(%rsp)
327	movq	%rcx,TF_RCX(%rsp)
328	movq	%r8,TF_R8(%rsp)
329	movq	%r9,TF_R9(%rsp)
330	movq	%rax,TF_RAX(%rsp)
331	movq	%rbx,TF_RBX(%rsp)
332	movq	%rbp,TF_RBP(%rsp)
333	movq	%r10,TF_R10(%rsp)
334	movq	%r11,TF_R11(%rsp)
335	movq	%r12,TF_R12(%rsp)
336	movq	%r13,TF_R13(%rsp)
337	movq	%r14,TF_R14(%rsp)
338	movq	%r15,TF_R15(%rsp)
339	SAVE_SEGS
340	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
341	pushfq
342	andq	$~(PSL_D | PSL_AC),(%rsp)
343	popfq
344	movq	TF_SIZE(%rsp),%rdx
345	movl	%edx,%eax
346	shrq	$32,%rdx
347	movl	$MSR_GSBASE,%ecx
348	wrmsr
349	movq	%cr3,%rax
350	movq	%rax,PCPU(SAVED_UCR3)
351	movq	PCPU(KCR3),%rax
352	cmpq	$~0,%rax
353	je	2f
354	movq	%rax,%cr3
3552:	KMSAN_ENTER
356	movq	%rsp,%rdi
357	call	dblfault_handler
358	KMSAN_LEAVE
3593:	hlt
360	jmp	3b
361
362	ALIGN_TEXT
363IDTVEC(page_pti)
364	testb	$SEL_RPL_MASK,PTI_CS-PTI_ERR(%rsp)
365	jz	page_k
366	swapgs
367	lfence
368	pushq	%rax
369	movq	%cr3,%rax
370	movq	%rax,PCPU(SAVED_UCR3)
371	cmpq	$~0,PCPU(UCR3)
372	jne	1f
373	popq	%rax
374	jmp	page_u
3751:	pushq	%rdx
376	PTI_UUENTRY has_err=1
377	jmp	page_u
378	ALIGN_TEXT
379IDTVEC(page)
380	testb	$SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) /* Did we come from kernel? */
381	jnz	page_u_swapgs		/* already running with kernel GS.base */
382page_k:
383	lfence
384	subq	$TF_ERR,%rsp
385	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
386	movq	%rax,TF_RAX(%rsp)
387	movq	%rdx,TF_RDX(%rsp)
388	movq	%rcx,TF_RCX(%rsp)
389	jmp	page_cr2
390	ALIGN_TEXT
391page_u_swapgs:
392	swapgs
393	lfence
394page_u:
395	subq	$TF_ERR,%rsp
396	movq	%rdi,TF_RDI(%rsp)
397	movq	%rax,TF_RAX(%rsp)
398	movq	%rdx,TF_RDX(%rsp)
399	movq	%rcx,TF_RCX(%rsp)
400	movq	PCPU(CURPCB),%rdi
401	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
402	movq	PCPU(SAVED_UCR3),%rax
403	movq	%rax,PCB_SAVED_UCR3(%rdi)
404	call	handle_ibrs_entry
405page_cr2:
406	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
407	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
408	SAVE_SEGS
409	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
410	testl	$PSL_I,TF_RFLAGS(%rsp)
411	jz	alltraps_pushregs_no_rax
412	sti
413	jmp	alltraps_pushregs_no_rax
414
415	/*
416	 * We have to special-case this one.  If we get a trap in doreti() at
417	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
418	 * to do a special the swapgs in this case even coming from the kernel.
419	 * XXX linux has a trap handler for their equivalent of load_gs().
420	 *
421	 * On the stack, we have the hardware interrupt frame to return
422	 * to usermode (faulted) and another frame with error code, for
423	 * fault.  For PTI, copy both frames to the main thread stack.
424	 * Handle the potential 16-byte alignment adjustment incurred
425	 * during the second fault by copying both frames independently
426	 * while unwinding the stack in between.
427	 */
428	.macro PROTF_ENTRY name,trapno
429\name\()_pti_doreti:
430	swapgs
431	lfence
432	cmpq	$~0,PCPU(UCR3)
433	je	1f
434	pushq	%rax
435	pushq	%rdx
436	movq	PCPU(KCR3),%rax
437	movq	%rax,%cr3
438	movq	PCPU(RSP0),%rax
439	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
440	MOVE_STACKS	(PTI_SIZE / 8)
441	addq	$PTI_SIZE,%rax
442	movq	PTI_RSP(%rsp),%rsp
443	MOVE_STACKS	(PTI_SIZE / 8 - 3)
444	subq	$PTI_SIZE,%rax
445	movq	%rax,%rsp
446	popq	%rdx
447	popq	%rax
4481:	swapgs
449	jmp	X\name
450IDTVEC(\name\()_pti)
451	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
452	je	\name\()_pti_doreti
453	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
454	jz	X\name		/* lfence is not needed until %gs: use */
455	PTI_UENTRY has_err=1
456	swapgs	/* fence provided by PTI_UENTRY */
457IDTVEC(\name)
458	subq	$TF_ERR,%rsp
459	movl	$\trapno,TF_TRAPNO(%rsp)
460	jmp	prot_addrf
461	.endm
462
463	PROTF_ENTRY	missing, T_SEGNPFLT
464	PROTF_ENTRY	stk, T_STKFLT
465	PROTF_ENTRY	prot, T_PROTFLT
466
467prot_addrf:
468	movq	$0,TF_ADDR(%rsp)
469	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
470	movq	%rax,TF_RAX(%rsp)
471	movq	%rdx,TF_RDX(%rsp)
472	movq	%rcx,TF_RCX(%rsp)
473	movw	%fs,TF_FS(%rsp)
474	movw	%gs,TF_GS(%rsp)
475	leaq	doreti_iret(%rip),%rdi
476	cmpq	%rdi,TF_RIP(%rsp)
477	je	5f			/* kernel but with user gsbase!! */
478	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
479	jz	6f			/* already running with kernel GS.base */
480	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
481	jz	2f
482	cmpw	$KUF32SEL,TF_FS(%rsp)
483	jne	1f
484	rdfsbase %rax
4851:	cmpw	$KUG32SEL,TF_GS(%rsp)
486	jne	2f
487	rdgsbase %rdx
4882:	swapgs
489	lfence
490	movq	PCPU(CURPCB),%rdi
491	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
492	jz	4f
493	cmpw	$KUF32SEL,TF_FS(%rsp)
494	jne	3f
495	movq	%rax,PCB_FSBASE(%rdi)
4963:	cmpw	$KUG32SEL,TF_GS(%rsp)
497	jne	4f
498	movq	%rdx,PCB_GSBASE(%rdi)
499	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* full iret from user #gp */
5004:	call	handle_ibrs_entry
501	movw	%es,TF_ES(%rsp)
502	movw	%ds,TF_DS(%rsp)
503	testl	$PSL_I,TF_RFLAGS(%rsp)
504	jz	alltraps_pushregs_no_rax
505	sti
506	jmp	alltraps_pushregs_no_rax
507
5085:	swapgs
5096:	lfence
510	movq	PCPU(CURPCB),%rdi
511	jmp	4b
512
513/*
514 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
515 * and the new privilege level.  We are still running on the old user stack
516 * pointer.  We have to juggle a few things around to find our stack etc.
517 * swapgs gives us access to our PCPU space only.
518 *
519 * We do not support invoking this from a custom segment registers,
520 * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
521 */
522	SUPERALIGN_TEXT
523IDTVEC(fast_syscall_pti)
524	swapgs
525	cmpq	$~0,PCPU(UCR3)
526	je	fast_syscall_common
527	movq	%rax,PCPU(SCRATCH_RAX)
528	movq	PCPU(KCR3),%rax
529	movq	%rax,%cr3
530	movq	PCPU(SCRATCH_RAX),%rax
531	jmp	fast_syscall_common
532	SUPERALIGN_TEXT
533IDTVEC(fast_syscall)
534	swapgs
535fast_syscall_common:
536	movq	%rsp,PCPU(SCRATCH_RSP)
537	movq	PCPU(RSP0),%rsp
538	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
539	subq	$TF_SIZE,%rsp
540	/* defer TF_RSP till we have a spare register */
541	movq	%r11,TF_RFLAGS(%rsp)
542	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
543	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
544	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
545	/*
546	 * Save a few arg registers early to free them for use in
547	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
548	 * arg register, but it holds the arg register %rcx.  Profiling
549	 * preserves %rcx, but may clobber %r10.  Profiling may also
550	 * clobber %r11, but %r11 (original %eflags) has been saved.
551	 */
552	movq	%rax,TF_RAX(%rsp)	/* syscall number */
553	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
554	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
555	SAVE_SEGS
556	call	handle_ibrs_entry
557	movq	PCPU(CURPCB),%r11
558	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
559	sti
560	movq	$KUDSEL,TF_SS(%rsp)
561	movq	$KUCSEL,TF_CS(%rsp)
562	movq	$2,TF_ERR(%rsp)
563	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
564	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
565	movq	%r8,TF_R8(%rsp)		/* arg 5 */
566	movq	%r9,TF_R9(%rsp)		/* arg 6 */
567	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
568	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
569	movq	%r12,TF_R12(%rsp)	/* C preserved */
570	movq	%r13,TF_R13(%rsp)	/* C preserved */
571	movq	%r14,TF_R14(%rsp)	/* C preserved */
572	movq	%r15,TF_R15(%rsp)	/* C preserved */
573	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
574	movq	PCPU(CURTHREAD),%rdi
575	movq	%rsp,TD_FRAME(%rdi)
576	movl	TF_RFLAGS(%rsp),%esi
577	andl	$PSL_T,%esi
578	call	amd64_syscall
5791:	movq	PCPU(CURPCB),%rax
580	/* Disable interrupts before testing PCB_FULL_IRET. */
581	cli
582	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
583	jnz	4f
584	/* Check for and handle AST's on return to userland. */
585	movq	PCPU(CURTHREAD),%rax
586	cmpl	$0,TD_AST(%rax)
587	jne	3f
588	call	handle_ibrs_exit
589	callq	*mds_handler
590	/* Restore preserved registers. */
591	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
592	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
593	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
594	movq	TF_RAX(%rsp),%rax	/* return value 1 */
595	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
596	movq	TF_RIP(%rsp),%rcx	/* original %rip */
597	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
598	xorl	%r8d,%r8d		/* zero the rest of GPRs */
599	xorl	%r10d,%r10d
600	cmpq	$~0,PCPU(UCR3)
601	je	2f
602	movq	PCPU(UCR3),%r9
603	andq	PCPU(UCR3_LOAD_MASK),%r9
604	movq	%r9,%cr3
6052:	xorl	%r9d,%r9d
606	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
607	swapgs
608	sysretq
609
6103:	/* AST scheduled. */
611	sti
612	movq	%rsp,%rdi
613	call	ast
614	jmp	1b
615
6164:	/* Requested full context restore, use doreti for that. */
617	jmp	doreti
618
619/*
620 * Here for CYA insurance, in case a "syscall" instruction gets
621 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
622 * to *something* if EFER_SCE is enabled.
623 */
624IDTVEC(fast_syscall32)
625	sysret
626
627/*
628 * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
629 * generation of exception until the next instruction is executed,
630 * which might be a kernel entry.  So we must execute the handler
631 * on IST stack and be ready for non-kernel GSBASE.
632 */
633IDTVEC(dbg)
634	subq	$TF_RIP,%rsp
635	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
636	movq	$0,TF_ADDR(%rsp)
637	movq	$0,TF_ERR(%rsp)
638	movq	%rdi,TF_RDI(%rsp)
639	movq	%rsi,TF_RSI(%rsp)
640	movq	%rdx,TF_RDX(%rsp)
641	movq	%rcx,TF_RCX(%rsp)
642	movq	%r8,TF_R8(%rsp)
643	movq	%r9,TF_R9(%rsp)
644	movq	%rax,TF_RAX(%rsp)
645	movq	%rbx,TF_RBX(%rsp)
646	movq	%rbp,TF_RBP(%rsp)
647	movq	%r10,TF_R10(%rsp)
648	movq	%r11,TF_R11(%rsp)
649	movq	%r12,TF_R12(%rsp)
650	movq	%r13,TF_R13(%rsp)
651	movq	%r14,TF_R14(%rsp)
652	movq	%r15,TF_R15(%rsp)
653	SAVE_SEGS
654	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
655	pushfq
656	andq	$~(PSL_D | PSL_AC),(%rsp)
657	popfq
658	testb	$SEL_RPL_MASK,TF_CS(%rsp)
659	jnz	dbg_fromuserspace
660	lfence
661	/*
662	 * We've interrupted the kernel.  See comment in NMI handler about
663	 * registers use.
664	 */
665	movq	%cr2,%r15
666	movl	$MSR_GSBASE,%ecx
667	rdmsr
668	movq	%rax,%r12
669	shlq	$32,%rdx
670	orq	%rdx,%r12
671	/* Retrieve and load the canonical value for GS.base. */
672	movq	TF_SIZE(%rsp),%rdx
673	movl	%edx,%eax
674	shrq	$32,%rdx
675	wrmsr
676	movq	%cr3,%r13
677	movq	PCPU(KCR3),%rax
678	cmpq	$~0,%rax
679	je	1f
680	movq	%rax,%cr3
6811:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
682	je	2f
683	movl	$MSR_IA32_SPEC_CTRL,%ecx
684	rdmsr
685	movl	%eax,%r14d
686	call	handle_ibrs_entry
6872:	movq	%rsp,%rdi
688	call	trap
689	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
690	je	3f
691	movl	%r14d,%eax
692	xorl	%edx,%edx
693	movl	$MSR_IA32_SPEC_CTRL,%ecx
694	wrmsr
695	/*
696	 * Put back the preserved MSR_GSBASE value.
697	 */
6983:	movl	$MSR_GSBASE,%ecx
699	movq	%r12,%rdx
700	movl	%edx,%eax
701	shrq	$32,%rdx
702	wrmsr
703	movq	%r13,%cr3
704	movq	%r15,%cr2
705	RESTORE_REGS
706	addq	$TF_RIP,%rsp
707	jmp	doreti_iret
708dbg_fromuserspace:
709	/*
710	 * Switch to kernel GSBASE and kernel page table, and copy frame
711	 * from the IST stack to the normal kernel stack, since trap()
712	 * re-enables interrupts, and since we might trap on DB# while
713	 * in trap().
714	 */
715	swapgs
716	lfence
717	movq	PCPU(KCR3),%rax
718	cmpq	$~0,%rax
719	je	1f
720	movq	%rax,%cr3
7211:	movq	PCPU(RSP0),%rax
722	movl	$TF_SIZE,%ecx
723	subq	%rcx,%rax
724	movq	%rax,%rdi
725	movq	%rsp,%rsi
726	rep;movsb
727	movq	%rax,%rsp
728	call	handle_ibrs_entry
729	movq	PCPU(CURPCB),%rdi
730	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
731	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
732	jz	3f
733	cmpw	$KUF32SEL,TF_FS(%rsp)
734	jne	2f
735	rdfsbase %rax
736	movq	%rax,PCB_FSBASE(%rdi)
7372:	cmpw	$KUG32SEL,TF_GS(%rsp)
738	jne	3f
739	movl	$MSR_KGSBASE,%ecx
740	rdmsr
741	shlq	$32,%rdx
742	orq	%rdx,%rax
743	movq	%rax,PCB_GSBASE(%rdi)
7443:	jmp	calltrap
745
746/*
747 * NMI handling is special.
748 *
749 * First, NMIs do not respect the state of the processor's RFLAGS.IF
750 * bit.  The NMI handler may be entered at any time, including when
751 * the processor is in a critical section with RFLAGS.IF == 0.
752 * The processor's GS.base value could be invalid on entry to the
753 * handler.
754 *
755 * Second, the processor treats NMIs specially, blocking further NMIs
756 * until an 'iretq' instruction is executed.  We thus need to execute
757 * the NMI handler with interrupts disabled, to prevent a nested interrupt
758 * from executing an 'iretq' instruction and inadvertently taking the
759 * processor out of NMI mode.
760 *
761 * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
762 * GS.base value for the processor is stored just above the bottom of its
763 * NMI stack.  For NMIs taken from kernel mode, the current value in
764 * the processor's GS.base is saved at entry to C-preserved register %r12,
765 * the canonical value for GS.base is then loaded into the processor, and
766 * the saved value is restored at exit time.  For NMIs taken from user mode,
767 * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
768 */
769
770IDTVEC(nmi)
771	subq	$TF_RIP,%rsp
772	movl	$(T_NMI),TF_TRAPNO(%rsp)
773	movq	$0,TF_ADDR(%rsp)
774	movq	$0,TF_ERR(%rsp)
775	movq	%rdi,TF_RDI(%rsp)
776	movq	%rsi,TF_RSI(%rsp)
777	movq	%rdx,TF_RDX(%rsp)
778	movq	%rcx,TF_RCX(%rsp)
779	movq	%r8,TF_R8(%rsp)
780	movq	%r9,TF_R9(%rsp)
781	movq	%rax,TF_RAX(%rsp)
782	movq	%rbx,TF_RBX(%rsp)
783	movq	%rbp,TF_RBP(%rsp)
784	movq	%r10,TF_R10(%rsp)
785	movq	%r11,TF_R11(%rsp)
786	movq	%r12,TF_R12(%rsp)
787	movq	%r13,TF_R13(%rsp)
788	movq	%r14,TF_R14(%rsp)
789	movq	%r15,TF_R15(%rsp)
790	SAVE_SEGS
791	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
792	pushfq
793	andq	$~(PSL_D | PSL_AC),(%rsp)
794	popfq
795	xorl	%ebx,%ebx
796	testb	$SEL_RPL_MASK,TF_CS(%rsp)
797	jnz	nmi_fromuserspace
798	/*
799	 * We've interrupted the kernel.  Preserve in callee-saved regs:
800	 * GS.base in %r12,
801	 * %cr3 in %r13,
802	 * possibly lower half of MSR_IA32_SPEC_CTL in %r14d,
803	 * %cr2 in %r15.
804	 */
805	lfence
806	movq	%cr2,%r15
807	movl	$MSR_GSBASE,%ecx
808	rdmsr
809	movq	%rax,%r12
810	shlq	$32,%rdx
811	orq	%rdx,%r12
812	/* Retrieve and load the canonical value for GS.base. */
813	movq	TF_SIZE(%rsp),%rdx
814	movl	%edx,%eax
815	shrq	$32,%rdx
816	wrmsr
817	movq	%cr3,%r13
818	movq	PCPU(KCR3),%rax
819	cmpq	$~0,%rax
820	je	1f
821	movq	%rax,%cr3
8221:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
823	je	nmi_calltrap
824	movl	$MSR_IA32_SPEC_CTRL,%ecx
825	rdmsr
826	movl	%eax,%r14d
827	call	handle_ibrs_entry
828	jmp	nmi_calltrap
829nmi_fromuserspace:
830	incl	%ebx
831	swapgs
832	lfence
833	movq	%cr3,%r13
834	movq	PCPU(KCR3),%rax
835	cmpq	$~0,%rax
836	je	1f
837	movq	%rax,%cr3
8381:	call	handle_ibrs_entry
839	movq	PCPU(CURPCB),%rdi
840	testq	%rdi,%rdi
841	jz	3f
842	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
843	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
844	jz	3f
845	cmpw	$KUF32SEL,TF_FS(%rsp)
846	jne	2f
847	rdfsbase %rax
848	movq	%rax,PCB_FSBASE(%rdi)
8492:	cmpw	$KUG32SEL,TF_GS(%rsp)
850	jne	3f
851	movl	$MSR_KGSBASE,%ecx
852	rdmsr
853	shlq	$32,%rdx
854	orq	%rdx,%rax
855	movq	%rax,PCB_GSBASE(%rdi)
8563:
857/* Note: this label is also used by ddb and gdb: */
858nmi_calltrap:
859	KMSAN_ENTER
860	movq	%rsp,%rdi
861	call	trap
862	KMSAN_LEAVE
863#ifdef HWPMC_HOOKS
864	/*
865	 * Capture a userspace callchain if needed.
866	 *
867	 * - Check if the current trap was from user mode.
868	 * - Check if the current thread is valid.
869	 * - Check if the thread requires a user call chain to be
870	 *   captured.
871	 *
872	 * We are still in NMI mode at this point.
873	 */
874	testl	%ebx,%ebx
875	jz	nocallchain	/* not from userspace */
876	movq	PCPU(CURTHREAD),%rax
877	orq	%rax,%rax	/* curthread present? */
878	jz	nocallchain
879	/*
880	 * Move execution to the regular kernel stack, because we
881	 * committed to return through doreti.
882	 */
883	movq	%rsp,%rsi	/* source stack pointer */
884	movq	$TF_SIZE,%rcx
885	movq	PCPU(RSP0),%rdx
886	subq	%rcx,%rdx
887	movq	%rdx,%rdi	/* destination stack pointer */
888	shrq	$3,%rcx		/* trap frame size in long words */
889	pushfq
890	andq	$~(PSL_D | PSL_AC),(%rsp)
891	popfq
892	rep
893	movsq			/* copy trapframe */
894	movq	%rdx,%rsp	/* we are on the regular kstack */
895
896	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
897	jz	nocallchain
898	/*
899	 * A user callchain is to be captured, so:
900	 * - Take the processor out of "NMI" mode by faking an "iret",
901	 *   to allow for nested NMI interrupts.
902	 * - Enable interrupts, so that copyin() can work.
903	 */
904	movl	%ss,%eax
905	pushq	%rax		/* tf_ss */
906	pushq	%rdx		/* tf_rsp (on kernel stack) */
907	pushfq			/* tf_rflags */
908	movl	%cs,%eax
909	pushq	%rax		/* tf_cs */
910	pushq	$outofnmi	/* tf_rip */
911	iretq
912outofnmi:
913	/*
914	 * At this point the processor has exited NMI mode and is running
915	 * with interrupts turned off on the normal kernel stack.
916	 *
917	 * If a pending NMI gets recognized at or after this point, it
918	 * will cause a kernel callchain to be traced.
919	 *
920	 * We turn interrupts back on, and call the user callchain capture hook.
921	 */
922	movq	pmc_hook,%rax
923	orq	%rax,%rax
924	jz	nocallchain
925	movq	PCPU(CURTHREAD),%rdi		/* thread */
926	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
927	movq	%rsp,%rdx			/* frame */
928	sti
929	call	*%rax
930	cli
931nocallchain:
932#endif
933	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
934	jnz	doreti_exit
935	/*
936	 * Restore speculation control MSR, if preserved.
937	 */
938	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
939	je	1f
940	movl	%r14d,%eax
941	xorl	%edx,%edx
942	movl	$MSR_IA32_SPEC_CTRL,%ecx
943	wrmsr
944	/*
945	 * Put back the preserved MSR_GSBASE value.
946	 */
9471:	movl	$MSR_GSBASE,%ecx
948	movq	%r12,%rdx
949	movl	%edx,%eax
950	shrq	$32,%rdx
951	wrmsr
952	cmpb	$0, nmi_flush_l1d_sw(%rip)
953	je	2f
954	call	flush_l1d_sw		/* bhyve L1TF assist */
9552:	movq	%r13,%cr3
956	movq	%r15,%cr2
957	RESTORE_REGS
958	addq	$TF_RIP,%rsp
959	jmp	doreti_iret
960
961/*
962 * MC# handling is similar to NMI.
963 *
964 * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
965 * can occur at any time with a GS.base value that does not correspond
966 * to the privilege level in CS.
967 *
968 * Machine checks are not unblocked by iretq, but it is best to run
969 * the handler with interrupts disabled since the exception may have
970 * interrupted a critical section.
971 *
972 * The MC# handler runs on its own stack (tss_ist3).  The canonical
973 * GS.base value for the processor is stored just above the bottom of
974 * its MC# stack.  For exceptions taken from kernel mode, the current
975 * value in the processor's GS.base is saved at entry to C-preserved
976 * register %r12, the canonical value for GS.base is then loaded into
977 * the processor, and the saved value is restored at exit time.  For
978 * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
979 * are used for swapping GS.base.
980 */
981
982IDTVEC(mchk)
983	subq	$TF_RIP,%rsp
984	movl	$(T_MCHK),TF_TRAPNO(%rsp)
985	movq	$0,TF_ADDR(%rsp)
986	movq	$0,TF_ERR(%rsp)
987	movq	%rdi,TF_RDI(%rsp)
988	movq	%rsi,TF_RSI(%rsp)
989	movq	%rdx,TF_RDX(%rsp)
990	movq	%rcx,TF_RCX(%rsp)
991	movq	%r8,TF_R8(%rsp)
992	movq	%r9,TF_R9(%rsp)
993	movq	%rax,TF_RAX(%rsp)
994	movq	%rbx,TF_RBX(%rsp)
995	movq	%rbp,TF_RBP(%rsp)
996	movq	%r10,TF_R10(%rsp)
997	movq	%r11,TF_R11(%rsp)
998	movq	%r12,TF_R12(%rsp)
999	movq	%r13,TF_R13(%rsp)
1000	movq	%r14,TF_R14(%rsp)
1001	movq	%r15,TF_R15(%rsp)
1002	SAVE_SEGS
1003	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1004	pushfq
1005	andq	$~(PSL_D | PSL_AC),(%rsp)
1006	popfq
1007	xorl	%ebx,%ebx
1008	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1009	jnz	mchk_fromuserspace
1010	/*
1011	 * We've interrupted the kernel.  See comment in NMI handler about
1012	 * registers use.
1013	 */
1014	movq	%cr2,%r15
1015	movl	$MSR_GSBASE,%ecx
1016	rdmsr
1017	movq	%rax,%r12
1018	shlq	$32,%rdx
1019	orq	%rdx,%r12
1020	/* Retrieve and load the canonical value for GS.base. */
1021	movq	TF_SIZE(%rsp),%rdx
1022	movl	%edx,%eax
1023	shrq	$32,%rdx
1024	wrmsr
1025	movq	%cr3,%r13
1026	movq	PCPU(KCR3),%rax
1027	cmpq	$~0,%rax
1028	je	1f
1029	movq	%rax,%cr3
10301:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1031	je	mchk_calltrap
1032	movl	$MSR_IA32_SPEC_CTRL,%ecx
1033	rdmsr
1034	movl	%eax,%r14d
1035	call	handle_ibrs_entry
1036	jmp	mchk_calltrap
1037mchk_fromuserspace:
1038	incl	%ebx
1039	swapgs
1040	movq	%cr3,%r13
1041	movq	PCPU(KCR3),%rax
1042	cmpq	$~0,%rax
1043	je	1f
1044	movq	%rax,%cr3
10451:	call	handle_ibrs_entry
1046/* Note: this label is also used by ddb and gdb: */
1047mchk_calltrap:
1048	KMSAN_ENTER
1049	movq	%rsp,%rdi
1050	call	mca_intr
1051	KMSAN_LEAVE
1052	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
1053	jnz	doreti_exit
1054	/*
1055	 * Restore speculation control MSR, if preserved.
1056	 */
1057	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1058	je	1f
1059	movl	%r14d,%eax
1060	xorl	%edx,%edx
1061	movl	$MSR_IA32_SPEC_CTRL,%ecx
1062	wrmsr
1063	/*
1064	 * Put back the preserved MSR_GSBASE value.
1065	 */
10661:	movl	$MSR_GSBASE,%ecx
1067	movq	%r12,%rdx
1068	movl	%edx,%eax
1069	shrq	$32,%rdx
1070	wrmsr
1071	movq	%r13,%cr3
1072	movq	%r15,%cr2
1073	RESTORE_REGS
1074	addq	$TF_RIP,%rsp
1075	jmp	doreti_iret
1076
1077ENTRY(fork_trampoline)
1078	movq	%r12,%rdi		/* function */
1079	movq	%rbx,%rsi		/* arg1 */
1080	movq	%rsp,%rdx		/* trapframe pointer */
1081	call	fork_exit
1082	jmp	doreti			/* Handle any ASTs */
1083
1084/*
1085 * To efficiently implement classification of trap and interrupt handlers
1086 * for profiling, there must be only trap handlers between the labels btrap
1087 * and bintr, and only interrupt handlers between the labels bintr and
1088 * eintr.  This is implemented (partly) by including files that contain
1089 * some of the handlers.  Before including the files, set up a normal asm
1090 * environment so that the included files doesn't need to know that they are
1091 * included.
1092 */
1093
1094#ifdef COMPAT_FREEBSD32
1095	.data
1096	.p2align 4
1097	.text
1098	SUPERALIGN_TEXT
1099
1100#include <amd64/ia32/ia32_exception.S>
1101#endif
1102
1103	.data
1104	.p2align 4
1105	.text
1106	SUPERALIGN_TEXT
1107#include <amd64/amd64/apic_vector.S>
1108
1109#ifdef DEV_ATPIC
1110	.data
1111	.p2align 4
1112	.text
1113	SUPERALIGN_TEXT
1114
1115#include <amd64/amd64/atpic_vector.S>
1116#endif
1117
1118/*
1119 * void doreti(struct trapframe)
1120 *
1121 * Handle return from interrupts, traps and syscalls.
1122 */
1123	.text
1124	SUPERALIGN_TEXT
1125	.type	doreti,@function
1126	.globl	doreti
1127doreti:
1128	/*
1129	 * Check if ASTs can be handled now.
1130	 */
1131	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
1132	jz	doreti_exit		/* can't handle ASTs now if not */
1133
1134doreti_ast:
1135	/*
1136	 * Check for ASTs atomically with returning.  Disabling CPU
1137	 * interrupts provides sufficient locking even in the SMP case,
1138	 * since we will be informed of any new ASTs by an IPI.
1139	 */
1140	cli
1141	movq	PCPU(CURTHREAD),%rax
1142	cmpl	$0,TD_AST(%rax)
1143	je	doreti_exit
1144	sti
1145	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
1146	call	ast
1147	jmp	doreti_ast
1148
1149	/*
1150	 * doreti_exit:	pop registers, iret.
1151	 *
1152	 *	The segment register pop is a special case, since it may
1153	 *	fault if (for example) a sigreturn specifies bad segment
1154	 *	registers.  The fault is handled in trap.c.
1155	 */
1156doreti_exit:
1157	movq	PCPU(CURPCB),%r8
1158
1159	/*
1160	 * Do not reload segment registers for kernel.
1161	 * Since we do not reload segments registers with sane
1162	 * values on kernel entry, descriptors referenced by
1163	 * segments registers might be not valid.  This is fatal
1164	 * for user mode, but is not a problem for the kernel.
1165	 */
1166	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1167	jz	ld_regs
1168	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
1169	jz	ld_regs
1170	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
1171	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
1172	je	set_segs
1173
1174do_segs:
1175	/* Restore %fs and fsbase */
1176	movw	TF_FS(%rsp),%ax
1177	.globl	ld_fs
1178ld_fs:
1179	movw	%ax,%fs
1180	cmpw	$KUF32SEL,%ax
1181	jne	1f
1182	movl	$MSR_FSBASE,%ecx
1183	movl	PCB_FSBASE(%r8),%eax
1184	movl	PCB_FSBASE+4(%r8),%edx
1185	.globl	ld_fsbase
1186ld_fsbase:
1187	wrmsr
11881:
1189	/* Restore %gs and gsbase */
1190	movw	TF_GS(%rsp),%si
1191	pushfq
1192	cli
1193	movl	$MSR_GSBASE,%ecx
1194	/* Save current kernel %gs base into %r12d:%r13d */
1195	rdmsr
1196	movl	%eax,%r12d
1197	movl	%edx,%r13d
1198	.globl	ld_gs
1199ld_gs:
1200	movw	%si,%gs
1201	/* Save user %gs base into %r14d:%r15d */
1202	rdmsr
1203	movl	%eax,%r14d
1204	movl	%edx,%r15d
1205	/* Restore kernel %gs base */
1206	movl	%r12d,%eax
1207	movl	%r13d,%edx
1208	wrmsr
1209	popfq
1210	/*
1211	 * Restore user %gs base, either from PCB if used for TLS, or
1212	 * from the previously saved msr read.
1213	 */
1214	movl	$MSR_KGSBASE,%ecx
1215	cmpw	$KUG32SEL,%si
1216	jne	1f
1217	movl	PCB_GSBASE(%r8),%eax
1218	movl	PCB_GSBASE+4(%r8),%edx
1219	jmp	ld_gsbase
12201:
1221	movl	%r14d,%eax
1222	movl	%r15d,%edx
1223	.globl	ld_gsbase
1224ld_gsbase:
1225	wrmsr	/* May trap if non-canonical, but only for TLS. */
1226	.globl	ld_es
1227ld_es:
1228	movw	TF_ES(%rsp),%es
1229	.globl	ld_ds
1230ld_ds:
1231	movw	TF_DS(%rsp),%ds
1232ld_regs:
1233	RESTORE_REGS
1234	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
1235	jz	2f			/* keep running with kernel GS.base */
1236	cli
1237	call	handle_ibrs_exit_rs
1238	callq	*mds_handler
1239	cmpq	$~0,PCPU(UCR3)
1240	je	1f
1241	pushq	%rdx
1242	movq	PCPU(PTI_RSP0),%rdx
1243	subq	$PTI_SIZE,%rdx
1244	movq	%rax,PTI_RAX(%rdx)
1245	popq	%rax
1246	movq	%rax,PTI_RDX(%rdx)
1247	movq	TF_RIP(%rsp),%rax
1248	movq	%rax,PTI_RIP(%rdx)
1249	movq	TF_CS(%rsp),%rax
1250	movq	%rax,PTI_CS(%rdx)
1251	movq	TF_RFLAGS(%rsp),%rax
1252	movq	%rax,PTI_RFLAGS(%rdx)
1253	movq	TF_RSP(%rsp),%rax
1254	movq	%rax,PTI_RSP(%rdx)
1255	movq	TF_SS(%rsp),%rax
1256	movq	%rax,PTI_SS(%rdx)
1257	movq	PCPU(UCR3),%rax
1258	andq	PCPU(UCR3_LOAD_MASK),%rax
1259	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
1260	swapgs
1261	movq	%rdx,%rsp
1262	movq	%rax,%cr3
1263	popq	%rdx
1264	popq	%rax
1265	addq	$8,%rsp
1266	jmp	doreti_iret
12671:	swapgs
12682:	addq	$TF_RIP,%rsp
1269	.globl	doreti_iret
1270doreti_iret:
1271	iretq
1272
1273set_segs:
1274	movw	$KUDSEL,%ax
1275	movw	%ax,TF_DS(%rsp)
1276	movw	%ax,TF_ES(%rsp)
1277	movw	$KUF32SEL,TF_FS(%rsp)
1278	movw	$KUG32SEL,TF_GS(%rsp)
1279	jmp	do_segs
1280
1281	/*
1282	 * doreti_iret_fault.  Alternative return code for
1283	 * the case where we get a fault in the doreti_exit code
1284	 * above.  trap() (amd64/amd64/trap.c) catches this specific
1285	 * case, sends the process a signal and continues in the
1286	 * corresponding place in the code below.
1287	 */
1288	ALIGN_TEXT
1289	.globl	doreti_iret_fault
1290doreti_iret_fault:
1291	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
1292	movq	%rax,TF_RAX(%rsp)
1293	movq	%rdx,TF_RDX(%rsp)
1294	movq	%rcx,TF_RCX(%rsp)
1295	call	handle_ibrs_entry
1296	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1297	jz	1f
1298	sti
12991:
1300	SAVE_SEGS
1301	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1302	movq	%rdi,TF_RDI(%rsp)
1303	movq	%rsi,TF_RSI(%rsp)
1304	movq	%r8,TF_R8(%rsp)
1305	movq	%r9,TF_R9(%rsp)
1306	movq	%rbx,TF_RBX(%rsp)
1307	movq	%rbp,TF_RBP(%rsp)
1308	movq	%r10,TF_R10(%rsp)
1309	movq	%r11,TF_R11(%rsp)
1310	movq	%r12,TF_R12(%rsp)
1311	movq	%r13,TF_R13(%rsp)
1312	movq	%r14,TF_R14(%rsp)
1313	movq	%r15,TF_R15(%rsp)
1314	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1315	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
1316	movq	$0,TF_ADDR(%rsp)
1317	jmp	calltrap
1318
1319	ALIGN_TEXT
1320	.globl	ds_load_fault
1321ds_load_fault:
1322	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1323	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1324	jz	1f
1325	sti
13261:
1327	movq	%rsp,%rdi
1328	call	trap
1329	movw	$KUDSEL,TF_DS(%rsp)
1330	jmp	doreti
1331
1332	ALIGN_TEXT
1333	.globl	es_load_fault
1334es_load_fault:
1335	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1336	testl	$PSL_I,TF_RFLAGS(%rsp)
1337	jz	1f
1338	sti
13391:
1340	movq	%rsp,%rdi
1341	call	trap
1342	movw	$KUDSEL,TF_ES(%rsp)
1343	jmp	doreti
1344
1345	ALIGN_TEXT
1346	.globl	fs_load_fault
1347fs_load_fault:
1348	testl	$PSL_I,TF_RFLAGS(%rsp)
1349	jz	1f
1350	sti
13511:
1352	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1353	movq	%rsp,%rdi
1354	call	trap
1355	movw	$KUF32SEL,TF_FS(%rsp)
1356	jmp	doreti
1357
1358	ALIGN_TEXT
1359	.globl	gs_load_fault
1360gs_load_fault:
1361	popfq
1362	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1363	testl	$PSL_I,TF_RFLAGS(%rsp)
1364	jz	1f
1365	sti
13661:
1367	movq	%rsp,%rdi
1368	call	trap
1369	movw	$KUG32SEL,TF_GS(%rsp)
1370	jmp	doreti
1371
1372	ALIGN_TEXT
1373	.globl	fsbase_load_fault
1374fsbase_load_fault:
1375	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1376	testl	$PSL_I,TF_RFLAGS(%rsp)
1377	jz	1f
1378	sti
13791:
1380	movq	%rsp,%rdi
1381	call	trap
1382	movq	PCPU(CURTHREAD),%r8
1383	movq	TD_PCB(%r8),%r8
1384	movq	$0,PCB_FSBASE(%r8)
1385	jmp	doreti
1386
1387	ALIGN_TEXT
1388	.globl	gsbase_load_fault
1389gsbase_load_fault:
1390	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1391	testl	$PSL_I,TF_RFLAGS(%rsp)
1392	jz	1f
1393	sti
13941:
1395	movq	%rsp,%rdi
1396	call	trap
1397	movq	PCPU(CURTHREAD),%r8
1398	movq	TD_PCB(%r8),%r8
1399	movq	$0,PCB_GSBASE(%r8)
1400	jmp	doreti
1401
1402#ifdef HWPMC_HOOKS
1403	ENTRY(end_exceptions)
1404#endif
1405