locore.S revision 1.113
1/*	$OpenBSD: locore.S,v 1.113 2019/01/24 00:00:50 deraadt Exp $	*/
2/*	$NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $	*/
3
4/*
5 * Copyright-o-rama!
6 */
7
8/*
9 * Copyright (c) 2001 Wasabi Systems, Inc.
10 * All rights reserved.
11 *
12 * Written by Frank van der Linden for Wasabi Systems, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 *    must display the following acknowledgement:
24 *      This product includes software developed for the NetBSD Project by
25 *      Wasabi Systems, Inc.
26 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
27 *    or promote products derived from this software without specific prior
28 *    written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
32 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
33 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
34 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
35 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
36 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
37 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
38 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
39 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40 * POSSIBILITY OF SUCH DAMAGE.
41 */
42
43
44/*-
45 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
46 * All rights reserved.
47 *
48 * This code is derived from software contributed to The NetBSD Foundation
49 * by Charles M. Hannum.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions
53 * are met:
54 * 1. Redistributions of source code must retain the above copyright
55 *    notice, this list of conditions and the following disclaimer.
56 * 2. Redistributions in binary form must reproduce the above copyright
57 *    notice, this list of conditions and the following disclaimer in the
58 *    documentation and/or other materials provided with the distribution.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
63 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70 * POSSIBILITY OF SUCH DAMAGE.
71 */
72
73/*-
74 * Copyright (c) 1990 The Regents of the University of California.
75 * All rights reserved.
76 *
77 * This code is derived from software contributed to Berkeley by
78 * William Jolitz.
79 *
80 * Redistribution and use in source and binary forms, with or without
81 * modification, are permitted provided that the following conditions
82 * are met:
83 * 1. Redistributions of source code must retain the above copyright
84 *    notice, this list of conditions and the following disclaimer.
85 * 2. Redistributions in binary form must reproduce the above copyright
86 *    notice, this list of conditions and the following disclaimer in the
87 *    documentation and/or other materials provided with the distribution.
88 * 3. Neither the name of the University nor the names of its contributors
89 *    may be used to endorse or promote products derived from this software
90 *    without specific prior written permission.
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
95 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
102 * SUCH DAMAGE.
103 *
104 *	@(#)locore.s	7.3 (Berkeley) 5/13/91
105 */
106
107#include "assym.h"
108#include "lapic.h"
109#include "ksyms.h"
110#include "xen.h"
111#include "hyperv.h"
112
113#include <sys/syscall.h>
114
115#include <machine/param.h>
116#include <machine/codepatch.h>
117#include <machine/psl.h>
118#include <machine/segments.h>
119#include <machine/specialreg.h>
120#include <machine/trap.h>			/* T_PROTFLT */
121#include <machine/frameasm.h>
122
123#if NLAPIC > 0
124#include <machine/i82489reg.h>
125#endif
126
127/*
128 * override user-land alignment before including asm.h
129 */
130#define	ALIGN_DATA	.align	8,0xcc
131
132#include <machine/asm.h>
133
134#define SET_CURPROC(proc,cpu)			\
135	movq	CPUVAR(SELF),cpu	;	\
136	movq	proc,CPUVAR(CURPROC)      ;	\
137	movq	cpu,P_CPU(proc)
138
139#define GET_CURPCB(reg)			movq	CPUVAR(CURPCB),reg
140#define SET_CURPCB(reg)			movq	reg,CPUVAR(CURPCB)
141
142
143/*
144 * Initialization
145 */
146	.data
147
148#if NLAPIC > 0
149	.align  NBPG, 0xcc
150	.globl _C_LABEL(local_apic), _C_LABEL(lapic_id), _C_LABEL(lapic_tpr)
151_C_LABEL(local_apic):
152	.space  LAPIC_ID
153_C_LABEL(lapic_id):
154	.long   0x00000000
155	.space  LAPIC_TPRI-(LAPIC_ID+4)
156_C_LABEL(lapic_tpr):
157	.space  LAPIC_PPRI-LAPIC_TPRI
158_C_LABEL(lapic_ppr):
159	.space  LAPIC_ISR-LAPIC_PPRI
160_C_LABEL(lapic_isr):
161	.space  NBPG-LAPIC_ISR
162#endif
163
164	.globl	_C_LABEL(cpu_id),_C_LABEL(cpu_vendor)
165	.globl	_C_LABEL(cpuid_level),_C_LABEL(cpu_feature)
166	.globl	_C_LABEL(cpu_ebxfeature)
167	.globl	_C_LABEL(cpu_ecxfeature),_C_LABEL(ecpu_ecxfeature)
168	.globl	_C_LABEL(cpu_perf_eax)
169	.globl	_C_LABEL(cpu_perf_ebx)
170	.globl	_C_LABEL(cpu_perf_edx)
171	.globl	_C_LABEL(cpu_apmi_edx)
172	.globl	_C_LABEL(ssym),_C_LABEL(esym),_C_LABEL(boothowto)
173	.globl	_C_LABEL(bootdev)
174	.globl	_C_LABEL(bootinfo), _C_LABEL(bootinfo_size), _C_LABEL(atdevbase)
175	.globl	_C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
176	.globl	_C_LABEL(biosbasemem)
177	.globl	_C_LABEL(bootapiver)
178	.globl	_C_LABEL(pg_nx)
179	.globl	_C_LABEL(pg_g_kern)
180	.globl	_C_LABEL(cpu_meltdown)
181_C_LABEL(cpu_id):	.long	0	# saved from `cpuid' instruction
182_C_LABEL(cpu_feature):	.long	0	# feature flags from 'cpuid'
183					#   instruction
184_C_LABEL(cpu_ebxfeature):.long	0	# ext. ebx feature flags from 'cpuid'
185_C_LABEL(cpu_ecxfeature):.long	0	# ext. ecx feature flags from 'cpuid'
186_C_LABEL(ecpu_ecxfeature):.long	0	# extended ecx feature flags
187_C_LABEL(cpu_perf_eax):	.long	0	# arch. perf. mon. flags from 'cpuid'
188_C_LABEL(cpu_perf_ebx):	.long	0	# arch. perf. mon. flags from 'cpuid'
189_C_LABEL(cpu_perf_edx):	.long	0	# arch. perf. mon. flags from 'cpuid'
190_C_LABEL(cpu_apmi_edx):	.long	0	# adv. power mgmt. info. from 'cpuid'
191_C_LABEL(cpuid_level):	.long	-1	# max. level accepted by 'cpuid'
192					#   instruction
193_C_LABEL(cpu_vendor):	.space	16	# vendor string returned by `cpuid'
194					#   instruction
195_C_LABEL(ssym):		.quad	0	# ptr to start of syms
196_C_LABEL(esym):		.quad	0	# ptr to end of syms
197_C_LABEL(atdevbase):	.quad	0	# location of start of iomem in virtual
198_C_LABEL(bootapiver):	.long	0	# /boot API version
199_C_LABEL(bootdev):	.long	0	# device we booted from
200_C_LABEL(proc0paddr):	.quad	0
201_C_LABEL(PTDpaddr):	.quad	0	# paddr of PTD, for libkvm
202#ifndef REALBASEMEM
203_C_LABEL(biosbasemem):	.long	0	# base memory reported by BIOS
204#else
205_C_LABEL(biosbasemem):	.long	REALBASEMEM
206#endif
207#ifndef REALEXTMEM
208_C_LABEL(biosextmem):	.long	0	# extended memory reported by BIOS
209#else
210_C_LABEL(biosextmem):	.long	REALEXTMEM
211#endif
212_C_LABEL(pg_nx):	.quad	0	# NX PTE bit (if CPU supports)
213_C_LABEL(pg_g_kern):	.quad	0	# 0x100 if global pages should be used
214					# in kernel mappings, 0 otherwise (for
215					# insecure CPUs)
216_C_LABEL(cpu_meltdown):	.long	0	# 1 if this CPU has Meltdown
217
218/*****************************************************************************/
219
220/*
221 * Signal trampoline; copied to a page mapped into userspace.
222 * gdb's backtrace logic matches against the instructions in this.
223 */
224	.section .rodata
225	.globl	_C_LABEL(sigcode)
226_C_LABEL(sigcode):
227	call	1f
228	movq	%rsp,%rdi
229	pushq	%rdi			/* fake return address */
230	movq	$SYS_sigreturn,%rax
231	syscall
232	.globl	_C_LABEL(sigcoderet)
233_C_LABEL(sigcoderet):
234	movq	$SYS_exit,%rax
235	syscall
236	_ALIGN_TRAPS
2371:	JMP_RETPOLINE(rax)
238	.globl	_C_LABEL(esigcode)
239_C_LABEL(esigcode):
240
241	.globl	_C_LABEL(sigfill)
242_C_LABEL(sigfill):
243	int3
244_C_LABEL(esigfill):
245	.globl	_C_LABEL(sigfillsiz)
246_C_LABEL(sigfillsiz):
247	.long	_C_LABEL(esigfill) - _C_LABEL(sigfill)
248
249	.text
250/*
251 * void lgdt(struct region_descriptor *rdp);
252 * Change the global descriptor table.
253 */
254NENTRY(lgdt)
255	RETGUARD_SETUP(lgdt, r11)
256	/* Reload the descriptor table. */
257	movq	%rdi,%rax
258	lgdt	(%rax)
259	/* Flush the prefetch q. */
260	jmp	1f
261	nop
2621:	/* Reload "stale" selectors. */
263	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
264	movl	%eax,%ds
265	movl	%eax,%es
266	movl	%eax,%ss
267	/* Reload code selector by doing intersegment return. */
268	popq	%rax
269	pushq	$GSEL(GCODE_SEL, SEL_KPL)
270	pushq	%rax
271	RETGUARD_CHECK(lgdt, r11)
272	lretq
273
274ENTRY(setjmp)
275	/*
276	 * Only save registers that must be preserved across function
277	 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15)
278	 * and %rip.
279	 */
280	movq	%rdi,%rax
281	movq	%rbx,(%rax)
282	movq	%rsp,8(%rax)
283	movq	%rbp,16(%rax)
284	movq	%r12,24(%rax)
285	movq	%r13,32(%rax)
286	movq	%r14,40(%rax)
287	movq	%r15,48(%rax)
288	movq	(%rsp),%rdx
289	movq	%rdx,56(%rax)
290	xorl	%eax,%eax
291	ret
292
293ENTRY(longjmp)
294	movq	%rdi,%rax
295	movq	(%rax),%rbx
296	movq	8(%rax),%rsp
297	movq	16(%rax),%rbp
298	movq	24(%rax),%r12
299	movq	32(%rax),%r13
300	movq	40(%rax),%r14
301	movq	48(%rax),%r15
302	movq	56(%rax),%rdx
303	movq	%rdx,(%rsp)
304	xorl	%eax,%eax
305	incl	%eax
306	ret
307
308/*****************************************************************************/
309
310/*
311 * int cpu_switchto(struct proc *old, struct proc *new)
312 * Switch from "old" proc to "new".
313 */
314ENTRY(cpu_switchto)
315	pushq	%rbx
316	pushq	%rbp
317	pushq	%r12
318	pushq	%r13
319	pushq	%r14
320	pushq	%r15
321
322	movq	%rdi, %r13
323	movq	%rsi, %r12
324
325	/* Record new proc. */
326	movb	$SONPROC,P_STAT(%r12)	# p->p_stat = SONPROC
327	SET_CURPROC(%r12,%rcx)
328
329	movl	CPUVAR(CPUID),%r9d
330
331	/* for the FPU/"extended CPU state" handling below */
332	movq	xsave_mask(%rip),%rdx
333	movl	%edx,%eax
334	shrq	$32,%rdx
335
336	/* If old proc exited, don't bother. */
337	testq	%r13,%r13
338	jz	switch_exited
339
340	/*
341	 * Save old context.
342	 *
343	 * Registers:
344	 *   %rax, %rcx - scratch
345	 *   %r13 - old proc, then old pcb
346	 *   %r12 - new proc
347	 *   %r9d - cpuid
348	 */
349
350	movq	P_ADDR(%r13),%r13
351
352	/* clear the old pmap's bit for the cpu */
353	movq	PCB_PMAP(%r13),%rcx
354	lock
355	btrq	%r9,PM_CPUS(%rcx)
356
357	/* Save stack pointers. */
358	movq	%rsp,PCB_RSP(%r13)
359	movq	%rbp,PCB_RBP(%r13)
360
361	/*
362	 * If the old proc ran in userspace then save the
363	 * floating-point/"extended state" registers
364	 */
365	testl	$CPUF_USERXSTATE,CPUVAR(FLAGS)
366	jz	.Lxstate_reset
367
368	movq	%r13, %rdi
369#if PCB_SAVEFPU != 0
370	addq	$PCB_SAVEFPU,%rdi
371#endif
372	CODEPATCH_START
373	.byte 0x48; fxsave	(%rdi)		/* really fxsave64 */
374	CODEPATCH_END(CPTAG_XSAVE)
375
376switch_exited:
377	/* now clear the xstate */
378	movq	proc0paddr(%rip),%rdi
379#if PCB_SAVEFPU != 0
380	addq	$PCB_SAVEFPU,%rdi
381#endif
382	CODEPATCH_START
383	.byte 0x48; fxrstor	(%rdi)		/* really fxrstor64 */
384	CODEPATCH_END(CPTAG_XRSTOR)
385	andl	$~CPUF_USERXSTATE,CPUVAR(FLAGS)
386
387.Lxstate_reset:
388	/*
389	 * If the segment registers haven't been reset since the old proc
390	 * ran in userspace then reset them now
391	 */
392	testl	$CPUF_USERSEGS,CPUVAR(FLAGS)
393	jz	restore_saved
394	andl	$~CPUF_USERSEGS,CPUVAR(FLAGS)
395
396	/* set %ds, %es, %fs, and %gs to expected value to prevent info leak */
397	movw	$(GSEL(GUDATA_SEL, SEL_UPL)),%ax
398	movw	%ax,%ds
399	movw	%ax,%es
400	movw	%ax,%fs
401	cli			/* block interrupts when on user GS.base */
402	swapgs			/* switch from kernel to user GS.base */
403	movw	%ax,%gs		/* set %gs to UDATA and GS.base to 0 */
404	swapgs			/* back to kernel GS.base */
405
406restore_saved:
407	/*
408	 * Restore saved context.
409	 *
410	 * Registers:
411	 *   %rax, %rcx, %rdx - scratch
412	 *   %r13 - new pcb
413	 *   %r12 - new process
414	 */
415
416	/* No interrupts while loading new state. */
417	cli
418	movq	P_ADDR(%r12),%r13
419
420	/* Restore stack pointers. */
421	movq	PCB_RSP(%r13),%rsp
422	movq	PCB_RBP(%r13),%rbp
423
424	/* Stack pivot done, setup RETGUARD */
425	RETGUARD_SETUP_OFF(cpu_switchto, r11, 6*8)
426
427	/* don't switch cr3 to the same thing it already was */
428	movq	%cr3,%rax
429	cmpq	PCB_CR3(%r13),%rax
430	movq	PCB_CR3(%r13),%rax	/* flags from cmpq unchanged */
431	jz	.Lsame_cr3
432
433	movq	%rax,%cr3			/* %rax used below too */
434
435.Lsame_cr3:
436	/*
437	 * If we switched from a userland thread with a shallow call stack
438	 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
439	 * then the RSB may have attacker controlled entries when we switch
440	 * to a deeper call stack in the new thread.  Refill the RSB with
441	 * entries safe to speculate into/through.
442	 */
443	RET_STACK_REFILL_WITH_RCX
444
445	/* Don't bother with the rest if switching to a system process. */
446	testl	$P_SYSTEM,P_FLAG(%r12)
447	jnz	switch_restored
448
449	/* record the bits needed for future U-->K transition */
450	movq	PCB_KSTACK(%r13),%rdx
451	subq	$FRAMESIZE,%rdx
452	movq	%rdx,CPUVAR(KERN_RSP)
453	movq	PCB_PMAP(%r13),%rcx
454
455	CODEPATCH_START
456	/*
457	 * Meltdown: iff we're doing separate U+K and U-K page tables,
458	 * then record them in cpu_info for easy access in syscall and
459	 * interrupt trampolines.
460	 */
461	movq	PM_PDIRPA_INTEL(%rcx),%rdx
462	orq	cr3_reuse_pcid,%rax
463	orq	cr3_pcid_proc_intel,%rdx
464	movq	%rax,CPUVAR(KERN_CR3)
465	movq	%rdx,CPUVAR(USER_CR3)
466	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
467
468	/* set the new pmap's bit for the cpu */
469	lock
470	btsq	%r9,PM_CPUS(%rcx)
471#ifdef DIAGNOSTIC
472	jc	_C_LABEL(switch_pmcpu_set)
473#endif
474
475switch_restored:
476	SET_CURPCB(%r13)
477
478	/* Interrupts are okay again. */
479	sti
480	popq	%r15
481	popq	%r14
482	popq	%r13
483	popq	%r12
484	popq	%rbp
485	popq	%rbx
486	RETGUARD_CHECK(cpu_switchto, r11)
487	ret
488
489ENTRY(cpu_idle_enter)
490	movq	_C_LABEL(cpu_idle_enter_fcn),%rax
491	cmpq	$0,%rax
492	jne	retpoline_rax
493	ret
494
495ENTRY(cpu_idle_leave)
496	movq	_C_LABEL(cpu_idle_leave_fcn),%rax
497	cmpq	$0,%rax
498	jne	retpoline_rax
499	ret
500
501/* placed here for correct static branch prediction in cpu_idle_* */
502NENTRY(retpoline_rax)
503	JMP_RETPOLINE(rax)
504
505ENTRY(cpu_idle_cycle)
506	movq	_C_LABEL(cpu_idle_cycle_fcn),%rax
507	cmpq	$0,%rax
508	jne	retpoline_rax
509	sti
510	hlt
511	ret
512
513	.globl	_C_LABEL(panic)
514
515#ifdef DIAGNOSTIC
516NENTRY(switch_pmcpu_set)
517	leaq	switch_active(%rip),%rdi
518	call	_C_LABEL(panic)
519	/* NOTREACHED */
520
521	.section .rodata
522switch_active:
523	.asciz	"activate already active pmap"
524	.text
525#endif /* DIAGNOSTIC */
526/*
527 * savectx(struct pcb *pcb);
528 * Update pcb, saving current processor state.
529 */
530ENTRY(savectx)
531	RETGUARD_SETUP(savectx, r11)
532	/* Save stack pointers. */
533	movq	%rsp,PCB_RSP(%rdi)
534	movq	%rbp,PCB_RBP(%rdi)
535	RETGUARD_CHECK(savectx, r11)
536	ret
537
538IDTVEC(syscall32)
539	sysret		/* go away please */
540
541/*
542 * syscall insn entry.
543 * Enter here with interrupts blocked; %rcx contains the caller's
544 * %rip and the original rflags has been copied to %r11.  %cs and
545 * %ss have been updated to the kernel segments, but %rsp is still
546 * the user-space value.
547 * First order of business is to swap to the kernel GS.base so that
548 * we can access our struct cpu_info.  After possibly mucking with
549 * pagetables, we switch to our kernel stack.  Once that's in place
550 * we can unblock interrupts and save the rest of the syscall frame.
551 */
552KUTEXT_PAGE_START
553 	.align	NBPG, 0xcc
554XUsyscall_meltdown:
555	/*
556	 * This is the real Xsyscall_meltdown page, which is mapped into
557	 * the U-K page tables at the same location as Xsyscall_meltdown
558	 * below.  For this, the Meltdown case, we use the scratch space
559	 * in cpu_info so we can switch to the kernel page tables
560	 * (thank you, Intel), at which point we'll continue at the
561	 * "movq CPUVAR(KERN_RSP),%rax" after Xsyscall below.
562	 * In case the CPU speculates past the mov to cr3, we put a
563	 * retpoline-style pause-jmp-to-pause loop.
564	 */
565	swapgs
566	movq	%rax,CPUVAR(SCRATCH)
567	movq	CPUVAR(KERN_CR3),%rax
568	movq	%rax,%cr3
5690:	pause
570	lfence
571	jmp	0b
572KUTEXT_PAGE_END
573
574KTEXT_PAGE_START
575	.align	NBPG, 0xcc
576IDTVEC_NOALIGN(syscall_meltdown)
577	/* pad to match real Xsyscall_meltdown positioning above */
578	movq	CPUVAR(KERN_CR3),%rax
579	movq	%rax,%cr3
580IDTVEC_NOALIGN(syscall)
581	swapgs
582	movq	%rax,CPUVAR(SCRATCH)
583	movq	CPUVAR(KERN_RSP),%rax
584	xchgq	%rax,%rsp
585	movq	%rcx,TF_RCX(%rsp)
586	movq	%rcx,TF_RIP(%rsp)
587	RET_STACK_REFILL_WITH_RCX
588	sti
589
590	/*
591	 * XXX don't need this whole frame, split of the
592	 * syscall frame and trapframe is needed.
593	 * First, leave some room for the trapno, error,
594	 * ss:rsp, etc, so that all GP registers can be
595	 * saved. Then, fill in the rest.
596	 */
597	movq	$(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp)
598	movq	%rax,TF_RSP(%rsp)
599	movq	CPUVAR(SCRATCH),%rax
600	INTR_SAVE_MOST_GPRS_NO_ADJ
601	movq	%r11, TF_RFLAGS(%rsp)	/* old rflags from syscall insn */
602	movq	$(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp)
603	movq	%rax,TF_ERR(%rsp)	/* stash syscall # for SPL check */
604	INTR_CLEAR_GPRS
605
606	movq	CPUVAR(CURPROC),%r14
607	movq	%rsp,P_MD_REGS(%r14)	# save pointer to frame
608	andl	$~MDP_IRET,P_MD_FLAGS(%r14)
609	movq	%rsp,%rdi
610	call	_C_LABEL(syscall)
611
612.Lsyscall_check_asts:
613	/* Check for ASTs on exit to user mode. */
614	cli
615	CHECK_ASTPENDING(%r11)
616	je	2f
617	CLEAR_ASTPENDING(%r11)
618	sti
619	movq	%rsp,%rdi
620	call	_C_LABEL(ast)
621	jmp	.Lsyscall_check_asts
622
6232:
624#ifdef DIAGNOSTIC
625	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
626	jne	.Lsyscall_spl_not_lowered
627#endif /* DIAGNOSTIC */
628
629	/* Could registers have been changed that require an iretq? */
630	testl	$MDP_IRET, P_MD_FLAGS(%r14)
631	jne	intr_user_exit_post_ast
632
633	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
634	testl	$CPUF_USERXSTATE,CPUVAR(FLAGS)
635	jz	.Lsyscall_restore_xstate
636
637	/* Restore FS.base if it's not already in the CPU */
638	testl	$CPUF_USERSEGS,CPUVAR(FLAGS)
639	jz	.Lsyscall_restore_fsbase
640
641.Lsyscall_restore_registers:
642	RET_STACK_REFILL_WITH_RCX
643
644	movq	TF_RDI(%rsp),%rdi
645	movq	TF_RSI(%rsp),%rsi
646	movq	TF_R8(%rsp),%r8
647	movq	TF_R9(%rsp),%r9
648	movq	TF_R10(%rsp),%r10
649	movq	TF_R12(%rsp),%r12
650	movq	TF_R13(%rsp),%r13
651	movq	TF_R14(%rsp),%r14
652	movq	TF_R15(%rsp),%r15
653	movq	TF_RBP(%rsp),%rbp
654	movq	TF_RBX(%rsp),%rbx
655
656	/*
657	 * We need to finish reading from the trapframe, then switch
658	 * to the user page tables, swapgs, and return.  We need
659	 * to get the final value for the register that was used
660	 * for the mov to %cr3 from somewhere accessible on the
661	 * user page tables, so save it in CPUVAR(SCRATCH) across
662	 * the switch.
663	 */
664	movq	TF_RDX(%rsp),%rdx
665	movq	TF_RAX(%rsp),%rax
666	movq	TF_RIP(%rsp),%rcx
667	movq	TF_RFLAGS(%rsp),%r11
668	movq	TF_RSP(%rsp),%rsp
669	CODEPATCH_START
670	movq	%rax,CPUVAR(SCRATCH)
671	movq	CPUVAR(USER_CR3),%rax
672	PCID_SET_REUSE_NOP
673	movq	%rax,%cr3
674Xsyscall_trampback:
6750:	pause
676	lfence
677	jmp	0b
678	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
679	swapgs
680	sysretq
681KTEXT_PAGE_END
682
683KUTEXT_PAGE_START
684	.space	(Xsyscall_trampback - Xsyscall_meltdown) - \
685		(. - XUsyscall_meltdown), 0xcc
686	movq	%rax,%cr3
687	movq	CPUVAR(SCRATCH),%rax
688	swapgs
689	sysretq
690KUTEXT_PAGE_END
691
692	.text
693	_ALIGN_TRAPS
694	/* in this case, need FS.base but not xstate, rarely happens */
695.Lsyscall_restore_fsbase:	/* CPU doesn't have curproc's FS.base */
696	orl	$CPUF_USERSEGS,CPUVAR(FLAGS)
697	movq	CPUVAR(CURPCB),%rdi
698	jmp	.Lsyscall_restore_fsbase_real
699
700	_ALIGN_TRAPS
701.Lsyscall_restore_xstate:	/* CPU doesn't have curproc's xstate */
702	orl	$(CPUF_USERXSTATE|CPUF_USERSEGS),CPUVAR(FLAGS)
703	movq	CPUVAR(CURPCB),%rdi
704	movq	xsave_mask(%rip),%rdx
705	movl	%edx,%eax
706	shrq	$32,%rdx
707#if PCB_SAVEFPU != 0
708	addq	$PCB_SAVEFPU,%rdi
709#endif
710	/* untouched state so can't fault */
711	CODEPATCH_START
712	.byte 0x48; fxrstor	(%rdi)		/* really fxrstor64 */
713	CODEPATCH_END(CPTAG_XRSTOR)
714#if PCB_SAVEFPU != 0
715	subq	$PCB_SAVEFPU,%rdi
716#endif
717.Lsyscall_restore_fsbase_real:
718	movq	PCB_FSBASE(%rdi),%rdx
719	movl	%edx,%eax
720	shrq	$32,%rdx
721	movl	$MSR_FSBASE,%ecx
722	wrmsr
723	jmp	.Lsyscall_restore_registers
724
725#ifdef DIAGNOSTIC
726.Lsyscall_spl_not_lowered:
727	leaq	spl_lowered(%rip), %rdi
728	movl	TF_ERR(%rsp),%esi	/* syscall # stashed above */
729	movl	TF_RDI(%rsp),%edx
730	movl	%ebx,%ecx
731	movl	CPUVAR(ILEVEL),%r8d
732	xorq	%rax,%rax
733	call	_C_LABEL(printf)
734#ifdef DDB
735	int	$3
736#endif /* DDB */
737	movl	$IPL_NONE,CPUVAR(ILEVEL)
738	jmp	.Lsyscall_check_asts
739
740	.section .rodata
741spl_lowered:
742	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n"
743	.text
744#endif
745
746NENTRY(proc_trampoline)
747#ifdef MULTIPROCESSOR
748	call	_C_LABEL(proc_trampoline_mp)
749#endif
750	movl	$IPL_NONE,CPUVAR(ILEVEL)
751	movq	%r13,%rdi
752	movq	%r12,%rax
753	call	retpoline_rax
754	movq	CPUVAR(CURPROC),%r14
755	jmp	.Lsyscall_check_asts
756
757
758/*
759 * Returning to userspace via iretq.  We do things in this order:
760 *  - check for ASTs
761 *  - restore FPU/"extended CPU state" if it's not already in the CPU
762 *  - DIAGNOSTIC: no more C calls after this, so check the SPL
763 *  - restore FS.base if it's not already in the CPU
764 *  - restore most registers
765 *  - update the iret frame from the trapframe
766 *  - finish reading from the trapframe
767 *  - switch to the trampoline stack	\
768 *  - jump to the .kutext segment	|-- Meltdown workaround
769 *  - switch to the user page tables	/
770 *  - swapgs
771 *  - iretq
772 */
773KTEXT_PAGE_START
774        _ALIGN_TRAPS
775GENTRY(intr_user_exit)
776#ifdef DIAGNOSTIC
777	pushfq
778	popq	%rdx
779	testq	$PSL_I,%rdx
780	jnz	.Lintr_user_exit_not_blocked
781#endif /* DIAGNOSTIC */
782
783	/* Check for ASTs */
784	CHECK_ASTPENDING(%r11)
785	je	intr_user_exit_post_ast
786	CLEAR_ASTPENDING(%r11)
787	sti
788	movq	%rsp,%rdi
789	call	_C_LABEL(ast)
790	cli
791	jmp	intr_user_exit
792
793intr_user_exit_post_ast:
794	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
795	testl	$CPUF_USERXSTATE,CPUVAR(FLAGS)
796	jz	.Lintr_restore_xstate
797
798#ifdef DIAGNOSTIC
799	/* no more C calls after this, so check the SPL */
800	cmpl	$0,CPUVAR(ILEVEL)
801	jne	.Luser_spl_not_lowered
802#endif /* DIAGNOSTIC */
803
804	/* Restore FS.base if it's not already in the CPU */
805	testl	$CPUF_USERSEGS,CPUVAR(FLAGS)
806	jz	.Lintr_restore_fsbase
807
808.Lintr_restore_registers:
809	RET_STACK_REFILL_WITH_RCX
810
811	movq	TF_RDI(%rsp),%rdi
812	movq	TF_RSI(%rsp),%rsi
813	movq	TF_R8(%rsp),%r8
814	movq	TF_R9(%rsp),%r9
815	movq	TF_R10(%rsp),%r10
816	movq	TF_R12(%rsp),%r12
817	movq	TF_R13(%rsp),%r13
818	movq	TF_R14(%rsp),%r14
819	movq	TF_R15(%rsp),%r15
820	movq	TF_RBP(%rsp),%rbp
821	movq	TF_RBX(%rsp),%rbx
822
823	/*
824	 * To get the final value for the register that was used
825	 * for the mov to %cr3, we need access to somewhere accessible
826	 * on the user page tables, so we save it in CPUVAR(SCRATCH)
827	 * across the switch.
828	 */
829	/* update iret frame */
830	movq	CPUVAR(INTR_RSP),%rdx
831	movq	$(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
832	movq	TF_RIP(%rsp),%rax
833	movq	%rax,IRETQ_RIP(%rdx)
834	movq	TF_RFLAGS(%rsp),%rax
835	movq	%rax,IRETQ_RFLAGS(%rdx)
836	movq	TF_RSP(%rsp),%rax
837	movq	%rax,IRETQ_RSP(%rdx)
838	movq	$(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
839	/* finish with the trap frame */
840	movq	TF_RAX(%rsp),%rax
841	movq	TF_RCX(%rsp),%rcx
842	movq	TF_R11(%rsp),%r11
843	/* switch to the trampoline stack */
844	xchgq	%rdx,%rsp
845	movq	TF_RDX(%rdx),%rdx
846	CODEPATCH_START
847	movq	%rax,CPUVAR(SCRATCH)
848	movq	CPUVAR(USER_CR3),%rax
849	PCID_SET_REUSE_NOP
850	movq	%rax,%cr3
851Xiretq_trampback:
852KTEXT_PAGE_END
853/* the movq %cr3 switches to this "KUTEXT" page */
854KUTEXT_PAGE_START
855	.space	(Xiretq_trampback - Xsyscall_meltdown) - \
856		(. - XUsyscall_meltdown), 0xcc
857	movq	CPUVAR(SCRATCH),%rax
858.Liretq_swapgs:
859	swapgs
860doreti_iret_meltdown:
861	iretq
862KUTEXT_PAGE_END
863/*
864 * Back to the "KTEXT" page to fill in the speculation trap and the
865 * swapgs+iretq used for non-Meltdown kernels.  This switching back
866 * and forth between segments is so that we can do the .space
867 * calculation below to guarantee the iretq's above and below line
868 * up, so the 'doreti_iret' label lines up with the iretq whether
869 * the CPU is affected by Meltdown or not.
870 */
871KTEXT_PAGE_START
8720:	pause
873	lfence
874	jmp	0b
875	.space	(.Liretq_swapgs - XUsyscall_meltdown) - \
876		(. - Xsyscall_meltdown), 0xcc
877	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
878	swapgs
879
880	.globl	_C_LABEL(doreti_iret)
881_C_LABEL(doreti_iret):
882	iretq
883KTEXT_PAGE_END
884
885	.text
886	_ALIGN_TRAPS
887.Lintr_restore_xstate:		/* CPU doesn't have curproc's xstate */
888	orl	$CPUF_USERXSTATE,CPUVAR(FLAGS)
889	movq	CPUVAR(CURPCB),%rdi
890#if PCB_SAVEFPU != 0
891	addq	$PCB_SAVEFPU,%rdi
892#endif
893	movq	xsave_mask(%rip),%rsi
894	call	xrstor_user
895	testl	%eax,%eax
896	jnz	.Lintr_xrstor_faulted
897.Lintr_restore_fsbase:		/* CPU doesn't have curproc's FS.base */
898	orl	$CPUF_USERSEGS,CPUVAR(FLAGS)
899	movq	CPUVAR(CURPCB),%rdx
900	movq	PCB_FSBASE(%rdx),%rdx
901	movl	%edx,%eax
902	shrq	$32,%rdx
903	movl	$MSR_FSBASE,%ecx
904	wrmsr
905	jmp	.Lintr_restore_registers
906
907.Lintr_xrstor_faulted:
908	/*
909	 * xrstor faulted; we need to reset the FPU state and call trap()
910	 * to post a signal, which requires interrupts be enabled.
911	 */
912	sti
913	movq	proc0paddr(%rip),%rdi
914#if PCB_SAVEFPU != 0
915	addq	$PCB_SAVEFPU,%rdi
916#endif
917	CODEPATCH_START
918	.byte 0x48; fxrstor	(%rdi)		/* really fxrstor64 */
919	CODEPATCH_END(CPTAG_XRSTOR)
920	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
921	jmp	recall_trap
922
923#ifdef DIAGNOSTIC
924.Lintr_user_exit_not_blocked:
925	movl	warn_once(%rip),%edi
926	testl	%edi,%edi
927	jnz	1f
928	incl	%edi
929	movl	%edi,warn_once(%rip)
930	leaq	.Lnot_blocked(%rip),%rdi
931	call	_C_LABEL(printf)
932#ifdef DDB
933	int	$3
934#endif /* DDB */
9351:	cli
936	jmp	intr_user_exit
937
938.Luser_spl_not_lowered:
939	sti
940	leaq	intr_spl_lowered(%rip),%rdi
941	movl	CPUVAR(ILEVEL),%esi
942	xorl	%edx,%edx		/* always SPL zero for userspace */
943	xorl	%eax,%eax
944	call	_C_LABEL(printf)
945#ifdef DDB
946	int	$3
947#endif /* DDB */
948	movl	$0,CPUVAR(ILEVEL)
949	cli
950	jmp	intr_user_exit
951
952	.section .rodata
953intr_spl_lowered:
954	.asciz	"WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n"
955	.text
956#endif /* DIAGNOSTIC */
957
958
959/*
960 * Return to supervisor mode from trap or interrupt
961 */
962NENTRY(intr_fast_exit)
963#ifdef DIAGNOSTIC
964	pushfq
965	popq	%rdx
966	testq	$PSL_I,%rdx
967	jnz	.Lintr_exit_not_blocked
968#endif /* DIAGNOSTIC */
969	movq	TF_RDI(%rsp),%rdi
970	movq	TF_RSI(%rsp),%rsi
971	movq	TF_R8(%rsp),%r8
972	movq	TF_R9(%rsp),%r9
973	movq	TF_R10(%rsp),%r10
974	movq	TF_R12(%rsp),%r12
975	movq	TF_R13(%rsp),%r13
976	movq	TF_R14(%rsp),%r14
977	movq	TF_R15(%rsp),%r15
978	movq	TF_RBP(%rsp),%rbp
979	movq	TF_RBX(%rsp),%rbx
980	movq	TF_RDX(%rsp),%rdx
981	movq	TF_RCX(%rsp),%rcx
982	movq	TF_R11(%rsp),%r11
983	movq	TF_RAX(%rsp),%rax
984	addq	$TF_RIP,%rsp
985	iretq
986
987#ifdef DIAGNOSTIC
988.Lintr_exit_not_blocked:
989	movl	warn_once(%rip),%edi
990	testl	%edi,%edi
991	jnz	1f
992	incl	%edi
993	movl	%edi,warn_once(%rip)
994	leaq	.Lnot_blocked(%rip),%rdi
995	call	_C_LABEL(printf)
996#ifdef DDB
997	int	$3
998#endif /* DDB */
9991:	cli
1000	jmp	intr_fast_exit
1001
1002	.data
1003.global warn_once
1004warn_once:
1005	.long	0
1006	.section .rodata
1007.Lnot_blocked:
1008	.asciz	"WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
1009	.text
1010#endif
1011
1012/*
1013 * FPU/"extended CPU state" handling
1014 * 	int xrstor_user(sfp, mask)
1015 *		load given state, returns 0/1 if okay/it trapped
1016 *	void fpusave(sfp)
1017 *		save current state, but retain it in the FPU
1018 *	void fpusavereset(sfp)
1019 *		save current state and reset FPU to initial/kernel state
1020 *	int xsetbv_user(reg, mask)
1021 *		load specifed %xcr# register, returns 0/1 if okay/it trapped
1022 */
1023
1024ENTRY(xrstor_user)
1025	RETGUARD_SETUP(xrstor_user, r11)
1026	movq	%rsi, %rdx
1027	movl	%esi, %eax
1028	shrq	$32, %rdx
1029	.globl	xrstor_fault
1030xrstor_fault:
1031	CODEPATCH_START
1032	.byte 0x48; fxrstor	(%rdi)		/* really fxrstor64 */
1033	CODEPATCH_END(CPTAG_XRSTOR)
1034	xorl	%eax, %eax
1035	RETGUARD_CHECK(xrstor_user, r11)
1036	ret
1037NENTRY(xrstor_resume)
1038	movl	$1, %eax
1039	RETGUARD_CHECK(xrstor_user, r11)
1040	ret
1041END(xrstor_user)
1042
1043ENTRY(fpusave)
1044	RETGUARD_SETUP(fpusave, r11)
1045	movq	xsave_mask(%rip),%rdx
1046	movl	%edx,%eax
1047	shrq	$32,%rdx
1048	CODEPATCH_START
1049	.byte 0x48; fxsave	(%rdi)		/* really fxsave64 */
1050	CODEPATCH_END(CPTAG_XSAVE)
1051	RETGUARD_CHECK(fpusave, r11)
1052	ret
1053END(fpusave)
1054
1055ENTRY(fpusavereset)
1056	RETGUARD_SETUP(fpusavereset, r11)
1057	movq	xsave_mask(%rip),%rdx
1058	movl	%edx,%eax
1059	shrq	$32,%rdx
1060	CODEPATCH_START
1061	.byte 0x48; fxsave	(%rdi)		/* really fxsave64 */
1062	CODEPATCH_END(CPTAG_XSAVE)
1063	movq	proc0paddr(%rip),%rdi
1064#if PCB_SAVEFPU != 0
1065	addq	$PCB_SAVEFPU,%rdi
1066#endif
1067	CODEPATCH_START
1068	.byte 0x48; fxrstor	(%rdi)		/* really fxrstor64 */
1069	CODEPATCH_END(CPTAG_XRSTOR)
1070	RETGUARD_CHECK(fpusavereset, r11)
1071	ret
1072END(fpusavereset)
1073
1074ENTRY(xsetbv_user)
1075	RETGUARD_SETUP(xsetbv_user, r11)
1076	movl	%edi, %ecx
1077	movq	%rsi, %rdx
1078	movl	%esi, %eax
1079	shrq	$32, %rdx
1080	.globl	xsetbv_fault
1081xsetbv_fault:
1082	xsetbv
1083	xorl	%eax, %eax
1084	RETGUARD_CHECK(xsetbv_user, r11)
1085	ret
1086NENTRY(xsetbv_resume)
1087	movl	$1, %eax
1088	RETGUARD_CHECK(xsetbv_user, r11)
1089	ret
1090END(xsetbv_user)
1091
1092	.section .rodata
1093	.globl	_C_LABEL(_xrstor)
1094_C_LABEL(_xrstor):
1095	.byte 0x48; xrstor	(%rdi)		/* really xrstor64 */
1096
1097	.globl	_C_LABEL(_xsave)
1098_C_LABEL(_xsave):
1099	.byte 0x48; xsave	(%rdi)		/* really xsave64 */
1100
1101	.globl	_C_LABEL(_xsaveopt)
1102_C_LABEL(_xsaveopt):
1103	.byte 0x48; xsaveopt	(%rdi)		/* really xsaveopt64 */
1104
1105	.globl	_C_LABEL(_pcid_set_reuse)
1106_C_LABEL(_pcid_set_reuse):
1107	orl	$(CR3_REUSE_PCID >> 32),CPUVAR(USER_CR3 + 4)
1108
1109ENTRY(pagezero)
1110	RETGUARD_SETUP(pagezero, r11)
1111	movq    $-PAGE_SIZE,%rdx
1112	subq    %rdx,%rdi
1113	xorq    %rax,%rax
11141:
1115	movnti  %rax,(%rdi,%rdx)
1116	movnti  %rax,8(%rdi,%rdx)
1117	movnti  %rax,16(%rdi,%rdx)
1118	movnti  %rax,24(%rdi,%rdx)
1119	addq    $32,%rdx
1120	jne     1b
1121	sfence
1122	RETGUARD_CHECK(pagezero, r11)
1123	ret
1124
1125/* int rdmsr_safe(u_int msr, uint64_t *data) */
1126ENTRY(rdmsr_safe)
1127	RETGUARD_SETUP(rdmsr_safe_return, r10)
1128
1129	movl	%edi,	%ecx	/* u_int msr */
1130	.globl	rdmsr_safe_fault
1131rdmsr_safe_fault:
1132	rdmsr
1133	salq	$32, %rdx
1134	movl	%eax, %eax
1135	orq	%rdx, %rax
1136	movq	%rax, (%rsi)	/* *data */
1137	xorq	%rax, %rax
1138
1139	RETGUARD_CHECK(rdmsr_safe_return, r10)
1140	ret
1141
1142NENTRY(rdmsr_resume)
1143	movl	$0x1, %eax
1144	RETGUARD_CHECK(rdmsr_safe_return, r10)
1145	ret
1146
1147#if NXEN > 0
1148	/* Hypercall page needs to be page aligned */
1149	.text
1150	.align	NBPG, 0xcc
1151	.globl	_C_LABEL(xen_hypercall_page)
1152_C_LABEL(xen_hypercall_page):
1153	.skip	0x1000, 0xcc
1154#endif /* NXEN > 0 */
1155
1156#if NHYPERV > 0
1157	/* Hypercall page needs to be page aligned */
1158	.text
1159	.align	NBPG, 0xcc
1160	.globl	_C_LABEL(hv_hypercall_page)
1161_C_LABEL(hv_hypercall_page):
1162	.skip	0x1000, 0xcc
1163#endif /* NXEN > 0 */
1164