syscall_asm_amd64.s revision 2712:f74a135872bc
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/asm_linkage.h>
29#include <sys/asm_misc.h>
30#include <sys/regset.h>
31#include <sys/psw.h>
32#include <sys/machbrand.h>
33
34#if defined(__lint)
35
36#include <sys/types.h>
37#include <sys/thread.h>
38#include <sys/systm.h>
39
40#else	/* __lint */
41
42#include <sys/segments.h>
43#include <sys/pcb.h>
44#include <sys/trap.h>
45#include <sys/ftrace.h>
46#include <sys/traptrace.h>
47#include <sys/clock.h>
48#include <sys/model.h>
49#include <sys/panic.h>
50#include "assym.h"
51
52#endif	/* __lint */
53
54/*
55 * We implement five flavours of system call entry points
56 *
57 * -	syscall/sysretq		(amd64 generic)
58 * -	syscall/sysretl		(i386 plus SYSC bit)
59 * -	sysenter/sysexit	(i386 plus SEP bit)
60 * -	int/iret		(i386 generic)
61 * -	lcall/iret		(i386 generic)
62 *
63 * The current libc included in Solaris uses int/iret as the base unoptimized
64 * kernel entry method. Older libc implementations and legacy binaries may use
65 * the lcall call gate, so it must continue to be supported.
66 *
67 * System calls that use an lcall call gate are processed in trap() via a
68 * segment-not-present trap, i.e. lcalls are extremely slow(!).
69 *
70 * The basic pattern used in the 32-bit SYSC handler at this point in time is
71 * to have the bare minimum of assembler, and get to the C handlers as
72 * quickly as possible.
73 *
74 * The 64-bit handler is much closer to the sparcv9 handler; that's
75 * because of passing arguments in registers.  The 32-bit world still
76 * passes arguments on the stack -- that makes that handler substantially
77 * more complex.
78 *
79 * The two handlers share a few code fragments which are broken
80 * out into preprocessor macros below.
81 *
82 * XX64	come back and speed all this up later.  The 32-bit stuff looks
83 * especially easy to speed up the argument copying part ..
84 *
85 *
86 * Notes about segment register usage (c.f. the 32-bit kernel)
87 *
88 * In the 32-bit kernel, segment registers are dutifully saved and
89 * restored on all mode transitions because the kernel uses them directly.
90 * When the processor is running in 64-bit mode, segment registers are
91 * largely ignored.
92 *
93 * %cs and %ss
94 *	controlled by the hardware mechanisms that make mode transitions
95 *
96 * The remaining segment registers have to either be pointing at a valid
97 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
98 *
99 * %ds and %es
100 *	always ignored
101 *
102 * %fs and %gs
103 *	fsbase and gsbase are used to control the place they really point at.
104 *	The kernel only depends on %gs, and controls its own gsbase via swapgs
105 *
106 * Note that loading segment registers is still costly because the GDT
107 * lookup still happens (this is because the hardware can't know that we're
108 * not setting up these segment registers for a 32-bit program).  Thus we
109 * avoid doing this in the syscall path, and defer them to lwp context switch
110 * handlers, so the register values remain virtualized to the lwp.
111 */
112
113#if defined(SYSCALLTRACE)
114#define	ORL_SYSCALLTRACE(r32)		\
115	orl	syscalltrace(%rip), r32
116#else
117#define	ORL_SYSCALLTRACE(r32)
118#endif
119
120/*
121 * In the 32-bit kernel, we do absolutely nothing before getting into the
122 * brand callback checks.  In 64-bit land, we do swapgs and then come here.
123 * We assume that the %rsp- and %r15-stashing fields in the CPU structure
124 * are still unused.
125 *
126 * When the callback is invoked, we will be on the user's %gs and
127 * the stack will look like this:
128 *
129 * stack:  --------------------------------------
130 *         | callback pointer			|
131 *    |    | user stack pointer			|
132 *    |    | lwp brand data			|
133 *    |    | proc brand data			|
134 *    v    | userland return address		|
135 *         | callback wrapper return addr	|
136 *         --------------------------------------
137 *
138 */
139#define	BRAND_CALLBACK(callback_id)					    \
140	movq	%rsp, %gs:CPU_RTMP_RSP	/* save the stack pointer	*/ ;\
141	movq	%r15, %gs:CPU_RTMP_R15	/* save %r15			*/ ;\
142	movq	%gs:CPU_THREAD, %r15	/* load the thread pointer	*/ ;\
143	movq	T_STACK(%r15), %rsp	/* switch to the kernel stack	*/ ;\
144	subq	$16, %rsp		/* save space for two pointers	*/ ;\
145	pushq	%r14			/* save %r14			*/ ;\
146	movq	%gs:CPU_RTMP_RSP, %r14					   ;\
147	movq	%r14, 8(%rsp)		/* stash the user stack pointer	*/ ;\
148	popq	%r14			/* restore %r14			*/ ;\
149	movq	T_LWP(%r15), %r15	/* load the lwp pointer		*/ ;\
150	pushq	LWP_BRAND(%r15)		/* push the lwp's brand data	*/ ;\
151	movq	LWP_PROCP(%r15), %r15	/* load the proc pointer	*/ ;\
152	pushq	P_BRAND_DATA(%r15)	/* push the proc's brand data	*/ ;\
153	movq	P_BRAND(%r15), %r15	/* load the brand pointer	*/ ;\
154	movq	B_MACHOPS(%r15), %r15	/* load the machops pointer	*/ ;\
155	movq	_CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15		   ;\
156	cmpq	$0, %r15						   ;\
157	je	1f							   ;\
158	movq	%r15, 24(%rsp)		/* save the callback pointer	*/ ;\
159	movq	%gs:CPU_RTMP_RSP, %r15	/* grab the user stack pointer	*/ ;\
160	pushq	(%r15)			/* push the return address	*/ ;\
161	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
162	swapgs								   ;\
163	call	*32(%rsp)		/* call callback		*/ ;\
164	swapgs								   ;\
1651:	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
166	movq	%gs:CPU_RTMP_RSP, %rsp	/* restore the stack pointer	*/
167
168#define	MSTATE_TRANSITION(from, to)		\
169	movl	$from, %edi;			\
170	movl	$to, %esi;			\
171	call	syscall_mstate
172
173/*
174 * Check to see if a simple (direct) return is possible i.e.
175 *
176 *	if ((t->t_post_sys_ast | syscalltrace |
177 *	    (lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING)) != 0)
178 *		do full version	;
179 *
180 * Preconditions:
181 * -	t is curthread
182 * Postconditions:
183 * -	condition code NE is set if post-sys is too complex
184 * -	rtmp is zeroed if it isn't (we rely on this!)
185 * -	ltmp is smashed
186 */
187#define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
188	movq	T_LWP(t), ltmp;				\
189	movl	PCB_FLAGS(ltmp), rtmp;			\
190	andl	$RUPDATE_PENDING, rtmp;			\
191	ORL_SYSCALLTRACE(rtmp);				\
192	orl	T_POST_SYS_AST(t), rtmp;		\
193	cmpl	$0, rtmp
194
195/*
196 * Fix up the lwp, thread, and eflags for a successful return
197 *
198 * Preconditions:
199 * -	zwreg contains zero
200 */
201#define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
202	movb	$LWP_USER, LWP_STATE(lwp);		\
203	movw	zwreg, T_SYSNUM(t);			\
204	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
205
206/*
207 * ASSERT(lwptoregs(lwp) == rp);
208 *
209 * This may seem obvious, but very odd things happen if this
210 * assertion is false
211 *
212 * Preconditions:
213 *	(%rsp is ready for normal call sequence)
214 * Postconditions (if assertion is true):
215 *	%r11 is smashed
216 *
217 * ASSERT(rp->r_cs == descnum)
218 *
219 * The code selector is written into the regs structure when the
220 * lwp stack is created.  We use this ASSERT to validate that
221 * the regs structure really matches how we came in.
222 *
223 * Preconditions:
224 *	(%rsp is ready for normal call sequence)
225 * Postconditions (if assertion is true):
226 *	-none-
227 *
228 * ASSERT((lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0);
229 *
230 * If this is false, it meant that we returned to userland without
231 * updating the segment registers as we were supposed to.
232 *
233 * Note that we must ensure no interrupts or other traps intervene
234 * between entering privileged mode and performing the assertion,
235 * otherwise we may perform a context switch on the thread, which
236 * will end up setting the RUPDATE_PENDING bit again.
237 */
238#if defined(DEBUG)
239
240#if !defined(__lint)
241
242__lwptoregs_msg:
243	.string	"syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
244
245__codesel_msg:
246	.string	"syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
247
248__no_rupdate_msg:
249	.string	"syscall_asm_amd64.s:%d lwp %p, pcb_flags & RUPDATE_PENDING != 0"
250
251#endif	/* !__lint */
252
253#define	ASSERT_LWPTOREGS(lwp, rp)			\
254	movq	LWP_REGS(lwp), %r11;			\
255	cmpq	rp, %r11;				\
256	je	7f;					\
257	leaq	__lwptoregs_msg(%rip), %rdi;		\
258	movl	$__LINE__, %esi;			\
259	movq	lwp, %rdx;				\
260	movq	%r11, %rcx;				\
261	movq	rp, %r8;				\
262	xorl	%eax, %eax;				\
263	call	panic;					\
2647:
265
266#define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
267	testl	$RUPDATE_PENDING, PCB_FLAGS(lwp);	\
268	je	8f;					\
269	movq	lwp, %rdx;				\
270	leaq	__no_rupdate_msg(%rip), %rdi;		\
271	movl	$__LINE__, %esi;			\
272	xorl	%eax, %eax;				\
273	call	panic;					\
2748:
275
276#else
277#define	ASSERT_LWPTOREGS(lwp, rp)
278#define	ASSERT_NO_RUPDATE_PENDING(lwp)
279#endif
280
281/*
282 * Do the traptrace thing and restore any registers we used
283 * in situ.  Assumes that %rsp is pointing at the base of
284 * the struct regs, obviously ..
285 */
286#ifdef TRAPTRACE
287#define	SYSCALL_TRAPTRACE(ttype)				\
288	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
289	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
290	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
291	movq	REGOFF_RAX(%rsp), %rax;				\
292	movq	REGOFF_RBX(%rsp), %rbx;				\
293	movq	REGOFF_RCX(%rsp), %rcx;				\
294	movq	REGOFF_RDX(%rsp), %rdx;				\
295	movl	%eax, TTR_SYSNUM(%rdi);				\
296	movq	REGOFF_RDI(%rsp), %rdi
297
298#define	SYSCALL_TRAPTRACE32(ttype)				\
299	SYSCALL_TRAPTRACE(ttype);				\
300	/* paranoia: clean the top 32-bits of the registers */	\
301	orl	%eax, %eax;					\
302	orl	%ebx, %ebx;					\
303	orl	%ecx, %ecx;					\
304	orl	%edx, %edx;					\
305	orl	%edi, %edi
306#else	/* TRAPTRACE */
307#define	SYSCALL_TRAPTRACE(ttype)
308#define	SYSCALL_TRAPTRACE32(ttype)
309#endif	/* TRAPTRACE */
310
311/*
312 * The 64-bit libc syscall wrapper does this:
313 *
314 * fn(<args>)
315 * {
316 *	movq	%rcx, %r10	-- because syscall smashes %rcx
317 *	movl	$CODE, %eax
318 *	syscall
319 *	<error processing>
320 * }
321 *
322 * Thus when we come into the kernel:
323 *
324 *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
325 *	%rax is the syscall number
326 *	%r12-%r15 contain caller state
327 *
328 * The syscall instruction arranges that:
329 *
330 *	%rcx contains the return %rip
331 *	%r11d contains bottom 32-bits of %rflags
332 *	%rflags is masked (as determined by the SFMASK msr)
333 *	%cs is set to UCS_SEL (as determined by the STAR msr)
334 *	%ss is set to UDS_SEL (as determined by the STAR msr)
335 *	%rip is set to sys_syscall (as determined by the LSTAR msr)
336 *
337 * Or in other words, we have no registers available at all.
338 * Only swapgs can save us!
339 */
340
341#if defined(__lint)
342
343/*ARGSUSED*/
344void
345sys_syscall()
346{}
347
348void
349_allsyscalls()
350{}
351
352size_t _allsyscalls_size;
353
354#else	/* __lint */
355
356	ENTRY_NP2(brand_sys_syscall,_allsyscalls)
357	swapgs
358	BRAND_CALLBACK(BRAND_CB_SYSCALL)
359	swapgs
360
361	ALTENTRY(sys_syscall)
362	swapgs
363	movq	%rsp, %gs:CPU_RTMP_RSP
364	movq	%r15, %gs:CPU_RTMP_R15
365	movq	%gs:CPU_THREAD, %r15
366	movq	T_STACK(%r15), %rsp
367
368	movl	$UCS_SEL, REGOFF_CS(%rsp)
369	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
370	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
371	movl	$UDS_SEL, REGOFF_SS(%rsp)
372
373	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
374	movq	%rdi, REGOFF_RDI(%rsp)
375	movq	%rsi, REGOFF_RSI(%rsp)
376	movq	%rdx, REGOFF_RDX(%rsp)
377	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
378	movq	%r10, %rcx			/* arg[3] for direct calls */
379
380	movq	%r8, REGOFF_R8(%rsp)
381	movq	%r9, REGOFF_R9(%rsp)
382	movq	%rax, REGOFF_RAX(%rsp)
383	movq	%rbx, REGOFF_RBX(%rsp)
384
385	movq	%rbp, REGOFF_RBP(%rsp)
386	movq	%r10, REGOFF_R10(%rsp)
387	movq	%gs:CPU_RTMP_RSP, %r11
388	movq	%r11, REGOFF_RSP(%rsp)
389	movq	%r12, REGOFF_R12(%rsp)
390
391	movq	%r13, REGOFF_R13(%rsp)
392	movq	%r14, REGOFF_R14(%rsp)
393	movq	%gs:CPU_RTMP_R15, %r10
394	movq	%r10, REGOFF_R15(%rsp)
395	movq	$0, REGOFF_SAVFP(%rsp)
396	movq	$0, REGOFF_SAVPC(%rsp)
397
398	/*
399	 * Copy these registers here in case we end up stopped with
400	 * someone (like, say, /proc) messing with our register state.
401	 * We don't -restore- them unless we have to in update_sregs.
402	 *
403	 * Since userland -can't- change fsbase or gsbase directly,
404	 * and capturing them involves two serializing instructions,
405	 * we don't bother to capture them here.
406	 */
407	xorl	%ebx, %ebx
408	movw	%ds, %bx
409	movq	%rbx, REGOFF_DS(%rsp)
410	movw	%es, %bx
411	movq	%rbx, REGOFF_ES(%rsp)
412	movw	%fs, %bx
413	movq	%rbx, REGOFF_FS(%rsp)
414	movw	%gs, %bx
415	movq	%rbx, REGOFF_GS(%rsp)
416
417	/*
418	 * Machine state saved in the regs structure on the stack
419	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
420	 * %eax is the syscall number
421	 * %rsp is the thread's stack, %r15 is curthread
422	 * REG_RSP(%rsp) is the user's stack
423	 */
424
425	SYSCALL_TRAPTRACE($TT_SYSC64)
426
427	movq	%rsp, %rbp
428
429	movq	T_LWP(%r15), %r14
430	ASSERT_NO_RUPDATE_PENDING(%r14)
431
432	ENABLE_INTR_FLAGS
433
434	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
435	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
436
437	ASSERT_LWPTOREGS(%r14, %rsp)
438
439	movb	$LWP_SYS, LWP_STATE(%r14)
440	incq	LWP_RU_SYSC(%r14)
441	movb	$NORMALRETURN, LWP_EOSYS(%r14)
442
443	incq	%gs:CPU_STATS_SYS_SYSCALL
444
445	movw	%ax, T_SYSNUM(%r15)
446	movzbl	T_PRE_SYS(%r15), %ebx
447	ORL_SYSCALLTRACE(%ebx)
448	testl	%ebx, %ebx
449	jne	_syscall_pre
450
451_syscall_invoke:
452	movq	REGOFF_RDI(%rbp), %rdi
453	movq	REGOFF_RSI(%rbp), %rsi
454	movq	REGOFF_RDX(%rbp), %rdx
455	movq	REGOFF_RCX(%rbp), %rcx
456	movq	REGOFF_R8(%rbp), %r8
457	movq	REGOFF_R9(%rbp), %r9
458
459	cmpl	$NSYSCALL, %eax
460	jae	_syscall_ill
461	shll	$SYSENT_SIZE_SHIFT, %eax
462	leaq	sysent(%rax), %rbx
463
464	call	*SY_CALLC(%rbx)
465
466	movq	%rax, %r12
467	movq	%rdx, %r13
468
469	/*
470	 * If the handler returns two ints, then we need to split the
471	 * 64-bit return value into two 32-bit values.
472	 */
473	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
474	je	5f
475	movq	%r12, %r13
476	shrq	$32, %r13	/* upper 32-bits into %edx */
477	movl	%r12d, %r12d	/* lower 32-bits into %eax */
4785:
479	/*
480	 * Optimistically assume that there's no post-syscall
481	 * work to do.  (This is to avoid having to call syscall_mstate()
482	 * with interrupts disabled)
483	 */
484	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
485
486	/*
487	 * We must protect ourselves from being descheduled here;
488	 * If we were, and we ended up on another cpu, or another
489	 * lwp got in ahead of us, it could change the segment
490	 * registers without us noticing before we return to userland.
491	 */
492	cli
493	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
494	jne	_syscall_post
495	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
496
497	movq	%r12, REGOFF_RAX(%rsp)
498	movq	%r13, REGOFF_RDX(%rsp)
499
500	/*
501	 * To get back to userland, we need the return %rip in %rcx and
502	 * the return %rfl in %r11d.  The sysretq instruction also arranges
503	 * to fix up %cs and %ss; everything else is our responsibility.
504	 */
505	movq	REGOFF_RDI(%rsp), %rdi
506	movq	REGOFF_RSI(%rsp), %rsi
507	movq	REGOFF_RDX(%rsp), %rdx
508	/* %rcx used to restore %rip value */
509
510	movq	REGOFF_R8(%rsp), %r8
511	movq	REGOFF_R9(%rsp), %r9
512	movq	REGOFF_RAX(%rsp), %rax
513	movq	REGOFF_RBX(%rsp), %rbx
514
515	movq	REGOFF_RBP(%rsp), %rbp
516	movq	REGOFF_R10(%rsp), %r10
517	/* %r11 used to restore %rfl value */
518	movq	REGOFF_R12(%rsp), %r12
519
520	movq	REGOFF_R13(%rsp), %r13
521	movq	REGOFF_R14(%rsp), %r14
522	movq	REGOFF_R15(%rsp), %r15
523
524	movq	REGOFF_RIP(%rsp), %rcx
525	movl	REGOFF_RFL(%rsp), %r11d
526	movq	REGOFF_RSP(%rsp), %rsp
527	swapgs
528	sysretq
529
530_syscall_pre:
531	call	pre_syscall
532	movl	%eax, %r12d
533	testl	%eax, %eax
534	jne	_syscall_post_call
535	/*
536	 * Didn't abort, so reload the syscall args and invoke the handler.
537	 */
538	movzwl	T_SYSNUM(%r15), %eax
539	jmp	_syscall_invoke
540
541_syscall_ill:
542	call	nosys
543	movq	%rax, %r12
544	movq	%rdx, %r13
545	jmp	_syscall_post_call
546
547_syscall_post:
548	sti
549	/*
550	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
551	 * so that we can account for the extra work it takes us to finish.
552	 */
553	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
554_syscall_post_call:
555	movq	%r12, %rdi
556	movq	%r13, %rsi
557	call	post_syscall
558	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
559	jmp	sys_rtt_syscall
560	SET_SIZE(sys_syscall)
561	SET_SIZE(brand_sys_syscall)
562
563#endif	/* __lint */
564
565#if defined(__lint)
566
567/*ARGSUSED*/
568void
569sys_syscall32()
570{}
571
572#else	/* __lint */
573
574	ENTRY_NP(brand_sys_syscall32)
575	swapgs
576	BRAND_CALLBACK(BRAND_CB_SYSCALL32)
577	swapgs
578
579	ALTENTRY(sys_syscall32)
580	swapgs
581	movl	%esp, %r10d
582	movq	%gs:CPU_THREAD, %r15
583	movq	T_STACK(%r15), %rsp
584	movl	%eax, %eax
585
586	movl	$U32CS_SEL, REGOFF_CS(%rsp)
587	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
588	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
589	movq	%r10, REGOFF_RSP(%rsp)
590	movl	$UDS_SEL, REGOFF_SS(%rsp)
591
592_syscall32_save:
593
594	movl	%edi, REGOFF_RDI(%rsp)
595	movl	%esi, REGOFF_RSI(%rsp)
596	movl	%ebp, REGOFF_RBP(%rsp)
597	movl	%ebx, REGOFF_RBX(%rsp)
598	movl	%edx, REGOFF_RDX(%rsp)
599	movl	%ecx, REGOFF_RCX(%rsp)
600	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
601	movq	$0, REGOFF_SAVFP(%rsp)
602	movq	$0, REGOFF_SAVPC(%rsp)
603
604	/*
605	 * Copy these registers here in case we end up stopped with
606	 * someone (like, say, /proc) messing with our register state.
607	 * We don't -restore- them unless we have to in update_sregs.
608	 *
609	 * Since userland -can't- change fsbase or gsbase directly,
610	 * we don't bother to capture them here.
611	 */
612	xorl	%ebx, %ebx
613	movw	%ds, %bx
614	movq	%rbx, REGOFF_DS(%rsp)
615	movw	%es, %bx
616	movq	%rbx, REGOFF_ES(%rsp)
617	movw	%fs, %bx
618	movq	%rbx, REGOFF_FS(%rsp)
619	movw	%gs, %bx
620	movq	%rbx, REGOFF_GS(%rsp)
621
622	/*
623	 * Application state saved in the regs structure on the stack
624	 * %eax is the syscall number
625	 * %rsp is the thread's stack, %r15 is curthread
626	 * REG_RSP(%rsp) is the user's stack
627	 */
628
629	SYSCALL_TRAPTRACE32($TT_SYSC)
630
631	movq	%rsp, %rbp
632
633	movq	T_LWP(%r15), %r14
634	ASSERT_NO_RUPDATE_PENDING(%r14)
635
636	ENABLE_INTR_FLAGS
637
638	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
639	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
640
641	ASSERT_LWPTOREGS(%r14, %rsp)
642
643	incq	 %gs:CPU_STATS_SYS_SYSCALL
644
645	/*
646	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
647	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
648	 * more succinctly:
649	 *
650	 *	SA(MAXSYSARGS * sizeof (long)) == 64
651	 */
652#define	SYS_DROP	64			/* drop for args */
653	subq	$SYS_DROP, %rsp
654	movb	$LWP_SYS, LWP_STATE(%r14)
655	movq	%r15, %rdi
656	movq	%rsp, %rsi
657	call	syscall_entry
658
659	/*
660	 * Fetch the arguments copied onto the kernel stack and put
661	 * them in the right registers to invoke a C-style syscall handler.
662	 * %rax contains the handler address.
663	 *
664	 * Ideas for making all this go faster of course include simply
665	 * forcibly fetching 6 arguments from the user stack under lofault
666	 * protection, reverting to copyin_args only when watchpoints
667	 * are in effect.
668	 *
669	 * (If we do this, make sure that exec and libthread leave
670	 * enough space at the top of the stack to ensure that we'll
671	 * never do a fetch from an invalid page.)
672	 *
673	 * Lots of ideas here, but they won't really help with bringup B-)
674	 * Correctness can't wait, performance can wait a little longer ..
675	 */
676
677	movq	%rax, %rbx
678	movl	0(%rsp), %edi
679	movl	8(%rsp), %esi
680	movl	0x10(%rsp), %edx
681	movl	0x18(%rsp), %ecx
682	movl	0x20(%rsp), %r8d
683	movl	0x28(%rsp), %r9d
684
685	call	*SY_CALLC(%rbx)
686
687	movq	%rbp, %rsp	/* pop the args */
688
689	/*
690	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
691	 * On the 32-bit kernel, they always return that value in %eax:%edx
692	 * as required by the 32-bit ABI.
693	 *
694	 * Simulate the same behaviour by unconditionally splitting the
695	 * return value in the same way.
696	 */
697	movq	%rax, %r13
698	shrq	$32, %r13	/* upper 32-bits into %edx */
699	movl	%eax, %r12d	/* lower 32-bits into %eax */
700
701	/*
702	 * Optimistically assume that there's no post-syscall
703	 * work to do.  (This is to avoid having to call syscall_mstate()
704	 * with interrupts disabled)
705	 */
706	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
707
708	/*
709	 * We must protect ourselves from being descheduled here;
710	 * If we were, and we ended up on another cpu, or another
711	 * lwp got in ahead of us, it could change the segment
712	 * registers without us noticing before we return to userland.
713	 */
714	cli
715	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
716	jne	_full_syscall_postsys32
717	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
718
719	/*
720	 * To get back to userland, we need to put the return %rip in %rcx and
721	 * the return %rfl in %r11d.  The sysret instruction also arranges
722	 * to fix up %cs and %ss; everything else is our responsibility.
723	 */
724
725	movl	%r12d, %eax			/* %eax: rval1 */
726	movl	REGOFF_RBX(%rsp), %ebx
727	/* %ecx used for return pointer */
728	movl	%r13d, %edx			/* %edx: rval2 */
729	movl	REGOFF_RBP(%rsp), %ebp
730	movl	REGOFF_RSI(%rsp), %esi
731	movl	REGOFF_RDI(%rsp), %edi
732
733	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
734	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
735	movl	REGOFF_RSP(%rsp), %esp
736
737	swapgs
738	sysretl
739
740_full_syscall_postsys32:
741	sti
742	/*
743	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
744	 * so that we can account for the extra work it takes us to finish.
745	 */
746	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
747	movq	%r15, %rdi
748	movq	%r12, %rsi			/* rval1 - %eax */
749	movq	%r13, %rdx			/* rval2 - %edx */
750	call	syscall_exit
751	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
752	jmp	sys_rtt_syscall32
753	SET_SIZE(sys_syscall32)
754	SET_SIZE(brand_sys_syscall32)
755
756#endif	/* __lint */
757
758/*
759 * System call handler via the sysenter instruction
760 * Used only for 32-bit system calls on the 64-bit kernel.
761 *
762 * The caller in userland has arranged that:
763 *
764 * -	%eax contains the syscall number
765 * -	%ecx contains the user %esp
766 * -	%edx contains the return %eip
767 * -	the user stack contains the args to the syscall
768 *
769 * Hardware and (privileged) initialization code have arranged that by
770 * the time the sysenter instructions completes:
771 *
772 * - %rip is pointing to sys_sysenter (below).
773 * - %cs and %ss are set to kernel text and stack (data) selectors.
774 * - %rsp is pointing at the lwp's stack
775 * - interrupts have been disabled.
776 *
777 * Note that we are unable to return both "rvals" to userland with
778 * this call, as %edx is used by the sysexit instruction.
779 *
780 * One final complication in this routine is its interaction with
781 * single-stepping in a debugger.  For most of the system call mechanisms,
782 * the CPU automatically clears the single-step flag before we enter the
783 * kernel.  The sysenter mechanism does not clear the flag, so a user
784 * single-stepping through a libc routine may suddenly find him/herself
785 * single-stepping through the kernel.  To detect this, kmdb compares the
786 * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
787 * If it finds that we have single-stepped to a sysenter entry point, it
788 * explicitly clears the flag and executes the sys_sysenter routine.
789 *
790 * One final complication in this final complication is the fact that we
791 * have two different entry points for sysenter: brand_sys_sysenter and
792 * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
793 * through the kernel with kmdb, we will eventually hit the instruction at
794 * sys_sysenter.  kmdb cannot distinguish between that valid single-step
795 * and the undesirable one mentioned above.  To avoid this situation, we
796 * simply add a jump over the instruction at sys_sysenter to make it
797 * impossible to single-step to it.
798 */
799#if defined(__lint)
800
801void
802sys_sysenter()
803{}
804
805#else	/* __lint */
806
807	ENTRY_NP(brand_sys_sysenter)
808	swapgs
809
810	ALTENTRY(_brand_sys_sysenter_post_swapgs)
811	BRAND_CALLBACK(BRAND_CB_SYSENTER)
812	/*
813	 * Jump over sys_sysenter to allow single-stepping as described
814	 * above.
815	 */
816	jmp	_sys_sysenter_post_swapgs
817
818	ALTENTRY(sys_sysenter)
819	swapgs
820
821	ALTENTRY(_sys_sysenter_post_swapgs)
822	movq	%gs:CPU_THREAD, %r15
823
824	movl	$U32CS_SEL, REGOFF_CS(%rsp)
825	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
826	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
827	pushfq
828	popq	%r10
829	movl	$UDS_SEL, REGOFF_SS(%rsp)
830
831	/*
832	 * Set the interrupt flag before storing the flags to the
833	 * flags image on the stack so we can return to user with
834	 * interrupts enabled if we return via sys_rtt_syscall32
835	 */
836	orq	$PS_IE, %r10
837	movq	%r10, REGOFF_RFL(%rsp)
838
839	movl	%edi, REGOFF_RDI(%rsp)
840	movl	%esi, REGOFF_RSI(%rsp)
841	movl	%ebp, REGOFF_RBP(%rsp)
842	movl	%ebx, REGOFF_RBX(%rsp)
843	movl	%edx, REGOFF_RDX(%rsp)
844	movl	%ecx, REGOFF_RCX(%rsp)
845	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
846	movq	$0, REGOFF_SAVFP(%rsp)
847	movq	$0, REGOFF_SAVPC(%rsp)
848
849	/*
850	 * Copy these registers here in case we end up stopped with
851	 * someone (like, say, /proc) messing with our register state.
852	 * We don't -restore- them unless we have to in update_sregs.
853	 *
854	 * Since userland -can't- change fsbase or gsbase directly,
855	 * we don't bother to capture them here.
856	 */
857	xorl	%ebx, %ebx
858	movw	%ds, %bx
859	movq	%rbx, REGOFF_DS(%rsp)
860	movw	%es, %bx
861	movq	%rbx, REGOFF_ES(%rsp)
862	movw	%fs, %bx
863	movq	%rbx, REGOFF_FS(%rsp)
864	movw	%gs, %bx
865	movq	%rbx, REGOFF_GS(%rsp)
866
867	/*
868	 * Application state saved in the regs structure on the stack
869	 * %eax is the syscall number
870	 * %rsp is the thread's stack, %r15 is curthread
871	 * REG_RSP(%rsp) is the user's stack
872	 */
873
874	SYSCALL_TRAPTRACE($TT_SYSENTER)
875
876	movq	%rsp, %rbp
877
878	movq	T_LWP(%r15), %r14
879	ASSERT_NO_RUPDATE_PENDING(%r14)
880
881	ENABLE_INTR_FLAGS
882
883	/*
884	 * Catch 64-bit process trying to issue sysenter instruction
885	 * on Nocona based systems.
886	 */
887	movq	LWP_PROCP(%r14), %rax
888	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
889	je	7f
890
891	/*
892	 * For a non-32-bit process, simulate a #ud, since that's what
893	 * native hardware does.  The traptrace entry (above) will
894	 * let you know what really happened.
895	 */
896	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
897	movq	REGOFF_CS(%rsp), %rdi
898	movq	%rdi, REGOFF_ERR(%rsp)
899	movq	%rsp, %rdi
900	movq	REGOFF_RIP(%rsp), %rsi
901	movl	%gs:CPU_ID, %edx
902	call	trap
903	jmp	_sys_rtt
9047:
905
906	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
907	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
908
909	ASSERT_LWPTOREGS(%r14, %rsp)
910
911	incq	%gs:CPU_STATS_SYS_SYSCALL
912
913	/*
914	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
915	 * placed into 64-bit (long) arg slots, plus one 64-bit
916	 * (long) arg count, maintaining 16 byte alignment.
917	 */
918	subq	$SYS_DROP, %rsp
919	movb	$LWP_SYS, LWP_STATE(%r14)
920	movq	%r15, %rdi
921	movq	%rsp, %rsi
922	call	syscall_entry
923
924	/*
925	 * Fetch the arguments copied onto the kernel stack and put
926	 * them in the right registers to invoke a C-style syscall handler.
927	 * %rax contains the handler address.
928	 */
929	movq	%rax, %rbx
930	movl	0(%rsp), %edi
931	movl	8(%rsp), %esi
932	movl	0x10(%rsp), %edx
933	movl	0x18(%rsp), %ecx
934	movl	0x20(%rsp), %r8d
935	movl	0x28(%rsp), %r9d
936
937	call	*SY_CALLC(%rbx)
938
939	movq	%rbp, %rsp	/* pop the args */
940
941	/*
942	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
943	 * On the 32-bit kernel, the always return that value in %eax:%edx
944	 * as required by the 32-bit ABI.
945	 *
946	 * Simulate the same behaviour by unconditionally splitting the
947	 * return value in the same way.
948	 */
949	movq	%rax, %r13
950	shrq	$32, %r13	/* upper 32-bits into %edx */
951	movl	%eax, %r12d	/* lower 32-bits into %eax */
952
953	/*
954	 * Optimistically assume that there's no post-syscall
955	 * work to do.  (This is to avoid having to call syscall_mstate()
956	 * with interrupts disabled)
957	 */
958	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
959
960	/*
961	 * We must protect ourselves from being descheduled here;
962	 * If we were, and we ended up on another cpu, or another
963	 * lwp got int ahead of us, it could change the segment
964	 * registers without us noticing before we return to userland.
965	 */
966	cli
967	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
968	jne	_full_syscall_postsys32
969	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
970
971	/*
972	 * To get back to userland, load up the 32-bit registers and
973	 * sysexit back where we came from.
974	 */
975
976	/*
977	 * Interrupts will be turned on by the 'sti' executed just before
978	 * sysexit.  The following ensures that restoring the user's rflags
979	 * doesn't enable interrupts too soon.
980	 */
981	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
982
983	/*
984	 * (There's no point in loading up %edx because the sysexit
985	 * mechanism smashes it.)
986	 */
987	movl	%r12d, %eax
988	movl	REGOFF_RBX(%rsp), %ebx
989	movl	REGOFF_RBP(%rsp), %ebp
990	movl	REGOFF_RSI(%rsp), %esi
991	movl	REGOFF_RDI(%rsp), %edi
992
993	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
994	pushq	REGOFF_RFL(%rsp)
995	popfq
996	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
997	swapgs
998	sti
999	sysexit
1000	SET_SIZE(sys_sysenter)
1001	SET_SIZE(_sys_sysenter_post_swapgs)
1002	SET_SIZE(brand_sys_sysenter)
1003
1004#endif	/* __lint */
1005
1006#if defined(__lint)
1007/*
1008 * System call via an int80.  This entry point is only used by the Linux
1009 * application environment.  Unlike the other entry points, there is no
1010 * default action to take if no callback is registered for this process.
1011 */
1012void
1013sys_int80()
1014{}
1015
1016#else	/* __lint */
1017
1018	ENTRY_NP(brand_sys_int80)
1019	swapgs
1020	BRAND_CALLBACK(BRAND_CB_INT80)
1021	swapgs
1022
1023	ENTRY_NP(sys_int80)
1024	/*
1025	 * We hit an int80, but this process isn't of a brand with an int80
1026	 * handler.  Bad process!  Make it look as if the INT failed.
1027	 * Modify %eip to point before the INT, push the expected error
1028	 * code and fake a GP fault.
1029	 *
1030	 */
1031	swapgs
1032	subq	$2, (%rsp)	/* int insn 2-bytes */
1033	pushq	$_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
1034	jmp	gptrap			/ GP fault
1035	SET_SIZE(sys_int80)
1036	SET_SIZE(brand_sys_int80)
1037#endif	/* __lint */
1038
1039
1040/*
1041 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1042 * the generic i386 libc to do system calls. We do a small amount of setup
1043 * before jumping into the existing sys_syscall32 path.
1044 */
1045#if defined(__lint)
1046
1047/*ARGSUSED*/
1048void
1049sys_syscall_int()
1050{}
1051
1052#else	/* __lint */
1053
1054	ENTRY_NP(brand_sys_syscall_int)
1055	swapgs
1056	BRAND_CALLBACK(BRAND_CB_INT91)
1057	swapgs
1058
1059	ALTENTRY(sys_syscall_int)
1060	swapgs
1061	movq	%gs:CPU_THREAD, %r15
1062	movq	T_STACK(%r15), %rsp
1063	movl	%eax, %eax
1064	/*
1065	 * Set t_post_sys on this thread to force ourselves out via the slow
1066	 * path. It might be possible at some later date to optimize this out
1067	 * and use a faster return mechanism.
1068	 */
1069	movb	$1, T_POST_SYS(%r15)
1070	jmp	_syscall32_save
1071	SET_SIZE(sys_syscall_int)
1072	SET_SIZE(brand_sys_syscall_int)
1073
1074#endif	/* __lint */
1075
1076/*
1077 * Legacy 32-bit applications and old libc implementations do lcalls;
1078 * we should never get here because the LDT entry containing the syscall
1079 * segment descriptor has the "segment present" bit cleared, which means
1080 * we end up processing those system calls in trap() via a not-present trap.
1081 *
1082 * We do it this way because a call gate unhelpfully does -nothing- to the
1083 * interrupt flag bit, so an interrupt can run us just after the lcall
1084 * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
1085 * INTR_POP paths would have to be slightly more complex to dance around
1086 * this problem, and end up depending explicitly on the first
1087 * instruction of this handler being either swapgs or cli.
1088 */
1089
1090#if defined(__lint)
1091
1092/*ARGSUSED*/
1093void
1094sys_lcall32()
1095{}
1096
1097#else	/* __lint */
1098
1099	ENTRY_NP(sys_lcall32)
1100	swapgs
1101	pushq	$0
1102	pushq	%rbp
1103	movq	%rsp, %rbp
1104	leaq	__lcall_panic_str(%rip), %rdi
1105	xorl	%eax, %eax
1106	call	panic
1107	SET_SIZE(sys_lcall32)
1108
1109__lcall_panic_str:
1110	.string	"sys_lcall32: shouldn't be here!"
1111
1112/*
1113 * Declare a uintptr_t which covers the entire pc range of syscall
1114 * handlers for the stack walkers that need this.
1115 */
1116	.align	CPTRSIZE
1117	.globl	_allsyscalls_size
1118	.type	_allsyscalls_size, @object
1119_allsyscalls_size:
1120	.NWORD	. - _allsyscalls
1121	SET_SIZE(_allsyscalls_size)
1122
1123#endif	/* __lint */
1124
1125/*
1126 * These are the thread context handlers for lwps using sysenter/sysexit.
1127 */
1128
1129#if defined(__lint)
1130
1131/*ARGSUSED*/
1132void
1133sep_save(void *ksp)
1134{}
1135
1136/*ARGSUSED*/
1137void
1138sep_restore(void *ksp)
1139{}
1140
1141#else	/* __lint */
1142
1143	/*
1144	 * setting this value to zero as we switch away causes the
1145	 * stack-pointer-on-sysenter to be NULL, ensuring that we
1146	 * don't silently corrupt another (preempted) thread stack
1147	 * when running an lwp that (somehow) didn't get sep_restore'd
1148	 */
1149	ENTRY_NP(sep_save)
1150	xorl	%edx, %edx
1151	xorl	%eax, %eax
1152	movl	$MSR_INTC_SEP_ESP, %ecx
1153	wrmsr
1154	ret
1155	SET_SIZE(sep_save)
1156
1157	/*
1158	 * Update the kernel stack pointer as we resume onto this cpu.
1159	 */
1160	ENTRY_NP(sep_restore)
1161	movq	%rdi, %rdx
1162	shrq	$32, %rdx
1163	movl	%edi, %eax
1164	movl	$MSR_INTC_SEP_ESP, %ecx
1165	wrmsr
1166	ret
1167	SET_SIZE(sep_restore)
1168
1169#endif	/* __lint */
1170