syscall.c revision 2712:f74a135872bc
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/vmparam.h>
31#include <sys/types.h>
32#include <sys/sysmacros.h>
33#include <sys/systm.h>
34#include <sys/signal.h>
35#include <sys/stack.h>
36#include <sys/cred.h>
37#include <sys/cmn_err.h>
38#include <sys/user.h>
39#include <sys/privregs.h>
40#include <sys/psw.h>
41#include <sys/debug.h>
42#include <sys/errno.h>
43#include <sys/proc.h>
44#include <sys/modctl.h>
45#include <sys/var.h>
46#include <sys/inline.h>
47#include <sys/syscall.h>
48#include <sys/ucontext.h>
49#include <sys/cpuvar.h>
50#include <sys/siginfo.h>
51#include <sys/trap.h>
52#include <sys/vtrace.h>
53#include <sys/sysinfo.h>
54#include <sys/procfs.h>
55#include <c2/audit.h>
56#include <sys/modctl.h>
57#include <sys/aio_impl.h>
58#include <sys/tnf.h>
59#include <sys/tnf_probe.h>
60#include <sys/copyops.h>
61#include <sys/priv.h>
62#include <sys/msacct.h>
63
64int syscalltrace = 0;
65#ifdef SYSCALLTRACE
66static kmutex_t systrace_lock;		/* syscall tracing lock */
67#else
68#define	syscalltrace 0
69#endif /* SYSCALLTRACE */
70
71typedef	int64_t (*llfcn_t)();	/* function returning long long */
72
73int pre_syscall(void);
74void post_syscall(long rval1, long rval2);
75static krwlock_t *lock_syscall(struct sysent *, uint_t);
76void deferred_singlestep_trap(caddr_t);
77
78#ifdef _SYSCALL32_IMPL
79#define	LWP_GETSYSENT(lwp)	\
80	(lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ? sysent : sysent32)
81#else
82#define	LWP_GETSYSENT(lwp)	(sysent)
83#endif
84
85/*
86 * Arrange for the real time profiling signal to be dispatched.
87 */
88void
89realsigprof(int sysnum, int error)
90{
91	proc_t *p;
92	klwp_t *lwp;
93
94	if (curthread->t_rprof->rp_anystate == 0)
95		return;
96	p = ttoproc(curthread);
97	lwp = ttolwp(curthread);
98	mutex_enter(&p->p_lock);
99	if (sigismember(&p->p_ignore, SIGPROF) ||
100	    signal_is_blocked(curthread, SIGPROF)) {
101		mutex_exit(&p->p_lock);
102		return;
103	}
104	lwp->lwp_siginfo.si_signo = SIGPROF;
105	lwp->lwp_siginfo.si_code = PROF_SIG;
106	lwp->lwp_siginfo.si_errno = error;
107	hrt2ts(gethrtime(), &lwp->lwp_siginfo.si_tstamp);
108	lwp->lwp_siginfo.si_syscall = sysnum;
109	lwp->lwp_siginfo.si_nsysarg = (sysnum > 0 && sysnum < NSYSCALL) ?
110		LWP_GETSYSENT(lwp)[sysnum].sy_narg : 0;
111	lwp->lwp_siginfo.si_fault = lwp->lwp_lastfault;
112	lwp->lwp_siginfo.si_faddr = lwp->lwp_lastfaddr;
113	lwp->lwp_lastfault = 0;
114	lwp->lwp_lastfaddr = NULL;
115	sigtoproc(p, curthread, SIGPROF);
116	mutex_exit(&p->p_lock);
117	ASSERT(lwp->lwp_cursig == 0);
118	if (issig(FORREAL))
119		psig();
120	mutex_enter(&p->p_lock);
121	lwp->lwp_siginfo.si_signo = 0;
122	bzero(curthread->t_rprof, sizeof (*curthread->t_rprof));
123	mutex_exit(&p->p_lock);
124}
125
126/*
127 * If watchpoints are active, don't make copying in of
128 * system call arguments take a read watchpoint trap.
129 */
130static int
131copyin_args(struct regs *rp, long *ap, uint_t nargs)
132{
133	greg_t *sp = 1 + (greg_t *)rp->r_sp;		/* skip ret addr */
134
135	ASSERT(nargs <= MAXSYSARGS);
136
137	return (copyin_nowatch(sp, ap, nargs * sizeof (*sp)));
138}
139
140#if defined(_SYSCALL32_IMPL)
141static int
142copyin_args32(struct regs *rp, long *ap, uint_t nargs)
143{
144	greg32_t *sp = 1 + (greg32_t *)rp->r_sp;	/* skip ret addr */
145	uint32_t a32[MAXSYSARGS];
146	int rc;
147
148	ASSERT(nargs <= MAXSYSARGS);
149
150	if ((rc = copyin_nowatch(sp, a32, nargs * sizeof (*sp))) == 0) {
151		uint32_t *a32p = &a32[0];
152
153		while (nargs--)
154			*ap++ = (ulong_t)*a32p++;
155	}
156	return (rc);
157}
158#define	COPYIN_ARGS32	copyin_args32
159#else
160#define	COPYIN_ARGS32	copyin_args
161#endif
162
163/*
164 * Error handler for system calls where arg copy gets fault.
165 */
166static longlong_t
167syscall_err()
168{
169	return (0);
170}
171
172/*
173 * Corresponding sysent entry to allow syscall_entry caller
174 * to invoke syscall_err.
175 */
176static struct sysent sysent_err =  {
177	0, SE_32RVAL1, NULL, NULL, (llfcn_t)syscall_err
178};
179
180/*
181 * Called from syscall() when a non-trivial 32-bit system call occurs.
182 * 	Sets up the args and returns a pointer to the handler.
183 */
184struct sysent *
185syscall_entry(kthread_t *t, long *argp)
186{
187	klwp_t *lwp = ttolwp(t);
188	struct regs *rp = lwptoregs(lwp);
189	unsigned int code;
190	struct sysent *callp;
191	struct sysent *se = LWP_GETSYSENT(lwp);
192	int error = 0;
193	uint_t nargs;
194
195	ASSERT(t == curthread && curthread->t_schedflag & TS_DONT_SWAP);
196
197	lwp->lwp_ru.sysc++;
198	lwp->lwp_eosys = NORMALRETURN;	/* assume this will be normal */
199
200	/*
201	 * Set lwp_ap to point to the args, even if none are needed for this
202	 * system call.  This is for the loadable-syscall case where the
203	 * number of args won't be known until the system call is loaded, and
204	 * also maintains a non-NULL lwp_ap setup for get_syscall_args(). Note
205	 * that lwp_ap MUST be set to a non-NULL value _BEFORE_ t_sysnum is
206	 * set to non-zero; otherwise get_syscall_args(), seeing a non-zero
207	 * t_sysnum for this thread, will charge ahead and dereference lwp_ap.
208	 */
209	lwp->lwp_ap = argp;		/* for get_syscall_args */
210
211	code = rp->r_r0;
212	t->t_sysnum = (short)code;
213	callp = code >= NSYSCALL ? &nosys_ent : se + code;
214
215	if ((t->t_pre_sys | syscalltrace) != 0) {
216		error = pre_syscall();
217
218		/*
219		 * pre_syscall() has taken care so that lwp_ap is current;
220		 * it either points to syscall-entry-saved amd64 regs,
221		 * or it points to lwp_arg[], which has been re-copied from
222		 * the ia32 ustack, but either way, it's a current copy after
223		 * /proc has possibly mucked with the syscall args.
224		 */
225
226		if (error)
227			return (&sysent_err);	/* use dummy handler */
228	}
229
230	/*
231	 * Fetch the system call arguments to the kernel stack copy used
232	 * for syscall handling.
233	 * Note: for loadable system calls the number of arguments required
234	 * may not be known at this point, and will be zero if the system call
235	 * was never loaded.  Once the system call has been loaded, the number
236	 * of args is not allowed to be changed.
237	 */
238	if ((nargs = (uint_t)callp->sy_narg) != 0 &&
239	    COPYIN_ARGS32(rp, argp, nargs)) {
240		(void) set_errno(EFAULT);
241		return (&sysent_err);	/* use dummy handler */
242	}
243
244	return (callp);		/* return sysent entry for caller */
245}
246
247void
248syscall_exit(kthread_t *t, long rval1, long rval2)
249{
250	/*
251	 * Handle signals and other post-call events if necessary.
252	 */
253	if ((t->t_post_sys_ast | syscalltrace) == 0) {
254		klwp_t *lwp = ttolwp(t);
255		struct regs *rp = lwptoregs(lwp);
256
257		/*
258		 * Normal return.
259		 * Clear error indication and set return values.
260		 */
261		rp->r_ps &= ~PS_C;	/* reset carry bit */
262		rp->r_r0 = rval1;
263		rp->r_r1 = rval2;
264		lwp->lwp_state = LWP_USER;
265	} else
266		post_syscall(rval1, rval2);
267	t->t_sysnum = 0;		/* invalidate args */
268}
269
270/*
271 * Perform pre-system-call processing, including stopping for tracing,
272 * auditing, etc.
273 *
274 * This routine is called only if the t_pre_sys flag is set. Any condition
275 * requiring pre-syscall handling must set the t_pre_sys flag. If the
276 * condition is persistent, this routine will repost t_pre_sys.
277 */
278int
279pre_syscall()
280{
281	kthread_t *t = curthread;
282	unsigned code = t->t_sysnum;
283	klwp_t *lwp = ttolwp(t);
284	proc_t *p = ttoproc(t);
285	int	repost;
286
287	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
288
289	ASSERT(t->t_schedflag & TS_DONT_SWAP);
290
291#if defined(DEBUG)
292	/*
293	 * On the i386 kernel, lwp_ap points at the piece of the thread
294	 * stack that we copy the users arguments into.
295	 *
296	 * On the amd64 kernel, the syscall arguments in the rdi..r9
297	 * registers should be pointed at by lwp_ap.  If the args need to
298	 * be copied so that those registers can be changed without losing
299	 * the ability to get the args for /proc, they can be saved by
300	 * save_syscall_args(), and lwp_ap will be restored by post_syscall().
301	 */
302	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
303#if defined(_LP64)
304		ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi);
305	} else {
306#endif
307		ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase &&
308			(caddr_t)lwp->lwp_ap < t->t_stk);
309	}
310#endif	/* DEBUG */
311
312	/*
313	 * Make sure the thread is holding the latest credentials for the
314	 * process.  The credentials in the process right now apply to this
315	 * thread for the entire system call.
316	 */
317	if (t->t_cred != p->p_cred) {
318		cred_t *oldcred = t->t_cred;
319		/*
320		 * DTrace accesses t_cred in probe context.  t_cred must
321		 * always be either NULL, or point to a valid, allocated cred
322		 * structure.
323		 */
324		t->t_cred = crgetcred();
325		crfree(oldcred);
326	}
327
328	/*
329	 * From the proc(4) manual page:
330	 * When entry to a system call is being traced, the traced process
331	 * stops after having begun the call to the system but before the
332	 * system call arguments have been fetched from the process.
333	 */
334	if (PTOU(p)->u_systrap) {
335		if (prismember(&PTOU(p)->u_entrymask, code)) {
336			mutex_enter(&p->p_lock);
337			/*
338			 * Recheck stop condition, now that lock is held.
339			 */
340			if (PTOU(p)->u_systrap &&
341			    prismember(&PTOU(p)->u_entrymask, code)) {
342				stop(PR_SYSENTRY, code);
343
344				/*
345				 * /proc may have modified syscall args,
346				 * either in regs for amd64 or on ustack
347				 * for ia32.  Either way, arrange to
348				 * copy them again, both for the syscall
349				 * handler and for other consumers in
350				 * post_syscall (like audit).  Here, we
351				 * only do amd64, and just set lwp_ap
352				 * back to the kernel-entry stack copy;
353				 * the syscall ml code redoes
354				 * move-from-regs to set up for the
355				 * syscall handler after we return.  For
356				 * ia32, save_syscall_args() below makes
357				 * an lwp_ap-accessible copy.
358				 */
359#if defined(_LP64)
360				if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
361					lwp->lwp_argsaved = 0;
362					lwp->lwp_ap =
363					    (long *)&lwptoregs(lwp)->r_rdi;
364				}
365#endif
366			}
367			mutex_exit(&p->p_lock);
368		}
369		repost = 1;
370	}
371
372	/*
373	 * ia32 kernel, or ia32 proc on amd64 kernel: keep args in
374	 * lwp_arg for post-syscall processing, regardless of whether
375	 * they might have been changed in /proc above.
376	 */
377#if defined(_LP64)
378	if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE)
379#endif
380		(void) save_syscall_args();
381
382	if (lwp->lwp_sysabort) {
383		/*
384		 * lwp_sysabort may have been set via /proc while the process
385		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
386		 * Override any error from the copyin() of the arguments.
387		 */
388		lwp->lwp_sysabort = 0;
389		(void) set_errno(EINTR);	/* forces post_sys */
390		t->t_pre_sys = 1;	/* repost anyway */
391		return (1);		/* don't do system call, return EINTR */
392	}
393
394#ifdef C2_AUDIT
395	if (audit_active) {	/* begin auditing for this syscall */
396		int error;
397		if (error = audit_start(T_SYSCALL, code, 0, lwp)) {
398			t->t_pre_sys = 1;	/* repost anyway */
399			(void) set_errno(error);
400			return (1);
401		}
402		repost = 1;
403	}
404#endif /* C2_AUDIT */
405
406#ifndef NPROBE
407	/* Kernel probe */
408	if (tnf_tracing_active) {
409		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
410			tnf_sysnum,	sysnum,		t->t_sysnum);
411		t->t_post_sys = 1;	/* make sure post_syscall runs */
412		repost = 1;
413	}
414#endif /* NPROBE */
415
416#ifdef SYSCALLTRACE
417	if (syscalltrace) {
418		int i;
419		long *ap;
420		char *cp;
421		char *sysname;
422		struct sysent *callp;
423
424		if (code >= NSYSCALL)
425			callp = &nosys_ent;	/* nosys has no args */
426		else
427			callp = LWP_GETSYSENT(lwp) + code;
428		(void) save_syscall_args();
429		mutex_enter(&systrace_lock);
430		printf("%d: ", p->p_pid);
431		if (code >= NSYSCALL)
432			printf("0x%x", code);
433		else {
434			sysname = mod_getsysname(code);
435			printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" :
436			    sysname, code, callp->sy_callc);
437		}
438		cp = "(";
439		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
440			printf("%s%lx", cp, *ap);
441			cp = ", ";
442		}
443		if (i)
444			printf(")");
445		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
446		mutex_exit(&systrace_lock);
447	}
448#endif /* SYSCALLTRACE */
449
450	/*
451	 * If there was a continuing reason for pre-syscall processing,
452	 * set the t_pre_sys flag for the next system call.
453	 */
454	if (repost)
455		t->t_pre_sys = 1;
456	lwp->lwp_error = 0;	/* for old drivers */
457	lwp->lwp_badpriv = PRIV_NONE;
458	return (0);
459}
460
461
462/*
463 * Post-syscall processing.  Perform abnormal system call completion
464 * actions such as /proc tracing, profiling, signals, preemption, etc.
465 *
466 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
467 * Any condition requiring pre-syscall handling must set one of these.
468 * If the condition is persistent, this routine will repost t_post_sys.
469 */
470void
471post_syscall(long rval1, long rval2)
472{
473	kthread_t *t = curthread;
474	klwp_t *lwp = ttolwp(t);
475	proc_t *p = ttoproc(t);
476	struct regs *rp = lwptoregs(lwp);
477	uint_t	error;
478	uint_t	code = t->t_sysnum;
479	int	repost = 0;
480	int	proc_stop = 0;		/* non-zero if stopping */
481	int	sigprof = 0;		/* non-zero if sending SIGPROF */
482
483	t->t_post_sys = 0;
484
485	error = lwp->lwp_errno;
486
487	/*
488	 * Code can be zero if this is a new LWP returning after a forkall(),
489	 * other than the one which matches the one in the parent which called
490	 * forkall().  In these LWPs, skip most of post-syscall activity.
491	 */
492	if (code == 0)
493		goto sig_check;
494	/*
495	 * If the trace flag is set, mark the lwp to take a single-step trap
496	 * on return to user level (below). The x86 lcall interface and
497	 * sysenter has already done this, and turned off the flag, but
498	 * amd64 syscall interface has not.
499	 */
500	if (rp->r_ps & PS_T) {
501		lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
502		rp->r_ps &= ~PS_T;
503		aston(curthread);
504	}
505#ifdef C2_AUDIT
506	if (audit_active) {	/* put out audit record for this syscall */
507		rval_t	rval;
508
509		/* XX64 -- truncation of 64-bit return values? */
510		rval.r_val1 = (int)rval1;
511		rval.r_val2 = (int)rval2;
512		audit_finish(T_SYSCALL, code, error, &rval);
513		repost = 1;
514	}
515#endif /* C2_AUDIT */
516
517	if (curthread->t_pdmsg != NULL) {
518		char *m = curthread->t_pdmsg;
519
520		uprintf("%s", m);
521		kmem_free(m, strlen(m) + 1);
522		curthread->t_pdmsg = NULL;
523	}
524
525	/*
526	 * If we're going to stop for /proc tracing, set the flag and
527	 * save the arguments so that the return values don't smash them.
528	 */
529	if (PTOU(p)->u_systrap) {
530		if (prismember(&PTOU(p)->u_exitmask, code)) {
531			if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
532				(void) save_syscall_args();
533			proc_stop = 1;
534		}
535		repost = 1;
536	}
537
538	/*
539	 * Similarly check to see if SIGPROF might be sent.
540	 */
541	if (curthread->t_rprof != NULL &&
542	    curthread->t_rprof->rp_anystate != 0) {
543		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
544			(void) save_syscall_args();
545		sigprof = 1;
546	}
547
548	if (lwp->lwp_eosys == NORMALRETURN) {
549		if (error == 0) {
550#ifdef SYSCALLTRACE
551			if (syscalltrace) {
552				mutex_enter(&systrace_lock);
553				printf(
554				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
555				    p->p_pid, rval1, rval2, curthread);
556				mutex_exit(&systrace_lock);
557			}
558#endif /* SYSCALLTRACE */
559			rp->r_ps &= ~PS_C;
560			rp->r_r0 = rval1;
561			rp->r_r1 = rval2;
562		} else {
563			int sig;
564#ifdef SYSCALLTRACE
565			if (syscalltrace) {
566				mutex_enter(&systrace_lock);
567				printf("%d: error=%d, id 0x%p\n",
568				    p->p_pid, error, curthread);
569				mutex_exit(&systrace_lock);
570			}
571#endif /* SYSCALLTRACE */
572			if (error == EINTR && t->t_activefd.a_stale)
573				error = EBADF;
574			if (error == EINTR &&
575			    (sig = lwp->lwp_cursig) != 0 &&
576			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
577			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
578			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
579				error = ERESTART;
580			rp->r_r0 = error;
581			rp->r_ps |= PS_C;
582		}
583	}
584
585	/*
586	 * From the proc(4) manual page:
587	 * When exit from a system call is being traced, the traced process
588	 * stops on completion of the system call just prior to checking for
589	 * signals and returning to user level.  At this point all return
590	 * values have been stored into the traced process's saved registers.
591	 */
592	if (proc_stop) {
593		mutex_enter(&p->p_lock);
594		if (PTOU(p)->u_systrap &&
595		    prismember(&PTOU(p)->u_exitmask, code))
596			stop(PR_SYSEXIT, code);
597		mutex_exit(&p->p_lock);
598	}
599
600	/*
601	 * If we are the parent returning from a successful
602	 * vfork, wait for the child to exec or exit.
603	 * This code must be here and not in the bowels of the system
604	 * so that /proc can intercept exit from vfork in a timely way.
605	 */
606	if (code == SYS_vfork && rp->r_r1 == 0 && error == 0)
607		vfwait((pid_t)rval1);
608
609	/*
610	 * If profiling is active, bill the current PC in user-land
611	 * and keep reposting until profiling is disabled.
612	 */
613	if (p->p_prof.pr_scale) {
614		if (lwp->lwp_oweupc)
615			profil_tick(rp->r_pc);
616		repost = 1;
617	}
618
619sig_check:
620	/*
621	 * Reset flag for next time.
622	 * We must do this after stopping on PR_SYSEXIT
623	 * because /proc uses the information in lwp_eosys.
624	 */
625	lwp->lwp_eosys = NORMALRETURN;
626	clear_stale_fd();
627	t->t_flag &= ~T_FORKALL;
628
629	if (t->t_astflag | t->t_sig_check) {
630		/*
631		 * Turn off the AST flag before checking all the conditions that
632		 * may have caused an AST.  This flag is on whenever a signal or
633		 * unusual condition should be handled after the next trap or
634		 * syscall.
635		 */
636		astoff(t);
637		/*
638		 * If a single-step trap occurred on a syscall (see trap())
639		 * recognize it now.  Do this before checking for signals
640		 * because deferred_singlestep_trap() may generate a SIGTRAP to
641		 * the LWP or may otherwise mark the LWP to call issig(FORREAL).
642		 */
643		if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
644			deferred_singlestep_trap((caddr_t)rp->r_pc);
645
646		t->t_sig_check = 0;
647
648		/*
649		 * The following check is legal for the following reasons:
650		 *	1) The thread we are checking, is ourselves, so there is
651		 *	   no way the proc can go away.
652		 *	2) The only time we need to be protected by the
653		 *	   lock is if the binding is changed.
654		 *
655		 *	Note we will still take the lock and check the binding
656		 *	if the condition was true without the lock held.  This
657		 *	prevents lock contention among threads owned by the
658		 * 	same proc.
659		 */
660
661		if (curthread->t_proc_flag & TP_CHANGEBIND) {
662			mutex_enter(&p->p_lock);
663			if (curthread->t_proc_flag & TP_CHANGEBIND) {
664				timer_lwpbind();
665				curthread->t_proc_flag &= ~TP_CHANGEBIND;
666			}
667			mutex_exit(&p->p_lock);
668		}
669
670		/*
671		 * for kaio requests on the special kaio poll queue,
672		 * copyout their results to user memory.
673		 */
674		if (p->p_aio)
675			aio_cleanup(0);
676		/*
677		 * If this LWP was asked to hold, call holdlwp(), which will
678		 * stop.  holdlwps() sets this up and calls pokelwps() which
679		 * sets the AST flag.
680		 *
681		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
682		 * through lwp_rtt().  That flag is set if the lwp_create(2)
683		 * syscall failed after creating the LWP.
684		 */
685		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
686			holdlwp();
687
688		/*
689		 * All code that sets signals and makes ISSIG_PENDING
690		 * evaluate true must set t_sig_check afterwards.
691		 */
692		if (ISSIG_PENDING(t, lwp, p)) {
693			if (issig(FORREAL))
694				psig();
695			t->t_sig_check = 1;	/* recheck next time */
696		}
697
698		if (sigprof) {
699			realsigprof(code, error);
700			t->t_sig_check = 1;	/* recheck next time */
701		}
702
703		/*
704		 * If a performance counter overflow interrupt was
705		 * delivered *during* the syscall, then re-enable the
706		 * AST so that we take a trip through trap() to cause
707		 * the SIGEMT to be delivered.
708		 */
709		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
710			aston(t);
711
712		/*
713		 * /proc can't enable/disable the trace bit itself
714		 * because that could race with the call gate used by
715		 * system calls via "lcall". If that happened, an
716		 * invalid EFLAGS would result. prstep()/prnostep()
717		 * therefore schedule an AST for the purpose.
718		 */
719		if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
720			lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
721			rp->r_ps |= PS_T;
722		}
723		if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
724			lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
725			rp->r_ps &= ~PS_T;
726		}
727	}
728
729	lwp->lwp_errno = 0;		/* clear error for next time */
730
731#ifndef NPROBE
732	/* Kernel probe */
733	if (tnf_tracing_active) {
734		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
735			tnf_long,	rval1,		rval1,
736			tnf_long,	rval2,		rval2,
737			tnf_long,	errno,		(long)error);
738		repost = 1;
739	}
740#endif /* NPROBE */
741
742	/*
743	 * Set state to LWP_USER here so preempt won't give us a kernel
744	 * priority if it occurs after this point.  Call CL_TRAPRET() to
745	 * restore the user-level priority.
746	 *
747	 * It is important that no locks (other than spinlocks) be entered
748	 * after this point before returning to user mode (unless lwp_state
749	 * is set back to LWP_SYS).
750	 *
751	 * XXX Sampled times past this point are charged to the user.
752	 */
753	lwp->lwp_state = LWP_USER;
754
755	if (t->t_trapret) {
756		t->t_trapret = 0;
757		thread_lock(t);
758		CL_TRAPRET(t);
759		thread_unlock(t);
760	}
761	if (CPU->cpu_runrun)
762		preempt();
763
764	lwp->lwp_errno = 0;		/* clear error for next time */
765
766	/*
767	 * The thread lock must be held in order to clear sysnum and reset
768	 * lwp_ap atomically with respect to other threads in the system that
769	 * may be looking at the args via lwp_ap from get_syscall_args().
770	 */
771
772	thread_lock(t);
773	t->t_sysnum = 0;		/* no longer in a system call */
774
775	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
776#if defined(_LP64)
777		/*
778		 * In case the args were copied to the lwp, reset the
779		 * pointer so the next syscall will have the right
780		 * lwp_ap pointer.
781		 */
782		lwp->lwp_ap = (long *)&rp->r_rdi;
783	} else {
784#endif
785		lwp->lwp_ap = NULL;	/* reset on every syscall entry */
786	}
787	thread_unlock(t);
788
789	lwp->lwp_argsaved = 0;
790
791	/*
792	 * If there was a continuing reason for post-syscall processing,
793	 * set the t_post_sys flag for the next system call.
794	 */
795	if (repost)
796		t->t_post_sys = 1;
797
798	/*
799	 * If there is a ustack registered for this lwp, and the stack rlimit
800	 * has been altered, read in the ustack. If the saved stack rlimit
801	 * matches the bounds of the ustack, update the ustack to reflect
802	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
803	 * stack checking by setting the size to 0.
804	 */
805	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
806		rlim64_t new_size;
807		caddr_t top;
808		stack_t stk;
809		struct rlimit64 rl;
810
811		mutex_enter(&p->p_lock);
812		new_size = p->p_stk_ctl;
813		top = p->p_usrstack;
814		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
815		mutex_exit(&p->p_lock);
816
817		if (rl.rlim_cur == RLIM64_INFINITY)
818			new_size = 0;
819
820		if (copyin((stack_t *)lwp->lwp_ustack, &stk,
821		    sizeof (stack_t)) == 0 &&
822		    (stk.ss_size == lwp->lwp_old_stk_ctl ||
823			stk.ss_size == 0) &&
824		    stk.ss_sp == top - stk.ss_size) {
825			stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
826			    stk.ss_size - (uintptr_t)new_size);
827			stk.ss_size = new_size;
828
829			(void) copyout(&stk, (stack_t *)lwp->lwp_ustack,
830			    sizeof (stack_t));
831		}
832
833		lwp->lwp_old_stk_ctl = 0;
834	}
835}
836
837/*
838 * Called from post_syscall() when a deferred singlestep is to be taken.
839 */
840void
841deferred_singlestep_trap(caddr_t pc)
842{
843	proc_t *p = ttoproc(curthread);
844	klwp_t *lwp = ttolwp(curthread);
845	pcb_t *pcb = &lwp->lwp_pcb;
846	uint_t fault = 0;
847	k_siginfo_t siginfo;
848
849	bzero(&siginfo, sizeof (siginfo));
850
851	/*
852	 * If both NORMAL_STEP and WATCH_STEP are in
853	 * effect, give precedence to WATCH_STEP.
854	 * If neither is set, user must have set the
855	 * PS_T bit in %efl; treat this as NORMAL_STEP.
856	 */
857	if ((fault = undo_watch_step(&siginfo)) == 0 &&
858	    ((pcb->pcb_flags & NORMAL_STEP) ||
859	    !(pcb->pcb_flags & WATCH_STEP))) {
860		siginfo.si_signo = SIGTRAP;
861		siginfo.si_code = TRAP_TRACE;
862		siginfo.si_addr  = pc;
863		fault = FLTTRACE;
864	}
865	pcb->pcb_flags &= ~(DEBUG_PENDING|NORMAL_STEP|WATCH_STEP);
866
867	if (fault) {
868		/*
869		 * Remember the fault and fault adddress
870		 * for real-time (SIGPROF) profiling.
871		 */
872		lwp->lwp_lastfault = fault;
873		lwp->lwp_lastfaddr = siginfo.si_addr;
874		/*
875		 * If a debugger has declared this fault to be an
876		 * event of interest, stop the lwp.  Otherwise just
877		 * deliver the associated signal.
878		 */
879		if (prismember(&p->p_fltmask, fault) &&
880		    stop_on_fault(fault, &siginfo) == 0)
881			siginfo.si_signo = 0;
882	}
883
884	if (siginfo.si_signo)
885		trapsig(&siginfo, 1);
886}
887
888/*
889 * nonexistent system call-- signal lwp (may want to handle it)
890 * flag error if lwp won't see signal immediately
891 */
892int64_t
893nosys()
894{
895	tsignal(curthread, SIGSYS);
896	return (set_errno(ENOSYS));
897}
898
899/*
900 * Execute a 32-bit system call on behalf of the current thread.
901 */
902void
903dosyscall(void)
904{
905	/*
906	 * Need space on the stack to store syscall arguments.
907	 */
908	long		syscall_args[MAXSYSARGS];
909	struct sysent	*se;
910	int64_t		ret;
911
912	syscall_mstate(LMS_TRAP, LMS_SYSTEM);
913
914	ASSERT(curproc->p_model == DATAMODEL_ILP32);
915
916	CPU_STATS_ENTER_K();
917	CPU_STATS_ADDQ(CPU, sys, syscall, 1);
918	CPU_STATS_EXIT_K();
919
920	se = syscall_entry(curthread, syscall_args);
921
922	/*
923	 * syscall_entry() copied all 8 arguments into syscall_args.
924	 */
925	ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2],
926	    syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6],
927	    syscall_args[7]);
928
929	syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32));
930	syscall_mstate(LMS_SYSTEM, LMS_TRAP);
931}
932
933/*
934 * Get the arguments to the current system call. See comment atop
935 * save_syscall_args() regarding lwp_ap usage.
936 */
937
938uint_t
939get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
940{
941	kthread_t	*t = lwptot(lwp);
942	ulong_t	mask = 0xfffffffful;
943	uint_t	code;
944	long	*ap;
945	int	nargs;
946
947#if defined(_LP64)
948	if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
949		mask = 0xfffffffffffffffful;
950#endif
951
952	/*
953	 * The thread lock must be held while looking at the arguments to ensure
954	 * they don't go away via post_syscall().
955	 * get_syscall_args() is the only routine to read them which is callable
956	 * outside the LWP in question and hence the only one that must be
957	 * synchronized in this manner.
958	 */
959	thread_lock(t);
960
961	code = t->t_sysnum;
962	ap = lwp->lwp_ap;
963
964	thread_unlock(t);
965
966	if (code != 0 && code < NSYSCALL) {
967		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
968
969		ASSERT(nargs <= MAXSYSARGS);
970
971		*nargsp = nargs;
972		while (nargs-- > 0)
973			*argp++ = *ap++ & mask;
974	} else {
975		*nargsp = 0;
976	}
977
978	return (code);
979}
980
981#ifdef _SYSCALL32_IMPL
982/*
983 * Get the arguments to the current 32-bit system call.
984 */
985uint_t
986get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
987{
988	long args[MAXSYSARGS];
989	uint_t i, code;
990
991	code = get_syscall_args(lwp, args, nargsp);
992
993	for (i = 0; i != *nargsp; i++)
994		*argp++ = (int)args[i];
995	return (code);
996}
997#endif
998
999/*
1000 * Save the system call arguments in a safe place.
1001 *
1002 * On the i386 kernel:
1003 *
1004 *	Copy the users args prior to changing the stack or stack pointer.
1005 *	This is so /proc will be able to get a valid copy of the
1006 *	args from the user stack even after the user stack has been changed.
1007 *	Note that the kernel stack copy of the args may also have been
1008 *	changed by a system call handler which takes C-style arguments.
1009 *
1010 *	Note that this may be called by stop() from trap().  In that case
1011 *	t_sysnum will be zero (syscall_exit clears it), so no args will be
1012 *	copied.
1013 *
1014 * On the amd64 kernel:
1015 *
1016 *	For 64-bit applications, lwp->lwp_ap normally points to %rdi..%r9
1017 *	in the reg structure. If the user is going to change the argument
1018 *	registers, rax, or the stack and might want to get the args (for
1019 *	/proc tracing), it must copy the args elsewhere via save_syscall_args().
1020 *
1021 *	For 32-bit applications, lwp->lwp_ap normally points to a copy of
1022 *	the system call arguments on the kernel stack made from the user
1023 *	stack.  Copy the args prior to change the stack or stack pointer.
1024 *	This is so /proc will be able to get a valid copy of the args
1025 *	from the user stack even after that stack has been changed.
1026 *
1027 *	This may be called from stop() even when we're not in a system call.
1028 *	Since there's no easy way to tell, this must be safe (not panic).
1029 *	If the copyins get data faults, return non-zero.
1030 */
1031int
1032save_syscall_args()
1033{
1034	kthread_t	*t = curthread;
1035	klwp_t		*lwp = ttolwp(t);
1036	uint_t		code = t->t_sysnum;
1037	uint_t		nargs;
1038
1039	if (lwp->lwp_argsaved || code == 0)
1040		return (0);		/* args already saved or not needed */
1041
1042	if (code >= NSYSCALL) {
1043		nargs = 0;		/* illegal syscall */
1044	} else {
1045		struct sysent *se = LWP_GETSYSENT(lwp);
1046		struct sysent *callp = se + code;
1047
1048		nargs = callp->sy_narg;
1049		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
1050			krwlock_t	*module_lock;
1051
1052			/*
1053			 * Find out how many arguments the system
1054			 * call uses.
1055			 *
1056			 * We have the property that loaded syscalls
1057			 * never change the number of arguments they
1058			 * use after they've been loaded once.  This
1059			 * allows us to stop for /proc tracing without
1060			 * holding the module lock.
1061			 * /proc is assured that sy_narg is valid.
1062			 */
1063			module_lock = lock_syscall(se, code);
1064			nargs = callp->sy_narg;
1065			rw_exit(module_lock);
1066		}
1067	}
1068
1069	/*
1070	 * Fetch the system call arguments.
1071	 */
1072	if (nargs == 0)
1073		goto out;
1074
1075	ASSERT(nargs <= MAXSYSARGS);
1076
1077	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
1078#if defined(_LP64)
1079		struct regs *rp = lwptoregs(lwp);
1080
1081		lwp->lwp_arg[0] = rp->r_rdi;
1082		lwp->lwp_arg[1] = rp->r_rsi;
1083		lwp->lwp_arg[2] = rp->r_rdx;
1084		lwp->lwp_arg[3] = rp->r_rcx;
1085		lwp->lwp_arg[4] = rp->r_r8;
1086		lwp->lwp_arg[5] = rp->r_r9;
1087		if (nargs > 6 && copyin_args(rp, &lwp->lwp_arg[6], nargs - 6))
1088			return (-1);
1089	} else {
1090#endif
1091		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_arg, nargs))
1092			return (-1);
1093	}
1094out:
1095	lwp->lwp_ap = lwp->lwp_arg;
1096	lwp->lwp_argsaved = 1;
1097	t->t_post_sys = 1;	/* so lwp_ap will be reset */
1098	return (0);
1099}
1100
1101void
1102reset_syscall_args(void)
1103{
1104	ttolwp(curthread)->lwp_argsaved = 0;
1105}
1106
1107/*
1108 * Call a system call which takes a pointer to the user args struct and
1109 * a pointer to the return values.  This is a bit slower than the standard
1110 * C arg-passing method in some cases.
1111 */
1112int64_t
1113syscall_ap(void)
1114{
1115	uint_t	error;
1116	struct sysent *callp;
1117	rval_t	rval;
1118	kthread_t *t = curthread;
1119	klwp_t	*lwp = ttolwp(t);
1120	struct regs *rp = lwptoregs(lwp);
1121
1122	callp = LWP_GETSYSENT(lwp) + t->t_sysnum;
1123
1124#if defined(__amd64)
1125	/*
1126	 * If the arguments don't fit in registers %rdi-%r9, make sure they
1127	 * have been copied to the lwp_arg array.
1128	 */
1129	if (callp->sy_narg > 6 && save_syscall_args())
1130		return ((int64_t)set_errno(EFAULT));
1131#endif
1132
1133	rval.r_val1 = 0;
1134	rval.r_val2 = rp->r_r1;
1135	lwp->lwp_error = 0;	/* for old drivers */
1136	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
1137	if (error)
1138		return ((longlong_t)set_errno(error));
1139	return (rval.r_vals);
1140}
1141
1142/*
1143 * Load system call module.
1144 *	Returns with pointer to held read lock for module.
1145 */
1146static krwlock_t *
1147lock_syscall(struct sysent *table, uint_t code)
1148{
1149	krwlock_t	*module_lock;
1150	struct modctl	*modp;
1151	int		id;
1152	struct sysent   *callp;
1153
1154	callp = table + code;
1155	module_lock = callp->sy_lock;
1156
1157	/*
1158	 * Optimization to only call modload if we don't have a loaded
1159	 * syscall.
1160	 */
1161	rw_enter(module_lock, RW_READER);
1162	if (LOADED_SYSCALL(callp))
1163		return (module_lock);
1164	rw_exit(module_lock);
1165
1166	for (;;) {
1167		if ((id = modload("sys", syscallnames[code])) == -1)
1168			break;
1169
1170		/*
1171		 * If we loaded successfully at least once, the modctl
1172		 * will still be valid, so we try to grab it by filename.
1173		 * If this call fails, it's because the mod_filename
1174		 * was changed after the call to modload() (mod_hold_by_name()
1175		 * is the likely culprit).  We can safely just take
1176		 * another lap if this is the case;  the modload() will
1177		 * change the mod_filename back to one by which we can
1178		 * find the modctl.
1179		 */
1180		modp = mod_find_by_filename("sys", syscallnames[code]);
1181
1182		if (modp == NULL)
1183			continue;
1184
1185		mutex_enter(&mod_lock);
1186
1187		if (!modp->mod_installed) {
1188			mutex_exit(&mod_lock);
1189			continue;
1190		}
1191		break;
1192	}
1193	rw_enter(module_lock, RW_READER);
1194
1195	if (id != -1)
1196		mutex_exit(&mod_lock);
1197
1198	return (module_lock);
1199}
1200
1201/*
1202 * Loadable syscall support.
1203 *	If needed, load the module, then reserve it by holding a read
1204 *	lock for the duration of the call.
1205 *	Later, if the syscall is not unloadable, it could patch the vector.
1206 */
1207/*ARGSUSED*/
1208int64_t
1209loadable_syscall(
1210    long a0, long a1, long a2, long a3,
1211    long a4, long a5, long a6, long a7)
1212{
1213	klwp_t *lwp = ttolwp(curthread);
1214	int64_t	rval;
1215	struct sysent *callp;
1216	struct sysent *se = LWP_GETSYSENT(lwp);
1217	krwlock_t *module_lock;
1218	int code, error = 0;
1219	int64_t (*sy_call)();
1220
1221	code = curthread->t_sysnum;
1222	callp = se + code;
1223
1224	/*
1225	 * Try to autoload the system call if necessary
1226	 */
1227	module_lock = lock_syscall(se, code);
1228	THREAD_KPRI_RELEASE();	/* drop priority given by rw_enter */
1229
1230	/*
1231	 * we've locked either the loaded syscall or nosys
1232	 */
1233
1234	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
1235#if defined(_LP64)
1236		if (callp->sy_flags & SE_ARGC) {
1237			sy_call = (int64_t (*)())callp->sy_call;
1238			rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1239		} else
1240			rval = syscall_ap();
1241	} else {
1242#endif
1243		/*
1244		 * Now that it's loaded, make sure enough args were copied.
1245		 */
1246		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_ap, callp->sy_narg))
1247			error = EFAULT;
1248		if (error) {
1249			rval = set_errno(error);
1250		} else if (callp->sy_flags & SE_ARGC) {
1251			sy_call = (int64_t (*)())callp->sy_call;
1252			rval = (*sy_call)(lwp->lwp_ap[0], lwp->lwp_ap[1],
1253			    lwp->lwp_ap[2], lwp->lwp_ap[3], lwp->lwp_ap[4],
1254			    lwp->lwp_ap[5]);
1255		} else
1256			rval = syscall_ap();
1257	}
1258
1259	THREAD_KPRI_REQUEST();	/* regain priority from read lock */
1260	rw_exit(module_lock);
1261	return (rval);
1262}
1263
1264/*
1265 * Indirect syscall handled in libc on x86 architectures
1266 */
1267int64_t
1268indir()
1269{
1270	return (nosys());
1271}
1272
1273/*
1274 * set_errno - set an error return from the current system call.
1275 *	This could be a macro.
1276 *	This returns the value it is passed, so that the caller can
1277 *	use tail-recursion-elimination and do return (set_errno(ERRNO));
1278 */
1279uint_t
1280set_errno(uint_t error)
1281{
1282	ASSERT(error != 0);		/* must not be used to clear errno */
1283
1284	curthread->t_post_sys = 1;	/* have post_syscall do error return */
1285	return (ttolwp(curthread)->lwp_errno = error);
1286}
1287
1288/*
1289 * set_proc_pre_sys - Set pre-syscall processing for entire process.
1290 */
1291void
1292set_proc_pre_sys(proc_t *p)
1293{
1294	kthread_t	*t;
1295	kthread_t	*first;
1296
1297	ASSERT(MUTEX_HELD(&p->p_lock));
1298
1299	t = first = p->p_tlist;
1300	do {
1301		t->t_pre_sys = 1;
1302	} while ((t = t->t_forw) != first);
1303}
1304
1305/*
1306 * set_proc_post_sys - Set post-syscall processing for entire process.
1307 */
1308void
1309set_proc_post_sys(proc_t *p)
1310{
1311	kthread_t	*t;
1312	kthread_t	*first;
1313
1314	ASSERT(MUTEX_HELD(&p->p_lock));
1315
1316	t = first = p->p_tlist;
1317	do {
1318		t->t_post_sys = 1;
1319	} while ((t = t->t_forw) != first);
1320}
1321
1322/*
1323 * set_proc_sys - Set pre- and post-syscall processing for entire process.
1324 */
1325void
1326set_proc_sys(proc_t *p)
1327{
1328	kthread_t	*t;
1329	kthread_t	*first;
1330
1331	ASSERT(MUTEX_HELD(&p->p_lock));
1332
1333	t = first = p->p_tlist;
1334	do {
1335		t->t_pre_sys = 1;
1336		t->t_post_sys = 1;
1337	} while ((t = t->t_forw) != first);
1338}
1339
1340/*
1341 * set_all_proc_sys - set pre- and post-syscall processing flags for all
1342 * user processes.
1343 *
1344 * This is needed when auditing, tracing, or other facilities which affect
1345 * all processes are turned on.
1346 */
1347void
1348set_all_proc_sys()
1349{
1350	kthread_t	*t;
1351	kthread_t	*first;
1352
1353	mutex_enter(&pidlock);
1354	t = first = curthread;
1355	do {
1356		t->t_pre_sys = 1;
1357		t->t_post_sys = 1;
1358	} while ((t = t->t_next) != first);
1359	mutex_exit(&pidlock);
1360}
1361
1362/*
1363 * set_proc_ast - Set asynchronous service trap (AST) flag for all
1364 * threads in process.
1365 */
1366void
1367set_proc_ast(proc_t *p)
1368{
1369	kthread_t	*t;
1370	kthread_t	*first;
1371
1372	ASSERT(MUTEX_HELD(&p->p_lock));
1373
1374	t = first = p->p_tlist;
1375	do {
1376		aston(t);
1377	} while ((t = t->t_forw) != first);
1378}
1379