exec.c revision 2712:f74a135872bc
175584Sru/*
275584Sru * CDDL HEADER START
375584Sru *
475584Sru * The contents of this file are subject to the terms of the
575584Sru * Common Development and Distribution License (the "License").
675584Sru * You may not use this file except in compliance with the License.
775584Sru *
875584Sru * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
975584Sru * or http://www.opensolaris.org/os/licensing.
1075584Sru * See the License for the specific language governing permissions
1175584Sru * and limitations under the License.
1275584Sru *
1375584Sru * When distributing Covered Code, include this CDDL HEADER in each
1475584Sru * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1575584Sru * If applicable, add the following below this CDDL HEADER, with the
1675584Sru * fields enclosed by brackets "[]" replaced with your own identifying
1775584Sru * information: Portions Copyright [yyyy] [name of copyright owner]
1875584Sru *
1975584Sru * CDDL HEADER END
2075584Sru */
2175584Sru/*
2275584Sru * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
2375584Sru * Use is subject to license terms.
2475584Sru */
2575584Sru
2675584Sru#pragma ident	"%Z%%M%	%I%	%E% SMI"
2775584Sru
2875584Sru/*	Copyright (c) 1988 AT&T	*/
2975584Sru/*	  All Rights Reserved  	*/
3075584Sru
3175584Sru
3275584Sru#include <sys/types.h>
3375584Sru#include <sys/param.h>
3475584Sru#include <sys/sysmacros.h>
3575584Sru#include <sys/systm.h>
3675584Sru#include <sys/signal.h>
3775584Sru#include <sys/cred_impl.h>
3875584Sru#include <sys/policy.h>
3975584Sru#include <sys/user.h>
4075584Sru#include <sys/errno.h>
4175584Sru#include <sys/file.h>
4275584Sru#include <sys/vfs.h>
4375584Sru#include <sys/vnode.h>
4475584Sru#include <sys/mman.h>
4575584Sru#include <sys/acct.h>
4675584Sru#include <sys/cpuvar.h>
4775584Sru#include <sys/proc.h>
4875584Sru#include <sys/cmn_err.h>
4975584Sru#include <sys/debug.h>
5075584Sru#include <sys/pathname.h>
5175584Sru#include <sys/vm.h>
5275584Sru#include <sys/vtrace.h>
5375584Sru#include <sys/exec.h>
5475584Sru#include <sys/exechdr.h>
5575584Sru#include <sys/kmem.h>
5675584Sru#include <sys/prsystm.h>
5775584Sru#include <sys/modctl.h>
5875584Sru#include <sys/vmparam.h>
5975584Sru#include <sys/schedctl.h>
6075584Sru#include <sys/utrap.h>
6175584Sru#include <sys/systeminfo.h>
6275584Sru#include <sys/stack.h>
6375584Sru#include <sys/rctl.h>
6475584Sru#include <sys/dtrace.h>
6575584Sru#include <sys/lwpchan_impl.h>
6675584Sru#include <sys/pool.h>
6775584Sru#include <sys/sdt.h>
6875584Sru#include <sys/brand.h>
6975584Sru
7075584Sru#include <c2/audit.h>
7175584Sru
7275584Sru#include <vm/hat.h>
7375584Sru#include <vm/anon.h>
7475584Sru#include <vm/as.h>
7575584Sru#include <vm/seg.h>
7675584Sru#include <vm/seg_vn.h>
7775584Sru
7875584Sru#define	PRIV_RESET		0x01	/* needs to reset privs */
7975584Sru#define	PRIV_SETID		0x02	/* needs to change uids */
8075584Sru#define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
8175584Sru#define	PRIV_INCREASE		0x08	/* child runs with more privs */
8275584Sru#define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
8375584Sru
8475584Srustatic int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
8575584Srustatic int hold_execsw(struct execsw *);
8675584Sru
8775584Sruuint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
8875584Sru#if defined(_SYSCALL32_IMPL)
8975584Sruuint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
9075584Sru#endif
9175584Sru
9275584Sruint exec_lpg_disable = 0;
9375584Sru#define	PSUIDFLAGS		(SNOCD|SUGID)
9475584Sru
9575584Sru/*
9675584Sru * exec() - wrapper around exece providing NULL environment pointer
9775584Sru */
9875584Sruint
9975584Sruexec(const char *fname, const char **argp)
10075584Sru{
10175584Sru	return (exece(fname, argp, NULL));
10275584Sru}
10375584Sru
10475584Sru/*
10575584Sru * exece() - system call wrapper around exec_common()
10675584Sru */
10775584Sruint
10875584Sruexece(const char *fname, const char **argp, const char **envp)
10975584Sru{
11075584Sru	int error;
11175584Sru
11275584Sru	error = exec_common(fname, argp, envp, EBA_NONE);
11375584Sru	return (error ? (set_errno(error)) : 0);
11475584Sru}
11575584Sru
11675584Sruint
11775584Sruexec_common(const char *fname, const char **argp, const char **envp,
11875584Sru    int brand_action)
11975584Sru{
12075584Sru	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
12175584Sru	proc_t *p = ttoproc(curthread);
12275584Sru	klwp_t *lwp = ttolwp(curthread);
12375584Sru	struct user *up = PTOU(p);
12475584Sru	long execsz;		/* temporary count of exec size */
12575584Sru	int i;
12675584Sru	int error;
12775584Sru	char exec_file[MAXCOMLEN+1];
12875584Sru	struct pathname pn;
12975584Sru	struct pathname resolvepn;
13075584Sru	struct uarg args;
13175584Sru	struct execa ua;
13275584Sru	k_sigset_t savedmask;
13375584Sru	lwpdir_t *lwpdir = NULL;
13475584Sru	lwpdir_t **tidhash;
13575584Sru	lwpdir_t *old_lwpdir = NULL;
13675584Sru	uint_t old_lwpdir_sz;
13775584Sru	lwpdir_t **old_tidhash;
13875584Sru	uint_t old_tidhash_sz;
13975584Sru	lwpent_t *lep;
14075584Sru	int brandme = 0;
14175584Sru
14275584Sru	/*
14375584Sru	 * exec() is not supported for the /proc agent lwp.
14475584Sru	 */
14575584Sru	if (curthread == p->p_agenttp)
14675584Sru		return (ENOTSUP);
147151497Sru
14875584Sru	if ((error = secpolicy_basic_exec(CRED())) != 0)
14975584Sru		return (error);
15075584Sru
15175584Sru	if (brand_action != EBA_NONE) {
15275584Sru		/*
15375584Sru		 * Brand actions are not supported for processes that are not
15475584Sru		 * running in a branded zone.
15575584Sru		 */
15675584Sru		if (!ZONE_IS_BRANDED(p->p_zone))
15775584Sru			return (ENOTSUP);
15875584Sru
15975584Sru		if (brand_action == EBA_NATIVE) {
16075584Sru			/* Only branded processes can be unbranded */
16175584Sru			if (!PROC_IS_BRANDED(p))
16275584Sru				return (ENOTSUP);
16375584Sru		} else {
16475584Sru			/* Only unbranded processes can be branded */
16575584Sru			if (PROC_IS_BRANDED(p))
16675584Sru				return (ENOTSUP);
16775584Sru			brandme = 1;
16875584Sru		}
16975584Sru	} else {
17075584Sru		/*
17175584Sru		 * If this is a native zone, or if the process is already
17275584Sru		 * branded, then we don't need to do anything.  If this is
17375584Sru		 * a native process in a branded zone, we need to brand the
17475584Sru		 * process as it exec()s the new binary.
17575584Sru		 */
17675584Sru		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
17775584Sru			brandme = 1;
17875584Sru	}
17975584Sru
18075584Sru	/*
18175584Sru	 * Inform /proc that an exec() has started.
18275584Sru	 * Hold signals that are ignored by default so that we will
18375584Sru	 * not be interrupted by a signal that will be ignored after
18475584Sru	 * successful completion of gexec().
18575584Sru	 */
18675584Sru	mutex_enter(&p->p_lock);
18775584Sru	prexecstart();
18875584Sru	schedctl_finish_sigblock(curthread);
18975584Sru	savedmask = curthread->t_hold;
19075584Sru	sigorset(&curthread->t_hold, &ignoredefault);
19175584Sru	mutex_exit(&p->p_lock);
19275584Sru
19375584Sru	/*
19475584Sru	 * Look up path name and remember last component for later.
19575584Sru	 * To help coreadm expand its %d token, we attempt to save
19675584Sru	 * the directory containing the executable in p_execdir. The
19775584Sru	 * first call to lookuppn() may fail and return EINVAL because
19875584Sru	 * dirvpp is non-NULL. In that case, we make a second call to
19975584Sru	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
20075584Sru	 * but coreadm is allowed to expand %d to the empty string and
20175584Sru	 * there are other cases in which that failure may occur.
20275584Sru	 */
20375584Sru	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
20475584Sru		goto out;
20575584Sru	pn_alloc(&resolvepn);
20675584Sru	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
20775584Sru		pn_free(&resolvepn);
20875584Sru		pn_free(&pn);
20975584Sru		if (error != EINVAL)
21075584Sru			goto out;
21175584Sru
212151497Sru		dir = NULL;
21375584Sru		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
21475584Sru			goto out;
21575584Sru		pn_alloc(&resolvepn);
216151497Sru		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
21775584Sru		    &vp)) != 0) {
21875584Sru			pn_free(&resolvepn);
21975584Sru			pn_free(&pn);
22075584Sru			goto out;
221151497Sru		}
222151497Sru	}
22375584Sru	if (vp == NULL) {
22475584Sru		if (dir != NULL)
22575584Sru			VN_RELE(dir);
22675584Sru		error = ENOENT;
22775584Sru		pn_free(&resolvepn);
22875584Sru		pn_free(&pn);
22975584Sru		goto out;
23075584Sru	}
23175584Sru
23275584Sru	/*
23375584Sru	 * We do not allow executing files in attribute directories.
23475584Sru	 * We test this by determining whether the resolved path
23575584Sru	 * contains a "/" when we're in an attribute directory;
23675584Sru	 * only if the pathname does not contain a "/" the resolved path
23775584Sru	 * points to a file in the current working (attribute) directory.
23875584Sru	 */
23975584Sru	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
24075584Sru	    strchr(resolvepn.pn_path, '/') == NULL) {
24175584Sru		if (dir != NULL)
24275584Sru			VN_RELE(dir);
24375584Sru		error = EACCES;
24475584Sru		pn_free(&resolvepn);
24575584Sru		pn_free(&pn);
24675584Sru		VN_RELE(vp);
24775584Sru		goto out;
24875584Sru	}
24975584Sru
25075584Sru	bzero(exec_file, MAXCOMLEN+1);
251151497Sru	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
252151497Sru	bzero(&args, sizeof (args));
253151497Sru	args.pathname = resolvepn.pn_path;
254151497Sru	/* don't free resolvepn until we are done with args */
255151497Sru	pn_free(&pn);
256151497Sru
257151497Sru	/*
258151497Sru	 * Specific exec handlers, or policies determined via
25975584Sru	 * /etc/system may override the historical default.
26075584Sru	 */
26175584Sru	args.stk_prot = PROT_ZFOD;
26275584Sru	args.dat_prot = PROT_ZFOD;
26375584Sru
26475584Sru	CPU_STATS_ADD_K(sys, sysexec, 1);
26575584Sru	DTRACE_PROC1(exec, char *, args.pathname);
26675584Sru
26775584Sru	ua.fname = fname;
26875584Sru	ua.argp = argp;
26975584Sru	ua.envp = envp;
27075584Sru
27175584Sru	/* If necessary, brand this process before we start the exec. */
27275584Sru	if (brandme != 0)
27375584Sru		brand_setbrand(p);
27475584Sru
27575584Sru	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
27675584Sru	    exec_file, p->p_cred, brand_action)) != 0) {
27775584Sru		if (brandme != 0)
278114402Sru			BROP(p)->b_proc_exit(p, lwp);
279114402Sru		VN_RELE(vp);
280114402Sru		if (dir != NULL)
281114402Sru			VN_RELE(dir);
282114402Sru		pn_free(&resolvepn);
283114402Sru		goto fail;
284114402Sru	}
285114402Sru
286114402Sru	/*
287114402Sru	 * Free floating point registers (sun4u only)
288114402Sru	 */
289114402Sru	ASSERT(lwp != NULL);
29075584Sru	lwp_freeregs(lwp, 1);
291114402Sru
29275584Sru	/*
293114402Sru	 * Free thread and process context ops.
29475584Sru	 */
295114402Sru	if (curthread->t_ctx)
296114402Sru		freectx(curthread, 1);
297114402Sru	if (p->p_pctx)
29875584Sru		freepctx(p, 1);
299114402Sru
300114402Sru	/*
301114402Sru	 * Remember file name for accounting; clear any cached DTrace predicate.
302114402Sru	 */
303114402Sru	up->u_acflag &= ~AFORK;
304114402Sru	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
30575584Sru	curthread->t_predcache = NULL;
306114402Sru
307114402Sru	/*
30875584Sru	 * Clear contract template state
309114402Sru	 */
310114402Sru	lwp_ctmpl_clear(lwp);
311114402Sru
312114402Sru	/*
313114402Sru	 * Save the directory in which we found the executable for expanding
314114402Sru	 * the %d token used in core file patterns.
315114402Sru	 */
316114402Sru	mutex_enter(&p->p_lock);
317114402Sru	tmpvp = p->p_execdir;
318114402Sru	p->p_execdir = dir;
319114402Sru	if (p->p_execdir != NULL)
320114402Sru		VN_HOLD(p->p_execdir);
321114402Sru	mutex_exit(&p->p_lock);
322114402Sru
323114402Sru	if (tmpvp != NULL)
324114402Sru		VN_RELE(tmpvp);
325114402Sru
32675584Sru	/*
327114402Sru	 * Reset stack state to the user stack, clear set of signals
328114402Sru	 * caught on the signal stack, and reset list of signals that
329114402Sru	 * restart system calls; the new program's environment should
330114402Sru	 * not be affected by detritus from the old program.  Any
331114402Sru	 * pending held signals remain held, so don't clear t_hold.
332114402Sru	 */
333114402Sru	mutex_enter(&p->p_lock);
334114402Sru	lwp->lwp_oldcontext = 0;
335114402Sru	lwp->lwp_ustack = 0;
336114402Sru	lwp->lwp_old_stk_ctl = 0;
337114402Sru	sigemptyset(&up->u_signodefer);
338114402Sru	sigemptyset(&up->u_sigonstack);
339114402Sru	sigemptyset(&up->u_sigresethand);
340114402Sru	lwp->lwp_sigaltstack.ss_sp = 0;
341114402Sru	lwp->lwp_sigaltstack.ss_size = 0;
342114402Sru	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
343114402Sru
344114402Sru	/*
345114402Sru	 * Make saved resource limit == current resource limit.
346114402Sru	 */
347114402Sru	for (i = 0; i < RLIM_NLIMITS; i++) {
348114402Sru		/*CONSTCOND*/
349114402Sru		if (RLIM_SAVED(i)) {
350114402Sru			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
351114402Sru			    &up->u_saved_rlimit[i]);
352114402Sru		}
353114402Sru	}
35475584Sru
355114402Sru	/*
35675584Sru	 * If the action was to catch the signal, then the action
357114402Sru	 * must be reset to SIG_DFL.
35875584Sru	 */
359114402Sru	sigdefault(p);
36075584Sru	p->p_flag &= ~(SNOWAIT|SJCTL);
361114402Sru	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
362114402Sru	up->u_signal[SIGCLD - 1] = SIG_DFL;
36375584Sru
364114402Sru	/*
365114402Sru	 * Delete the dot4 sigqueues/signotifies.
366114402Sru	 */
367114402Sru	sigqfree(p);
368114402Sru
369114402Sru	mutex_exit(&p->p_lock);
370114402Sru
371114402Sru	mutex_enter(&p->p_pflock);
372114402Sru	p->p_prof.pr_base = NULL;
373114402Sru	p->p_prof.pr_size = 0;
374114402Sru	p->p_prof.pr_off = 0;
375114402Sru	p->p_prof.pr_scale = 0;
376114402Sru	p->p_prof.pr_samples = 0;
377114402Sru	mutex_exit(&p->p_pflock);
378114402Sru
379114402Sru	ASSERT(curthread->t_schedctl == NULL);
380114402Sru
381114402Sru#if defined(__sparc)
382114402Sru	if (p->p_utraps != NULL)
383114402Sru		utrap_free(p);
384114402Sru#endif	/* __sparc */
385114402Sru
386114402Sru	/*
387114402Sru	 * Close all close-on-exec files.
388114402Sru	 */
389114402Sru	close_exec(P_FINFO(p));
390114402Sru	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
39175584Sru
392114402Sru	/* Unbrand ourself if requested. */
39375584Sru	if (brand_action == EBA_NATIVE)
394114402Sru		BROP(p)->b_proc_exit(p, lwp);
39575584Sru	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
396114402Sru
39775584Sru	setregs(&args);
398114402Sru
399114402Sru	/* Mark this as an executable vnode */
400114402Sru	mutex_enter(&vp->v_lock);
401114402Sru	vp->v_flag |= VVMEXEC;
402114402Sru	mutex_exit(&vp->v_lock);
403114402Sru
404114402Sru	VN_RELE(vp);
405114402Sru	if (dir != NULL)
406114402Sru		VN_RELE(dir);
407114402Sru	pn_free(&resolvepn);
408114402Sru
409114402Sru	/*
410114402Sru	 * Allocate a new lwp directory and lwpid hash table if necessary.
411114402Sru	 */
412114402Sru	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
413114402Sru		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
414114402Sru		lwpdir->ld_next = lwpdir + 1;
415114402Sru		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
416114402Sru		if (p->p_lwpdir != NULL)
417114402Sru			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
418114402Sru		else
419114402Sru			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
420114402Sru	}
421114402Sru
422114402Sru	if (PROC_IS_BRANDED(p))
423114402Sru		BROP(p)->b_exec();
424114402Sru
425114402Sru	mutex_enter(&p->p_lock);
426114402Sru	prbarrier(p);
427114402Sru
428114402Sru	/*
429114402Sru	 * Reset lwp id to the default value of 1.
430114402Sru	 * This is a single-threaded process now
431114402Sru	 * and lwp #1 is lwp_wait()able by default.
432114402Sru	 * The t_unpark flag should not be inherited.
433114402Sru	 */
434114402Sru	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
435114402Sru	curthread->t_tid = 1;
436114402Sru	curthread->t_unpark = 0;
437114402Sru	curthread->t_proc_flag |= TP_TWAIT;
438114402Sru	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
439114402Sru	p->p_lwpdaemon = 0;			/* but oh well ... */
440114402Sru	p->p_lwpid = 1;
441114402Sru
442114402Sru	/*
443114402Sru	 * Install the newly-allocated lwp directory and lwpid hash table
444114402Sru	 * and insert the current thread into the new hash table.
445114402Sru	 */
446114402Sru	if (lwpdir != NULL) {
447114402Sru		old_lwpdir = p->p_lwpdir;
448114402Sru		old_lwpdir_sz = p->p_lwpdir_sz;
449114402Sru		old_tidhash = p->p_tidhash;
450114402Sru		old_tidhash_sz = p->p_tidhash_sz;
451114402Sru		p->p_lwpdir = p->p_lwpfree = lwpdir;
452114402Sru		p->p_lwpdir_sz = 2;
453114402Sru		p->p_tidhash = tidhash;
454114402Sru		p->p_tidhash_sz = 2;
455114402Sru		lep->le_thread = curthread;
456114402Sru		lep->le_lwpid = curthread->t_tid;
457114402Sru		lep->le_start = curthread->t_start;
458114402Sru		lwp_hash_in(p, lep);
459114402Sru	}
460114402Sru
461114402Sru	/*
462114402Sru	 * Restore the saved signal mask and
463114402Sru	 * inform /proc that the exec() has finished.
464114402Sru	 */
465114402Sru	curthread->t_hold = savedmask;
466114402Sru	prexecend();
467114402Sru	mutex_exit(&p->p_lock);
468114402Sru	if (old_lwpdir) {
469114402Sru		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
470114402Sru		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
471114402Sru	}
472114402Sru
473114402Sru	ASSERT(error == 0);
474114402Sru	DTRACE_PROC(exec__success);
475114402Sru	return (0);
476114402Sru
477114402Srufail:
478114402Sru	DTRACE_PROC1(exec__failure, int, error);
479114402Sruout:		/* error return */
480114402Sru	mutex_enter(&p->p_lock);
481114402Sru	curthread->t_hold = savedmask;
482114402Sru	prexecend();
483114402Sru	mutex_exit(&p->p_lock);
484114402Sru	ASSERT(error != 0);
485114402Sru	return (error);
486114402Sru}
487114402Sru
488114402Sru
489114402Sru/*
490114402Sru * Perform generic exec duties and switchout to object-file specific
491114402Sru * handler.
492114402Sru */
493114402Sruint
494114402Srugexec(
495114402Sru	struct vnode **vpp,
496114402Sru	struct execa *uap,
497114402Sru	struct uarg *args,
498114402Sru	struct intpdata *idatap,
499114402Sru	int level,
500114402Sru	long *execsz,
501114402Sru	caddr_t exec_file,
502114402Sru	struct cred *cred,
503114402Sru	int brand_action)
504114402Sru{
505114402Sru	struct vnode *vp;
506114402Sru	proc_t *pp = ttoproc(curthread);
507114402Sru	struct execsw *eswp;
508114402Sru	int error = 0;
509114402Sru	int suidflags = 0;
510114402Sru	ssize_t resid;
511114402Sru	uid_t uid, gid;
512114402Sru	struct vattr vattr;
513114402Sru	char magbuf[MAGIC_BYTES];
514114402Sru	int setid;
515114402Sru	cred_t *oldcred, *newcred = NULL;
516114402Sru	int privflags = 0;
517114402Sru	int setidfl;
518114402Sru
519114402Sru	/*
520114402Sru	 * If the SNOCD or SUGID flag is set, turn it off and remember the
521114402Sru	 * previous setting so we can restore it if we encounter an error.
522	 */
523	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
524		mutex_enter(&pp->p_lock);
525		suidflags = pp->p_flag & PSUIDFLAGS;
526		pp->p_flag &= ~PSUIDFLAGS;
527		mutex_exit(&pp->p_lock);
528	}
529
530	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
531		goto bad;
532
533	/* need to open vnode for stateful file systems like rfs */
534	if ((error = VOP_OPEN(vpp, FREAD, CRED())) != 0)
535		goto bad;
536	vp = *vpp;
537
538	/*
539	 * Note: to support binary compatibility with SunOS a.out
540	 * executables, we read in the first four bytes, as the
541	 * magic number is in bytes 2-3.
542	 */
543	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
544	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
545		goto bad;
546	if (resid != 0)
547		goto bad;
548
549	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
550		goto bad;
551
552	if (level == 0 &&
553	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
554
555		newcred = cred = crdup(cred);
556
557		/* If we can, drop the PA bit */
558		if ((privflags & PRIV_RESET) != 0)
559			priv_adjust_PA(cred);
560
561		if (privflags & PRIV_SETID) {
562			cred->cr_uid = uid;
563			cred->cr_gid = gid;
564			cred->cr_suid = uid;
565			cred->cr_sgid = gid;
566		}
567
568		if (privflags & MAC_FLAGS) {
569			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
570				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
571			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
572		}
573
574		/*
575		 * Implement the privilege updates:
576		 *
577		 * Restrict with L:
578		 *
579		 *	I' = I & L
580		 *
581		 *	E' = P' = (I' + F) & A
582		 *
583		 * But if running under ptrace, we cap I with P.
584		 */
585		if ((privflags & PRIV_RESET) != 0) {
586			if ((privflags & PRIV_INCREASE) != 0 &&
587			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
588				priv_intersect(&CR_OPPRIV(cred),
589						    &CR_IPRIV(cred));
590			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
591			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
592			priv_adjust_PA(cred);
593		}
594	}
595
596	/* SunOS 4.x buy-back */
597	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
598	    (vattr.va_mode & (VSUID|VSGID))) {
599		cmn_err(CE_NOTE,
600		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
601		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
602	}
603
604	/*
605	 * execsetid() told us whether or not we had to change the
606	 * credentials of the process.  In privflags, it told us
607	 * whether we gained any privileges or executed a set-uid executable.
608	 */
609	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
610
611	/*
612	 * Use /etc/system variable to determine if the stack
613	 * should be marked as executable by default.
614	 */
615	if (noexec_user_stack)
616		args->stk_prot &= ~PROT_EXEC;
617
618	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
619
620	/*
621	 * Traditionally, the setid flags told the sub processes whether
622	 * the file just executed was set-uid or set-gid; this caused
623	 * some confusion as the 'setid' flag did not match the SUGID
624	 * process flag which is only set when the uids/gids do not match.
625	 * A script set-gid/set-uid to the real uid/gid would start with
626	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
627	 * Now we flag those cases where the calling process cannot
628	 * be trusted to influence the newly exec'ed process, either
629	 * because it runs with more privileges or when the uids/gids
630	 * do in fact not match.
631	 * This also makes the runtime linker agree with the on exec
632	 * values of SNOCD and SUGID.
633	 */
634	setidfl = 0;
635	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
636	    !supgroupmember(cred->cr_gid, cred))) {
637		setidfl |= EXECSETID_UGIDS;
638	}
639	if (setid & PRIV_SETUGID)
640		setidfl |= EXECSETID_SETID;
641	if (setid & PRIV_INCREASE)
642		setidfl |= EXECSETID_PRIVS;
643
644	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
645		setidfl, exec_file, cred, brand_action);
646	rw_exit(eswp->exec_lock);
647	if (error != 0) {
648		if (newcred != NULL)
649			crfree(newcred);
650		goto bad;
651	}
652
653	if (level == 0) {
654		mutex_enter(&pp->p_crlock);
655		if (newcred != NULL) {
656			/*
657			 * Free the old credentials, and set the new ones.
658			 * Do this for both the process and the (single) thread.
659			 */
660			crfree(pp->p_cred);
661			pp->p_cred = cred;	/* cred already held for proc */
662			crhold(cred);		/* hold new cred for thread */
663			/*
664			 * DTrace accesses t_cred in probe context.  t_cred
665			 * must always be either NULL, or point to a valid,
666			 * allocated cred structure.
667			 */
668			oldcred = curthread->t_cred;
669			curthread->t_cred = cred;
670			crfree(oldcred);
671		}
672		/*
673		 * On emerging from a successful exec(), the saved
674		 * uid and gid equal the effective uid and gid.
675		 */
676		cred->cr_suid = cred->cr_uid;
677		cred->cr_sgid = cred->cr_gid;
678
679		/*
680		 * If the real and effective ids do not match, this
681		 * is a setuid process that should not dump core.
682		 * The group comparison is tricky; we prevent the code
683		 * from flagging SNOCD when executing with an effective gid
684		 * which is a supplementary group.
685		 */
686		if (cred->cr_ruid != cred->cr_uid ||
687		    (cred->cr_rgid != cred->cr_gid &&
688		    !supgroupmember(cred->cr_gid, cred)) ||
689		    (privflags & PRIV_INCREASE) != 0)
690			suidflags = PSUIDFLAGS;
691		else
692			suidflags = 0;
693
694		mutex_exit(&pp->p_crlock);
695		if (suidflags) {
696			mutex_enter(&pp->p_lock);
697			pp->p_flag |= suidflags;
698			mutex_exit(&pp->p_lock);
699		}
700		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
701			/*
702			 * If process is traced via /proc, arrange to
703			 * invalidate the associated /proc vnode.
704			 */
705			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
706				args->traceinval = 1;
707		}
708		if (pp->p_proc_flag & P_PR_PTRACE)
709			psignal(pp, SIGTRAP);
710		if (args->traceinval)
711			prinvalidate(&pp->p_user);
712	}
713
714	return (0);
715bad:
716	if (error == 0)
717		error = ENOEXEC;
718
719	if (suidflags) {
720		mutex_enter(&pp->p_lock);
721		pp->p_flag |= suidflags;
722		mutex_exit(&pp->p_lock);
723	}
724	return (error);
725}
726
727extern char *execswnames[];
728
729struct execsw *
730allocate_execsw(char *name, char *magic, size_t magic_size)
731{
732	int i, j;
733	char *ename;
734	char *magicp;
735
736	mutex_enter(&execsw_lock);
737	for (i = 0; i < nexectype; i++) {
738		if (execswnames[i] == NULL) {
739			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
740			(void) strcpy(ename, name);
741			execswnames[i] = ename;
742			/*
743			 * Set the magic number last so that we
744			 * don't need to hold the execsw_lock in
745			 * findexectype().
746			 */
747			magicp = kmem_alloc(magic_size, KM_SLEEP);
748			for (j = 0; j < magic_size; j++)
749				magicp[j] = magic[j];
750			execsw[i].exec_magic = magicp;
751			mutex_exit(&execsw_lock);
752			return (&execsw[i]);
753		}
754	}
755	mutex_exit(&execsw_lock);
756	return (NULL);
757}
758
759/*
760 * Find the exec switch table entry with the corresponding magic string.
761 */
762struct execsw *
763findexecsw(char *magic)
764{
765	struct execsw *eswp;
766
767	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
768		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
769		if (magic && eswp->exec_maglen != 0 &&
770		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
771			return (eswp);
772	}
773	return (NULL);
774}
775
776/*
777 * Find the execsw[] index for the given exec header string by looking for the
778 * magic string at a specified offset and length for each kind of executable
779 * file format until one matches.  If no execsw[] entry is found, try to
780 * autoload a module for this magic string.
781 */
782struct execsw *
783findexec_by_hdr(char *header)
784{
785	struct execsw *eswp;
786
787	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
788		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
789		if (header && eswp->exec_maglen != 0 &&
790		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
791			    eswp->exec_maglen) == 0) {
792			if (hold_execsw(eswp) != 0)
793				return (NULL);
794			return (eswp);
795		}
796	}
797	return (NULL);	/* couldn't find the type */
798}
799
800/*
801 * Find the execsw[] index for the given magic string.  If no execsw[] entry
802 * is found, try to autoload a module for this magic string.
803 */
804struct execsw *
805findexec_by_magic(char *magic)
806{
807	struct execsw *eswp;
808
809	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
810		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
811		if (magic && eswp->exec_maglen != 0 &&
812		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
813			if (hold_execsw(eswp) != 0)
814				return (NULL);
815			return (eswp);
816		}
817	}
818	return (NULL);	/* couldn't find the type */
819}
820
821static int
822hold_execsw(struct execsw *eswp)
823{
824	char *name;
825
826	rw_enter(eswp->exec_lock, RW_READER);
827	while (!LOADED_EXEC(eswp)) {
828		rw_exit(eswp->exec_lock);
829		name = execswnames[eswp-execsw];
830		ASSERT(name);
831		if (modload("exec", name) == -1)
832			return (-1);
833		rw_enter(eswp->exec_lock, RW_READER);
834	}
835	return (0);
836}
837
838static int
839execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
840{
841	proc_t *pp = ttoproc(curthread);
842	uid_t uid, gid;
843	cred_t *cr = pp->p_cred;
844	int privflags = 0;
845
846	/*
847	 * Remember credentials.
848	 */
849	uid = cr->cr_uid;
850	gid = cr->cr_gid;
851
852	/* Will try to reset the PRIV_AWARE bit later. */
853	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
854		privflags |= PRIV_RESET;
855
856	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
857		/*
858		 * Set-uid root execution only allowed if the limit set
859		 * holds all unsafe privileges.
860		 */
861		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
862		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
863			uid = vattrp->va_uid;
864			privflags |= PRIV_SETUGID;
865		}
866		if (vattrp->va_mode & VSGID) {
867			gid = vattrp->va_gid;
868			privflags |= PRIV_SETUGID;
869		}
870	}
871
872	/*
873	 * Do we need to change our credential anyway?
874	 * This is the case when E != I or P != I, as
875	 * we need to do the assignments (with F empty and A full)
876	 * Or when I is not a subset of L; in that case we need to
877	 * enforce L.
878	 *
879	 *		I' = L & I
880	 *
881	 *		E' = P' = (I' + F) & A
882	 * or
883	 *		E' = P' = I'
884	 */
885	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
886	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
887	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
888		privflags |= PRIV_RESET;
889
890	/* If MAC-aware flag(s) are on, need to update cred to remove. */
891	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
892	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
893		privflags |= MAC_FLAGS;
894
895	/*
896	 * When we introduce the "forced" set then we will need
897	 * to set PRIV_INCREASE here if I not a subset of P.
898	 * If the "allowed" set is introduced we will need to do
899	 * a similar thing; however, it seems more reasonable to
900	 * have the allowed set reduce "L": script language interpreters
901	 * would typically have an allowed set of "all".
902	 */
903
904	/*
905	 * Set setuid/setgid protections if no ptrace() compatibility.
906	 * For privileged processes, honor setuid/setgid even in
907	 * the presence of ptrace() compatibility.
908	 */
909	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
910	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
911	    (cr->cr_uid != uid ||
912	    cr->cr_gid != gid ||
913	    cr->cr_suid != uid ||
914	    cr->cr_sgid != gid)) {
915		*uidp = uid;
916		*gidp = gid;
917		privflags |= PRIV_SETID;
918	}
919	return (privflags);
920}
921
922int
923execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
924{
925	int error;
926	proc_t *p = ttoproc(curthread);
927
928	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
929	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred))
930		return (error);
931	/*
932	 * Check the access mode.
933	 * If VPROC, ask /proc if the file is an object file.
934	 */
935	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred)) != 0 ||
936	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
937	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
938	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
939		if (error == 0)
940			error = EACCES;
941		return (error);
942	}
943
944	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
945	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred))) {
946		/*
947		 * If process is under ptrace(2) compatibility,
948		 * fail the exec(2).
949		 */
950		if (p->p_proc_flag & P_PR_PTRACE)
951			goto bad;
952		/*
953		 * Process is traced via /proc.
954		 * Arrange to invalidate the /proc vnode.
955		 */
956		args->traceinval = 1;
957	}
958	return (0);
959bad:
960	if (error == 0)
961		error = ENOEXEC;
962	return (error);
963}
964
965/*
966 * Map a section of an executable file into the user's
967 * address space.
968 */
969int
970execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
971    off_t offset, int prot, int page, uint_t szc)
972{
973	int error = 0;
974	off_t oldoffset;
975	caddr_t zfodbase, oldaddr;
976	size_t end, oldlen;
977	size_t zfoddiff;
978	label_t ljb;
979	proc_t *p = ttoproc(curthread);
980
981	oldaddr = addr;
982	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
983	if (len) {
984		oldlen = len;
985		len += ((size_t)oldaddr - (size_t)addr);
986		oldoffset = offset;
987		offset = (off_t)((uintptr_t)offset & PAGEMASK);
988		if (page) {
989			spgcnt_t  prefltmem, availm, npages;
990			int preread;
991			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
992
993			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
994				mflag |= MAP_TEXT;
995			} else {
996				mflag |= MAP_INITDATA;
997			}
998
999			if (valid_usr_range(addr, len, prot, p->p_as,
1000			    p->p_as->a_userlimit) != RANGE_OKAY) {
1001				error = ENOMEM;
1002				goto bad;
1003			}
1004			if (error = VOP_MAP(vp, (offset_t)offset,
1005			    p->p_as, &addr, len, prot, PROT_ALL,
1006			    mflag, CRED()))
1007				goto bad;
1008
1009			/*
1010			 * If the segment can fit, then we prefault
1011			 * the entire segment in.  This is based on the
1012			 * model that says the best working set of a
1013			 * small program is all of its pages.
1014			 */
1015			npages = (spgcnt_t)btopr(len);
1016			prefltmem = freemem - desfree;
1017			preread =
1018			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1019
1020			/*
1021			 * If we aren't prefaulting the segment,
1022			 * increment "deficit", if necessary to ensure
1023			 * that pages will become available when this
1024			 * process starts executing.
1025			 */
1026			availm = freemem - lotsfree;
1027			if (preread == 0 && npages > availm &&
1028			    deficit < lotsfree) {
1029				deficit += MIN((pgcnt_t)(npages - availm),
1030				    lotsfree - deficit);
1031			}
1032
1033			if (preread) {
1034				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1035				    "execmap preread:freemem %d size %lu",
1036				    freemem, len);
1037				(void) as_fault(p->p_as->a_hat, p->p_as,
1038				    (caddr_t)addr, len, F_INVAL, S_READ);
1039			}
1040		} else {
1041			if (valid_usr_range(addr, len, prot, p->p_as,
1042			    p->p_as->a_userlimit) != RANGE_OKAY) {
1043				error = ENOMEM;
1044				goto bad;
1045			}
1046
1047			if (error = as_map(p->p_as, addr, len,
1048			    segvn_create, zfod_argsp))
1049				goto bad;
1050			/*
1051			 * Read in the segment in one big chunk.
1052			 */
1053			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1054			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1055			    (rlim64_t)0, CRED(), (ssize_t *)0))
1056				goto bad;
1057			/*
1058			 * Now set protections.
1059			 */
1060			if (prot != PROT_ZFOD) {
1061				(void) as_setprot(p->p_as, (caddr_t)addr,
1062				    len, prot);
1063			}
1064		}
1065	}
1066
1067	if (zfodlen) {
1068		struct as *as = curproc->p_as;
1069		struct seg *seg;
1070		uint_t zprot = 0;
1071
1072		end = (size_t)addr + len;
1073		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1074		zfoddiff = (uintptr_t)zfodbase - end;
1075		if (zfoddiff) {
1076			/*
1077			 * Before we go to zero the remaining space on the last
1078			 * page, make sure we have write permission.
1079			 */
1080
1081			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1082			seg = as_segat(curproc->p_as, (caddr_t)end);
1083			if (seg != NULL)
1084				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1085				    &zprot);
1086			AS_LOCK_EXIT(as, &as->a_lock);
1087
1088			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1089				(void) as_setprot(as, (caddr_t)end,
1090				    zfoddiff - 1, zprot | PROT_WRITE);
1091			}
1092
1093			if (on_fault(&ljb)) {
1094				no_fault();
1095				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1096					(void) as_setprot(as, (caddr_t)end,
1097					zfoddiff - 1, zprot);
1098				error = EFAULT;
1099				goto bad;
1100			}
1101			uzero((void *)end, zfoddiff);
1102			no_fault();
1103			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1104				(void) as_setprot(as, (caddr_t)end,
1105				    zfoddiff - 1, zprot);
1106		}
1107		if (zfodlen > zfoddiff) {
1108			struct segvn_crargs crargs =
1109			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1110
1111			zfodlen -= zfoddiff;
1112			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1113			    p->p_as->a_userlimit) != RANGE_OKAY) {
1114				error = ENOMEM;
1115				goto bad;
1116			}
1117			crargs.szc = szc;
1118			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1119			    zfodlen, segvn_create, &crargs))
1120				goto bad;
1121			if (prot != PROT_ZFOD) {
1122				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1123				    zfodlen, prot);
1124			}
1125		}
1126	}
1127	return (0);
1128bad:
1129	return (error);
1130}
1131
1132void
1133setexecenv(struct execenv *ep)
1134{
1135	proc_t *p = ttoproc(curthread);
1136	klwp_t *lwp = ttolwp(curthread);
1137	struct vnode *vp;
1138
1139	p->p_bssbase = ep->ex_bssbase;
1140	p->p_brkbase = ep->ex_brkbase;
1141	p->p_brksize = ep->ex_brksize;
1142	if (p->p_exec)
1143		VN_RELE(p->p_exec);	/* out with the old */
1144	vp = p->p_exec = ep->ex_vp;
1145	if (vp != NULL)
1146		VN_HOLD(vp);		/* in with the new */
1147
1148	lwp->lwp_sigaltstack.ss_sp = 0;
1149	lwp->lwp_sigaltstack.ss_size = 0;
1150	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1151}
1152
1153int
1154execopen(struct vnode **vpp, int *fdp)
1155{
1156	struct vnode *vp = *vpp;
1157	file_t *fp;
1158	int error = 0;
1159	int filemode = FREAD;
1160
1161	VN_HOLD(vp);		/* open reference */
1162	if (error = falloc(NULL, filemode, &fp, fdp)) {
1163		VN_RELE(vp);
1164		*fdp = -1;	/* just in case falloc changed value */
1165		return (error);
1166	}
1167	if (error = VOP_OPEN(&vp, filemode, CRED())) {
1168		VN_RELE(vp);
1169		setf(*fdp, NULL);
1170		unfalloc(fp);
1171		*fdp = -1;
1172		return (error);
1173	}
1174	*vpp = vp;		/* vnode should not have changed */
1175	fp->f_vnode = vp;
1176	mutex_exit(&fp->f_tlock);
1177	setf(*fdp, fp);
1178	return (0);
1179}
1180
1181int
1182execclose(int fd)
1183{
1184	return (closeandsetf(fd, NULL));
1185}
1186
1187
1188/*
1189 * noexec stub function.
1190 */
1191/*ARGSUSED*/
1192int
1193noexec(
1194    struct vnode *vp,
1195    struct execa *uap,
1196    struct uarg *args,
1197    struct intpdata *idatap,
1198    int level,
1199    long *execsz,
1200    int setid,
1201    caddr_t exec_file,
1202    struct cred *cred)
1203{
1204	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1205	return (ENOEXEC);
1206}
1207
1208/*
1209 * Support routines for building a user stack.
1210 *
1211 * execve(path, argv, envp) must construct a new stack with the specified
1212 * arguments and environment variables (see exec_args() for a description
1213 * of the user stack layout).  To do this, we copy the arguments and
1214 * environment variables from the old user address space into the kernel,
1215 * free the old as, create the new as, and copy our buffered information
1216 * to the new stack.  Our kernel buffer has the following structure:
1217 *
1218 *	+-----------------------+ <--- stk_base + stk_size
1219 *	| string offsets	|
1220 *	+-----------------------+ <--- stk_offp
1221 *	|			|
1222 *	| STK_AVAIL() space	|
1223 *	|			|
1224 *	+-----------------------+ <--- stk_strp
1225 *	| strings		|
1226 *	+-----------------------+ <--- stk_base
1227 *
1228 * When we add a string, we store the string's contents (including the null
1229 * terminator) at stk_strp, and we store the offset of the string relative to
1230 * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1231 * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1232 * the difference between these pointers.  If we run out of space, we return
1233 * an error and exec_args() starts all over again with a buffer twice as large.
1234 * When we're all done, the kernel buffer looks like this:
1235 *
1236 *	+-----------------------+ <--- stk_base + stk_size
1237 *	| argv[0] offset	|
1238 *	+-----------------------+
1239 *	| ...			|
1240 *	+-----------------------+
1241 *	| argv[argc-1] offset	|
1242 *	+-----------------------+
1243 *	| envp[0] offset	|
1244 *	+-----------------------+
1245 *	| ...			|
1246 *	+-----------------------+
1247 *	| envp[envc-1] offset	|
1248 *	+-----------------------+
1249 *	| AT_SUN_PLATFORM offset|
1250 *	+-----------------------+
1251 *	| AT_SUN_EXECNAME offset|
1252 *	+-----------------------+ <--- stk_offp
1253 *	|			|
1254 *	| STK_AVAIL() space	|
1255 *	|			|
1256 *	+-----------------------+ <--- stk_strp
1257 *	| AT_SUN_EXECNAME offset|
1258 *	+-----------------------+
1259 *	| AT_SUN_PLATFORM offset|
1260 *	+-----------------------+
1261 *	| envp[envc-1] string	|
1262 *	+-----------------------+
1263 *	| ...			|
1264 *	+-----------------------+
1265 *	| envp[0] string	|
1266 *	+-----------------------+
1267 *	| argv[argc-1] string	|
1268 *	+-----------------------+
1269 *	| ...			|
1270 *	+-----------------------+
1271 *	| argv[0] string	|
1272 *	+-----------------------+ <--- stk_base
1273 */
1274
1275#define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1276
1277/*
1278 * Add a string to the stack.
1279 */
1280static int
1281stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1282{
1283	int error;
1284	size_t len;
1285
1286	if (STK_AVAIL(args) < sizeof (int))
1287		return (E2BIG);
1288	*--args->stk_offp = args->stk_strp - args->stk_base;
1289
1290	if (segflg == UIO_USERSPACE) {
1291		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1292		if (error != 0)
1293			return (error);
1294	} else {
1295		len = strlen(sp) + 1;
1296		if (len > STK_AVAIL(args))
1297			return (E2BIG);
1298		bcopy(sp, args->stk_strp, len);
1299	}
1300
1301	args->stk_strp += len;
1302
1303	return (0);
1304}
1305
1306static int
1307stk_getptr(uarg_t *args, char *src, char **dst)
1308{
1309	int error;
1310
1311	if (args->from_model == DATAMODEL_NATIVE) {
1312		ulong_t ptr;
1313		error = fulword(src, &ptr);
1314		*dst = (caddr_t)ptr;
1315	} else {
1316		uint32_t ptr;
1317		error = fuword32(src, &ptr);
1318		*dst = (caddr_t)(uintptr_t)ptr;
1319	}
1320	return (error);
1321}
1322
1323static int
1324stk_putptr(uarg_t *args, char *addr, char *value)
1325{
1326	if (args->to_model == DATAMODEL_NATIVE)
1327		return (sulword(addr, (ulong_t)value));
1328	else
1329		return (suword32(addr, (uint32_t)(uintptr_t)value));
1330}
1331
1332static int
1333stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1334{
1335	char *sp;
1336	int argc, error;
1337	int argv_empty = 0;
1338	size_t ptrsize = args->from_ptrsize;
1339	size_t size, pad;
1340	char *argv = (char *)uap->argp;
1341	char *envp = (char *)uap->envp;
1342
1343	/*
1344	 * Copy interpreter's name and argument to argv[0] and argv[1].
1345	 */
1346	if (intp != NULL && intp->intp_name != NULL) {
1347		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1348			return (error);
1349		if (intp->intp_arg != NULL &&
1350		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1351			return (error);
1352		if (args->fname != NULL)
1353			error = stk_add(args, args->fname, UIO_SYSSPACE);
1354		else
1355			error = stk_add(args, uap->fname, UIO_USERSPACE);
1356		if (error)
1357			return (error);
1358
1359		/*
1360		 * Check for an empty argv[].
1361		 */
1362		if (stk_getptr(args, argv, &sp))
1363			return (EFAULT);
1364		if (sp == NULL)
1365			argv_empty = 1;
1366
1367		argv += ptrsize;		/* ignore original argv[0] */
1368	}
1369
1370	if (argv_empty == 0) {
1371		/*
1372		 * Add argv[] strings to the stack.
1373		 */
1374		for (;;) {
1375			if (stk_getptr(args, argv, &sp))
1376				return (EFAULT);
1377			if (sp == NULL)
1378				break;
1379			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1380				return (error);
1381			argv += ptrsize;
1382		}
1383	}
1384	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1385	args->arglen = args->stk_strp - args->stk_base;
1386
1387	/*
1388	 * Add environ[] strings to the stack.
1389	 */
1390	if (envp != NULL) {
1391		for (;;) {
1392			if (stk_getptr(args, envp, &sp))
1393				return (EFAULT);
1394			if (sp == NULL)
1395				break;
1396			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1397				return (error);
1398			envp += ptrsize;
1399		}
1400	}
1401	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1402	args->ne = args->na - argc;
1403
1404	/*
1405	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1406	 * AT_SUN_EMULATOR strings to the stack.
1407	 */
1408	if (auxvpp != NULL && *auxvpp != NULL) {
1409		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1410			return (error);
1411		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1412			return (error);
1413		if (args->brandname != NULL &&
1414		    (error = stk_add(args, args->brandname,
1415			UIO_SYSSPACE)) != 0)
1416			return (error);
1417		if (args->emulator != NULL &&
1418		    (error = stk_add(args, args->emulator,
1419			UIO_SYSSPACE)) != 0)
1420			return (error);
1421	}
1422
1423	/*
1424	 * Compute the size of the stack.  This includes all the pointers,
1425	 * the space reserved for the aux vector, and all the strings.
1426	 * The total number of pointers is args->na (which is argc + envc)
1427	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1428	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1429	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1430	 * all the strings, at the very top of the stack.
1431	 */
1432	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1433	    (args->stk_strp - args->stk_base);
1434
1435	/*
1436	 * Pad the string section with zeroes to align the stack size.
1437	 */
1438	pad = P2NPHASE(size, args->stk_align);
1439
1440	if (STK_AVAIL(args) < pad)
1441		return (E2BIG);
1442
1443	args->usrstack_size = size + pad;
1444
1445	while (pad-- != 0)
1446		*args->stk_strp++ = 0;
1447
1448	args->nc = args->stk_strp - args->stk_base;
1449
1450	return (0);
1451}
1452
1453static int
1454stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1455{
1456	size_t ptrsize = args->to_ptrsize;
1457	ssize_t pslen;
1458	char *kstrp = args->stk_base;
1459	char *ustrp = usrstack - args->nc - ptrsize;
1460	char *usp = usrstack - args->usrstack_size;
1461	int *offp = (int *)(args->stk_base + args->stk_size);
1462	int envc = args->ne;
1463	int argc = args->na - envc;
1464	int i;
1465
1466	/*
1467	 * Record argc for /proc.
1468	 */
1469	up->u_argc = argc;
1470
1471	/*
1472	 * Put argc on the stack.  Note that even though it's an int,
1473	 * it always consumes ptrsize bytes (for alignment).
1474	 */
1475	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1476		return (-1);
1477
1478	/*
1479	 * Add argc space (ptrsize) to usp and record argv for /proc.
1480	 */
1481	up->u_argv = (uintptr_t)(usp += ptrsize);
1482
1483	/*
1484	 * Put the argv[] pointers on the stack.
1485	 */
1486	for (i = 0; i < argc; i++, usp += ptrsize)
1487		if (stk_putptr(args, usp, &ustrp[*--offp]))
1488			return (-1);
1489
1490	/*
1491	 * Copy arguments to u_psargs.
1492	 */
1493	pslen = MIN(args->arglen, PSARGSZ) - 1;
1494	for (i = 0; i < pslen; i++)
1495		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1496	while (i < PSARGSZ)
1497		up->u_psargs[i++] = '\0';
1498
1499	/*
1500	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1501	 * record envp for /proc.
1502	 */
1503	up->u_envp = (uintptr_t)(usp += ptrsize);
1504
1505	/*
1506	 * Put the envp[] pointers on the stack.
1507	 */
1508	for (i = 0; i < envc; i++, usp += ptrsize)
1509		if (stk_putptr(args, usp, &ustrp[*--offp]))
1510			return (-1);
1511
1512	/*
1513	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1514	 * remember where the stack ends, which is also where auxv begins.
1515	 */
1516	args->stackend = usp += ptrsize;
1517
1518	/*
1519	 * Put all the argv[], envp[], and auxv strings on the stack.
1520	 */
1521	if (copyout(args->stk_base, ustrp, args->nc))
1522		return (-1);
1523
1524	/*
1525	 * Fill in the aux vector now that we know the user stack addresses
1526	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1527	 * AT_SUN_EMULATOR strings.
1528	 */
1529	if (auxvpp != NULL && *auxvpp != NULL) {
1530		if (args->to_model == DATAMODEL_NATIVE) {
1531			auxv_t **a = (auxv_t **)auxvpp;
1532			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1533			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1534			if (args->brandname != NULL)
1535				ADDAUX(*a,
1536				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1537			if (args->emulator != NULL)
1538				ADDAUX(*a,
1539				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1540		} else {
1541			auxv32_t **a = (auxv32_t **)auxvpp;
1542			ADDAUX(*a,
1543			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1544			ADDAUX(*a,
1545			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1546			if (args->brandname != NULL)
1547				ADDAUX(*a, AT_SUN_BRANDNAME,
1548				    (int)(uintptr_t)&ustrp[*--offp])
1549			if (args->emulator != NULL)
1550				ADDAUX(*a, AT_SUN_EMULATOR,
1551				    (int)(uintptr_t)&ustrp[*--offp])
1552		}
1553	}
1554
1555	return (0);
1556}
1557
1558#ifdef DEBUG
1559int mpss_brkpgszsel = 0;
1560int mpss_stkpgszsel = 0;
1561#endif
1562
1563/*
1564 * Initialize a new user stack with the specified arguments and environment.
1565 * The initial user stack layout is as follows:
1566 *
1567 *	User Stack
1568 *	+---------------+ <--- curproc->p_usrstack
1569 *	| NULL		|
1570 *	+---------------+
1571 *	|		|
1572 *	| auxv strings	|
1573 *	|		|
1574 *	+---------------+
1575 *	|		|
1576 *	| envp strings	|
1577 *	|		|
1578 *	+---------------+
1579 *	|		|
1580 *	| argv strings	|
1581 *	|		|
1582 *	+---------------+ <--- ustrp
1583 *	|		|
1584 *	| aux vector	|
1585 *	|		|
1586 *	+---------------+ <--- auxv
1587 *	| NULL		|
1588 *	+---------------+
1589 *	| envp[envc-1]	|
1590 *	+---------------+
1591 *	| ...		|
1592 *	+---------------+
1593 *	| envp[0]	|
1594 *	+---------------+ <--- envp[]
1595 *	| NULL		|
1596 *	+---------------+
1597 *	| argv[argc-1]	|
1598 *	+---------------+
1599 *	| ...		|
1600 *	+---------------+
1601 *	| argv[0]	|
1602 *	+---------------+ <--- argv[]
1603 *	| argc		|
1604 *	+---------------+ <--- stack base
1605 */
1606int
1607exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1608{
1609	size_t size;
1610	int error;
1611	proc_t *p = ttoproc(curthread);
1612	user_t *up = PTOU(p);
1613	char *usrstack;
1614	rctl_entity_p_t e;
1615
1616	struct as *as;
1617
1618	args->from_model = p->p_model;
1619	if (p->p_model == DATAMODEL_NATIVE) {
1620		args->from_ptrsize = sizeof (long);
1621	} else {
1622		args->from_ptrsize = sizeof (int32_t);
1623	}
1624
1625	if (args->to_model == DATAMODEL_NATIVE) {
1626		args->to_ptrsize = sizeof (long);
1627		args->ncargs = NCARGS;
1628		args->stk_align = STACK_ALIGN;
1629		usrstack = (char *)USRSTACK;
1630	} else {
1631		args->to_ptrsize = sizeof (int32_t);
1632		args->ncargs = NCARGS32;
1633		args->stk_align = STACK_ALIGN32;
1634		usrstack = (char *)USRSTACK32;
1635	}
1636
1637	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1638
1639#if defined(__sparc)
1640	/*
1641	 * Make sure user register windows are empty before
1642	 * attempting to make a new stack.
1643	 */
1644	(void) flush_user_windows_to_stack(NULL);
1645#endif
1646
1647	for (size = PAGESIZE; ; size *= 2) {
1648		args->stk_size = size;
1649		args->stk_base = kmem_alloc(size, KM_SLEEP);
1650		args->stk_strp = args->stk_base;
1651		args->stk_offp = (int *)(args->stk_base + size);
1652		error = stk_copyin(uap, args, intp, auxvpp);
1653		if (error == 0)
1654			break;
1655		kmem_free(args->stk_base, size);
1656		if (error != E2BIG && error != ENAMETOOLONG)
1657			return (error);
1658		if (size >= args->ncargs)
1659			return (E2BIG);
1660	}
1661
1662	size = args->usrstack_size;
1663
1664	ASSERT(error == 0);
1665	ASSERT(P2PHASE(size, args->stk_align) == 0);
1666	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1667
1668	if (size > args->ncargs) {
1669		kmem_free(args->stk_base, args->stk_size);
1670		return (E2BIG);
1671	}
1672
1673	/*
1674	 * Leave only the current lwp and force the other lwps to exit.
1675	 * If another lwp beat us to the punch by calling exit(), bail out.
1676	 */
1677	if ((error = exitlwps(0)) != 0) {
1678		kmem_free(args->stk_base, args->stk_size);
1679		return (error);
1680	}
1681
1682	/*
1683	 * Revoke any doors created by the process.
1684	 */
1685	if (p->p_door_list)
1686		door_exit();
1687
1688	/*
1689	 * Release schedctl data structures.
1690	 */
1691	if (p->p_pagep)
1692		schedctl_proc_cleanup();
1693
1694	/*
1695	 * Clean up any DTrace helpers for the process.
1696	 */
1697	if (p->p_dtrace_helpers != NULL) {
1698		ASSERT(dtrace_helpers_cleanup != NULL);
1699		(*dtrace_helpers_cleanup)();
1700	}
1701
1702	mutex_enter(&p->p_lock);
1703	/*
1704	 * Cleanup the DTrace provider associated with this process.
1705	 */
1706	if (p->p_dtrace_probes) {
1707		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1708		dtrace_fasttrap_exec_ptr(p);
1709	}
1710	mutex_exit(&p->p_lock);
1711
1712	/*
1713	 * discard the lwpchan cache.
1714	 */
1715	if (p->p_lcp != NULL)
1716		lwpchan_destroy_cache(1);
1717
1718	/*
1719	 * Delete the POSIX timers.
1720	 */
1721	if (p->p_itimer != NULL)
1722		timer_exit();
1723
1724#ifdef C2_AUDIT
1725	if (audit_active)
1726		audit_exec(args->stk_base, args->stk_base + args->arglen,
1727		    args->na - args->ne, args->ne);
1728#endif
1729
1730	/*
1731	 * Ensure that we don't change resource associations while we
1732	 * change address spaces.
1733	 */
1734	mutex_enter(&p->p_lock);
1735	pool_barrier_enter();
1736	mutex_exit(&p->p_lock);
1737
1738	/*
1739	 * Destroy the old address space and create a new one.
1740	 * From here on, any errors are fatal to the exec()ing process.
1741	 * On error we return -1, which means the caller must SIGKILL
1742	 * the process.
1743	 */
1744	relvm();
1745
1746	mutex_enter(&p->p_lock);
1747	pool_barrier_exit();
1748	mutex_exit(&p->p_lock);
1749
1750	up->u_execsw = args->execswp;
1751
1752	p->p_brkbase = NULL;
1753	p->p_brksize = 0;
1754	p->p_stksize = 0;
1755	p->p_model = args->to_model;
1756	p->p_usrstack = usrstack;
1757	p->p_stkprot = args->stk_prot;
1758	p->p_datprot = args->dat_prot;
1759
1760	/*
1761	 * Reset resource controls such that all controls are again active as
1762	 * well as appropriate to the potentially new address model for the
1763	 * process.
1764	 */
1765	e.rcep_p.proc = p;
1766	e.rcep_t = RCENTITY_PROCESS;
1767	rctl_set_reset(p->p_rctls, p, &e);
1768
1769	if (exec_lpg_disable == 0) {
1770#ifdef DEBUG
1771		uint_t pgsizes = page_num_pagesizes();
1772		uint_t szc;
1773#endif
1774		p->p_brkpageszc = args->brkpageszc;
1775		p->p_stkpageszc = args->stkpageszc;
1776
1777		if (p->p_brkpageszc == 0) {
1778			p->p_brkpageszc = page_szc(map_pgsz(MAPPGSZ_HEAP,
1779			    p, 0, 0, NULL));
1780		}
1781		if (p->p_stkpageszc == 0) {
1782			p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK,
1783			    p, 0, 0, NULL));
1784		}
1785
1786#ifdef DEBUG
1787		if (mpss_brkpgszsel != 0) {
1788			if (mpss_brkpgszsel == -1) {
1789				szc = ((uint_t)gethrtime() >> 8) % pgsizes;
1790			} else {
1791				szc = mpss_brkpgszsel % pgsizes;
1792			}
1793			p->p_brkpageszc = szc;
1794		}
1795
1796		if (mpss_stkpgszsel != 0) {
1797			if (mpss_stkpgszsel == -1) {
1798				szc = ((uint_t)gethrtime() >> 7) % pgsizes;
1799			} else {
1800				szc = mpss_stkpgszsel % pgsizes;
1801			}
1802			p->p_stkpageszc = szc;
1803		}
1804
1805#endif
1806		mutex_enter(&p->p_lock);
1807		p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1808		mutex_exit(&p->p_lock);
1809
1810	} else {
1811		p->p_brkpageszc = 0;
1812		p->p_stkpageszc = 0;
1813	}
1814
1815	exec_set_sp(size);
1816
1817	as = as_alloc();
1818	p->p_as = as;
1819	if (p->p_model == DATAMODEL_ILP32)
1820		as->a_userlimit = (caddr_t)USERLIMIT32;
1821	(void) hat_setup(as->a_hat, HAT_ALLOC);
1822
1823	/*
1824	 * Finally, write out the contents of the new stack.
1825	 */
1826	error = stk_copyout(args, usrstack, auxvpp, up);
1827	kmem_free(args->stk_base, args->stk_size);
1828	return (error);
1829}
1830