1/*	$NetBSD: kern_exec.c,v 1.339.2.6.2.1 2014/02/03 11:57:24 sborrill Exp $	*/
2
3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*-
30 * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
31 * Copyright (C) 1992 Wolfgang Solfrank.
32 * Copyright (C) 1992 TooLs GmbH.
33 * All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 *    must display the following acknowledgement:
45 *	This product includes software developed by TooLs GmbH.
46 * 4. The name of TooLs GmbH may not be used to endorse or promote products
47 *    derived from this software without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
53 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
54 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
55 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
56 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
57 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
58 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 */
60
61#include <sys/cdefs.h>
62__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.339.2.6.2.1 2014/02/03 11:57:24 sborrill Exp $");
63
64#include "opt_exec.h"
65#include "opt_ktrace.h"
66#include "opt_modular.h"
67#include "opt_syscall_debug.h"
68#include "veriexec.h"
69#include "opt_pax.h"
70#include "opt_sa.h"
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/filedesc.h>
75#include <sys/kernel.h>
76#include <sys/proc.h>
77#include <sys/mount.h>
78#include <sys/malloc.h>
79#include <sys/kmem.h>
80#include <sys/namei.h>
81#include <sys/vnode.h>
82#include <sys/file.h>
83#include <sys/acct.h>
84#include <sys/atomic.h>
85#include <sys/exec.h>
86#include <sys/ktrace.h>
87#include <sys/uidinfo.h>
88#include <sys/wait.h>
89#include <sys/mman.h>
90#include <sys/ras.h>
91#include <sys/signalvar.h>
92#include <sys/stat.h>
93#include <sys/syscall.h>
94#include <sys/kauth.h>
95#include <sys/lwpctl.h>
96#include <sys/pax.h>
97#include <sys/cpu.h>
98#include <sys/module.h>
99#include <sys/sa.h>
100#include <sys/savar.h>
101#include <sys/syscallvar.h>
102#include <sys/syscallargs.h>
103#if NVERIEXEC > 0
104#include <sys/verified_exec.h>
105#endif /* NVERIEXEC > 0 */
106#include <sys/sdt.h>
107#include <sys/spawn.h>
108#include <sys/prot.h>
109#include <sys/cprng.h>
110
111#include <uvm/uvm_extern.h>
112
113#include <machine/reg.h>
114
115#include <compat/common/compat_util.h>
116
117static int exec_sigcode_map(struct proc *, const struct emul *);
118
119#ifdef DEBUG_EXEC
120#define DPRINTF(a) printf a
121#define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
122    __LINE__, (s), (a), (b))
123#else
124#define DPRINTF(a)
125#define COPYPRINTF(s, a, b)
126#endif /* DEBUG_EXEC */
127
128/*
129 * DTrace SDT provider definitions
130 */
131SDT_PROBE_DEFINE(proc,,,exec,
132	    "char *", NULL,
133	    NULL, NULL, NULL, NULL,
134	    NULL, NULL, NULL, NULL);
135SDT_PROBE_DEFINE(proc,,,exec_success,
136	    "char *", NULL,
137	    NULL, NULL, NULL, NULL,
138	    NULL, NULL, NULL, NULL);
139SDT_PROBE_DEFINE(proc,,,exec_failure,
140	    "int", NULL,
141	    NULL, NULL, NULL, NULL,
142	    NULL, NULL, NULL, NULL);
143
144/*
145 * Exec function switch:
146 *
147 * Note that each makecmds function is responsible for loading the
148 * exec package with the necessary functions for any exec-type-specific
149 * handling.
150 *
151 * Functions for specific exec types should be defined in their own
152 * header file.
153 */
154static const struct execsw	**execsw = NULL;
155static int			nexecs;
156
157u_int	exec_maxhdrsz;	 /* must not be static - used by netbsd32 */
158
159/* list of dynamically loaded execsw entries */
160static LIST_HEAD(execlist_head, exec_entry) ex_head =
161    LIST_HEAD_INITIALIZER(ex_head);
162struct exec_entry {
163	LIST_ENTRY(exec_entry)	ex_list;
164	SLIST_ENTRY(exec_entry)	ex_slist;
165	const struct execsw	*ex_sw;
166};
167
168#ifndef __HAVE_SYSCALL_INTERN
169void	syscall(void);
170#endif
171
172#ifdef KERN_SA
173static struct sa_emul saemul_netbsd = {
174	sizeof(ucontext_t),
175	sizeof(struct sa_t),
176	sizeof(struct sa_t *),
177	NULL,
178	NULL,
179	cpu_upcall,
180	(void (*)(struct lwp *, void *))getucontext_sa,
181	sa_ucsp
182};
183#endif /* KERN_SA */
184
185/* NetBSD emul struct */
186struct emul emul_netbsd = {
187	.e_name =		"netbsd",
188	.e_path =		NULL,
189#ifndef __HAVE_MINIMAL_EMUL
190	.e_flags =		EMUL_HAS_SYS___syscall,
191	.e_errno =		NULL,
192	.e_nosys =		SYS_syscall,
193	.e_nsysent =		SYS_NSYSENT,
194#endif
195	.e_sysent =		sysent,
196#ifdef SYSCALL_DEBUG
197	.e_syscallnames =	syscallnames,
198#else
199	.e_syscallnames =	NULL,
200#endif
201	.e_sendsig =		sendsig,
202	.e_trapsignal =		trapsignal,
203	.e_tracesig =		NULL,
204	.e_sigcode =		NULL,
205	.e_esigcode =		NULL,
206	.e_sigobject =		NULL,
207	.e_setregs =		setregs,
208	.e_proc_exec =		NULL,
209	.e_proc_fork =		NULL,
210	.e_proc_exit =		NULL,
211	.e_lwp_fork =		NULL,
212	.e_lwp_exit =		NULL,
213#ifdef __HAVE_SYSCALL_INTERN
214	.e_syscall_intern =	syscall_intern,
215#else
216	.e_syscall =		syscall,
217#endif
218	.e_sysctlovly =		NULL,
219	.e_fault =		NULL,
220	.e_vm_default_addr =	uvm_default_mapaddr,
221	.e_usertrap =		NULL,
222#ifdef KERN_SA
223	.e_sa =			&saemul_netbsd,
224#else
225	.e_sa =			NULL,
226#endif
227	.e_ucsize =		sizeof(ucontext_t),
228	.e_startlwp =		startlwp
229};
230
231/*
232 * Exec lock. Used to control access to execsw[] structures.
233 * This must not be static so that netbsd32 can access it, too.
234 */
235krwlock_t exec_lock;
236
237static kmutex_t sigobject_lock;
238
239/*
240 * Data used between a loadvm and execve part of an "exec" operation
241 */
242struct execve_data {
243	struct exec_package	ed_pack;
244	struct pathbuf		*ed_pathbuf;
245	struct vattr		ed_attr;
246	struct ps_strings	ed_arginfo;
247	char			*ed_argp;
248	const char		*ed_pathstring;
249	char			*ed_resolvedpathbuf;
250	size_t			ed_ps_strings_sz;
251	int			ed_szsigcode;
252	long			ed_argc;
253	long			ed_envc;
254};
255
256/*
257 * data passed from parent lwp to child during a posix_spawn()
258 */
259struct spawn_exec_data {
260	struct execve_data	sed_exec;
261	struct posix_spawn_file_actions
262				*sed_actions;
263	struct posix_spawnattr	*sed_attrs;
264	struct proc		*sed_parent;
265	kcondvar_t		sed_cv_child_ready;
266	kmutex_t		sed_mtx_child;
267	int			sed_error;
268	volatile uint32_t	sed_refcnt;
269};
270
271static void *
272exec_pool_alloc(struct pool *pp, int flags)
273{
274
275	return (void *)uvm_km_alloc(kernel_map, NCARGS, 0,
276	    UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
277}
278
279static void
280exec_pool_free(struct pool *pp, void *addr)
281{
282
283	uvm_km_free(kernel_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
284}
285
286static struct pool exec_pool;
287
288static struct pool_allocator exec_palloc = {
289	.pa_alloc = exec_pool_alloc,
290	.pa_free = exec_pool_free,
291	.pa_pagesz = NCARGS
292};
293
294/*
295 * check exec:
296 * given an "executable" described in the exec package's namei info,
297 * see what we can do with it.
298 *
299 * ON ENTRY:
300 *	exec package with appropriate namei info
301 *	lwp pointer of exec'ing lwp
302 *	NO SELF-LOCKED VNODES
303 *
304 * ON EXIT:
305 *	error:	nothing held, etc.  exec header still allocated.
306 *	ok:	filled exec package, executable's vnode (unlocked).
307 *
308 * EXEC SWITCH ENTRY:
309 * 	Locked vnode to check, exec package, proc.
310 *
311 * EXEC SWITCH EXIT:
312 *	ok:	return 0, filled exec package, executable's vnode (unlocked).
313 *	error:	destructive:
314 *			everything deallocated execept exec header.
315 *		non-destructive:
316 *			error code, executable's vnode (unlocked),
317 *			exec header unmodified.
318 */
319int
320/*ARGSUSED*/
321check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb)
322{
323	int		error, i;
324	struct vnode	*vp;
325	struct nameidata nd;
326	size_t		resid;
327
328	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
329
330	/* first get the vnode */
331	if ((error = namei(&nd)) != 0)
332		return error;
333	epp->ep_vp = vp = nd.ni_vp;
334	/* this cannot overflow as both are size PATH_MAX */
335	strcpy(epp->ep_resolvedname, nd.ni_pnbuf);
336
337#ifdef DIAGNOSTIC
338	/* paranoia (take this out once namei stuff stabilizes) */
339	memset(nd.ni_pnbuf, '~', PATH_MAX);
340#endif
341
342	/* check access and type */
343	if (vp->v_type != VREG) {
344		error = EACCES;
345		goto bad1;
346	}
347	if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
348		goto bad1;
349
350	/* get attributes */
351	if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
352		goto bad1;
353
354	/* Check mount point */
355	if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
356		error = EACCES;
357		goto bad1;
358	}
359	if (vp->v_mount->mnt_flag & MNT_NOSUID)
360		epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
361
362	/* try to open it */
363	if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
364		goto bad1;
365
366	/* unlock vp, since we need it unlocked from here on out. */
367	VOP_UNLOCK(vp);
368
369#if NVERIEXEC > 0
370	error = veriexec_verify(l, vp, epp->ep_resolvedname,
371	    epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
372	    NULL);
373	if (error)
374		goto bad2;
375#endif /* NVERIEXEC > 0 */
376
377#ifdef PAX_SEGVGUARD
378	error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
379	if (error)
380		goto bad2;
381#endif /* PAX_SEGVGUARD */
382
383	/* now we have the file, get the exec header */
384	error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
385			UIO_SYSSPACE, 0, l->l_cred, &resid, NULL);
386	if (error)
387		goto bad2;
388	epp->ep_hdrvalid = epp->ep_hdrlen - resid;
389
390	/*
391	 * Set up default address space limits.  Can be overridden
392	 * by individual exec packages.
393	 *
394	 * XXX probably should be all done in the exec packages.
395	 */
396	epp->ep_vm_minaddr = VM_MIN_ADDRESS;
397	epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
398	/*
399	 * set up the vmcmds for creation of the process
400	 * address space
401	 */
402	error = ENOEXEC;
403	for (i = 0; i < nexecs; i++) {
404		int newerror;
405
406		epp->ep_esch = execsw[i];
407		newerror = (*execsw[i]->es_makecmds)(l, epp);
408
409		if (!newerror) {
410			/* Seems ok: check that entry point is not too high */
411			if (epp->ep_entry > epp->ep_vm_maxaddr) {
412#ifdef DIAGNOSTIC
413				printf("%s: rejecting %p due to "
414				    "too high entry address (> %p)\n",
415					 __func__, (void *)epp->ep_entry,
416					 (void *)epp->ep_vm_maxaddr);
417#endif
418				error = ENOEXEC;
419				break;
420			}
421			/* Seems ok: check that entry point is not too low */
422			if (epp->ep_entry < epp->ep_vm_minaddr) {
423#ifdef DIAGNOSTIC
424				printf("%s: rejecting %p due to "
425				    "too low entry address (< %p)\n",
426				     __func__, (void *)epp->ep_entry,
427				     (void *)epp->ep_vm_minaddr);
428#endif
429				error = ENOEXEC;
430				break;
431			}
432
433			/* check limits */
434			if ((epp->ep_tsize > MAXTSIZ) ||
435			    (epp->ep_dsize > (u_quad_t)l->l_proc->p_rlimit
436						    [RLIMIT_DATA].rlim_cur)) {
437#ifdef DIAGNOSTIC
438				printf("%s: rejecting due to "
439				    "limits (t=%llu > %llu || d=%llu > %llu)\n",
440				    __func__,
441				    (unsigned long long)epp->ep_tsize,
442				    (unsigned long long)MAXTSIZ,
443				    (unsigned long long)epp->ep_dsize,
444				    (unsigned long long)
445				    l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur);
446#endif
447				error = ENOMEM;
448				break;
449			}
450			return 0;
451		}
452
453		if (epp->ep_emul_root != NULL) {
454			vrele(epp->ep_emul_root);
455			epp->ep_emul_root = NULL;
456		}
457		if (epp->ep_interp != NULL) {
458			vrele(epp->ep_interp);
459			epp->ep_interp = NULL;
460		}
461
462		/* make sure the first "interesting" error code is saved. */
463		if (error == ENOEXEC)
464			error = newerror;
465
466		if (epp->ep_flags & EXEC_DESTR)
467			/* Error from "#!" code, tidied up by recursive call */
468			return error;
469	}
470
471	/* not found, error */
472
473	/*
474	 * free any vmspace-creation commands,
475	 * and release their references
476	 */
477	kill_vmcmds(&epp->ep_vmcmds);
478
479bad2:
480	/*
481	 * close and release the vnode, restore the old one, free the
482	 * pathname buf, and punt.
483	 */
484	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
485	VOP_CLOSE(vp, FREAD, l->l_cred);
486	vput(vp);
487	return error;
488
489bad1:
490	/*
491	 * free the namei pathname buffer, and put the vnode
492	 * (which we don't yet have open).
493	 */
494	vput(vp);				/* was still locked */
495	return error;
496}
497
498#ifdef __MACHINE_STACK_GROWS_UP
499#define STACK_PTHREADSPACE NBPG
500#else
501#define STACK_PTHREADSPACE 0
502#endif
503
504static int
505execve_fetch_element(char * const *array, size_t index, char **value)
506{
507	return copyin(array + index, value, sizeof(*value));
508}
509
510/*
511 * exec system call
512 */
513/* ARGSUSED */
514int
515sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
516{
517	/* {
518		syscallarg(const char *)	path;
519		syscallarg(char * const *)	argp;
520		syscallarg(char * const *)	envp;
521	} */
522
523	return execve1(l, SCARG(uap, path), SCARG(uap, argp),
524	    SCARG(uap, envp), execve_fetch_element);
525}
526
527int
528sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
529    register_t *retval)
530{
531	/* {
532		syscallarg(int)			fd;
533		syscallarg(char * const *)	argp;
534		syscallarg(char * const *)	envp;
535	} */
536
537	return ENOSYS;
538}
539
540/*
541 * Load modules to try and execute an image that we do not understand.
542 * If no execsw entries are present, we load those likely to be needed
543 * in order to run native images only.  Otherwise, we autoload all
544 * possible modules that could let us run the binary.  XXX lame
545 */
546static void
547exec_autoload(void)
548{
549#ifdef MODULAR
550	static const char * const native[] = {
551		"exec_elf32",
552		"exec_elf64",
553		"exec_script",
554		NULL
555	};
556	static const char * const compat[] = {
557		"exec_elf32",
558		"exec_elf64",
559		"exec_script",
560		"exec_aout",
561		"exec_coff",
562		"exec_ecoff",
563		"compat_aoutm68k",
564		"compat_freebsd",
565		"compat_ibcs2",
566		"compat_linux",
567		"compat_linux32",
568		"compat_netbsd32",
569		"compat_sunos",
570		"compat_sunos32",
571		"compat_svr4",
572		"compat_svr4_32",
573		"compat_ultrix",
574		NULL
575	};
576	char const * const *list;
577	int i;
578
579	list = (nexecs == 0 ? native : compat);
580	for (i = 0; list[i] != NULL; i++) {
581		if (module_autoload(list[i], MODULE_CLASS_MISC) != 0) {
582		    	continue;
583		}
584	   	yield();
585	}
586#endif
587}
588
589static int
590execve_loadvm(struct lwp *l, const char *path, char * const *args,
591	char * const *envs, execve_fetch_element_t fetch_element,
592	struct execve_data * restrict data)
593{
594	int			error;
595	struct proc		*p;
596	char			*dp, *sp;
597	size_t			i, len;
598	struct exec_fakearg	*tmpfap;
599	int			oldlwpflags;
600	u_int			modgen;
601
602	KASSERT(data != NULL);
603
604	p = l->l_proc;
605 	modgen = 0;
606
607	SDT_PROBE(proc,,,exec, path, 0, 0, 0, 0);
608
609	/*
610	 * Check if we have exceeded our number of processes limit.
611	 * This is so that we handle the case where a root daemon
612	 * forked, ran setuid to become the desired user and is trying
613	 * to exec. The obvious place to do the reference counting check
614	 * is setuid(), but we don't do the reference counting check there
615	 * like other OS's do because then all the programs that use setuid()
616	 * must be modified to check the return code of setuid() and exit().
617	 * It is dangerous to make setuid() fail, because it fails open and
618	 * the program will continue to run as root. If we make it succeed
619	 * and return an error code, again we are not enforcing the limit.
620	 * The best place to enforce the limit is here, when the process tries
621	 * to execute a new image, because eventually the process will need
622	 * to call exec in order to do something useful.
623	 */
624 retry:
625	if ((p->p_flag & PK_SUGID) && kauth_authorize_generic(l->l_cred,
626	    KAUTH_GENERIC_ISSUSER, NULL) != 0 && chgproccnt(kauth_cred_getuid(
627	    l->l_cred), 0) > p->p_rlimit[RLIMIT_NPROC].rlim_cur)
628		return EAGAIN;
629
630	oldlwpflags = l->l_flag & (LW_SA | LW_SA_UPCALL);
631	if (l->l_flag & LW_SA) {
632		lwp_lock(l);
633		l->l_flag &= ~(LW_SA | LW_SA_UPCALL);
634		lwp_unlock(l);
635	}
636
637	/*
638	 * Drain existing references and forbid new ones.  The process
639	 * should be left alone until we're done here.  This is necessary
640	 * to avoid race conditions - e.g. in ptrace() - that might allow
641	 * a local user to illicitly obtain elevated privileges.
642	 */
643	rw_enter(&p->p_reflock, RW_WRITER);
644
645	/*
646	 * Init the namei data to point the file user's program name.
647	 * This is done here rather than in check_exec(), so that it's
648	 * possible to override this settings if any of makecmd/probe
649	 * functions call check_exec() recursively - for example,
650	 * see exec_script_makecmds().
651	 */
652	error = pathbuf_copyin(path, &data->ed_pathbuf);
653	if (error) {
654		DPRINTF(("%s: pathbuf_copyin path @%p %d\n", __func__,
655		    path, error));
656		goto clrflg;
657	}
658	data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
659
660	data->ed_resolvedpathbuf = PNBUF_GET();
661#ifdef DIAGNOSTIC
662	strcpy(data->ed_resolvedpathbuf, "/wrong");
663#endif
664
665	/*
666	 * initialize the fields of the exec package.
667	 */
668	data->ed_pack.ep_name = path;
669	data->ed_pack.ep_kname = data->ed_pathstring;
670	data->ed_pack.ep_resolvedname = data->ed_resolvedpathbuf;
671	data->ed_pack.ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
672	data->ed_pack.ep_hdrlen = exec_maxhdrsz;
673	data->ed_pack.ep_hdrvalid = 0;
674	data->ed_pack.ep_emul_arg = NULL;
675	data->ed_pack.ep_emul_arg_free = NULL;
676	data->ed_pack.ep_vmcmds.evs_cnt = 0;
677	data->ed_pack.ep_vmcmds.evs_used = 0;
678	data->ed_pack.ep_vap = &data->ed_attr;
679	data->ed_pack.ep_flags = 0;
680	data->ed_pack.ep_emul_root = NULL;
681	data->ed_pack.ep_interp = NULL;
682	data->ed_pack.ep_esch = NULL;
683	data->ed_pack.ep_pax_flags = 0;
684
685	rw_enter(&exec_lock, RW_READER);
686
687	/* see if we can run it. */
688	if ((error = check_exec(l, &data->ed_pack, data->ed_pathbuf)) != 0) {
689		if (error != ENOENT) {
690			DPRINTF(("%s: check exec failed %d\n",
691			    __func__, error));
692		}
693		goto freehdr;
694	}
695
696	/* XXX -- THE FOLLOWING SECTION NEEDS MAJOR CLEANUP */
697
698	/* allocate an argument buffer */
699	data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
700	KASSERT(data->ed_argp != NULL);
701	dp = data->ed_argp;
702	data->ed_argc = 0;
703
704	/* copy the fake args list, if there's one, freeing it as we go */
705	if (data->ed_pack.ep_flags & EXEC_HASARGL) {
706		tmpfap = data->ed_pack.ep_fa;
707		while (tmpfap->fa_arg != NULL) {
708			const char *cp;
709
710			cp = tmpfap->fa_arg;
711			while (*cp)
712				*dp++ = *cp++;
713			*dp++ = '\0';
714			ktrexecarg(tmpfap->fa_arg, cp - tmpfap->fa_arg);
715
716			kmem_free(tmpfap->fa_arg, tmpfap->fa_len);
717			tmpfap++; data->ed_argc++;
718		}
719		kmem_free(data->ed_pack.ep_fa, data->ed_pack.ep_fa_len);
720		data->ed_pack.ep_flags &= ~EXEC_HASARGL;
721	}
722
723	/* Now get argv & environment */
724	if (args == NULL) {
725		DPRINTF(("%s: null args\n", __func__));
726		error = EINVAL;
727		goto bad;
728	}
729	/* 'i' will index the argp/envp element to be retrieved */
730	i = 0;
731	if (data->ed_pack.ep_flags & EXEC_SKIPARG)
732		i++;
733
734	while (1) {
735		len = data->ed_argp + ARG_MAX - dp;
736		if ((error = (*fetch_element)(args, i, &sp)) != 0) {
737			DPRINTF(("%s: fetch_element args %d\n",
738			    __func__, error));
739			goto bad;
740		}
741		if (!sp)
742			break;
743		if ((error = copyinstr(sp, dp, len, &len)) != 0) {
744			DPRINTF(("%s: copyinstr args %d\n", __func__, error));
745			if (error == ENAMETOOLONG)
746				error = E2BIG;
747			goto bad;
748		}
749		ktrexecarg(dp, len - 1);
750		dp += len;
751		i++;
752		data->ed_argc++;
753	}
754
755	data->ed_envc = 0;
756	/* environment need not be there */
757	if (envs != NULL) {
758		i = 0;
759		while (1) {
760			len = data->ed_argp + ARG_MAX - dp;
761			if ((error = (*fetch_element)(envs, i, &sp)) != 0) {
762				DPRINTF(("%s: fetch_element env %d\n",
763				    __func__, error));
764				goto bad;
765			}
766			if (!sp)
767				break;
768			if ((error = copyinstr(sp, dp, len, &len)) != 0) {
769				DPRINTF(("%s: copyinstr env %d\n",
770				    __func__, error));
771				if (error == ENAMETOOLONG)
772					error = E2BIG;
773				goto bad;
774			}
775
776			ktrexecenv(dp, len - 1);
777			dp += len;
778			i++;
779			data->ed_envc++;
780		}
781	}
782
783	dp = (char *) ALIGN(dp);
784
785	data->ed_szsigcode = data->ed_pack.ep_esch->es_emul->e_esigcode -
786	    data->ed_pack.ep_esch->es_emul->e_sigcode;
787
788#ifdef __MACHINE_STACK_GROWS_UP
789/* See big comment lower down */
790#define	RTLD_GAP	32
791#else
792#define	RTLD_GAP	0
793#endif
794
795	/* Now check if args & environ fit into new stack */
796	if (data->ed_pack.ep_flags & EXEC_32) {
797		data->ed_ps_strings_sz = sizeof(struct ps_strings32);
798		len = ((data->ed_argc + data->ed_envc + 2 +
799		    data->ed_pack.ep_esch->es_arglen) *
800		    sizeof(int) + sizeof(int) + dp + RTLD_GAP +
801		    data->ed_szsigcode + data->ed_ps_strings_sz + STACK_PTHREADSPACE)
802		    - data->ed_argp;
803	} else {
804		data->ed_ps_strings_sz = sizeof(struct ps_strings);
805		len = ((data->ed_argc + data->ed_envc + 2 +
806		    data->ed_pack.ep_esch->es_arglen) *
807		    sizeof(char *) + sizeof(int) + dp + RTLD_GAP +
808		    data->ed_szsigcode + data->ed_ps_strings_sz + STACK_PTHREADSPACE)
809		    - data->ed_argp;
810	}
811
812#ifdef PAX_ASLR
813	if (pax_aslr_active(l))
814		len += (cprng_fast32() % PAGE_SIZE);
815#endif /* PAX_ASLR */
816
817	/* make the stack "safely" aligned */
818	len = STACK_LEN_ALIGN(len, STACK_ALIGNBYTES);
819
820	if (len > data->ed_pack.ep_ssize) {
821		/* in effect, compare to initial limit */
822		DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
823		error = ENOMEM;
824		goto bad;
825	}
826	/* adjust "active stack depth" for process VSZ */
827	data->ed_pack.ep_ssize = len;
828
829	return 0;
830
831 bad:
832	/* free the vmspace-creation commands, and release their references */
833	kill_vmcmds(&data->ed_pack.ep_vmcmds);
834	/* kill any opened file descriptor, if necessary */
835	if (data->ed_pack.ep_flags & EXEC_HASFD) {
836		data->ed_pack.ep_flags &= ~EXEC_HASFD;
837		fd_close(data->ed_pack.ep_fd);
838	}
839	/* close and put the exec'd file */
840	vn_lock(data->ed_pack.ep_vp, LK_EXCLUSIVE | LK_RETRY);
841	VOP_CLOSE(data->ed_pack.ep_vp, FREAD, l->l_cred);
842	vput(data->ed_pack.ep_vp);
843	pool_put(&exec_pool, data->ed_argp);
844
845 freehdr:
846	kmem_free(data->ed_pack.ep_hdr, data->ed_pack.ep_hdrlen);
847	if (data->ed_pack.ep_emul_root != NULL)
848		vrele(data->ed_pack.ep_emul_root);
849	if (data->ed_pack.ep_interp != NULL)
850		vrele(data->ed_pack.ep_interp);
851
852	rw_exit(&exec_lock);
853
854	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
855	pathbuf_destroy(data->ed_pathbuf);
856	PNBUF_PUT(data->ed_resolvedpathbuf);
857
858 clrflg:
859	lwp_lock(l);
860	l->l_flag |= oldlwpflags;
861	lwp_unlock(l);
862	rw_exit(&p->p_reflock);
863
864	if (modgen != module_gen && error == ENOEXEC) {
865		modgen = module_gen;
866		exec_autoload();
867		goto retry;
868	}
869
870	SDT_PROBE(proc,,,exec_failure, error, 0, 0, 0, 0);
871	return error;
872}
873
874static void
875execve_free_data(struct execve_data *data)
876{
877
878	/* free the vmspace-creation commands, and release their references */
879	kill_vmcmds(&data->ed_pack.ep_vmcmds);
880	/* kill any opened file descriptor, if necessary */
881	if (data->ed_pack.ep_flags & EXEC_HASFD) {
882		data->ed_pack.ep_flags &= ~EXEC_HASFD;
883		fd_close(data->ed_pack.ep_fd);
884	}
885
886	/* close and put the exec'd file */
887	vn_lock(data->ed_pack.ep_vp, LK_EXCLUSIVE | LK_RETRY);
888	VOP_CLOSE(data->ed_pack.ep_vp, FREAD, curlwp->l_cred);
889	vput(data->ed_pack.ep_vp);
890	pool_put(&exec_pool, data->ed_argp);
891
892	kmem_free(data->ed_pack.ep_hdr, data->ed_pack.ep_hdrlen);
893	if (data->ed_pack.ep_emul_root != NULL)
894		vrele(data->ed_pack.ep_emul_root);
895	if (data->ed_pack.ep_interp != NULL)
896		vrele(data->ed_pack.ep_interp);
897
898	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
899	pathbuf_destroy(data->ed_pathbuf);
900	PNBUF_PUT(data->ed_resolvedpathbuf);
901}
902
903static int
904execve_runproc(struct lwp *l, struct execve_data * restrict data,
905	bool no_local_exec_lock, bool is_spawn)
906{
907	int error = 0;
908	struct proc		*p;
909	size_t			i;
910	char			*stack, *dp;
911	const char		*commandname;
912	struct ps_strings32	arginfo32;
913	struct exec_vmcmd	*base_vcp;
914	void			*aip;
915	struct vmspace		*vm;
916	ksiginfo_t		ksi;
917	ksiginfoq_t		kq;
918
919	/*
920	 * In case of a posix_spawn operation, the child doing the exec
921	 * might not hold the reader lock on exec_lock, but the parent
922	 * will do this instead.
923	 */
924	KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
925	KASSERT(data != NULL);
926	if (data == NULL)
927		return (EINVAL);
928
929	p = l->l_proc;
930	if (no_local_exec_lock)
931		KASSERT(is_spawn);
932
933	base_vcp = NULL;
934
935	if (data->ed_pack.ep_flags & EXEC_32)
936		aip = &arginfo32;
937	else
938		aip = &data->ed_arginfo;
939
940	/* Get rid of other LWPs. */
941	if (p->p_sa || p->p_nlwps > 1) {
942		mutex_enter(p->p_lock);
943		exit_lwps(l);
944		mutex_exit(p->p_lock);
945	}
946	KDASSERT(p->p_nlwps == 1);
947
948	/* Destroy any lwpctl info. */
949	if (p->p_lwpctl != NULL)
950		lwp_ctl_exit();
951
952#ifdef KERN_SA
953	/* Release any SA state. */
954	if (p->p_sa)
955		sa_release(p);
956#endif /* KERN_SA */
957
958	/* Remove POSIX timers */
959	timers_free(p, TIMERS_POSIX);
960
961	/*
962	 * Do whatever is necessary to prepare the address space
963	 * for remapping.  Note that this might replace the current
964	 * vmspace with another!
965	 */
966	if (is_spawn)
967		uvmspace_spawn(l, data->ed_pack.ep_vm_minaddr,
968		    data->ed_pack.ep_vm_maxaddr);
969	else
970		uvmspace_exec(l, data->ed_pack.ep_vm_minaddr,
971		    data->ed_pack.ep_vm_maxaddr);
972
973	/* record proc's vnode, for use by procfs and others */
974        if (p->p_textvp)
975                vrele(p->p_textvp);
976	vref(data->ed_pack.ep_vp);
977	p->p_textvp = data->ed_pack.ep_vp;
978
979	/* Now map address space */
980	vm = p->p_vmspace;
981	vm->vm_taddr = (void *)data->ed_pack.ep_taddr;
982	vm->vm_tsize = btoc(data->ed_pack.ep_tsize);
983	vm->vm_daddr = (void*)data->ed_pack.ep_daddr;
984	vm->vm_dsize = btoc(data->ed_pack.ep_dsize);
985	vm->vm_ssize = btoc(data->ed_pack.ep_ssize);
986	vm->vm_issize = 0;
987	vm->vm_maxsaddr = (void *)data->ed_pack.ep_maxsaddr;
988	vm->vm_minsaddr = (void *)data->ed_pack.ep_minsaddr;
989
990#ifdef PAX_ASLR
991	pax_aslr_init(l, vm);
992#endif /* PAX_ASLR */
993
994	/* create the new process's VM space by running the vmcmds */
995#ifdef DIAGNOSTIC
996	if (data->ed_pack.ep_vmcmds.evs_used == 0)
997		panic("%s: no vmcmds", __func__);
998#endif
999
1000#ifdef DEBUG_EXEC
1001	{
1002		size_t j;
1003		struct exec_vmcmd *vp = &data->ed_pack.ep_vmcmds.evs_cmds[0];
1004		DPRINTF(("vmcmds %u\n", data->ed_pack.ep_vmcmds.evs_used));
1005		for (j = 0; j < data->ed_pack.ep_vmcmds.evs_used; j++) {
1006			DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
1007			    PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
1008			    PRIxVSIZE" prot=0%o flags=%d\n", j,
1009			    vp[j].ev_proc == vmcmd_map_pagedvn ?
1010			    "pagedvn" :
1011			    vp[j].ev_proc == vmcmd_map_readvn ?
1012			    "readvn" :
1013			    vp[j].ev_proc == vmcmd_map_zero ?
1014			    "zero" : "*unknown*",
1015			    vp[j].ev_addr, vp[j].ev_len,
1016			    vp[j].ev_offset, vp[j].ev_prot,
1017			    vp[j].ev_flags));
1018		}
1019	}
1020#endif	/* DEBUG_EXEC */
1021
1022	for (i = 0; i < data->ed_pack.ep_vmcmds.evs_used && !error; i++) {
1023		struct exec_vmcmd *vcp;
1024
1025		vcp = &data->ed_pack.ep_vmcmds.evs_cmds[i];
1026		if (vcp->ev_flags & VMCMD_RELATIVE) {
1027#ifdef DIAGNOSTIC
1028			if (base_vcp == NULL)
1029				panic("%s: relative vmcmd with no base",
1030				    __func__);
1031			if (vcp->ev_flags & VMCMD_BASE)
1032				panic("%s: illegal base & relative vmcmd",
1033				    __func__);
1034#endif
1035			vcp->ev_addr += base_vcp->ev_addr;
1036		}
1037		error = (*vcp->ev_proc)(l, vcp);
1038#ifdef DEBUG_EXEC
1039		if (error) {
1040			size_t j;
1041			struct exec_vmcmd *vp =
1042			    &data->ed_pack.ep_vmcmds.evs_cmds[0];
1043			DPRINTF(("vmcmds %zu/%u, error %d\n", i,
1044			    data->ed_pack.ep_vmcmds.evs_used, error));
1045			for (j = 0; j < data->ed_pack.ep_vmcmds.evs_used; j++) {
1046				DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
1047				    PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
1048				    PRIxVSIZE" prot=0%o flags=%d\n", j,
1049				    vp[j].ev_proc == vmcmd_map_pagedvn ?
1050				    "pagedvn" :
1051				    vp[j].ev_proc == vmcmd_map_readvn ?
1052				    "readvn" :
1053				    vp[j].ev_proc == vmcmd_map_zero ?
1054				    "zero" : "*unknown*",
1055				    vp[j].ev_addr, vp[j].ev_len,
1056				    vp[j].ev_offset, vp[j].ev_prot,
1057				    vp[j].ev_flags));
1058				if (j == i)
1059					DPRINTF(("     ^--- failed\n"));
1060			}
1061		}
1062#endif /* DEBUG_EXEC */
1063		if (vcp->ev_flags & VMCMD_BASE)
1064			base_vcp = vcp;
1065	}
1066
1067	/* free the vmspace-creation commands, and release their references */
1068	kill_vmcmds(&data->ed_pack.ep_vmcmds);
1069
1070	vn_lock(data->ed_pack.ep_vp, LK_EXCLUSIVE | LK_RETRY);
1071	VOP_CLOSE(data->ed_pack.ep_vp, FREAD, l->l_cred);
1072	vput(data->ed_pack.ep_vp);
1073
1074	/* if an error happened, deallocate and punt */
1075	if (error) {
1076		DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
1077		goto exec_abort;
1078	}
1079
1080	/* remember information about the process */
1081	data->ed_arginfo.ps_nargvstr = data->ed_argc;
1082	data->ed_arginfo.ps_nenvstr = data->ed_envc;
1083
1084	/* set command name & other accounting info */
1085	commandname = strrchr(data->ed_pack.ep_resolvedname, '/');
1086	if (commandname != NULL) {
1087		commandname++;
1088	} else {
1089		commandname = data->ed_pack.ep_resolvedname;
1090	}
1091	i = min(strlen(commandname), MAXCOMLEN);
1092	(void)memcpy(p->p_comm, commandname, i);
1093	p->p_comm[i] = '\0';
1094
1095	dp = PNBUF_GET();
1096	/*
1097	 * If the path starts with /, we don't need to do any work.
1098	 * This handles the majority of the cases.
1099	 * In the future perhaps we could canonicalize it?
1100	 */
1101	if (data->ed_pathstring[0] == '/')
1102		(void)strlcpy(data->ed_pack.ep_path = dp, data->ed_pathstring,
1103		    MAXPATHLEN);
1104#ifdef notyet
1105	/*
1106	 * Although this works most of the time [since the entry was just
1107	 * entered in the cache] we don't use it because it theoretically
1108	 * can fail and it is not the cleanest interface, because there
1109	 * could be races. When the namei cache is re-written, this can
1110	 * be changed to use the appropriate function.
1111	 */
1112	else if (!(error = vnode_to_path(dp, MAXPATHLEN, p->p_textvp, l, p)))
1113		data->ed_pack.ep_path = dp;
1114#endif
1115	else {
1116#ifdef notyet
1117		printf("Cannot get path for pid %d [%s] (error %d)",
1118		    (int)p->p_pid, p->p_comm, error);
1119#endif
1120		data->ed_pack.ep_path = NULL;
1121		PNBUF_PUT(dp);
1122	}
1123
1124	stack = (char *)STACK_ALLOC(STACK_GROW(vm->vm_minsaddr,
1125		STACK_PTHREADSPACE + data->ed_ps_strings_sz + data->ed_szsigcode),
1126		data->ed_pack.ep_ssize - (data->ed_ps_strings_sz + data->ed_szsigcode));
1127
1128#ifdef __MACHINE_STACK_GROWS_UP
1129	/*
1130	 * The copyargs call always copies into lower addresses
1131	 * first, moving towards higher addresses, starting with
1132	 * the stack pointer that we give.  When the stack grows
1133	 * down, this puts argc/argv/envp very shallow on the
1134	 * stack, right at the first user stack pointer.
1135	 * When the stack grows up, the situation is reversed.
1136	 *
1137	 * Normally, this is no big deal.  But the ld_elf.so _rtld()
1138	 * function expects to be called with a single pointer to
1139	 * a region that has a few words it can stash values into,
1140	 * followed by argc/argv/envp.  When the stack grows down,
1141	 * it's easy to decrement the stack pointer a little bit to
1142	 * allocate the space for these few words and pass the new
1143	 * stack pointer to _rtld.  When the stack grows up, however,
1144	 * a few words before argc is part of the signal trampoline, XXX
1145	 * so we have a problem.
1146	 *
1147	 * Instead of changing how _rtld works, we take the easy way
1148	 * out and steal 32 bytes before we call copyargs.
1149	 * This extra space was allowed for when 'pack.ep_ssize' was calculated.
1150	 */
1151	stack += RTLD_GAP;
1152#endif /* __MACHINE_STACK_GROWS_UP */
1153
1154	/* Now copy argc, args & environ to new stack */
1155	error = (*data->ed_pack.ep_esch->es_copyargs)(l, &data->ed_pack,
1156	    &data->ed_arginfo, &stack, data->ed_argp);
1157
1158	if (data->ed_pack.ep_path) {
1159		PNBUF_PUT(data->ed_pack.ep_path);
1160		data->ed_pack.ep_path = NULL;
1161	}
1162	if (error) {
1163		DPRINTF(("%s: copyargs failed %d\n", __func__, error));
1164		goto exec_abort;
1165	}
1166	/* Move the stack back to original point */
1167	stack = (char *)STACK_GROW(vm->vm_minsaddr, data->ed_pack.ep_ssize);
1168
1169	/* fill process ps_strings info */
1170	p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(vm->vm_minsaddr,
1171	    STACK_PTHREADSPACE), data->ed_ps_strings_sz);
1172
1173	if (data->ed_pack.ep_flags & EXEC_32) {
1174		arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
1175		arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
1176		arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
1177		arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
1178	}
1179
1180	/* copy out the process's ps_strings structure */
1181	if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
1182	    != 0) {
1183		DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
1184		    __func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
1185		goto exec_abort;
1186	}
1187
1188	cwdexec(p);
1189	fd_closeexec();		/* handle close on exec */
1190
1191	if (__predict_false(ktrace_on))
1192		fd_ktrexecfd();
1193
1194	execsigs(p);		/* reset catched signals */
1195
1196	l->l_ctxlink = NULL;	/* reset ucontext link */
1197
1198
1199	p->p_acflag &= ~AFORK;
1200	mutex_enter(p->p_lock);
1201	p->p_flag |= PK_EXEC;
1202	mutex_exit(p->p_lock);
1203
1204	/*
1205	 * Stop profiling.
1206	 */
1207	if ((p->p_stflag & PST_PROFIL) != 0) {
1208		mutex_spin_enter(&p->p_stmutex);
1209		stopprofclock(p);
1210		mutex_spin_exit(&p->p_stmutex);
1211	}
1212
1213	/*
1214	 * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
1215	 * exited and exec()/exit() are the only places it will be cleared.
1216	 */
1217	if ((p->p_lflag & PL_PPWAIT) != 0) {
1218		mutex_enter(proc_lock);
1219		l->l_lwpctl = NULL; /* was on loan from blocked parent */
1220		p->p_lflag &= ~PL_PPWAIT;
1221		cv_broadcast(&p->p_pptr->p_waitcv);
1222		mutex_exit(proc_lock);
1223	}
1224
1225	/*
1226	 * Deal with set[ug]id.  MNT_NOSUID has already been used to disable
1227	 * s[ug]id.  It's OK to check for PSL_TRACED here as we have blocked
1228	 * out additional references on the process for the moment.
1229	 */
1230	if ((p->p_slflag & PSL_TRACED) == 0 &&
1231
1232	    (((data->ed_attr.va_mode & S_ISUID) != 0 &&
1233	      kauth_cred_geteuid(l->l_cred) != data->ed_attr.va_uid) ||
1234
1235	     ((data->ed_attr.va_mode & S_ISGID) != 0 &&
1236	      kauth_cred_getegid(l->l_cred) != data->ed_attr.va_gid))) {
1237		/*
1238		 * Mark the process as SUGID before we do
1239		 * anything that might block.
1240		 */
1241		proc_crmod_enter();
1242		proc_crmod_leave(NULL, NULL, true);
1243
1244		/* Make sure file descriptors 0..2 are in use. */
1245		if ((error = fd_checkstd()) != 0) {
1246			DPRINTF(("%s: fdcheckstd failed %d\n",
1247			    __func__, error));
1248			goto exec_abort;
1249		}
1250
1251		/*
1252		 * Copy the credential so other references don't see our
1253		 * changes.
1254		 */
1255		l->l_cred = kauth_cred_copy(l->l_cred);
1256#ifdef KTRACE
1257		/*
1258		 * If the persistent trace flag isn't set, turn off.
1259		 */
1260		if (p->p_tracep) {
1261			mutex_enter(&ktrace_lock);
1262			if (!(p->p_traceflag & KTRFAC_PERSISTENT))
1263				ktrderef(p);
1264			mutex_exit(&ktrace_lock);
1265		}
1266#endif
1267		if (data->ed_attr.va_mode & S_ISUID)
1268			kauth_cred_seteuid(l->l_cred, data->ed_attr.va_uid);
1269		if (data->ed_attr.va_mode & S_ISGID)
1270			kauth_cred_setegid(l->l_cred, data->ed_attr.va_gid);
1271	} else {
1272		if (kauth_cred_geteuid(l->l_cred) ==
1273		    kauth_cred_getuid(l->l_cred) &&
1274		    kauth_cred_getegid(l->l_cred) ==
1275		    kauth_cred_getgid(l->l_cred))
1276			p->p_flag &= ~PK_SUGID;
1277	}
1278
1279	/*
1280	 * Copy the credential so other references don't see our changes.
1281	 * Test to see if this is necessary first, since in the common case
1282	 * we won't need a private reference.
1283	 */
1284	if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
1285	    kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
1286		l->l_cred = kauth_cred_copy(l->l_cred);
1287		kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
1288		kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
1289	}
1290
1291	/* Update the master credentials. */
1292	if (l->l_cred != p->p_cred) {
1293		kauth_cred_t ocred;
1294
1295		kauth_cred_hold(l->l_cred);
1296		mutex_enter(p->p_lock);
1297		ocred = p->p_cred;
1298		p->p_cred = l->l_cred;
1299		mutex_exit(p->p_lock);
1300		kauth_cred_free(ocred);
1301	}
1302
1303#if defined(__HAVE_RAS)
1304	/*
1305	 * Remove all RASs from the address space.
1306	 */
1307	ras_purgeall();
1308#endif
1309
1310	doexechooks(p);
1311
1312	/* setup new registers and do misc. setup. */
1313	(*data->ed_pack.ep_esch->es_emul->e_setregs)(l, &data->ed_pack,
1314	     (vaddr_t)stack);
1315	if (data->ed_pack.ep_esch->es_setregs)
1316		(*data->ed_pack.ep_esch->es_setregs)(l, &data->ed_pack,
1317		    (vaddr_t)stack);
1318
1319	/* Provide a consistent LWP private setting */
1320	(void)lwp_setprivate(l, NULL);
1321
1322	/* Discard all PCU state; need to start fresh */
1323	pcu_discard_all(l);
1324
1325	/* map the process's signal trampoline code */
1326	if ((error = exec_sigcode_map(p, data->ed_pack.ep_esch->es_emul)) != 0) {
1327		DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
1328		goto exec_abort;
1329	}
1330
1331	pool_put(&exec_pool, data->ed_argp);
1332
1333	/* notify others that we exec'd */
1334	KNOTE(&p->p_klist, NOTE_EXEC);
1335
1336	kmem_free(data->ed_pack.ep_hdr, data->ed_pack.ep_hdrlen);
1337
1338	SDT_PROBE(proc,,,exec_success, data->ed_pack.ep_name, 0, 0, 0, 0);
1339
1340	/* The emulation root will usually have been found when we looked
1341	 * for the elf interpreter (or similar), if not look now. */
1342	if (data->ed_pack.ep_esch->es_emul->e_path != NULL &&
1343	    data->ed_pack.ep_emul_root == NULL)
1344		emul_find_root(l, &data->ed_pack);
1345
1346	/* Any old emulation root got removed by fdcloseexec */
1347	rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
1348	p->p_cwdi->cwdi_edir = data->ed_pack.ep_emul_root;
1349	rw_exit(&p->p_cwdi->cwdi_lock);
1350	data->ed_pack.ep_emul_root = NULL;
1351	if (data->ed_pack.ep_interp != NULL)
1352		vrele(data->ed_pack.ep_interp);
1353
1354	/*
1355	 * Call emulation specific exec hook. This can setup per-process
1356	 * p->p_emuldata or do any other per-process stuff an emulation needs.
1357	 *
1358	 * If we are executing process of different emulation than the
1359	 * original forked process, call e_proc_exit() of the old emulation
1360	 * first, then e_proc_exec() of new emulation. If the emulation is
1361	 * same, the exec hook code should deallocate any old emulation
1362	 * resources held previously by this process.
1363	 */
1364	if (p->p_emul && p->p_emul->e_proc_exit
1365	    && p->p_emul != data->ed_pack.ep_esch->es_emul)
1366		(*p->p_emul->e_proc_exit)(p);
1367
1368	/*
1369	 * This is now LWP 1.
1370	 */
1371	mutex_enter(p->p_lock);
1372	p->p_nlwpid = 1;
1373	l->l_lid = 1;
1374	mutex_exit(p->p_lock);
1375
1376	/*
1377	 * Call exec hook. Emulation code may NOT store reference to anything
1378	 * from &pack.
1379	 */
1380	if (data->ed_pack.ep_esch->es_emul->e_proc_exec)
1381		(*data->ed_pack.ep_esch->es_emul->e_proc_exec)(p, &data->ed_pack);
1382
1383	/* update p_emul, the old value is no longer needed */
1384	p->p_emul = data->ed_pack.ep_esch->es_emul;
1385
1386	/* ...and the same for p_execsw */
1387	p->p_execsw = data->ed_pack.ep_esch;
1388
1389#ifdef __HAVE_SYSCALL_INTERN
1390	(*p->p_emul->e_syscall_intern)(p);
1391#endif
1392	ktremul();
1393
1394	/* Allow new references from the debugger/procfs. */
1395	rw_exit(&p->p_reflock);
1396	if (!no_local_exec_lock)
1397		rw_exit(&exec_lock);
1398
1399	mutex_enter(proc_lock);
1400
1401	if ((p->p_slflag & (PSL_TRACED|PSL_SYSCALL)) == PSL_TRACED) {
1402		KSI_INIT_EMPTY(&ksi);
1403		ksi.ksi_signo = SIGTRAP;
1404		ksi.ksi_lid = l->l_lid;
1405		kpsignal(p, &ksi, NULL);
1406	}
1407
1408	if (p->p_sflag & PS_STOPEXEC) {
1409		KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
1410		p->p_pptr->p_nstopchild++;
1411		p->p_pptr->p_waited = 0;
1412		mutex_enter(p->p_lock);
1413		ksiginfo_queue_init(&kq);
1414		sigclearall(p, &contsigmask, &kq);
1415		lwp_lock(l);
1416		l->l_stat = LSSTOP;
1417		p->p_stat = SSTOP;
1418		p->p_nrlwps--;
1419		lwp_unlock(l);
1420		mutex_exit(p->p_lock);
1421		mutex_exit(proc_lock);
1422		lwp_lock(l);
1423		mi_switch(l);
1424		ksiginfo_queue_drain(&kq);
1425		KERNEL_LOCK(l->l_biglocks, l);
1426	} else {
1427		mutex_exit(proc_lock);
1428	}
1429
1430	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
1431	pathbuf_destroy(data->ed_pathbuf);
1432	PNBUF_PUT(data->ed_resolvedpathbuf);
1433	DPRINTF(("%s finished\n", __func__));
1434	return (EJUSTRETURN);
1435
1436 exec_abort:
1437	SDT_PROBE(proc,,,exec_failure, error, 0, 0, 0, 0);
1438	rw_exit(&p->p_reflock);
1439	if (!no_local_exec_lock)
1440		rw_exit(&exec_lock);
1441
1442	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
1443	pathbuf_destroy(data->ed_pathbuf);
1444	PNBUF_PUT(data->ed_resolvedpathbuf);
1445
1446	/*
1447	 * the old process doesn't exist anymore.  exit gracefully.
1448	 * get rid of the (new) address space we have created, if any, get rid
1449	 * of our namei data and vnode, and exit noting failure
1450	 */
1451	uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
1452		VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
1453
1454	exec_free_emul_arg(&data->ed_pack);
1455	pool_put(&exec_pool, data->ed_argp);
1456	kmem_free(data->ed_pack.ep_hdr, data->ed_pack.ep_hdrlen);
1457	if (data->ed_pack.ep_emul_root != NULL)
1458		vrele(data->ed_pack.ep_emul_root);
1459	if (data->ed_pack.ep_interp != NULL)
1460		vrele(data->ed_pack.ep_interp);
1461
1462	/* Acquire the sched-state mutex (exit1() will release it). */
1463	if (!is_spawn) {
1464		mutex_enter(p->p_lock);
1465		exit1(l, W_EXITCODE(error, SIGABRT));
1466	}
1467
1468	return error;
1469}
1470
1471int
1472execve1(struct lwp *l, const char *path, char * const *args,
1473    char * const *envs, execve_fetch_element_t fetch_element)
1474{
1475	struct execve_data data;
1476	int error;
1477
1478	error = execve_loadvm(l, path, args, envs, fetch_element, &data);
1479	if (error)
1480		return error;
1481	error = execve_runproc(l, &data, false, false);
1482	return error;
1483}
1484
1485int
1486copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
1487    char **stackp, void *argp)
1488{
1489	char	**cpp, *dp, *sp;
1490	size_t	len;
1491	void	*nullp;
1492	long	argc, envc;
1493	int	error;
1494
1495	cpp = (char **)*stackp;
1496	nullp = NULL;
1497	argc = arginfo->ps_nargvstr;
1498	envc = arginfo->ps_nenvstr;
1499	if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
1500		COPYPRINTF("", cpp - 1, sizeof(argc));
1501		return error;
1502	}
1503
1504	dp = (char *) (cpp + argc + envc + 2 + pack->ep_esch->es_arglen);
1505	sp = argp;
1506
1507	/* XXX don't copy them out, remap them! */
1508	arginfo->ps_argvstr = cpp; /* remember location of argv for later */
1509
1510	for (; --argc >= 0; sp += len, dp += len) {
1511		if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1512			COPYPRINTF("", cpp - 1, sizeof(dp));
1513			return error;
1514		}
1515		if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1516			COPYPRINTF("str", dp, (size_t)ARG_MAX);
1517			return error;
1518		}
1519	}
1520
1521	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1522		COPYPRINTF("", cpp - 1, sizeof(nullp));
1523		return error;
1524	}
1525
1526	arginfo->ps_envstr = cpp; /* remember location of envp for later */
1527
1528	for (; --envc >= 0; sp += len, dp += len) {
1529		if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1530			COPYPRINTF("", cpp - 1, sizeof(dp));
1531			return error;
1532		}
1533		if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1534			COPYPRINTF("str", dp, (size_t)ARG_MAX);
1535			return error;
1536		}
1537
1538	}
1539
1540	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1541		COPYPRINTF("", cpp - 1, sizeof(nullp));
1542		return error;
1543	}
1544
1545	*stackp = (char *)cpp;
1546	return 0;
1547}
1548
1549
1550/*
1551 * Add execsw[] entries.
1552 */
1553int
1554exec_add(struct execsw *esp, int count)
1555{
1556	struct exec_entry	*it;
1557	int			i;
1558
1559	if (count == 0) {
1560		return 0;
1561	}
1562
1563	/* Check for duplicates. */
1564	rw_enter(&exec_lock, RW_WRITER);
1565	for (i = 0; i < count; i++) {
1566		LIST_FOREACH(it, &ex_head, ex_list) {
1567			/* assume unique (makecmds, probe_func, emulation) */
1568			if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
1569			    it->ex_sw->u.elf_probe_func ==
1570			    esp[i].u.elf_probe_func &&
1571			    it->ex_sw->es_emul == esp[i].es_emul) {
1572				rw_exit(&exec_lock);
1573				return EEXIST;
1574			}
1575		}
1576	}
1577
1578	/* Allocate new entries. */
1579	for (i = 0; i < count; i++) {
1580		it = kmem_alloc(sizeof(*it), KM_SLEEP);
1581		it->ex_sw = &esp[i];
1582		LIST_INSERT_HEAD(&ex_head, it, ex_list);
1583	}
1584
1585	/* update execsw[] */
1586	exec_init(0);
1587	rw_exit(&exec_lock);
1588	return 0;
1589}
1590
1591/*
1592 * Remove execsw[] entry.
1593 */
1594int
1595exec_remove(struct execsw *esp, int count)
1596{
1597	struct exec_entry	*it, *next;
1598	int			i;
1599	const struct proclist_desc *pd;
1600	proc_t			*p;
1601
1602	if (count == 0) {
1603		return 0;
1604	}
1605
1606	/* Abort if any are busy. */
1607	rw_enter(&exec_lock, RW_WRITER);
1608	for (i = 0; i < count; i++) {
1609		mutex_enter(proc_lock);
1610		for (pd = proclists; pd->pd_list != NULL; pd++) {
1611			PROCLIST_FOREACH(p, pd->pd_list) {
1612				if (p->p_execsw == &esp[i]) {
1613					mutex_exit(proc_lock);
1614					rw_exit(&exec_lock);
1615					return EBUSY;
1616				}
1617			}
1618		}
1619		mutex_exit(proc_lock);
1620	}
1621
1622	/* None are busy, so remove them all. */
1623	for (i = 0; i < count; i++) {
1624		for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
1625			next = LIST_NEXT(it, ex_list);
1626			if (it->ex_sw == &esp[i]) {
1627				LIST_REMOVE(it, ex_list);
1628				kmem_free(it, sizeof(*it));
1629				break;
1630			}
1631		}
1632	}
1633
1634	/* update execsw[] */
1635	exec_init(0);
1636	rw_exit(&exec_lock);
1637	return 0;
1638}
1639
1640/*
1641 * Initialize exec structures. If init_boot is true, also does necessary
1642 * one-time initialization (it's called from main() that way).
1643 * Once system is multiuser, this should be called with exec_lock held,
1644 * i.e. via exec_{add|remove}().
1645 */
1646int
1647exec_init(int init_boot)
1648{
1649	const struct execsw 	**sw;
1650	struct exec_entry	*ex;
1651	SLIST_HEAD(,exec_entry)	first;
1652	SLIST_HEAD(,exec_entry)	any;
1653	SLIST_HEAD(,exec_entry)	last;
1654	int			i, sz;
1655
1656	if (init_boot) {
1657		/* do one-time initializations */
1658		rw_init(&exec_lock);
1659		mutex_init(&sigobject_lock, MUTEX_DEFAULT, IPL_NONE);
1660		pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
1661		    "execargs", &exec_palloc, IPL_NONE);
1662		pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
1663	} else {
1664		KASSERT(rw_write_held(&exec_lock));
1665	}
1666
1667	/* Sort each entry onto the appropriate queue. */
1668	SLIST_INIT(&first);
1669	SLIST_INIT(&any);
1670	SLIST_INIT(&last);
1671	sz = 0;
1672	LIST_FOREACH(ex, &ex_head, ex_list) {
1673		switch(ex->ex_sw->es_prio) {
1674		case EXECSW_PRIO_FIRST:
1675			SLIST_INSERT_HEAD(&first, ex, ex_slist);
1676			break;
1677		case EXECSW_PRIO_ANY:
1678			SLIST_INSERT_HEAD(&any, ex, ex_slist);
1679			break;
1680		case EXECSW_PRIO_LAST:
1681			SLIST_INSERT_HEAD(&last, ex, ex_slist);
1682			break;
1683		default:
1684			panic("%s", __func__);
1685			break;
1686		}
1687		sz++;
1688	}
1689
1690	/*
1691	 * Create new execsw[].  Ensure we do not try a zero-sized
1692	 * allocation.
1693	 */
1694	sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
1695	i = 0;
1696	SLIST_FOREACH(ex, &first, ex_slist) {
1697		sw[i++] = ex->ex_sw;
1698	}
1699	SLIST_FOREACH(ex, &any, ex_slist) {
1700		sw[i++] = ex->ex_sw;
1701	}
1702	SLIST_FOREACH(ex, &last, ex_slist) {
1703		sw[i++] = ex->ex_sw;
1704	}
1705
1706	/* Replace old execsw[] and free used memory. */
1707	if (execsw != NULL) {
1708		kmem_free(__UNCONST(execsw),
1709		    nexecs * sizeof(struct execsw *) + 1);
1710	}
1711	execsw = sw;
1712	nexecs = sz;
1713
1714	/* Figure out the maximum size of an exec header. */
1715	exec_maxhdrsz = sizeof(int);
1716	for (i = 0; i < nexecs; i++) {
1717		if (execsw[i]->es_hdrsz > exec_maxhdrsz)
1718			exec_maxhdrsz = execsw[i]->es_hdrsz;
1719	}
1720
1721	return 0;
1722}
1723
1724static int
1725exec_sigcode_map(struct proc *p, const struct emul *e)
1726{
1727	vaddr_t va;
1728	vsize_t sz;
1729	int error;
1730	struct uvm_object *uobj;
1731
1732	sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
1733
1734	if (e->e_sigobject == NULL || sz == 0) {
1735		return 0;
1736	}
1737
1738	/*
1739	 * If we don't have a sigobject for this emulation, create one.
1740	 *
1741	 * sigobject is an anonymous memory object (just like SYSV shared
1742	 * memory) that we keep a permanent reference to and that we map
1743	 * in all processes that need this sigcode. The creation is simple,
1744	 * we create an object, add a permanent reference to it, map it in
1745	 * kernel space, copy out the sigcode to it and unmap it.
1746	 * We map it with PROT_READ|PROT_EXEC into the process just
1747	 * the way sys_mmap() would map it.
1748	 */
1749
1750	uobj = *e->e_sigobject;
1751	if (uobj == NULL) {
1752		mutex_enter(&sigobject_lock);
1753		if ((uobj = *e->e_sigobject) == NULL) {
1754			uobj = uao_create(sz, 0);
1755			(*uobj->pgops->pgo_reference)(uobj);
1756			va = vm_map_min(kernel_map);
1757			if ((error = uvm_map(kernel_map, &va, round_page(sz),
1758			    uobj, 0, 0,
1759			    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
1760			    UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
1761				printf("kernel mapping failed %d\n", error);
1762				(*uobj->pgops->pgo_detach)(uobj);
1763				mutex_exit(&sigobject_lock);
1764				return (error);
1765			}
1766			memcpy((void *)va, e->e_sigcode, sz);
1767#ifdef PMAP_NEED_PROCWR
1768			pmap_procwr(&proc0, va, sz);
1769#endif
1770			uvm_unmap(kernel_map, va, va + round_page(sz));
1771			*e->e_sigobject = uobj;
1772		}
1773		mutex_exit(&sigobject_lock);
1774	}
1775
1776	/* Just a hint to uvm_map where to put it. */
1777	va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
1778	    round_page(sz));
1779
1780#ifdef __alpha__
1781	/*
1782	 * Tru64 puts /sbin/loader at the end of user virtual memory,
1783	 * which causes the above calculation to put the sigcode at
1784	 * an invalid address.  Put it just below the text instead.
1785	 */
1786	if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
1787		va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
1788	}
1789#endif
1790
1791	(*uobj->pgops->pgo_reference)(uobj);
1792	error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
1793			uobj, 0, 0,
1794			UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
1795				    UVM_ADV_RANDOM, 0));
1796	if (error) {
1797		DPRINTF(("%s, %d: map %p "
1798		    "uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
1799		    __func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
1800		    va, error));
1801		(*uobj->pgops->pgo_detach)(uobj);
1802		return (error);
1803	}
1804	p->p_sigctx.ps_sigcode = (void *)va;
1805	return (0);
1806}
1807
1808/*
1809 * Release a refcount on spawn_exec_data and destroy memory, if this
1810 * was the last one.
1811 */
1812static void
1813spawn_exec_data_release(struct spawn_exec_data *data)
1814{
1815	if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
1816		return;
1817
1818	cv_destroy(&data->sed_cv_child_ready);
1819	mutex_destroy(&data->sed_mtx_child);
1820
1821	if (data->sed_actions)
1822		posix_spawn_fa_free(data->sed_actions,
1823		    data->sed_actions->len);
1824	if (data->sed_attrs)
1825		kmem_free(data->sed_attrs,
1826		    sizeof(*data->sed_attrs));
1827	kmem_free(data, sizeof(*data));
1828}
1829
1830/*
1831 * A child lwp of a posix_spawn operation starts here and ends up in
1832 * cpu_spawn_return, dealing with all filedescriptor and scheduler
1833 * manipulations in between.
1834 * The parent waits for the child, as it is not clear wether the child
1835 * will be able to aquire its own exec_lock. If it can, the parent can
1836 * be released early and continue running in parallel. If not (or if the
1837 * magic debug flag is passed in the scheduler attribute struct), the
1838 * child rides on the parent's exec lock untill it is ready to return to
1839 * to userland - and only then releases the parent. This method loses
1840 * concurrency, but improves error reporting.
1841 */
1842static void
1843spawn_return(void *arg)
1844{
1845	struct spawn_exec_data *spawn_data = arg;
1846	struct lwp *l = curlwp;
1847	int error, newfd;
1848	size_t i;
1849	const struct posix_spawn_file_actions_entry *fae;
1850	pid_t ppid;
1851	register_t retval;
1852	bool have_reflock;
1853	bool parent_is_waiting = true;
1854
1855	/*
1856	 * Check if we can release parent early.
1857	 * We either need to have no sed_attrs, or sed_attrs does not
1858	 * have POSIX_SPAWN_RETURNERROR or one of the flags, that require
1859	 * safe access to the parent proc (passed in sed_parent).
1860	 * We then try to get the exec_lock, and only if that works, we can
1861	 * release the parent here already.
1862	 */
1863	ppid = spawn_data->sed_parent->p_pid;
1864	if ((!spawn_data->sed_attrs
1865	    || (spawn_data->sed_attrs->sa_flags
1866	        & (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
1867	    && rw_tryenter(&exec_lock, RW_READER)) {
1868		parent_is_waiting = false;
1869		mutex_enter(&spawn_data->sed_mtx_child);
1870		cv_signal(&spawn_data->sed_cv_child_ready);
1871		mutex_exit(&spawn_data->sed_mtx_child);
1872	}
1873
1874	/* don't allow debugger access yet */
1875	rw_enter(&l->l_proc->p_reflock, RW_WRITER);
1876	have_reflock = true;
1877
1878	error = 0;
1879	/* handle posix_spawn_file_actions */
1880	if (spawn_data->sed_actions != NULL) {
1881		for (i = 0; i < spawn_data->sed_actions->len; i++) {
1882			fae = &spawn_data->sed_actions->fae[i];
1883			switch (fae->fae_action) {
1884			case FAE_OPEN:
1885				if (fd_getfile(fae->fae_fildes) != NULL) {
1886					error = fd_close(fae->fae_fildes);
1887					if (error)
1888						break;
1889				}
1890				error = fd_open(fae->fae_path, fae->fae_oflag,
1891				    fae->fae_mode, &newfd);
1892 				if (error)
1893 					break;
1894				if (newfd != fae->fae_fildes) {
1895					error = dodup(l, newfd,
1896					    fae->fae_fildes, 0, &retval);
1897					if (fd_getfile(newfd) != NULL)
1898						fd_close(newfd);
1899				}
1900				break;
1901			case FAE_DUP2:
1902				error = dodup(l, fae->fae_fildes,
1903				    fae->fae_newfildes, 0, &retval);
1904				break;
1905			case FAE_CLOSE:
1906				if (fd_getfile(fae->fae_fildes) == NULL) {
1907					error = EBADF;
1908					break;
1909				}
1910				error = fd_close(fae->fae_fildes);
1911				break;
1912			}
1913			if (error)
1914				goto report_error;
1915		}
1916	}
1917
1918	/* handle posix_spawnattr */
1919	if (spawn_data->sed_attrs != NULL) {
1920		int ostat;
1921		struct sigaction sigact;
1922		sigact._sa_u._sa_handler = SIG_DFL;
1923		sigact.sa_flags = 0;
1924
1925		/*
1926		 * set state to SSTOP so that this proc can be found by pid.
1927		 * see proc_enterprp, do_sched_setparam below
1928		 */
1929		ostat = l->l_proc->p_stat;
1930		l->l_proc->p_stat = SSTOP;
1931
1932		/* Set process group */
1933		if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
1934			pid_t mypid = l->l_proc->p_pid,
1935			     pgrp = spawn_data->sed_attrs->sa_pgroup;
1936
1937			if (pgrp == 0)
1938				pgrp = mypid;
1939
1940			error = proc_enterpgrp(spawn_data->sed_parent,
1941			    mypid, pgrp, false);
1942			if (error)
1943				goto report_error;
1944		}
1945
1946		/* Set scheduler policy */
1947		if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
1948			error = do_sched_setparam(l->l_proc->p_pid, 0,
1949			    spawn_data->sed_attrs->sa_schedpolicy,
1950			    &spawn_data->sed_attrs->sa_schedparam);
1951		else if (spawn_data->sed_attrs->sa_flags
1952		    & POSIX_SPAWN_SETSCHEDPARAM) {
1953			error = do_sched_setparam(ppid, 0,
1954			    SCHED_NONE, &spawn_data->sed_attrs->sa_schedparam);
1955		}
1956		if (error)
1957			goto report_error;
1958
1959		/* Reset user ID's */
1960		if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
1961			error = do_setresuid(l, -1,
1962			     kauth_cred_getgid(l->l_cred), -1,
1963			     ID_E_EQ_R | ID_E_EQ_S);
1964			if (error)
1965				goto report_error;
1966			error = do_setresuid(l, -1,
1967			    kauth_cred_getuid(l->l_cred), -1,
1968			    ID_E_EQ_R | ID_E_EQ_S);
1969			if (error)
1970				goto report_error;
1971		}
1972
1973		/* Set signal masks/defaults */
1974		if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
1975			mutex_enter(l->l_proc->p_lock);
1976			error = sigprocmask1(l, SIG_SETMASK,
1977			    &spawn_data->sed_attrs->sa_sigmask, NULL);
1978			mutex_exit(l->l_proc->p_lock);
1979			if (error)
1980				goto report_error;
1981		}
1982
1983		if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
1984			for (i = 1; i <= NSIG; i++) {
1985				if (sigismember(
1986				    &spawn_data->sed_attrs->sa_sigdefault, i))
1987					sigaction1(l, i, &sigact, NULL, NULL,
1988					    0);
1989			}
1990		}
1991		l->l_proc->p_stat = ostat;
1992	}
1993
1994	/* now do the real exec */
1995	error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
1996	    true);
1997	have_reflock = false;
1998	if (error == EJUSTRETURN)
1999		error = 0;
2000	else if (error)
2001		goto report_error;
2002
2003	if (parent_is_waiting) {
2004		mutex_enter(&spawn_data->sed_mtx_child);
2005		cv_signal(&spawn_data->sed_cv_child_ready);
2006		mutex_exit(&spawn_data->sed_mtx_child);
2007	}
2008
2009	/* release our refcount on the data */
2010	spawn_exec_data_release(spawn_data);
2011
2012	/* and finaly: leave to userland for the first time */
2013	cpu_spawn_return(l);
2014
2015	/* NOTREACHED */
2016	return;
2017
2018 report_error:
2019 	if (have_reflock) {
2020 		/*
2021		 * We have not passed through execve_runproc(),
2022		 * which would have released the p_reflock and also
2023		 * taken ownership of the sed_exec part of spawn_data,
2024		 * so release/free both here.
2025		 */
2026		rw_exit(&l->l_proc->p_reflock);
2027		execve_free_data(&spawn_data->sed_exec);
2028	}
2029
2030	if (parent_is_waiting) {
2031		/* pass error to parent */
2032		mutex_enter(&spawn_data->sed_mtx_child);
2033		spawn_data->sed_error = error;
2034		cv_signal(&spawn_data->sed_cv_child_ready);
2035		mutex_exit(&spawn_data->sed_mtx_child);
2036	} else {
2037		rw_exit(&exec_lock);
2038	}
2039
2040	/* release our refcount on the data */
2041	spawn_exec_data_release(spawn_data);
2042
2043	/* done, exit */
2044	mutex_enter(l->l_proc->p_lock);
2045	/*
2046	 * Posix explicitly asks for an exit code of 127 if we report
2047	 * errors from the child process - so, unfortunately, there
2048	 * is no way to report a more exact error code.
2049	 * A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
2050	 * flag bit in the attrp argument to posix_spawn(2), see above.
2051	 */
2052	exit1(l, W_EXITCODE(127, 0));
2053}
2054
2055void
2056posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
2057{
2058
2059	for (size_t i = 0; i < len; i++) {
2060		struct posix_spawn_file_actions_entry *fae = &fa->fae[i];
2061		if (fae->fae_action != FAE_OPEN)
2062			continue;
2063		kmem_free(fae->fae_path, strlen(fae->fae_path) + 1);
2064	}
2065	if (fa->len > 0)
2066		kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
2067	kmem_free(fa, sizeof(*fa));
2068}
2069
2070static int
2071posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
2072    const struct posix_spawn_file_actions *ufa, rlim_t lim)
2073{
2074	struct posix_spawn_file_actions *fa;
2075	struct posix_spawn_file_actions_entry *fae;
2076	char *pbuf = NULL;
2077	int error;
2078	size_t i = 0;
2079
2080	fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
2081	error = copyin(ufa, fa, sizeof(*fa));
2082	if (error) {
2083		fa->fae = NULL;
2084		fa->len = 0;
2085		goto out;
2086	}
2087
2088	if (fa->len == 0) {
2089		kmem_free(fa, sizeof(*fa));
2090		return 0;
2091	}
2092
2093	if (fa->len > lim) {
2094		kmem_free(fa, sizeof(*fa));
2095		return EINVAL;
2096	}
2097
2098	fa->size = fa->len;
2099	size_t fal = fa->len * sizeof(*fae);
2100	fae = fa->fae;
2101	fa->fae = kmem_alloc(fal, KM_SLEEP);
2102	error = copyin(fae, fa->fae, fal);
2103	if (error)
2104		goto out;
2105
2106	pbuf = PNBUF_GET();
2107	for (; i < fa->len; i++) {
2108		fae = &fa->fae[i];
2109		if (fae->fae_action != FAE_OPEN)
2110			continue;
2111		error = copyinstr(fae->fae_path, pbuf, MAXPATHLEN, &fal);
2112		if (error)
2113			goto out;
2114		fae->fae_path = kmem_alloc(fal, KM_SLEEP);
2115		memcpy(fae->fae_path, pbuf, fal);
2116	}
2117	PNBUF_PUT(pbuf);
2118
2119	*fap = fa;
2120	return 0;
2121out:
2122	if (pbuf)
2123		PNBUF_PUT(pbuf);
2124	posix_spawn_fa_free(fa, i);
2125	return error;
2126}
2127
2128int
2129check_posix_spawn(struct lwp *l1)
2130{
2131	int error, tnprocs, count;
2132	uid_t uid;
2133	struct proc *p1;
2134
2135	p1 = l1->l_proc;
2136	uid = kauth_cred_getuid(l1->l_cred);
2137	tnprocs = atomic_inc_uint_nv(&nprocs);
2138
2139	/*
2140	 * Although process entries are dynamically created, we still keep
2141	 * a global limit on the maximum number we will create.
2142	 */
2143	if (__predict_false(tnprocs >= maxproc))
2144		error = -1;
2145	else
2146		error = kauth_authorize_process(l1->l_cred,
2147		    KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
2148
2149	if (error) {
2150		atomic_dec_uint(&nprocs);
2151		return EAGAIN;
2152	}
2153
2154	/*
2155	 * Enforce limits.
2156	 */
2157	count = chgproccnt(uid, 1);
2158	if (kauth_authorize_generic(l1->l_cred, KAUTH_GENERIC_ISSUSER, NULL) !=
2159	    0 && __predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
2160		(void)chgproccnt(uid, -1);
2161		atomic_dec_uint(&nprocs);
2162		return EAGAIN;
2163	}
2164
2165	return 0;
2166}
2167
2168int
2169do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
2170	struct posix_spawn_file_actions *fa,
2171	struct posix_spawnattr *sa,
2172	char *const *argv, char *const *envp,
2173	execve_fetch_element_t fetch)
2174{
2175
2176	struct proc *p1, *p2;
2177	struct lwp *l2;
2178	int error;
2179	struct spawn_exec_data *spawn_data;
2180	vaddr_t uaddr;
2181	pid_t pid;
2182	bool have_exec_lock = false;
2183
2184	p1 = l1->l_proc;
2185
2186	/* Allocate and init spawn_data */
2187	spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
2188	spawn_data->sed_refcnt = 1; /* only parent so far */
2189	cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
2190	mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
2191	mutex_enter(&spawn_data->sed_mtx_child);
2192
2193	/*
2194	 * Do the first part of the exec now, collect state
2195	 * in spawn_data.
2196	 */
2197	error = execve_loadvm(l1, path, argv,
2198	    envp, fetch, &spawn_data->sed_exec);
2199	if (error == EJUSTRETURN)
2200		error = 0;
2201	else if (error)
2202		goto error_exit;
2203
2204	have_exec_lock = true;
2205
2206	/*
2207	 * Allocate virtual address space for the U-area now, while it
2208	 * is still easy to abort the fork operation if we're out of
2209	 * kernel virtual address space.
2210	 */
2211	uaddr = uvm_uarea_alloc();
2212	if (__predict_false(uaddr == 0)) {
2213		error = ENOMEM;
2214		goto error_exit;
2215	}
2216
2217	/*
2218	 * Allocate new proc. Borrow proc0 vmspace for it, we will
2219	 * replace it with its own before returning to userland
2220	 * in the child.
2221	 * This is a point of no return, we will have to go through
2222	 * the child proc to properly clean it up past this point.
2223	 */
2224	p2 = proc_alloc();
2225	pid = p2->p_pid;
2226
2227	/*
2228	 * Make a proc table entry for the new process.
2229	 * Start by zeroing the section of proc that is zero-initialized,
2230	 * then copy the section that is copied directly from the parent.
2231	 */
2232	memset(&p2->p_startzero, 0,
2233	    (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
2234	memcpy(&p2->p_startcopy, &p1->p_startcopy,
2235	    (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
2236	p2->p_vmspace = proc0.p_vmspace;
2237
2238	CIRCLEQ_INIT(&p2->p_sigpend.sp_info);
2239
2240	LIST_INIT(&p2->p_lwps);
2241	LIST_INIT(&p2->p_sigwaiters);
2242
2243	/*
2244	 * Duplicate sub-structures as needed.
2245	 * Increase reference counts on shared objects.
2246	 * Inherit flags we want to keep.  The flags related to SIGCHLD
2247	 * handling are important in order to keep a consistent behaviour
2248	 * for the child after the fork.  If we are a 32-bit process, the
2249	 * child will be too.
2250	 */
2251	p2->p_flag =
2252	    p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
2253	p2->p_emul = p1->p_emul;
2254	p2->p_execsw = p1->p_execsw;
2255
2256	mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
2257	mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
2258	rw_init(&p2->p_reflock);
2259	cv_init(&p2->p_waitcv, "wait");
2260	cv_init(&p2->p_lwpcv, "lwpwait");
2261
2262	p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
2263
2264	kauth_proc_fork(p1, p2);
2265
2266	p2->p_raslist = NULL;
2267	p2->p_fd = fd_copy();
2268
2269	/* XXX racy */
2270	p2->p_mqueue_cnt = p1->p_mqueue_cnt;
2271
2272	p2->p_cwdi = cwdinit();
2273
2274	/*
2275	 * Note: p_limit (rlimit stuff) is copy-on-write, so normally
2276	 * we just need increase pl_refcnt.
2277	 */
2278	if (!p1->p_limit->pl_writeable) {
2279		lim_addref(p1->p_limit);
2280		p2->p_limit = p1->p_limit;
2281	} else {
2282		p2->p_limit = lim_copy(p1->p_limit);
2283	}
2284
2285	p2->p_lflag = 0;
2286	p2->p_sflag = 0;
2287	p2->p_slflag = 0;
2288	p2->p_pptr = p1;
2289	p2->p_ppid = p1->p_pid;
2290	LIST_INIT(&p2->p_children);
2291
2292	p2->p_aio = NULL;
2293
2294#ifdef KTRACE
2295	/*
2296	 * Copy traceflag and tracefile if enabled.
2297	 * If not inherited, these were zeroed above.
2298	 */
2299	if (p1->p_traceflag & KTRFAC_INHERIT) {
2300		mutex_enter(&ktrace_lock);
2301		p2->p_traceflag = p1->p_traceflag;
2302		if ((p2->p_tracep = p1->p_tracep) != NULL)
2303			ktradref(p2);
2304		mutex_exit(&ktrace_lock);
2305	}
2306#endif
2307
2308	/*
2309	 * Create signal actions for the child process.
2310	 */
2311	p2->p_sigacts = sigactsinit(p1, 0);
2312	mutex_enter(p1->p_lock);
2313	p2->p_sflag |=
2314	    (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
2315	sched_proc_fork(p1, p2);
2316	mutex_exit(p1->p_lock);
2317
2318	p2->p_stflag = p1->p_stflag;
2319
2320	/*
2321	 * p_stats.
2322	 * Copy parts of p_stats, and zero out the rest.
2323	 */
2324	p2->p_stats = pstatscopy(p1->p_stats);
2325
2326	/* copy over machdep flags to the new proc */
2327	cpu_proc_fork(p1, p2);
2328
2329	/*
2330	 * Prepare remaining parts of spawn data
2331	 */
2332	spawn_data->sed_actions = fa;
2333	spawn_data->sed_attrs = sa;
2334
2335	spawn_data->sed_parent = p1;
2336
2337	/* create LWP */
2338	lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
2339	    &l2, l1->l_class);
2340	l2->l_ctxlink = NULL;	/* reset ucontext link */
2341
2342	/*
2343	 * Copy the credential so other references don't see our changes.
2344	 * Test to see if this is necessary first, since in the common case
2345	 * we won't need a private reference.
2346	 */
2347	if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
2348	    kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
2349		l2->l_cred = kauth_cred_copy(l2->l_cred);
2350		kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
2351		kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
2352	}
2353
2354	/* Update the master credentials. */
2355	if (l2->l_cred != p2->p_cred) {
2356		kauth_cred_t ocred;
2357
2358		kauth_cred_hold(l2->l_cred);
2359		mutex_enter(p2->p_lock);
2360		ocred = p2->p_cred;
2361		p2->p_cred = l2->l_cred;
2362		mutex_exit(p2->p_lock);
2363		kauth_cred_free(ocred);
2364	}
2365
2366	*child_ok = true;
2367	spawn_data->sed_refcnt = 2;	/* child gets it as well */
2368#if 0
2369	l2->l_nopreempt = 1; /* start it non-preemptable */
2370#endif
2371
2372	/*
2373	 * It's now safe for the scheduler and other processes to see the
2374	 * child process.
2375	 */
2376	mutex_enter(proc_lock);
2377
2378	if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
2379		p2->p_lflag |= PL_CONTROLT;
2380
2381	LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
2382	p2->p_exitsig = SIGCHLD;	/* signal for parent on exit */
2383
2384	LIST_INSERT_AFTER(p1, p2, p_pglist);
2385	LIST_INSERT_HEAD(&allproc, p2, p_list);
2386
2387	p2->p_trace_enabled = trace_is_enabled(p2);
2388#ifdef __HAVE_SYSCALL_INTERN
2389	(*p2->p_emul->e_syscall_intern)(p2);
2390#endif
2391
2392	/*
2393	 * Make child runnable, set start time, and add to run queue except
2394	 * if the parent requested the child to start in SSTOP state.
2395	 */
2396	mutex_enter(p2->p_lock);
2397
2398	getmicrotime(&p2->p_stats->p_start);
2399
2400	lwp_lock(l2);
2401	KASSERT(p2->p_nrlwps == 1);
2402	p2->p_nrlwps = 1;
2403	p2->p_stat = SACTIVE;
2404	l2->l_stat = LSRUN;
2405	sched_enqueue(l2, false);
2406	lwp_unlock(l2);
2407
2408	mutex_exit(p2->p_lock);
2409	mutex_exit(proc_lock);
2410
2411	cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
2412	error = spawn_data->sed_error;
2413	mutex_exit(&spawn_data->sed_mtx_child);
2414	spawn_exec_data_release(spawn_data);
2415
2416	rw_exit(&p1->p_reflock);
2417	rw_exit(&exec_lock);
2418	have_exec_lock = false;
2419
2420	*pid_res = pid;
2421	return error;
2422
2423 error_exit:
2424 	if (have_exec_lock) {
2425		execve_free_data(&spawn_data->sed_exec);
2426		rw_exit(&p1->p_reflock);
2427 		rw_exit(&exec_lock);
2428	}
2429	mutex_exit(&spawn_data->sed_mtx_child);
2430	spawn_exec_data_release(spawn_data);
2431
2432	return error;
2433}
2434
2435int
2436sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
2437    register_t *retval)
2438{
2439	/* {
2440		syscallarg(pid_t *) pid;
2441		syscallarg(const char *) path;
2442		syscallarg(const struct posix_spawn_file_actions *) file_actions;
2443		syscallarg(const struct posix_spawnattr *) attrp;
2444		syscallarg(char *const *) argv;
2445		syscallarg(char *const *) envp;
2446	} */
2447
2448	int error;
2449	struct posix_spawn_file_actions *fa = NULL;
2450	struct posix_spawnattr *sa = NULL;
2451	pid_t pid;
2452	bool child_ok = false;
2453	rlim_t max_fileactions;
2454	proc_t *p = l1->l_proc;
2455
2456	error = check_posix_spawn(l1);
2457	if (error) {
2458		*retval = error;
2459		return 0;
2460	}
2461
2462	/* copy in file_actions struct */
2463	if (SCARG(uap, file_actions) != NULL) {
2464		max_fileactions = 2 * min(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
2465		    maxfiles);
2466		error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
2467		    max_fileactions);
2468		if (error)
2469			goto error_exit;
2470	}
2471
2472	/* copyin posix_spawnattr struct */
2473	if (SCARG(uap, attrp) != NULL) {
2474		sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
2475		error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
2476		if (error)
2477			goto error_exit;
2478	}
2479
2480	/*
2481	 * Do the spawn
2482	 */
2483	error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
2484	    SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
2485	if (error)
2486		goto error_exit;
2487
2488	if (error == 0 && SCARG(uap, pid) != NULL)
2489		error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
2490
2491	*retval = error;
2492	return 0;
2493
2494 error_exit:
2495	if (!child_ok) {
2496		(void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
2497		atomic_dec_uint(&nprocs);
2498
2499		if (sa)
2500			kmem_free(sa, sizeof(*sa));
2501		if (fa)
2502			posix_spawn_fa_free(fa, fa->len);
2503	}
2504
2505	*retval = error;
2506	return 0;
2507}
2508
2509void
2510exec_free_emul_arg(struct exec_package *epp)
2511{
2512	if (epp->ep_emul_arg_free != NULL) {
2513		KASSERT(epp->ep_emul_arg != NULL);
2514		(*epp->ep_emul_arg_free)(epp->ep_emul_arg);
2515		epp->ep_emul_arg_free = NULL;
2516		epp->ep_emul_arg = NULL;
2517	} else {
2518		KASSERT(epp->ep_emul_arg == NULL);
2519	}
2520}
2521