kern_sharedpage.c revision 139804
1/*-
2 * Copyright (c) 1993, David Greenman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/kern_exec.c 139804 2005-01-06 23:35:40Z imp $");
29
30#include "opt_ktrace.h"
31#include "opt_mac.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/eventhandler.h>
36#include <sys/lock.h>
37#include <sys/mutex.h>
38#include <sys/sysproto.h>
39#include <sys/signalvar.h>
40#include <sys/kernel.h>
41#include <sys/mac.h>
42#include <sys/mount.h>
43#include <sys/filedesc.h>
44#include <sys/fcntl.h>
45#include <sys/acct.h>
46#include <sys/exec.h>
47#include <sys/imgact.h>
48#include <sys/imgact_elf.h>
49#include <sys/wait.h>
50#include <sys/malloc.h>
51#include <sys/proc.h>
52#include <sys/pioctl.h>
53#include <sys/namei.h>
54#include <sys/resourcevar.h>
55#include <sys/sf_buf.h>
56#include <sys/syscallsubr.h>
57#include <sys/sysent.h>
58#include <sys/shm.h>
59#include <sys/sysctl.h>
60#include <sys/vnode.h>
61#ifdef KTRACE
62#include <sys/ktrace.h>
63#endif
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/pmap.h>
68#include <vm/vm_page.h>
69#include <vm/vm_map.h>
70#include <vm/vm_kern.h>
71#include <vm/vm_extern.h>
72#include <vm/vm_object.h>
73#include <vm/vm_pager.h>
74
75#include <machine/reg.h>
76
77MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
78
79static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
80static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
81static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
82static int do_execve(struct thread *td, char *fname, char **argv,
83	char **envv, struct mac *mac_p);
84
85/* XXX This should be vm_size_t. */
86SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
87    NULL, 0, sysctl_kern_ps_strings, "LU", "");
88
89/* XXX This should be vm_size_t. */
90SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
91    NULL, 0, sysctl_kern_usrstack, "LU", "");
92
93SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
94    NULL, 0, sysctl_kern_stackprot, "I", "");
95
96u_long ps_arg_cache_limit = PAGE_SIZE / 16;
97SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
98    &ps_arg_cache_limit, 0, "");
99
100static int
101sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
102{
103	struct proc *p;
104	int error;
105
106	p = curproc;
107#ifdef SCTL_MASK32
108	if (req->flags & SCTL_MASK32) {
109		unsigned int val;
110		val = (unsigned int)p->p_sysent->sv_psstrings;
111		error = SYSCTL_OUT(req, &val, sizeof(val));
112	} else
113#endif
114		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
115		   sizeof(p->p_sysent->sv_psstrings));
116	return error;
117}
118
119static int
120sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
121{
122	struct proc *p;
123	int error;
124
125	p = curproc;
126#ifdef SCTL_MASK32
127	if (req->flags & SCTL_MASK32) {
128		unsigned int val;
129		val = (unsigned int)p->p_sysent->sv_usrstack;
130		error = SYSCTL_OUT(req, &val, sizeof(val));
131	} else
132#endif
133		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
134		    sizeof(p->p_sysent->sv_usrstack));
135	return error;
136}
137
138static int
139sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
140{
141	struct proc *p;
142
143	p = curproc;
144	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
145	    sizeof(p->p_sysent->sv_stackprot)));
146}
147
148/*
149 * Each of the items is a pointer to a `const struct execsw', hence the
150 * double pointer here.
151 */
152static const struct execsw **execsw;
153
154#ifndef _SYS_SYSPROTO_H_
155struct execve_args {
156	char    *fname;
157	char    **argv;
158	char    **envv;
159};
160#endif
161
162/*
163 * MPSAFE
164 */
165int
166execve(td, uap)
167	struct thread *td;
168	struct execve_args /* {
169		char *fname;
170		char **argv;
171		char **envv;
172	} */ *uap;
173{
174
175	return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL));
176}
177
178#ifndef _SYS_SYSPROTO_H_
179struct __mac_execve_args {
180	char	*fname;
181	char	**argv;
182	char	**envv;
183	struct mac	*mac_p;
184};
185#endif
186
187/*
188 * MPSAFE
189 */
190int
191__mac_execve(td, uap)
192	struct thread *td;
193	struct __mac_execve_args /* {
194		char *fname;
195		char **argv;
196		char **envv;
197		struct mac *mac_p;
198	} */ *uap;
199{
200
201#ifdef MAC
202	return (kern_execve(td, uap->fname, uap->argv, uap->envv,
203	    uap->mac_p));
204#else
205	return (ENOSYS);
206#endif
207}
208
209int
210kern_execve(td, fname, argv, envv, mac_p)
211	struct thread *td;
212	char *fname;
213	char **argv;
214	char **envv;
215	struct mac *mac_p;
216{
217	struct proc *p = td->td_proc;
218	int error;
219
220	if (p->p_flag & P_HADTHREADS) {
221		PROC_LOCK(p);
222		if (thread_single(SINGLE_BOUNDARY)) {
223			PROC_UNLOCK(p);
224			return (ERESTART);	/* Try again later. */
225		}
226		PROC_UNLOCK(p);
227	}
228
229	error = do_execve(td, fname, argv, envv, mac_p);
230
231	if (p->p_flag & P_HADTHREADS) {
232		PROC_LOCK(p);
233		/*
234		 * If success, we upgrade to SINGLE_EXIT state to
235		 * force other threads to suicide.
236		 */
237		if (error == 0)
238			thread_single(SINGLE_EXIT);
239		else
240			thread_single_end();
241		PROC_UNLOCK(p);
242	}
243
244	return (error);
245}
246
247/*
248 * In-kernel implementation of execve().  All arguments are assumed to be
249 * userspace pointers from the passed thread.
250 *
251 * MPSAFE
252 */
253static int
254do_execve(td, fname, argv, envv, mac_p)
255	struct thread *td;
256	char *fname;
257	char **argv;
258	char **envv;
259	struct mac *mac_p;
260{
261	struct proc *p = td->td_proc;
262	struct nameidata nd, *ndp;
263	struct ucred *newcred = NULL, *oldcred;
264	struct uidinfo *euip;
265	register_t *stack_base;
266	int error, len, i;
267	struct image_params image_params, *imgp;
268	struct vattr attr;
269	int (*img_first)(struct image_params *);
270	struct pargs *oldargs = NULL, *newargs = NULL;
271	struct sigacts *oldsigacts, *newsigacts;
272#ifdef KTRACE
273	struct vnode *tracevp = NULL;
274	struct ucred *tracecred = NULL;
275#endif
276	struct vnode *textvp = NULL;
277	int credential_changing;
278	int textset;
279#ifdef MAC
280	struct label *interplabel = NULL;
281	int will_transition;
282#endif
283
284	imgp = &image_params;
285
286	/*
287	 * Lock the process and set the P_INEXEC flag to indicate that
288	 * it should be left alone until we're done here.  This is
289	 * necessary to avoid race conditions - e.g. in ptrace() -
290	 * that might allow a local user to illicitly obtain elevated
291	 * privileges.
292	 */
293	PROC_LOCK(p);
294	KASSERT((p->p_flag & P_INEXEC) == 0,
295	    ("%s(): process already has P_INEXEC flag", __func__));
296	p->p_flag |= P_INEXEC;
297	PROC_UNLOCK(p);
298
299	/*
300	 * Initialize part of the common data
301	 */
302	imgp->proc = p;
303	imgp->userspace_argv = argv;
304	imgp->userspace_envv = envv;
305	imgp->execlabel = NULL;
306	imgp->attr = &attr;
307	imgp->argc = imgp->envc = 0;
308	imgp->argv0 = NULL;
309	imgp->entry_addr = 0;
310	imgp->vmspace_destroyed = 0;
311	imgp->interpreted = 0;
312	imgp->interpreter_name[0] = '\0';
313	imgp->auxargs = NULL;
314	imgp->vp = NULL;
315	imgp->object = NULL;
316	imgp->firstpage = NULL;
317	imgp->ps_strings = 0;
318	imgp->auxarg_size = 0;
319
320#ifdef MAC
321	error = mac_execve_enter(imgp, mac_p);
322	if (error) {
323		mtx_lock(&Giant);
324		goto exec_fail;
325	}
326#endif
327
328	/*
329	 * Allocate temporary demand zeroed space for argument and
330	 *	environment strings
331	 */
332	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
333	if (imgp->stringbase == NULL) {
334		error = ENOMEM;
335		mtx_lock(&Giant);
336		goto exec_fail;
337	}
338	imgp->stringp = imgp->stringbase;
339	imgp->stringspace = ARG_MAX;
340	imgp->image_header = NULL;
341
342	/*
343	 * Translate the file name. namei() returns a vnode pointer
344	 *	in ni_vp amoung other things.
345	 */
346	ndp = &nd;
347	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
348	    UIO_USERSPACE, fname, td);
349
350	mtx_lock(&Giant);
351interpret:
352
353	error = namei(ndp);
354	if (error) {
355		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
356		    ARG_MAX);
357		goto exec_fail;
358	}
359
360	imgp->vp = ndp->ni_vp;
361	imgp->fname = fname;
362
363	/*
364	 * Check file permissions (also 'opens' file)
365	 */
366	error = exec_check_permissions(imgp);
367	if (error)
368		goto exec_fail_dealloc;
369
370	if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0)
371		vm_object_reference(imgp->object);
372
373	/*
374	 * Set VV_TEXT now so no one can write to the executable while we're
375	 * activating it.
376	 *
377	 * Remember if this was set before and unset it in case this is not
378	 * actually an executable image.
379	 */
380	textset = imgp->vp->v_vflag & VV_TEXT;
381	imgp->vp->v_vflag |= VV_TEXT;
382
383	error = exec_map_first_page(imgp);
384	if (error)
385		goto exec_fail_dealloc;
386
387	/*
388	 *	If the current process has a special image activator it
389	 *	wants to try first, call it.   For example, emulating shell
390	 *	scripts differently.
391	 */
392	error = -1;
393	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
394		error = img_first(imgp);
395
396	/*
397	 *	Loop through the list of image activators, calling each one.
398	 *	An activator returns -1 if there is no match, 0 on success,
399	 *	and an error otherwise.
400	 */
401	for (i = 0; error == -1 && execsw[i]; ++i) {
402		if (execsw[i]->ex_imgact == NULL ||
403		    execsw[i]->ex_imgact == img_first) {
404			continue;
405		}
406		error = (*execsw[i]->ex_imgact)(imgp);
407	}
408
409	if (error) {
410		if (error == -1) {
411			if (textset == 0)
412				imgp->vp->v_vflag &= ~VV_TEXT;
413			error = ENOEXEC;
414		}
415		goto exec_fail_dealloc;
416	}
417
418	/*
419	 * Special interpreter operation, cleanup and loop up to try to
420	 * activate the interpreter.
421	 */
422	if (imgp->interpreted) {
423		exec_unmap_first_page(imgp);
424		/*
425		 * VV_TEXT needs to be unset for scripts.  There is a short
426		 * period before we determine that something is a script where
427		 * VV_TEXT will be set. The vnode lock is held over this
428		 * entire period so nothing should illegitimately be blocked.
429		 */
430		imgp->vp->v_vflag &= ~VV_TEXT;
431		/* free name buffer and old vnode */
432		NDFREE(ndp, NDF_ONLY_PNBUF);
433#ifdef MAC
434		interplabel = mac_vnode_label_alloc();
435		mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel);
436#endif
437		vput(ndp->ni_vp);
438		vm_object_deallocate(imgp->object);
439		imgp->object = NULL;
440		/* set new name to that of the interpreter */
441		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
442		    UIO_SYSSPACE, imgp->interpreter_name, td);
443		goto interpret;
444	}
445
446	/*
447	 * Copy out strings (args and env) and initialize stack base
448	 */
449	if (p->p_sysent->sv_copyout_strings)
450		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
451	else
452		stack_base = exec_copyout_strings(imgp);
453
454	/*
455	 * If custom stack fixup routine present for this process
456	 * let it do the stack setup.
457	 * Else stuff argument count as first item on stack
458	 */
459	if (p->p_sysent->sv_fixup != NULL)
460		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
461	else
462		suword(--stack_base, imgp->argc);
463
464	/*
465	 * For security and other reasons, the file descriptor table cannot
466	 * be shared after an exec.
467	 */
468	fdunshare(p, td);
469
470	/*
471	 * Malloc things before we need locks.
472	 */
473	newcred = crget();
474	euip = uifind(attr.va_uid);
475	i = imgp->endargs - imgp->stringbase;
476	if (ps_arg_cache_limit >= i + sizeof(struct pargs))
477		newargs = pargs_alloc(i);
478
479	/* close files on exec */
480	fdcloseexec(td);
481
482	/* Get a reference to the vnode prior to locking the proc */
483	VREF(ndp->ni_vp);
484
485	/*
486	 * For security and other reasons, signal handlers cannot
487	 * be shared after an exec. The new process gets a copy of the old
488	 * handlers. In execsigs(), the new process will have its signals
489	 * reset.
490	 */
491	PROC_LOCK(p);
492	if (sigacts_shared(p->p_sigacts)) {
493		oldsigacts = p->p_sigacts;
494		PROC_UNLOCK(p);
495		newsigacts = sigacts_alloc();
496		sigacts_copy(newsigacts, oldsigacts);
497		PROC_LOCK(p);
498		p->p_sigacts = newsigacts;
499	} else
500		oldsigacts = NULL;
501
502	/* Stop profiling */
503	stopprofclock(p);
504
505	/* reset caught signals */
506	execsigs(p);
507
508	/* name this process - nameiexec(p, ndp) */
509	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
510	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
511	p->p_comm[len] = 0;
512
513	/*
514	 * mark as execed, wakeup the process that vforked (if any) and tell
515	 * it that it now has its own resources back
516	 */
517	p->p_flag |= P_EXEC;
518	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
519		p->p_flag &= ~P_PPWAIT;
520		wakeup(p->p_pptr);
521	}
522
523	/*
524	 * Implement image setuid/setgid.
525	 *
526	 * Don't honor setuid/setgid if the filesystem prohibits it or if
527	 * the process is being traced.
528	 *
529	 * XXXMAC: For the time being, use NOSUID to also prohibit
530	 * transitions on the file system.
531	 */
532	oldcred = p->p_ucred;
533	credential_changing = 0;
534	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
535	    attr.va_uid;
536	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
537	    attr.va_gid;
538#ifdef MAC
539	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
540	    interplabel, imgp);
541	credential_changing |= will_transition;
542#endif
543
544	if (credential_changing &&
545	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
546	    (p->p_flag & P_TRACED) == 0) {
547		/*
548		 * Turn off syscall tracing for set-id programs, except for
549		 * root.  Record any set-id flags first to make sure that
550		 * we do not regain any tracing during a possible block.
551		 */
552		setsugid(p);
553#ifdef KTRACE
554		if (p->p_tracevp != NULL && suser_cred(oldcred, SUSER_ALLOWJAIL)) {
555			mtx_lock(&ktrace_mtx);
556			p->p_traceflag = 0;
557			tracevp = p->p_tracevp;
558			p->p_tracevp = NULL;
559			tracecred = p->p_tracecred;
560			p->p_tracecred = NULL;
561			mtx_unlock(&ktrace_mtx);
562		}
563#endif
564		/*
565		 * Close any file descriptors 0..2 that reference procfs,
566		 * then make sure file descriptors 0..2 are in use.
567		 *
568		 * setugidsafety() may call closef() and then pfind()
569		 * which may grab the process lock.
570		 * fdcheckstd() may call falloc() which may block to
571		 * allocate memory, so temporarily drop the process lock.
572		 */
573		PROC_UNLOCK(p);
574		setugidsafety(td);
575		error = fdcheckstd(td);
576		if (error != 0)
577			goto done1;
578		PROC_LOCK(p);
579		/*
580		 * Set the new credentials.
581		 */
582		crcopy(newcred, oldcred);
583		if (attr.va_mode & VSUID)
584			change_euid(newcred, euip);
585		if (attr.va_mode & VSGID)
586			change_egid(newcred, attr.va_gid);
587#ifdef MAC
588		if (will_transition) {
589			mac_execve_transition(oldcred, newcred, imgp->vp,
590			    interplabel, imgp);
591		}
592#endif
593		/*
594		 * Implement correct POSIX saved-id behavior.
595		 *
596		 * XXXMAC: Note that the current logic will save the
597		 * uid and gid if a MAC domain transition occurs, even
598		 * though maybe it shouldn't.
599		 */
600		change_svuid(newcred, newcred->cr_uid);
601		change_svgid(newcred, newcred->cr_gid);
602		p->p_ucred = newcred;
603		newcred = NULL;
604	} else {
605		if (oldcred->cr_uid == oldcred->cr_ruid &&
606		    oldcred->cr_gid == oldcred->cr_rgid)
607			p->p_flag &= ~P_SUGID;
608		/*
609		 * Implement correct POSIX saved-id behavior.
610		 *
611		 * XXX: It's not clear that the existing behavior is
612		 * POSIX-compliant.  A number of sources indicate that the
613		 * saved uid/gid should only be updated if the new ruid is
614		 * not equal to the old ruid, or the new euid is not equal
615		 * to the old euid and the new euid is not equal to the old
616		 * ruid.  The FreeBSD code always updates the saved uid/gid.
617		 * Also, this code uses the new (replaced) euid and egid as
618		 * the source, which may or may not be the right ones to use.
619		 */
620		if (oldcred->cr_svuid != oldcred->cr_uid ||
621		    oldcred->cr_svgid != oldcred->cr_gid) {
622			crcopy(newcred, oldcred);
623			change_svuid(newcred, newcred->cr_uid);
624			change_svgid(newcred, newcred->cr_gid);
625			p->p_ucred = newcred;
626			newcred = NULL;
627		}
628	}
629
630	/*
631	 * Store the vp for use in procfs.  This vnode was referenced prior
632	 * to locking the proc lock.
633	 */
634	textvp = p->p_textvp;
635	p->p_textvp = ndp->ni_vp;
636
637	/*
638	 * Notify others that we exec'd, and clear the P_INEXEC flag
639	 * as we're now a bona fide freshly-execed process.
640	 */
641	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
642	p->p_flag &= ~P_INEXEC;
643
644	/*
645	 * If tracing the process, trap to debugger so breakpoints
646	 * can be set before the program executes.
647	 * Use tdsignal to deliver signal to current thread, use
648	 * psignal may cause the signal to be delivered to wrong thread
649	 * because that thread will exit, remember we are going to enter
650	 * single thread mode.
651	 */
652	if (p->p_flag & P_TRACED)
653		tdsignal(td, SIGTRAP, SIGTARGET_TD);
654
655	/* clear "fork but no exec" flag, as we _are_ execing */
656	p->p_acflag &= ~AFORK;
657
658	/* Free any previous argument cache */
659	oldargs = p->p_args;
660	p->p_args = NULL;
661
662	/* Cache arguments if they fit inside our allowance */
663	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
664		bcopy(imgp->stringbase, newargs->ar_args, i);
665		p->p_args = newargs;
666		newargs = NULL;
667	}
668	PROC_UNLOCK(p);
669
670	/* Set values passed into the program in registers. */
671	if (p->p_sysent->sv_setregs)
672		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
673		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
674	else
675		exec_setregs(td, imgp->entry_addr,
676		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
677
678done1:
679	/*
680	 * Free any resources malloc'd earlier that we didn't use.
681	 */
682	uifree(euip);
683	if (newcred == NULL)
684		crfree(oldcred);
685	else
686		crfree(newcred);
687	/*
688	 * Handle deferred decrement of ref counts.
689	 */
690	if (textvp != NULL)
691		vrele(textvp);
692	if (ndp->ni_vp && error != 0)
693		vrele(ndp->ni_vp);
694#ifdef KTRACE
695	if (tracevp != NULL)
696		vrele(tracevp);
697	if (tracecred != NULL)
698		crfree(tracecred);
699#endif
700	if (oldargs != NULL)
701		pargs_drop(oldargs);
702	if (newargs != NULL)
703		pargs_drop(newargs);
704	if (oldsigacts != NULL)
705		sigacts_free(oldsigacts);
706
707exec_fail_dealloc:
708
709	/*
710	 * free various allocated resources
711	 */
712	if (imgp->firstpage != NULL)
713		exec_unmap_first_page(imgp);
714
715	if (imgp->vp != NULL) {
716		NDFREE(ndp, NDF_ONLY_PNBUF);
717		vput(imgp->vp);
718	}
719
720	if (imgp->stringbase != NULL)
721		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
722		    ARG_MAX);
723
724	if (imgp->object != NULL)
725		vm_object_deallocate(imgp->object);
726
727	if (error == 0) {
728		/*
729		 * Stop the process here if its stop event mask has
730		 * the S_EXEC bit set.
731		 */
732		STOPEVENT(p, S_EXEC, 0);
733		goto done2;
734	}
735
736exec_fail:
737	/* we're done here, clear P_INEXEC */
738	PROC_LOCK(p);
739	p->p_flag &= ~P_INEXEC;
740	PROC_UNLOCK(p);
741
742	if (imgp->vmspace_destroyed) {
743		/* sorry, no more process anymore. exit gracefully */
744#ifdef MAC
745		mac_execve_exit(imgp);
746		if (interplabel != NULL)
747			mac_vnode_label_free(interplabel);
748#endif
749		mtx_unlock(&Giant);
750		exit1(td, W_EXITCODE(0, SIGABRT));
751		/* NOT REACHED */
752		error = 0;
753	}
754done2:
755#ifdef MAC
756	mac_execve_exit(imgp);
757	if (interplabel != NULL)
758		mac_vnode_label_free(interplabel);
759#endif
760	mtx_unlock(&Giant);
761	return (error);
762}
763
764int
765exec_map_first_page(imgp)
766	struct image_params *imgp;
767{
768	int rv, i;
769	int initial_pagein;
770	vm_page_t ma[VM_INITIAL_PAGEIN];
771	vm_object_t object;
772
773	GIANT_REQUIRED;
774
775	if (imgp->firstpage != NULL)
776		exec_unmap_first_page(imgp);
777
778	VOP_GETVOBJECT(imgp->vp, &object);
779	VM_OBJECT_LOCK(object);
780	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
781	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
782		initial_pagein = VM_INITIAL_PAGEIN;
783		if (initial_pagein > object->size)
784			initial_pagein = object->size;
785		for (i = 1; i < initial_pagein; i++) {
786			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
787				if (ma[i]->valid)
788					break;
789				vm_page_lock_queues();
790				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
791					vm_page_unlock_queues();
792					break;
793				}
794				vm_page_busy(ma[i]);
795				vm_page_unlock_queues();
796			} else {
797				ma[i] = vm_page_alloc(object, i,
798				    VM_ALLOC_NORMAL);
799				if (ma[i] == NULL)
800					break;
801			}
802		}
803		initial_pagein = i;
804		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
805		ma[0] = vm_page_lookup(object, 0);
806		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
807		    (ma[0]->valid == 0)) {
808			if (ma[0]) {
809				vm_page_lock_queues();
810				pmap_remove_all(ma[0]);
811				vm_page_free(ma[0]);
812				vm_page_unlock_queues();
813			}
814			VM_OBJECT_UNLOCK(object);
815			return (EIO);
816		}
817	}
818	vm_page_lock_queues();
819	vm_page_hold(ma[0]);
820	vm_page_wakeup(ma[0]);
821	vm_page_unlock_queues();
822	VM_OBJECT_UNLOCK(object);
823
824	imgp->firstpage = sf_buf_alloc(ma[0], 0);
825	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
826
827	return (0);
828}
829
830void
831exec_unmap_first_page(imgp)
832	struct image_params *imgp;
833{
834	vm_page_t m;
835
836	if (imgp->firstpage != NULL) {
837		m = sf_buf_page(imgp->firstpage);
838		sf_buf_free(imgp->firstpage);
839		imgp->firstpage = NULL;
840		vm_page_lock_queues();
841		vm_page_unhold(m);
842		vm_page_unlock_queues();
843	}
844}
845
846/*
847 * Destroy old address space, and allocate a new stack
848 *	The new stack is only SGROWSIZ large because it is grown
849 *	automatically in trap.c.
850 */
851int
852exec_new_vmspace(imgp, sv)
853	struct image_params *imgp;
854	struct sysentvec *sv;
855{
856	int error;
857	struct proc *p = imgp->proc;
858	struct vmspace *vmspace = p->p_vmspace;
859	vm_offset_t stack_addr;
860	vm_map_t map;
861
862	GIANT_REQUIRED;
863
864	imgp->vmspace_destroyed = 1;
865
866	/* Called with Giant held, do not depend on it! */
867	EVENTHANDLER_INVOKE(process_exec, p);
868
869	/*
870	 * Here is as good a place as any to do any resource limit cleanups.
871	 * This is needed if a 64 bit binary exec's a 32 bit binary - the
872	 * data size limit may need to be changed to a value that makes
873	 * sense for the 32 bit binary.
874	 */
875	if (sv->sv_fixlimits != NULL)
876		sv->sv_fixlimits(imgp);
877
878	/*
879	 * Blow away entire process VM, if address space not shared,
880	 * otherwise, create a new VM space so that other threads are
881	 * not disrupted
882	 */
883	map = &vmspace->vm_map;
884	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
885	    vm_map_max(map) == sv->sv_maxuser) {
886		shmexit(vmspace);
887		pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
888		    vm_map_max(map));
889		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
890	} else {
891		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
892		vmspace = p->p_vmspace;
893		map = &vmspace->vm_map;
894	}
895
896	/* Allocate a new stack */
897	stack_addr = sv->sv_usrstack - maxssiz;
898	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
899	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
900	if (error)
901		return (error);
902
903#ifdef __ia64__
904	/* Allocate a new register stack */
905	stack_addr = IA64_BACKINGSTORE;
906	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
907	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
908	if (error)
909		return (error);
910#endif
911
912	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
913	 * VM_STACK case, but they are still used to monitor the size of the
914	 * process stack so we can check the stack rlimit.
915	 */
916	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
917	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
918
919	return (0);
920}
921
922/*
923 * Copy out argument and environment strings from the old process
924 *	address space into the temporary string buffer.
925 */
926int
927exec_extract_strings(imgp)
928	struct image_params *imgp;
929{
930	char	**argv, **envv;
931	char	*argp, *envp;
932	int	error;
933	size_t	length;
934
935	/*
936	 * extract arguments first
937	 */
938
939	argv = imgp->userspace_argv;
940
941	if (argv) {
942		argp = (caddr_t)(intptr_t)fuword(argv);
943		if (argp == (caddr_t)-1)
944			return (EFAULT);
945		if (argp)
946			argv++;
947		if (imgp->argv0)
948			argp = imgp->argv0;
949		if (argp) {
950			do {
951				if (argp == (caddr_t)-1)
952					return (EFAULT);
953				if ((error = copyinstr(argp, imgp->stringp,
954				    imgp->stringspace, &length))) {
955					if (error == ENAMETOOLONG)
956						return (E2BIG);
957					return (error);
958				}
959				imgp->stringspace -= length;
960				imgp->stringp += length;
961				imgp->argc++;
962			} while ((argp = (caddr_t)(intptr_t)fuword(argv++)));
963		}
964	} else
965		return (EFAULT);
966
967	imgp->endargs = imgp->stringp;
968
969	/*
970	 * extract environment strings
971	 */
972
973	envv = imgp->userspace_envv;
974
975	if (envv) {
976		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
977			if (envp == (caddr_t)-1)
978				return (EFAULT);
979			if ((error = copyinstr(envp, imgp->stringp,
980			    imgp->stringspace, &length))) {
981				if (error == ENAMETOOLONG)
982					return (E2BIG);
983				return (error);
984			}
985			imgp->stringspace -= length;
986			imgp->stringp += length;
987			imgp->envc++;
988		}
989	}
990
991	return (0);
992}
993
994/*
995 * Copy strings out to the new process address space, constructing
996 *	new arg and env vector tables. Return a pointer to the base
997 *	so that it can be used as the initial stack pointer.
998 */
999register_t *
1000exec_copyout_strings(imgp)
1001	struct image_params *imgp;
1002{
1003	int argc, envc;
1004	char **vectp;
1005	char *stringp, *destp;
1006	register_t *stack_base;
1007	struct ps_strings *arginfo;
1008	struct proc *p;
1009	int szsigcode;
1010
1011	/*
1012	 * Calculate string base and vector table pointers.
1013	 * Also deal with signal trampoline code for this exec type.
1014	 */
1015	p = imgp->proc;
1016	szsigcode = 0;
1017	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
1018	if (p->p_sysent->sv_szsigcode != NULL)
1019		szsigcode = *(p->p_sysent->sv_szsigcode);
1020	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
1021	    roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
1022
1023	/*
1024	 * install sigcode
1025	 */
1026	if (szsigcode)
1027		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
1028		    szsigcode), szsigcode);
1029
1030	/*
1031	 * If we have a valid auxargs ptr, prepare some room
1032	 * on the stack.
1033	 */
1034	if (imgp->auxargs) {
1035		/*
1036		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
1037		 * lower compatibility.
1038		 */
1039		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
1040		    (AT_COUNT * 2);
1041		/*
1042		 * The '+ 2' is for the null pointers at the end of each of
1043		 * the arg and env vector sets,and imgp->auxarg_size is room
1044		 * for argument of Runtime loader.
1045		 */
1046		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
1047		    imgp->auxarg_size) * sizeof(char *));
1048
1049	} else
1050		/*
1051		 * The '+ 2' is for the null pointers at the end of each of
1052		 * the arg and env vector sets
1053		 */
1054		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) *
1055		    sizeof(char *));
1056
1057	/*
1058	 * vectp also becomes our initial stack base
1059	 */
1060	stack_base = (register_t *)vectp;
1061
1062	stringp = imgp->stringbase;
1063	argc = imgp->argc;
1064	envc = imgp->envc;
1065
1066	/*
1067	 * Copy out strings - arguments and environment.
1068	 */
1069	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
1070
1071	/*
1072	 * Fill in "ps_strings" struct for ps, w, etc.
1073	 */
1074	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
1075	suword(&arginfo->ps_nargvstr, argc);
1076
1077	/*
1078	 * Fill in argument portion of vector table.
1079	 */
1080	for (; argc > 0; --argc) {
1081		suword(vectp++, (long)(intptr_t)destp);
1082		while (*stringp++ != 0)
1083			destp++;
1084		destp++;
1085	}
1086
1087	/* a null vector table pointer separates the argp's from the envp's */
1088	suword(vectp++, 0);
1089
1090	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
1091	suword(&arginfo->ps_nenvstr, envc);
1092
1093	/*
1094	 * Fill in environment portion of vector table.
1095	 */
1096	for (; envc > 0; --envc) {
1097		suword(vectp++, (long)(intptr_t)destp);
1098		while (*stringp++ != 0)
1099			destp++;
1100		destp++;
1101	}
1102
1103	/* end of vector table is a null pointer */
1104	suword(vectp, 0);
1105
1106	return (stack_base);
1107}
1108
1109/*
1110 * Check permissions of file to execute.
1111 *	Called with imgp->vp locked.
1112 *	Return 0 for success or error code on failure.
1113 */
1114int
1115exec_check_permissions(imgp)
1116	struct image_params *imgp;
1117{
1118	struct vnode *vp = imgp->vp;
1119	struct vattr *attr = imgp->attr;
1120	struct thread *td;
1121	int error;
1122
1123	td = curthread;			/* XXXKSE */
1124
1125	/* Get file attributes */
1126	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
1127	if (error)
1128		return (error);
1129
1130#ifdef MAC
1131	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
1132	if (error)
1133		return (error);
1134#endif
1135
1136	/*
1137	 * 1) Check if file execution is disabled for the filesystem that this
1138	 *	file resides on.
1139	 * 2) Insure that at least one execute bit is on - otherwise root
1140	 *	will always succeed, and we don't want to happen unless the
1141	 *	file really is executable.
1142	 * 3) Insure that the file is a regular file.
1143	 */
1144	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
1145	    ((attr->va_mode & 0111) == 0) ||
1146	    (attr->va_type != VREG))
1147		return (EACCES);
1148
1149	/*
1150	 * Zero length files can't be exec'd
1151	 */
1152	if (attr->va_size == 0)
1153		return (ENOEXEC);
1154
1155	/*
1156	 *  Check for execute permission to file based on current credentials.
1157	 */
1158	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
1159	if (error)
1160		return (error);
1161
1162	/*
1163	 * Check number of open-for-writes on the file and deny execution
1164	 * if there are any.
1165	 */
1166	if (vp->v_writecount)
1167		return (ETXTBSY);
1168
1169	/*
1170	 * Call filesystem specific open routine (which does nothing in the
1171	 * general case).
1172	 */
1173	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
1174	return (error);
1175}
1176
1177/*
1178 * Exec handler registration
1179 */
1180int
1181exec_register(execsw_arg)
1182	const struct execsw *execsw_arg;
1183{
1184	const struct execsw **es, **xs, **newexecsw;
1185	int count = 2;	/* New slot and trailing NULL */
1186
1187	if (execsw)
1188		for (es = execsw; *es; es++)
1189			count++;
1190	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1191	if (newexecsw == NULL)
1192		return (ENOMEM);
1193	xs = newexecsw;
1194	if (execsw)
1195		for (es = execsw; *es; es++)
1196			*xs++ = *es;
1197	*xs++ = execsw_arg;
1198	*xs = NULL;
1199	if (execsw)
1200		free(execsw, M_TEMP);
1201	execsw = newexecsw;
1202	return (0);
1203}
1204
1205int
1206exec_unregister(execsw_arg)
1207	const struct execsw *execsw_arg;
1208{
1209	const struct execsw **es, **xs, **newexecsw;
1210	int count = 1;
1211
1212	if (execsw == NULL)
1213		panic("unregister with no handlers left?\n");
1214
1215	for (es = execsw; *es; es++) {
1216		if (*es == execsw_arg)
1217			break;
1218	}
1219	if (*es == NULL)
1220		return (ENOENT);
1221	for (es = execsw; *es; es++)
1222		if (*es != execsw_arg)
1223			count++;
1224	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1225	if (newexecsw == NULL)
1226		return (ENOMEM);
1227	xs = newexecsw;
1228	for (es = execsw; *es; es++)
1229		if (*es != execsw_arg)
1230			*xs++ = *es;
1231	*xs = NULL;
1232	if (execsw)
1233		free(execsw, M_TEMP);
1234	execsw = newexecsw;
1235	return (0);
1236}
1237