kern_sharedpage.c revision 106720
1/*
2 * Copyright (c) 1993, David Greenman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/kern/kern_exec.c 106720 2002-11-10 07:12:04Z alc $
27 */
28
29#include "opt_ktrace.h"
30#include "opt_mac.h"
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/lock.h>
35#include <sys/mutex.h>
36#include <sys/sysproto.h>
37#include <sys/signalvar.h>
38#include <sys/kernel.h>
39#include <sys/mac.h>
40#include <sys/mount.h>
41#include <sys/filedesc.h>
42#include <sys/fcntl.h>
43#include <sys/acct.h>
44#include <sys/exec.h>
45#include <sys/imgact.h>
46#include <sys/imgact_elf.h>
47#include <sys/wait.h>
48#include <sys/malloc.h>
49#include <sys/proc.h>
50#include <sys/pioctl.h>
51#include <sys/namei.h>
52#include <sys/sysent.h>
53#include <sys/shm.h>
54#include <sys/sysctl.h>
55#include <sys/user.h>
56#include <sys/vnode.h>
57#ifdef KTRACE
58#include <sys/ktrace.h>
59#endif
60
61#include <vm/vm.h>
62#include <vm/vm_param.h>
63#include <vm/pmap.h>
64#include <vm/vm_page.h>
65#include <vm/vm_map.h>
66#include <vm/vm_kern.h>
67#include <vm/vm_extern.h>
68#include <vm/vm_object.h>
69#include <vm/vm_pager.h>
70
71#include <machine/reg.h>
72
73MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
74
75static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");
76
77static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
78static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
79static int kern_execve(struct thread *td, char *fname, char **argv,
80	char **envv, struct mac *mac_p);
81
82/*
83 * callout list for things to do at exec time
84 */
85struct execlist {
86	execlist_fn function;
87	TAILQ_ENTRY(execlist) next;
88};
89
90TAILQ_HEAD(exec_list_head, execlist);
91static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);
92
93/* XXX This should be vm_size_t. */
94SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
95    NULL, 0, sysctl_kern_ps_strings, "LU", "");
96
97/* XXX This should be vm_size_t. */
98SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
99    NULL, 0, sysctl_kern_usrstack, "LU", "");
100
101u_long ps_arg_cache_limit = PAGE_SIZE / 16;
102SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
103    &ps_arg_cache_limit, 0, "");
104
105int ps_argsopen = 1;
106SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
107
108#ifdef __ia64__
109/* XXX HACK */
110static int regstkpages = 256;
111SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
112#endif
113
114static int
115sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
116{
117	struct proc *p;
118
119	p = curproc;
120	return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
121	   sizeof(p->p_sysent->sv_psstrings)));
122}
123
124static int
125sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
126{
127	struct proc *p;
128
129	p = curproc;
130	return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
131	    sizeof(p->p_sysent->sv_usrstack)));
132}
133
134/*
135 * Each of the items is a pointer to a `const struct execsw', hence the
136 * double pointer here.
137 */
138static const struct execsw **execsw;
139
140/*
141 * In-kernel implementation of execve().  All arguments are assumed to be
142 * userspace pointers from the passed thread.
143 *
144 * MPSAFE
145 */
146static int
147kern_execve(td, fname, argv, envv, mac_p)
148	struct thread *td;
149	char *fname;
150	char **argv;
151	char **envv;
152	struct mac *mac_p;
153{
154	struct proc *p = td->td_proc;
155	struct nameidata nd, *ndp;
156	struct ucred *newcred = NULL, *oldcred;
157	struct uidinfo *euip;
158	register_t *stack_base;
159	int error, len, i;
160	struct image_params image_params, *imgp;
161	struct vattr attr;
162	int (*img_first)(struct image_params *);
163	struct pargs *oldargs = NULL, *newargs = NULL;
164	struct procsig *oldprocsig, *newprocsig;
165#ifdef KTRACE
166	struct vnode *tracevp = NULL;
167#endif
168	struct vnode *textvp = NULL;
169	int credential_changing;
170	int textset;
171#ifdef MAC
172	struct label interplabel;	/* label of the interpreted vnode */
173	struct label execlabel;		/* optional label argument */
174	int will_transition, interplabelvalid = 0;
175#endif
176
177	imgp = &image_params;
178
179	/*
180	 * Lock the process and set the P_INEXEC flag to indicate that
181	 * it should be left alone until we're done here.  This is
182	 * necessary to avoid race conditions - e.g. in ptrace() -
183	 * that might allow a local user to illicitly obtain elevated
184	 * privileges.
185	 */
186	PROC_LOCK(p);
187	KASSERT((p->p_flag & P_INEXEC) == 0,
188	    ("%s(): process already has P_INEXEC flag", __func__));
189	if (p->p_flag & P_KSES) {
190		if (thread_single(SINGLE_EXIT)) {
191			PROC_UNLOCK(p);
192			return (ERESTART);	/* Try again later. */
193		}
194		/*
195		 * If we get here all other threads are dead,
196		 * so unset the associated flags and lose KSE mode.
197		 */
198		p->p_flag &= ~P_KSES;
199		td->td_flags &= ~TDF_UNBOUND;
200		thread_single_end();
201	}
202	p->p_flag |= P_INEXEC;
203	PROC_UNLOCK(p);
204
205	/*
206	 * Initialize part of the common data
207	 */
208	imgp->proc = p;
209	imgp->userspace_argv = argv;
210	imgp->userspace_envv = envv;
211	imgp->execlabel = NULL;
212	imgp->attr = &attr;
213	imgp->argc = imgp->envc = 0;
214	imgp->argv0 = NULL;
215	imgp->entry_addr = 0;
216	imgp->vmspace_destroyed = 0;
217	imgp->interpreted = 0;
218	imgp->interpreter_name[0] = '\0';
219	imgp->auxargs = NULL;
220	imgp->vp = NULL;
221	imgp->object = NULL;
222	imgp->firstpage = NULL;
223	imgp->ps_strings = 0;
224	imgp->auxarg_size = 0;
225
226#ifdef MAC
227	error = mac_execve_enter(imgp, mac_p, &execlabel);
228	if (error) {
229		mtx_lock(&Giant);
230		goto exec_fail;
231	}
232#endif
233
234	/*
235	 * Allocate temporary demand zeroed space for argument and
236	 *	environment strings
237	 */
238	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX +
239	    PAGE_SIZE);
240	if (imgp->stringbase == NULL) {
241		error = ENOMEM;
242		mtx_lock(&Giant);
243		goto exec_fail;
244	}
245	imgp->stringp = imgp->stringbase;
246	imgp->stringspace = ARG_MAX;
247	imgp->image_header = imgp->stringbase + ARG_MAX;
248
249	/*
250	 * Translate the file name. namei() returns a vnode pointer
251	 *	in ni_vp amoung other things.
252	 */
253	ndp = &nd;
254	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
255	    UIO_USERSPACE, fname, td);
256
257	mtx_lock(&Giant);
258interpret:
259
260	error = namei(ndp);
261	if (error) {
262		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
263		    ARG_MAX + PAGE_SIZE);
264		goto exec_fail;
265	}
266
267	imgp->vp = ndp->ni_vp;
268	imgp->fname = fname;
269
270	/*
271	 * Check file permissions (also 'opens' file)
272	 */
273	error = exec_check_permissions(imgp);
274	if (error)
275		goto exec_fail_dealloc;
276
277	if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0)
278		vm_object_reference(imgp->object);
279
280	/*
281	 * Set VV_TEXT now so no one can write to the executable while we're
282	 * activating it.
283	 *
284	 * Remember if this was set before and unset it in case this is not
285	 * actually an executable image.
286	 */
287	textset = imgp->vp->v_vflag & VV_TEXT;
288	imgp->vp->v_vflag |= VV_TEXT;
289
290	error = exec_map_first_page(imgp);
291	if (error)
292		goto exec_fail_dealloc;
293
294	/*
295	 *	If the current process has a special image activator it
296	 *	wants to try first, call it.   For example, emulating shell
297	 *	scripts differently.
298	 */
299	error = -1;
300	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
301		error = img_first(imgp);
302
303	/*
304	 *	Loop through the list of image activators, calling each one.
305	 *	An activator returns -1 if there is no match, 0 on success,
306	 *	and an error otherwise.
307	 */
308	for (i = 0; error == -1 && execsw[i]; ++i) {
309		if (execsw[i]->ex_imgact == NULL ||
310		    execsw[i]->ex_imgact == img_first) {
311			continue;
312		}
313		error = (*execsw[i]->ex_imgact)(imgp);
314	}
315
316	if (error) {
317		if (error == -1) {
318			if (textset == 0)
319				imgp->vp->v_vflag &= ~VV_TEXT;
320			error = ENOEXEC;
321		}
322		goto exec_fail_dealloc;
323	}
324
325	/*
326	 * Special interpreter operation, cleanup and loop up to try to
327	 * activate the interpreter.
328	 */
329	if (imgp->interpreted) {
330		exec_unmap_first_page(imgp);
331		/*
332		 * VV_TEXT needs to be unset for scripts.  There is a short
333		 * period before we determine that something is a script where
334		 * VV_TEXT will be set. The vnode lock is held over this
335		 * entire period so nothing should illegitimately be blocked.
336		 */
337		imgp->vp->v_vflag &= ~VV_TEXT;
338		/* free name buffer and old vnode */
339		NDFREE(ndp, NDF_ONLY_PNBUF);
340#ifdef MAC
341		mac_init_vnode_label(&interplabel);
342		mac_copy_vnode_label(&ndp->ni_vp->v_label, &interplabel);
343		interplabelvalid = 1;
344#endif
345		vput(ndp->ni_vp);
346		vm_object_deallocate(imgp->object);
347		imgp->object = NULL;
348		/* set new name to that of the interpreter */
349		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
350		    UIO_SYSSPACE, imgp->interpreter_name, td);
351		goto interpret;
352	}
353
354	/*
355	 * Copy out strings (args and env) and initialize stack base
356	 */
357	if (p->p_sysent->sv_copyout_strings)
358		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
359	else
360		stack_base = exec_copyout_strings(imgp);
361
362	/*
363	 * If custom stack fixup routine present for this process
364	 * let it do the stack setup.
365	 * Else stuff argument count as first item on stack
366	 */
367	if (p->p_sysent->sv_fixup)
368		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
369	else
370		suword(--stack_base, imgp->argc);
371
372	/*
373	 * For security and other reasons, the file descriptor table cannot
374	 * be shared after an exec.
375	 */
376	FILEDESC_LOCK(p->p_fd);
377	if (p->p_fd->fd_refcnt > 1) {
378		struct filedesc *tmp;
379
380		tmp = fdcopy(td);
381		FILEDESC_UNLOCK(p->p_fd);
382		fdfree(td);
383		p->p_fd = tmp;
384	} else
385		FILEDESC_UNLOCK(p->p_fd);
386
387	/*
388	 * Malloc things before we need locks.
389	 */
390	newcred = crget();
391	euip = uifind(attr.va_uid);
392	i = imgp->endargs - imgp->stringbase;
393	if (ps_arg_cache_limit >= i + sizeof(struct pargs))
394		newargs = pargs_alloc(i);
395
396	/* close files on exec */
397	fdcloseexec(td);
398
399	/* Get a reference to the vnode prior to locking the proc */
400	VREF(ndp->ni_vp);
401
402	/*
403	 * For security and other reasons, signal handlers cannot
404	 * be shared after an exec. The new process gets a copy of the old
405	 * handlers. In execsigs(), the new process will have its signals
406	 * reset.
407	 */
408	PROC_LOCK(p);
409	mp_fixme("procsig needs a lock");
410	if (p->p_procsig->ps_refcnt > 1) {
411		oldprocsig = p->p_procsig;
412		PROC_UNLOCK(p);
413		MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
414		    M_SUBPROC, M_WAITOK);
415		bcopy(oldprocsig, newprocsig, sizeof(*newprocsig));
416		newprocsig->ps_refcnt = 1;
417		oldprocsig->ps_refcnt--;
418		PROC_LOCK(p);
419		p->p_procsig = newprocsig;
420		if (p->p_sigacts == &p->p_uarea->u_sigacts)
421			panic("shared procsig but private sigacts?");
422
423		p->p_uarea->u_sigacts = *p->p_sigacts;
424		p->p_sigacts = &p->p_uarea->u_sigacts;
425	}
426	/* Stop profiling */
427	stopprofclock(p);
428
429	/* reset caught signals */
430	execsigs(p);
431
432	/* name this process - nameiexec(p, ndp) */
433	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
434	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
435	p->p_comm[len] = 0;
436
437	/*
438	 * mark as execed, wakeup the process that vforked (if any) and tell
439	 * it that it now has its own resources back
440	 */
441	p->p_flag |= P_EXEC;
442	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
443		p->p_flag &= ~P_PPWAIT;
444		wakeup(p->p_pptr);
445	}
446
447	/*
448	 * Implement image setuid/setgid.
449	 *
450	 * Don't honor setuid/setgid if the filesystem prohibits it or if
451	 * the process is being traced.
452	 *
453	 * XXXMAC: For the time being, use NOSUID to also prohibit
454	 * transitions on the file system.
455	 */
456	oldcred = p->p_ucred;
457	credential_changing = 0;
458	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
459	    attr.va_uid;
460	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
461	    attr.va_gid;
462#ifdef MAC
463	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
464	    interplabelvalid ? &interplabel : NULL, imgp);
465	credential_changing |= will_transition;
466#endif
467
468	if (credential_changing &&
469	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
470	    (p->p_flag & P_TRACED) == 0) {
471		/*
472		 * Turn off syscall tracing for set-id programs, except for
473		 * root.  Record any set-id flags first to make sure that
474		 * we do not regain any tracing during a possible block.
475		 */
476		setsugid(p);
477#ifdef KTRACE
478		if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) {
479			mtx_lock(&ktrace_mtx);
480			p->p_traceflag = 0;
481			tracevp = p->p_tracep;
482			p->p_tracep = NULL;
483			mtx_unlock(&ktrace_mtx);
484		}
485#endif
486		/*
487		 * Close any file descriptors 0..2 that reference procfs,
488		 * then make sure file descriptors 0..2 are in use.
489		 *
490		 * setugidsafety() may call closef() and then pfind()
491		 * which may grab the process lock.
492		 * fdcheckstd() may call falloc() which may block to
493		 * allocate memory, so temporarily drop the process lock.
494		 */
495		PROC_UNLOCK(p);
496		setugidsafety(td);
497		error = fdcheckstd(td);
498		if (error != 0)
499			goto done1;
500		PROC_LOCK(p);
501		/*
502		 * Set the new credentials.
503		 */
504		crcopy(newcred, oldcred);
505		if (attr.va_mode & VSUID)
506			change_euid(newcred, euip);
507		if (attr.va_mode & VSGID)
508			change_egid(newcred, attr.va_gid);
509#ifdef MAC
510		if (will_transition) {
511			mac_execve_transition(oldcred, newcred, imgp->vp,
512			    interplabelvalid ? &interplabel : NULL, imgp);
513		}
514#endif
515		/*
516		 * Implement correct POSIX saved-id behavior.
517		 *
518		 * XXXMAC: Note that the current logic will save the
519		 * uid and gid if a MAC domain transition occurs, even
520		 * though maybe it shouldn't.
521		 */
522		change_svuid(newcred, newcred->cr_uid);
523		change_svgid(newcred, newcred->cr_gid);
524		p->p_ucred = newcred;
525		newcred = NULL;
526	} else {
527		if (oldcred->cr_uid == oldcred->cr_ruid &&
528		    oldcred->cr_gid == oldcred->cr_rgid)
529			p->p_flag &= ~P_SUGID;
530		/*
531		 * Implement correct POSIX saved-id behavior.
532		 *
533		 * XXX: It's not clear that the existing behavior is
534		 * POSIX-compliant.  A number of sources indicate that the
535		 * saved uid/gid should only be updated if the new ruid is
536		 * not equal to the old ruid, or the new euid is not equal
537		 * to the old euid and the new euid is not equal to the old
538		 * ruid.  The FreeBSD code always updates the saved uid/gid.
539		 * Also, this code uses the new (replaced) euid and egid as
540		 * the source, which may or may not be the right ones to use.
541		 */
542		if (oldcred->cr_svuid != oldcred->cr_uid ||
543		    oldcred->cr_svgid != oldcred->cr_gid) {
544			crcopy(newcred, oldcred);
545			change_svuid(newcred, newcred->cr_uid);
546			change_svgid(newcred, newcred->cr_gid);
547			p->p_ucred = newcred;
548			newcred = NULL;
549		}
550	}
551
552	/*
553	 * Store the vp for use in procfs.  This vnode was referenced prior
554	 * to locking the proc lock.
555	 */
556	textvp = p->p_textvp;
557	p->p_textvp = ndp->ni_vp;
558
559	/*
560	 * Notify others that we exec'd, and clear the P_INEXEC flag
561	 * as we're now a bona fide freshly-execed process.
562	 */
563	KNOTE(&p->p_klist, NOTE_EXEC);
564	p->p_flag &= ~P_INEXEC;
565
566	/*
567	 * If tracing the process, trap to debugger so breakpoints
568	 * can be set before the program executes.
569	 */
570	_STOPEVENT(p, S_EXEC, 0);
571
572	if (p->p_flag & P_TRACED)
573		psignal(p, SIGTRAP);
574
575	/* clear "fork but no exec" flag, as we _are_ execing */
576	p->p_acflag &= ~AFORK;
577
578	/* Free any previous argument cache */
579	oldargs = p->p_args;
580	p->p_args = NULL;
581
582	/* Cache arguments if they fit inside our allowance */
583	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
584		bcopy(imgp->stringbase, newargs->ar_args, i);
585		p->p_args = newargs;
586		newargs = NULL;
587	}
588	PROC_UNLOCK(p);
589
590	/* Set values passed into the program in registers. */
591	if (p->p_sysent->sv_setregs)
592		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
593		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
594	else
595		exec_setregs(td, imgp->entry_addr,
596		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
597
598done1:
599	/*
600	 * Free any resources malloc'd earlier that we didn't use.
601	 */
602	uifree(euip);
603	if (newcred == NULL)
604		crfree(oldcred);
605	else
606		crfree(newcred);
607	/*
608	 * Handle deferred decrement of ref counts.
609	 */
610	if (textvp != NULL)
611		vrele(textvp);
612	if (ndp->ni_vp && error != 0)
613		vrele(ndp->ni_vp);
614#ifdef KTRACE
615	if (tracevp != NULL)
616		vrele(tracevp);
617#endif
618	if (oldargs != NULL)
619		pargs_drop(oldargs);
620	if (newargs != NULL)
621		pargs_drop(newargs);
622
623exec_fail_dealloc:
624
625	/*
626	 * free various allocated resources
627	 */
628	if (imgp->firstpage)
629		exec_unmap_first_page(imgp);
630
631	if (imgp->stringbase != NULL)
632		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
633		    ARG_MAX + PAGE_SIZE);
634
635	if (imgp->vp) {
636		NDFREE(ndp, NDF_ONLY_PNBUF);
637		vput(imgp->vp);
638	}
639
640	if (imgp->object)
641		vm_object_deallocate(imgp->object);
642
643	if (error == 0)
644		goto done2;
645
646exec_fail:
647	/* we're done here, clear P_INEXEC */
648	PROC_LOCK(p);
649	p->p_flag &= ~P_INEXEC;
650	PROC_UNLOCK(p);
651
652	if (imgp->vmspace_destroyed) {
653		/* sorry, no more process anymore. exit gracefully */
654#ifdef MAC
655		mac_execve_exit(imgp);
656		if (interplabelvalid)
657			mac_destroy_vnode_label(&interplabel);
658#endif
659		exit1(td, W_EXITCODE(0, SIGABRT));
660		/* NOT REACHED */
661		error = 0;
662	}
663done2:
664#ifdef MAC
665	mac_execve_exit(imgp);
666	if (interplabelvalid)
667		mac_destroy_vnode_label(&interplabel);
668#endif
669	mtx_unlock(&Giant);
670	return (error);
671}
672
673#ifndef _SYS_SYSPROTO_H_
674struct execve_args {
675        char    *fname;
676        char    **argv;
677        char    **envv;
678};
679#endif
680
681/*
682 * MPSAFE
683 */
684int
685execve(td, uap)
686	struct thread *td;
687	struct execve_args /* {
688		syscallarg(char *) fname;
689		syscallarg(char **) argv;
690		syscallarg(char **) envv;
691	} */ *uap;
692{
693
694	return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL));
695}
696
697#ifndef _SYS_SYSPROTO_H_
698struct __mac_execve_args {
699	char	*fname;
700	char	**argv;
701	char	**envv;
702	struct mac	*mac_p;
703};
704#endif
705
706/*
707 * MPSAFE
708 */
709int
710__mac_execve(td, uap)
711	struct thread *td;
712	struct __mac_execve_args /* {
713		syscallarg(char *) fname;
714		syscallarg(char **) argv;
715		syscallarg(char **) envv;
716		syscallarg(struct mac *) mac_p;
717	} */ *uap;
718{
719
720#ifdef MAC
721	return (kern_execve(td, uap->fname, uap->argv, uap->envv,
722	    uap->mac_p));
723#else
724	return (ENOSYS);
725#endif
726}
727
728int
729exec_map_first_page(imgp)
730	struct image_params *imgp;
731{
732	int rv, i;
733	int initial_pagein;
734	vm_page_t ma[VM_INITIAL_PAGEIN];
735	vm_object_t object;
736
737	GIANT_REQUIRED;
738
739	if (imgp->firstpage) {
740		exec_unmap_first_page(imgp);
741	}
742
743	VOP_GETVOBJECT(imgp->vp, &object);
744
745	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
746
747	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
748		initial_pagein = VM_INITIAL_PAGEIN;
749		if (initial_pagein > object->size)
750			initial_pagein = object->size;
751		for (i = 1; i < initial_pagein; i++) {
752			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
753				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
754					break;
755				if (ma[i]->valid)
756					break;
757				vm_page_busy(ma[i]);
758			} else {
759				ma[i] = vm_page_alloc(object, i,
760				    VM_ALLOC_NORMAL);
761				if (ma[i] == NULL)
762					break;
763			}
764		}
765		initial_pagein = i;
766
767		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
768		ma[0] = vm_page_lookup(object, 0);
769
770		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
771		    (ma[0]->valid == 0)) {
772			if (ma[0]) {
773				vm_page_lock_queues();
774				pmap_page_protect(ma[0], VM_PROT_NONE);
775				vm_page_free(ma[0]);
776				vm_page_unlock_queues();
777			}
778			return (EIO);
779		}
780	}
781	vm_page_lock_queues();
782	vm_page_wire(ma[0]);
783	vm_page_wakeup(ma[0]);
784	vm_page_unlock_queues();
785
786	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
787	imgp->firstpage = ma[0];
788
789	return (0);
790}
791
792void
793exec_unmap_first_page(imgp)
794	struct image_params *imgp;
795{
796	GIANT_REQUIRED;
797
798	if (imgp->firstpage) {
799		pmap_qremove((vm_offset_t)imgp->image_header, 1);
800		vm_page_lock_queues();
801		vm_page_unwire(imgp->firstpage, 1);
802		vm_page_unlock_queues();
803		imgp->firstpage = NULL;
804	}
805}
806
807/*
808 * Destroy old address space, and allocate a new stack
809 *	The new stack is only SGROWSIZ large because it is grown
810 *	automatically in trap.c.
811 */
812int
813exec_new_vmspace(imgp, sv)
814	struct image_params *imgp;
815	struct sysentvec *sv;
816{
817	int error;
818	struct execlist *ep;
819	struct proc *p = imgp->proc;
820	struct vmspace *vmspace = p->p_vmspace;
821	vm_offset_t stack_addr;
822	vm_map_t map;
823
824	GIANT_REQUIRED;
825
826	stack_addr = sv->sv_usrstack - maxssiz;
827
828	imgp->vmspace_destroyed = 1;
829
830	/*
831	 * Perform functions registered with at_exec().
832	 */
833	TAILQ_FOREACH(ep, &exec_list, next)
834		(*ep->function)(p);
835
836	/*
837	 * Blow away entire process VM, if address space not shared,
838	 * otherwise, create a new VM space so that other threads are
839	 * not disrupted
840	 */
841	map = &vmspace->vm_map;
842	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
843	    vm_map_max(map) == sv->sv_maxuser) {
844		if (vmspace->vm_shm)
845			shmexit(p);
846		pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
847		    vm_map_max(map));
848		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
849	} else {
850		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
851		vmspace = p->p_vmspace;
852		map = &vmspace->vm_map;
853	}
854
855	/* Allocate a new stack */
856	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
857	    sv->sv_stackprot, VM_PROT_ALL, 0);
858	if (error)
859		return (error);
860
861#ifdef __ia64__
862	{
863		/*
864		 * Allocate backing store. We really need something
865		 * similar to vm_map_stack which can allow the backing
866		 * store to grow upwards. This will do for now.
867		 */
868		vm_offset_t bsaddr;
869		bsaddr = p->p_sysent->sv_usrstack - 2 * maxssiz;
870		error = vm_map_find(map, 0, 0, &bsaddr,
871		    regstkpages * PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
872		FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
873	}
874#endif
875
876	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
877	 * VM_STACK case, but they are still used to monitor the size of the
878	 * process stack so we can check the stack rlimit.
879	 */
880	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
881	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
882
883	return (0);
884}
885
886/*
887 * Copy out argument and environment strings from the old process
888 *	address space into the temporary string buffer.
889 */
890int
891exec_extract_strings(imgp)
892	struct image_params *imgp;
893{
894	char	**argv, **envv;
895	char	*argp, *envp;
896	int	error;
897	size_t	length;
898
899	/*
900	 * extract arguments first
901	 */
902
903	argv = imgp->userspace_argv;
904
905	if (argv) {
906		argp = (caddr_t)(intptr_t)fuword(argv);
907		if (argp == (caddr_t)-1)
908			return (EFAULT);
909		if (argp)
910			argv++;
911		if (imgp->argv0)
912			argp = imgp->argv0;
913		if (argp) {
914			do {
915				if (argp == (caddr_t)-1)
916					return (EFAULT);
917				if ((error = copyinstr(argp, imgp->stringp,
918				    imgp->stringspace, &length))) {
919					if (error == ENAMETOOLONG)
920						return (E2BIG);
921					return (error);
922				}
923				imgp->stringspace -= length;
924				imgp->stringp += length;
925				imgp->argc++;
926			} while ((argp = (caddr_t)(intptr_t)fuword(argv++)));
927		}
928	}
929
930	imgp->endargs = imgp->stringp;
931
932	/*
933	 * extract environment strings
934	 */
935
936	envv = imgp->userspace_envv;
937
938	if (envv) {
939		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
940			if (envp == (caddr_t)-1)
941				return (EFAULT);
942			if ((error = copyinstr(envp, imgp->stringp,
943			    imgp->stringspace, &length))) {
944				if (error == ENAMETOOLONG)
945					return (E2BIG);
946				return (error);
947			}
948			imgp->stringspace -= length;
949			imgp->stringp += length;
950			imgp->envc++;
951		}
952	}
953
954	return (0);
955}
956
957/*
958 * Copy strings out to the new process address space, constructing
959 *	new arg and env vector tables. Return a pointer to the base
960 *	so that it can be used as the initial stack pointer.
961 */
962register_t *
963exec_copyout_strings(imgp)
964	struct image_params *imgp;
965{
966	int argc, envc;
967	char **vectp;
968	char *stringp, *destp;
969	register_t *stack_base;
970	struct ps_strings *arginfo;
971	struct proc *p;
972	int szsigcode;
973
974	/*
975	 * Calculate string base and vector table pointers.
976	 * Also deal with signal trampoline code for this exec type.
977	 */
978	p = imgp->proc;
979	szsigcode = 0;
980	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
981	if (p->p_sysent->sv_szsigcode != NULL)
982		szsigcode = *(p->p_sysent->sv_szsigcode);
983	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
984	    roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
985
986	/*
987	 * install sigcode
988	 */
989	if (szsigcode)
990		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
991		    szsigcode), szsigcode);
992
993	/*
994	 * If we have a valid auxargs ptr, prepare some room
995	 * on the stack.
996	 */
997	if (imgp->auxargs) {
998		/*
999		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
1000		 * lower compatibility.
1001		 */
1002		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
1003		    (AT_COUNT * 2);
1004		/*
1005		 * The '+ 2' is for the null pointers at the end of each of
1006		 * the arg and env vector sets,and imgp->auxarg_size is room
1007		 * for argument of Runtime loader.
1008		 */
1009		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
1010		    imgp->auxarg_size) * sizeof(char *));
1011
1012	} else
1013		/*
1014		 * The '+ 2' is for the null pointers at the end of each of
1015		 * the arg and env vector sets
1016		 */
1017		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) *
1018		    sizeof(char *));
1019
1020	/*
1021	 * vectp also becomes our initial stack base
1022	 */
1023	stack_base = (register_t *)vectp;
1024
1025	stringp = imgp->stringbase;
1026	argc = imgp->argc;
1027	envc = imgp->envc;
1028
1029	/*
1030	 * Copy out strings - arguments and environment.
1031	 */
1032	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
1033
1034	/*
1035	 * Fill in "ps_strings" struct for ps, w, etc.
1036	 */
1037	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
1038	suword(&arginfo->ps_nargvstr, argc);
1039
1040	/*
1041	 * Fill in argument portion of vector table.
1042	 */
1043	for (; argc > 0; --argc) {
1044		suword(vectp++, (long)(intptr_t)destp);
1045		while (*stringp++ != 0)
1046			destp++;
1047		destp++;
1048	}
1049
1050	/* a null vector table pointer separates the argp's from the envp's */
1051	suword(vectp++, 0);
1052
1053	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
1054	suword(&arginfo->ps_nenvstr, envc);
1055
1056	/*
1057	 * Fill in environment portion of vector table.
1058	 */
1059	for (; envc > 0; --envc) {
1060		suword(vectp++, (long)(intptr_t)destp);
1061		while (*stringp++ != 0)
1062			destp++;
1063		destp++;
1064	}
1065
1066	/* end of vector table is a null pointer */
1067	suword(vectp, 0);
1068
1069	return (stack_base);
1070}
1071
1072/*
1073 * Check permissions of file to execute.
1074 *	Called with imgp->vp locked.
1075 *	Return 0 for success or error code on failure.
1076 */
1077int
1078exec_check_permissions(imgp)
1079	struct image_params *imgp;
1080{
1081	struct vnode *vp = imgp->vp;
1082	struct vattr *attr = imgp->attr;
1083	struct thread *td;
1084	int error;
1085
1086	td = curthread;			/* XXXKSE */
1087
1088#ifdef MAC
1089	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
1090	if (error)
1091		return (error);
1092#endif
1093
1094	/* Get file attributes */
1095	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
1096	if (error)
1097		return (error);
1098
1099	/*
1100	 * 1) Check if file execution is disabled for the filesystem that this
1101	 *	file resides on.
1102	 * 2) Insure that at least one execute bit is on - otherwise root
1103	 *	will always succeed, and we don't want to happen unless the
1104	 *	file really is executable.
1105	 * 3) Insure that the file is a regular file.
1106	 */
1107	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
1108	    ((attr->va_mode & 0111) == 0) ||
1109	    (attr->va_type != VREG))
1110		return (EACCES);
1111
1112	/*
1113	 * Zero length files can't be exec'd
1114	 */
1115	if (attr->va_size == 0)
1116		return (ENOEXEC);
1117
1118	/*
1119	 *  Check for execute permission to file based on current credentials.
1120	 */
1121	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
1122	if (error)
1123		return (error);
1124
1125	/*
1126	 * Check number of open-for-writes on the file and deny execution
1127	 * if there are any.
1128	 */
1129	if (vp->v_writecount)
1130		return (ETXTBSY);
1131
1132	/*
1133	 * Call filesystem specific open routine (which does nothing in the
1134	 * general case).
1135	 */
1136	error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
1137	return (error);
1138}
1139
1140/*
1141 * Exec handler registration
1142 */
1143int
1144exec_register(execsw_arg)
1145	const struct execsw *execsw_arg;
1146{
1147	const struct execsw **es, **xs, **newexecsw;
1148	int count = 2;	/* New slot and trailing NULL */
1149
1150	if (execsw)
1151		for (es = execsw; *es; es++)
1152			count++;
1153	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1154	if (newexecsw == NULL)
1155		return (ENOMEM);
1156	xs = newexecsw;
1157	if (execsw)
1158		for (es = execsw; *es; es++)
1159			*xs++ = *es;
1160	*xs++ = execsw_arg;
1161	*xs = NULL;
1162	if (execsw)
1163		free(execsw, M_TEMP);
1164	execsw = newexecsw;
1165	return (0);
1166}
1167
1168int
1169exec_unregister(execsw_arg)
1170	const struct execsw *execsw_arg;
1171{
1172	const struct execsw **es, **xs, **newexecsw;
1173	int count = 1;
1174
1175	if (execsw == NULL)
1176		panic("unregister with no handlers left?\n");
1177
1178	for (es = execsw; *es; es++) {
1179		if (*es == execsw_arg)
1180			break;
1181	}
1182	if (*es == NULL)
1183		return (ENOENT);
1184	for (es = execsw; *es; es++)
1185		if (*es != execsw_arg)
1186			count++;
1187	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1188	if (newexecsw == NULL)
1189		return (ENOMEM);
1190	xs = newexecsw;
1191	for (es = execsw; *es; es++)
1192		if (*es != execsw_arg)
1193			*xs++ = *es;
1194	*xs = NULL;
1195	if (execsw)
1196		free(execsw, M_TEMP);
1197	execsw = newexecsw;
1198	return (0);
1199}
1200
1201int
1202at_exec(function)
1203	execlist_fn function;
1204{
1205	struct execlist *ep;
1206
1207#ifdef INVARIANTS
1208	/* Be noisy if the programmer has lost track of things */
1209	if (rm_at_exec(function))
1210		printf("WARNING: exec callout entry (%p) already present\n",
1211		    function);
1212#endif
1213	ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
1214	if (ep == NULL)
1215		return (ENOMEM);
1216	ep->function = function;
1217	TAILQ_INSERT_TAIL(&exec_list, ep, next);
1218	return (0);
1219}
1220
1221/*
1222 * Scan the exec callout list for the given item and remove it.
1223 * Returns the number of items removed (0 or 1)
1224 */
1225int
1226rm_at_exec(function)
1227	execlist_fn function;
1228{
1229	struct execlist *ep;
1230
1231	TAILQ_FOREACH(ep, &exec_list, next) {
1232		if (ep->function == function) {
1233			TAILQ_REMOVE(&exec_list, ep, next);
1234			free(ep, M_ATEXEC);
1235			return (1);
1236		}
1237	}
1238	return (0);
1239}
1240