kern_sharedpage.c revision 30451
1/*
2 * Copyright (c) 1993, David Greenman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 *	$Id: kern_exec.c,v 1.66 1997/09/21 04:22:50 dyson Exp $
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/signalvar.h>
33#include <sys/kernel.h>
34#include <sys/mount.h>
35#include <sys/filedesc.h>
36#include <sys/fcntl.h>
37#include <sys/acct.h>
38#include <sys/exec.h>
39#include <sys/imgact.h>
40#include <sys/imgact_elf.h>
41#include <sys/wait.h>
42#include <sys/proc.h>
43#include <sys/malloc.h>
44#include <sys/namei.h>
45#include <sys/sysent.h>
46#include <sys/shm.h>
47#include <sys/sysctl.h>
48#include <sys/vnode.h>
49#include <sys/buf.h>
50
51#include <vm/vm.h>
52#include <vm/vm_param.h>
53#include <vm/vm_prot.h>
54#include <sys/lock.h>
55#include <vm/pmap.h>
56#include <vm/vm_map.h>
57#include <vm/vm_kern.h>
58#include <vm/vm_extern.h>
59#include <vm/vm_object.h>
60
61#include <machine/reg.h>
62
63static int *exec_copyout_strings __P((struct image_params *));
64
65static int exec_check_permissions(struct image_params *);
66
67/*
68 * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts
69 * of the sysctl code also assumes this, and sizeof(int) == sizeof(long).
70 */
71static struct ps_strings *ps_strings = PS_STRINGS;
72SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, "");
73
74static caddr_t usrstack = (caddr_t)USRSTACK;
75SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, "");
76
77/*
78 * execsw_set is constructed for us by the linker.  Each of the items
79 * is a pointer to a `const struct execsw', hence the double pointer here.
80 */
81static const struct execsw **execsw =
82	(const struct execsw **)&execsw_set.ls_items[0];
83
84#ifndef _SYS_SYSPROTO_H_
85struct execve_args {
86        char    *fname;
87        char    **argv;
88        char    **envv;
89};
90#endif
91
92/*
93 * execve() system call.
94 */
95int
96execve(p, uap, retval)
97	struct proc *p;
98	register struct execve_args *uap;
99	int *retval;
100{
101	struct nameidata nd, *ndp;
102	int *stack_base;
103	int error, len, i;
104	struct image_params image_params, *imgp;
105	struct vattr attr;
106	struct buf *bp = NULL;
107
108	imgp = &image_params;
109
110	/*
111	 * Initialize part of the common data
112	 */
113	imgp->proc = p;
114	imgp->uap = uap;
115	imgp->attr = &attr;
116	imgp->image_header = NULL;
117	imgp->argc = imgp->envc = 0;
118	imgp->argv0 = NULL;
119	imgp->entry_addr = 0;
120	imgp->vmspace_destroyed = 0;
121	imgp->interpreted = 0;
122	imgp->interpreter_name[0] = '\0';
123	imgp->auxargs = NULL;
124
125	/*
126	 * Allocate temporary demand zeroed space for argument and
127	 *	environment strings
128	 */
129	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
130	if (imgp->stringbase == NULL) {
131		error = ENOMEM;
132		goto exec_fail;
133	}
134	imgp->stringp = imgp->stringbase;
135	imgp->stringspace = ARG_MAX;
136
137	/*
138	 * Translate the file name. namei() returns a vnode pointer
139	 *	in ni_vp amoung other things.
140	 */
141	ndp = &nd;
142	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
143	    UIO_USERSPACE, uap->fname, p);
144
145interpret:
146
147	error = namei(ndp);
148	if (error) {
149		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
150		goto exec_fail;
151	}
152
153	imgp->vp = ndp->ni_vp;
154
155	/*
156	 * Check file permissions (also 'opens' file)
157	 */
158	error = exec_check_permissions(imgp);
159	if (error) {
160		VOP_UNLOCK(imgp->vp, 0, p);
161		goto exec_fail_dealloc;
162	}
163
164	/*
165	 * Get the image header, which we define here as meaning the first
166	 * page of the executable.
167	 */
168	if (imgp->vp->v_object && imgp->vp->v_mount &&
169	    imgp->vp->v_mount->mnt_stat.f_iosize >= PAGE_SIZE &&
170	    imgp->vp->v_object->un_pager.vnp.vnp_size >=
171	    imgp->vp->v_mount->mnt_stat.f_iosize) {
172		/*
173		 * Get a buffer with (at least) the first page.
174		 */
175		error = bread(imgp->vp, 0, imgp->vp->v_mount->mnt_stat.f_iosize,
176		     p->p_ucred, &bp);
177		imgp->image_header = bp->b_data;
178	} else {
179		int resid;
180
181		/*
182		 * The filesystem block size is too small, so do this the hard
183		 * way. Malloc some space and read PAGE_SIZE worth of the image
184		 * header into it.
185		 */
186		imgp->image_header = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
187		error = vn_rdwr(UIO_READ, imgp->vp, (void *)imgp->image_header, PAGE_SIZE, 0,
188		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
189		/*
190		 * Clear out any remaining junk.
191		 */
192		if (!error && resid)
193			bzero((char *)imgp->image_header + PAGE_SIZE - resid, resid);
194	}
195	VOP_UNLOCK(imgp->vp, 0, p);
196	if (error)
197		goto exec_fail_dealloc;
198
199	/*
200	 * Loop through list of image activators, calling each one.
201	 *	If there is no match, the activator returns -1. If there
202	 *	is a match, but there was an error during the activation,
203	 *	the error is returned. Otherwise 0 means success. If the
204	 *	image is interpreted, loop back up and try activating
205	 *	the interpreter.
206	 */
207	for (i = 0; execsw[i]; ++i) {
208		if (execsw[i]->ex_imgact)
209			error = (*execsw[i]->ex_imgact)(imgp);
210		else
211			continue;
212		if (error == -1)
213			continue;
214		if (error)
215			goto exec_fail_dealloc;
216		if (imgp->interpreted) {
217			/* free old bp/image_header */
218			if (bp != NULL) {
219				brelse(bp);
220				bp = NULL;
221			} else {
222				free((void *)imgp->image_header, M_TEMP);
223				imgp->image_header = NULL;
224			}
225			/* free old vnode and name buffer */
226			vrele(ndp->ni_vp);
227			zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
228			/* set new name to that of the interpreter */
229			NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
230			    UIO_SYSSPACE, imgp->interpreter_name, p);
231			goto interpret;
232		}
233		break;
234	}
235	/* If we made it through all the activators and none matched, exit. */
236	if (error == -1) {
237		error = ENOEXEC;
238		goto exec_fail_dealloc;
239	}
240
241	/*
242	 * Copy out strings (args and env) and initialize stack base
243	 */
244	stack_base = exec_copyout_strings(imgp);
245	p->p_vmspace->vm_minsaddr = (char *)stack_base;
246
247	/*
248	 * If custom stack fixup routine present for this process
249	 * let it do the stack setup.
250	 * Else stuff argument count as first item on stack
251	 */
252	if (p->p_sysent->sv_fixup)
253		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
254	else
255		suword(--stack_base, imgp->argc);
256
257	/*
258	 * For security and other reasons, the file descriptor table cannot
259	 * be shared after an exec.
260	 */
261	if (p->p_fd->fd_refcnt > 1) {
262		struct filedesc *tmp;
263
264		tmp = fdcopy(p);
265		fdfree(p);
266		p->p_fd = tmp;
267	}
268
269	/* close files on exec */
270	fdcloseexec(p);
271
272	/* reset caught signals */
273	execsigs(p);
274
275	/* name this process - nameiexec(p, ndp) */
276	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
277	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
278	p->p_comm[len] = 0;
279
280	/*
281	 * mark as execed, wakeup the process that vforked (if any) and tell
282	 * it that it now has it's own resources back
283	 */
284	p->p_flag |= P_EXEC;
285	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
286		p->p_flag &= ~P_PPWAIT;
287		wakeup((caddr_t)p->p_pptr);
288	}
289
290	/*
291	 * Implement image setuid/setgid.
292	 *
293	 * Don't honor setuid/setgid if the filesystem prohibits it or if
294	 * the process is being traced.
295	 */
296	if ((attr.va_mode & VSUID && p->p_ucred->cr_uid != attr.va_uid ||
297	     attr.va_mode & VSGID && p->p_ucred->cr_gid != attr.va_gid) &&
298	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
299	    (p->p_flag & P_TRACED) == 0) {
300		/*
301		 * Turn off syscall tracing for set-id programs, except for
302		 * root.
303		 */
304		if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
305			p->p_traceflag = 0;
306			vrele(p->p_tracep);
307			p->p_tracep = NULL;
308		}
309		/*
310		 * Set the new credentials.
311		 */
312		p->p_ucred = crcopy(p->p_ucred);
313		if (attr.va_mode & VSUID)
314			p->p_ucred->cr_uid = attr.va_uid;
315		if (attr.va_mode & VSGID)
316			p->p_ucred->cr_gid = attr.va_gid;
317		p->p_flag |= P_SUGID;
318	} else {
319	        if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
320		    p->p_ucred->cr_gid == p->p_cred->p_rgid)
321			p->p_flag &= ~P_SUGID;
322	}
323
324	/*
325	 * Implement correct POSIX saved-id behavior.
326	 */
327	p->p_cred->p_svuid = p->p_ucred->cr_uid;
328	p->p_cred->p_svgid = p->p_ucred->cr_gid;
329
330	/*
331	 * Store the vp for use in procfs
332	 */
333	if (p->p_textvp)		/* release old reference */
334		vrele(p->p_textvp);
335	VREF(ndp->ni_vp);
336	p->p_textvp = ndp->ni_vp;
337
338	/*
339	 * If tracing the process, trap to debugger so breakpoints
340	 * 	can be set before the program executes.
341	 */
342	if (p->p_flag & P_TRACED)
343		psignal(p, SIGTRAP);
344
345	/* clear "fork but no exec" flag, as we _are_ execing */
346	p->p_acflag &= ~AFORK;
347
348	/* Set entry address */
349	setregs(p, imgp->entry_addr, (u_long)stack_base);
350
351	/*
352	 * free various allocated resources
353	 */
354	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
355	if (bp != NULL)
356		brelse(bp);
357	else if (imgp->image_header != NULL)
358		free((void *)imgp->image_header, M_TEMP);
359	vrele(ndp->ni_vp);
360	zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
361
362	return (0);
363
364exec_fail_dealloc:
365	if (imgp->stringbase != NULL)
366		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
367	if (bp != NULL)
368		brelse(bp);
369	else if (imgp->image_header != NULL)
370		free((void *)imgp->image_header, M_TEMP);
371	if (ndp->ni_vp) {
372		vrele(ndp->ni_vp);
373		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
374	}
375
376exec_fail:
377	if (imgp->vmspace_destroyed) {
378		/* sorry, no more process anymore. exit gracefully */
379		exit1(p, W_EXITCODE(0, SIGABRT));
380		/* NOT REACHED */
381		return(0);
382	} else {
383		return(error);
384	}
385}
386
387/*
388 * Destroy old address space, and allocate a new stack
389 *	The new stack is only SGROWSIZ large because it is grown
390 *	automatically in trap.c.
391 */
392int
393exec_new_vmspace(imgp)
394	struct image_params *imgp;
395{
396	int error;
397	struct vmspace *vmspace = imgp->proc->p_vmspace;
398	caddr_t	stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
399	vm_map_t map = &vmspace->vm_map;
400
401	imgp->vmspace_destroyed = 1;
402
403	/*
404	 * Blow away entire process VM, if address space not shared,
405	 * otherwise, create a new VM space so that other threads are
406	 * not disrupted
407	 */
408	if (vmspace->vm_refcnt == 1) {
409		if (vmspace->vm_shm)
410			shmexit(imgp->proc);
411		pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK);
412		vm_map_remove(map, 0, USRSTACK);
413	} else {
414		vmspace_exec(imgp->proc);
415		vmspace = imgp->proc->p_vmspace;
416		map = &vmspace->vm_map;
417	}
418
419	/* Allocate a new stack */
420	error = vm_map_find(map, NULL, 0, (vm_offset_t *)&stack_addr,
421	    SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
422	if (error)
423		return(error);
424
425	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
426
427	/* Initialize maximum stack address */
428	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
429
430	return(0);
431}
432
433/*
434 * Copy out argument and environment strings from the old process
435 *	address space into the temporary string buffer.
436 */
437int
438exec_extract_strings(imgp)
439	struct image_params *imgp;
440{
441	char	**argv, **envv;
442	char	*argp, *envp;
443	int	error, length;
444
445	/*
446	 * extract arguments first
447	 */
448
449	argv = imgp->uap->argv;
450
451	if (argv) {
452		argp = (caddr_t) fuword(argv);
453		if (argp == (caddr_t) -1)
454			return (EFAULT);
455		if (argp)
456			argv++;
457		if (imgp->argv0)
458			argp = imgp->argv0;
459		if (argp) {
460			do {
461				if (argp == (caddr_t) -1)
462					return (EFAULT);
463				if ((error = copyinstr(argp, imgp->stringp,
464				    imgp->stringspace, &length))) {
465					if (error == ENAMETOOLONG)
466						return(E2BIG);
467					return (error);
468				}
469				imgp->stringspace -= length;
470				imgp->stringp += length;
471				imgp->argc++;
472			} while ((argp = (caddr_t) fuword(argv++)));
473		}
474	}
475
476	/*
477	 * extract environment strings
478	 */
479
480	envv = imgp->uap->envv;
481
482	if (envv) {
483		while ((envp = (caddr_t) fuword(envv++))) {
484			if (envp == (caddr_t) -1)
485				return (EFAULT);
486			if ((error = copyinstr(envp, imgp->stringp,
487			    imgp->stringspace, &length))) {
488				if (error == ENAMETOOLONG)
489					return(E2BIG);
490				return (error);
491			}
492			imgp->stringspace -= length;
493			imgp->stringp += length;
494			imgp->envc++;
495		}
496	}
497
498	return (0);
499}
500
501/*
502 * Copy strings out to the new process address space, constructing
503 *	new arg and env vector tables. Return a pointer to the base
504 *	so that it can be used as the initial stack pointer.
505 */
506int *
507exec_copyout_strings(imgp)
508	struct image_params *imgp;
509{
510	int argc, envc;
511	char **vectp;
512	char *stringp, *destp;
513	int *stack_base;
514	struct ps_strings *arginfo;
515	int szsigcode;
516
517	/*
518	 * Calculate string base and vector table pointers.
519	 * Also deal with signal trampoline code for this exec type.
520	 */
521	arginfo = PS_STRINGS;
522	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
523	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
524		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
525
526	/*
527	 * install sigcode
528	 */
529	if (szsigcode)
530		copyout(imgp->proc->p_sysent->sv_sigcode,
531			((caddr_t)arginfo - szsigcode), szsigcode);
532
533	/*
534	 * If we have a valid auxargs ptr, prepare some room
535	 * on the stack.
536	 */
537	if (imgp->auxargs)
538	/*
539	 * The '+ 2' is for the null pointers at the end of each of the
540	 * arg and env vector sets, and 'AT_COUNT*2' is room for the
541	 * ELF Auxargs data.
542	 */
543		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
544				  AT_COUNT*2) * sizeof(char*));
545	else
546	/*
547	 * The '+ 2' is for the null pointers at the end of each of the
548	 * arg and env vector sets
549	 */
550		vectp = (char **)
551			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
552
553	/*
554	 * vectp also becomes our initial stack base
555	 */
556	stack_base = (int *)vectp;
557
558	stringp = imgp->stringbase;
559	argc = imgp->argc;
560	envc = imgp->envc;
561
562	/*
563	 * Copy out strings - arguments and environment.
564	 */
565	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
566
567	/*
568	 * Fill in "ps_strings" struct for ps, w, etc.
569	 */
570	suword(&arginfo->ps_argvstr, (int)vectp);
571	suword(&arginfo->ps_nargvstr, argc);
572
573	/*
574	 * Fill in argument portion of vector table.
575	 */
576	for (; argc > 0; --argc) {
577		suword(vectp++, (int)destp);
578		while (*stringp++ != 0)
579			destp++;
580		destp++;
581	}
582
583	/* a null vector table pointer seperates the argp's from the envp's */
584	suword(vectp++, 0);
585
586	suword(&arginfo->ps_envstr, (int)vectp);
587	suword(&arginfo->ps_nenvstr, envc);
588
589	/*
590	 * Fill in environment portion of vector table.
591	 */
592	for (; envc > 0; --envc) {
593		suword(vectp++, (int)destp);
594		while (*stringp++ != 0)
595			destp++;
596		destp++;
597	}
598
599	/* end of vector table is a null pointer */
600	suword(vectp, 0);
601
602	return (stack_base);
603}
604
605/*
606 * Check permissions of file to execute.
607 *	Return 0 for success or error code on failure.
608 */
609static int
610exec_check_permissions(imgp)
611	struct image_params *imgp;
612{
613	struct proc *p = imgp->proc;
614	struct vnode *vp = imgp->vp;
615	struct vattr *attr = imgp->attr;
616	int error;
617
618	/* Get file attributes */
619	error = VOP_GETATTR(vp, attr, p->p_ucred, p);
620	if (error)
621		return (error);
622
623	/*
624	 * 1) Check if file execution is disabled for the filesystem that this
625	 *	file resides on.
626	 * 2) Insure that at least one execute bit is on - otherwise root
627	 *	will always succeed, and we don't want to happen unless the
628	 *	file really is executable.
629	 * 3) Insure that the file is a regular file.
630	 */
631	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
632	    ((attr->va_mode & 0111) == 0) ||
633	    (attr->va_type != VREG)) {
634		return (EACCES);
635	}
636
637	/*
638	 * Zero length files can't be exec'd
639	 */
640	if (attr->va_size == 0)
641		return (ENOEXEC);
642
643	/*
644	 *  Check for execute permission to file based on current credentials.
645	 */
646	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
647	if (error)
648		return (error);
649
650	/*
651	 * Check number of open-for-writes on the file and deny execution
652	 * if there are any.
653	 */
654	if (vp->v_writecount)
655		return (ETXTBSY);
656
657	/*
658	 * Call filesystem specific open routine (which does nothing in the
659	 * general case).
660	 */
661	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
662	if (error)
663		return (error);
664
665	return (0);
666}
667