kern_sharedpage.c revision 24828
1/*
2 * Copyright (c) 1993, David Greenman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 *	$Id: kern_exec.c,v 1.57 1997/04/04 09:06:20 davidg Exp $
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/signalvar.h>
33#include <sys/kernel.h>
34#include <sys/mount.h>
35#include <sys/filedesc.h>
36#include <sys/fcntl.h>
37#include <sys/acct.h>
38#include <sys/exec.h>
39#include <sys/imgact.h>
40#include <sys/imgact_elf.h>
41#include <sys/wait.h>
42#include <sys/proc.h>
43#include <sys/malloc.h>
44#include <sys/namei.h>
45#include <sys/sysent.h>
46#include <sys/syslog.h>
47#include <sys/shm.h>
48#include <sys/sysctl.h>
49#include <sys/vnode.h>
50#include <sys/buf.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/vm_prot.h>
55#include <sys/lock.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58#include <vm/vm_kern.h>
59#include <vm/vm_extern.h>
60
61#include <machine/reg.h>
62
63static int *exec_copyout_strings __P((struct image_params *));
64
65static int exec_check_permissions(struct image_params *);
66
67/*
68 * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts
69 * of the sysctl code also assumes this, and sizeof(int) == sizeof(long).
70 */
71static struct ps_strings *ps_strings = PS_STRINGS;
72SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, "");
73
74static caddr_t usrstack = (caddr_t)USRSTACK;
75SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, "");
76
77/*
78 * execsw_set is constructed for us by the linker.  Each of the items
79 * is a pointer to a `const struct execsw', hence the double pointer here.
80 */
81static const struct execsw **execsw =
82	(const struct execsw **)&execsw_set.ls_items[0];
83
84#ifndef _SYS_SYSPROTO_H_
85struct execve_args {
86        char    *fname;
87        char    **argv;
88        char    **envv;
89};
90#endif
91
92/*
93 * execve() system call.
94 */
95int
96execve(p, uap, retval)
97	struct proc *p;
98	register struct execve_args *uap;
99	int *retval;
100{
101	struct nameidata nd, *ndp;
102	int *stack_base;
103	int error, len, i;
104	struct image_params image_params, *imgp;
105	struct vattr attr;
106	struct buf *bp = NULL;
107
108	imgp = &image_params;
109
110	/*
111	 * Initialize part of the common data
112	 */
113	imgp->proc = p;
114	imgp->uap = uap;
115	imgp->attr = &attr;
116	imgp->image_header = NULL;
117	imgp->argc = imgp->envc = 0;
118	imgp->entry_addr = 0;
119	imgp->vmspace_destroyed = 0;
120	imgp->interpreted = 0;
121	imgp->interpreter_name[0] = '\0';
122	imgp->auxargs = NULL;
123
124	/*
125	 * Allocate temporary demand zeroed space for argument and
126	 *	environment strings
127	 */
128	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
129	if (imgp->stringbase == NULL) {
130		error = ENOMEM;
131		goto exec_fail;
132	}
133	imgp->stringp = imgp->stringbase;
134	imgp->stringspace = ARG_MAX;
135
136	/*
137	 * Translate the file name. namei() returns a vnode pointer
138	 *	in ni_vp amoung other things.
139	 */
140	ndp = &nd;
141	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
142	    UIO_USERSPACE, uap->fname, p);
143
144interpret:
145
146	error = namei(ndp);
147	if (error) {
148		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
149		goto exec_fail;
150	}
151
152	imgp->vp = ndp->ni_vp;
153
154	/*
155	 * Check file permissions (also 'opens' file)
156	 */
157	error = exec_check_permissions(imgp);
158	if (error) {
159		VOP_UNLOCK(imgp->vp, 0, p);
160		goto exec_fail_dealloc;
161	}
162
163	/*
164	 * Get the image header, which we define here as meaning the first
165	 * page of the executable.
166	 */
167	if (imgp->vp->v_mount && imgp->vp->v_mount->mnt_stat.f_iosize >= PAGE_SIZE) {
168		/*
169		 * Get a buffer with (at least) the first page.
170		 */
171		error = bread(imgp->vp, 0, imgp->vp->v_mount->mnt_stat.f_iosize,
172		     p->p_ucred, &bp);
173		imgp->image_header = bp->b_data;
174	} else {
175		/*
176		 * The filesystem block size is too small, so do this the hard
177		 * way. Malloc some space and read PAGE_SIZE worth of the image
178		 * header into it.
179		 */
180		imgp->image_header = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
181		error = vn_rdwr(UIO_READ, imgp->vp, (void *)imgp->image_header, PAGE_SIZE, 0,
182		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, NULL, p);
183	}
184	VOP_UNLOCK(imgp->vp, 0, p);
185	if (error)
186		goto exec_fail_dealloc;
187
188	/*
189	 * Loop through list of image activators, calling each one.
190	 *	If there is no match, the activator returns -1. If there
191	 *	is a match, but there was an error during the activation,
192	 *	the error is returned. Otherwise 0 means success. If the
193	 *	image is interpreted, loop back up and try activating
194	 *	the interpreter.
195	 */
196	for (i = 0; execsw[i]; ++i) {
197		if (execsw[i]->ex_imgact)
198			error = (*execsw[i]->ex_imgact)(imgp);
199		else
200			continue;
201		if (error == -1)
202			continue;
203		if (error)
204			goto exec_fail_dealloc;
205		if (imgp->interpreted) {
206			/* free old bp/image_header */
207			if (bp != NULL) {
208				brelse(bp);
209				bp = NULL;
210			} else {
211				free((void *)imgp->image_header, M_TEMP);
212				imgp->image_header = NULL;
213			}
214			/* free old vnode and name buffer */
215			vrele(ndp->ni_vp);
216			FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
217			/* set new name to that of the interpreter */
218			NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
219			    UIO_SYSSPACE, imgp->interpreter_name, p);
220			goto interpret;
221		}
222		break;
223	}
224	/* If we made it through all the activators and none matched, exit. */
225	if (error == -1) {
226		error = ENOEXEC;
227		goto exec_fail_dealloc;
228	}
229
230	/*
231	 * Copy out strings (args and env) and initialize stack base
232	 */
233	stack_base = exec_copyout_strings(imgp);
234	p->p_vmspace->vm_minsaddr = (char *)stack_base;
235
236	/*
237	 * If custom stack fixup routine present for this process
238	 * let it do the stack setup.
239	 * Else stuff argument count as first item on stack
240	 */
241	if (p->p_sysent->sv_fixup)
242		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
243	else
244		suword(--stack_base, imgp->argc);
245
246	/* close files on exec */
247	fdcloseexec(p);
248
249	/* reset caught signals */
250	execsigs(p);
251
252	/* name this process - nameiexec(p, ndp) */
253	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
254	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
255	p->p_comm[len] = 0;
256
257	/*
258	 * mark as execed, wakeup the process that vforked (if any) and tell
259	 * it that it now has it's own resources back
260	 */
261	p->p_flag |= P_EXEC;
262	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
263		p->p_flag &= ~P_PPWAIT;
264		wakeup((caddr_t)p->p_pptr);
265	}
266
267	/*
268	 * Implement image setuid/setgid. Disallow if the process is
269	 * being traced.
270	 */
271	if ((attr.va_mode & (VSUID | VSGID)) &&
272	    (p->p_flag & P_TRACED) == 0) {
273		/*
274		 * Turn off syscall tracing for set-id programs, except for
275		 * root.
276		 */
277		if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
278			p->p_traceflag = 0;
279			vrele(p->p_tracep);
280			p->p_tracep = NULL;
281		}
282		/*
283		 * Set the new credentials.
284		 */
285		p->p_ucred = crcopy(p->p_ucred);
286		if (attr.va_mode & VSUID)
287			p->p_ucred->cr_uid = attr.va_uid;
288		if (attr.va_mode & VSGID)
289			p->p_ucred->cr_groups[0] = attr.va_gid;
290		p->p_flag |= P_SUGID;
291	} else {
292	        if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
293		    p->p_ucred->cr_gid == p->p_cred->p_rgid)
294			p->p_flag &= ~P_SUGID;
295	}
296
297	/*
298	 * Implement correct POSIX saved-id behavior.
299	 */
300	p->p_cred->p_svuid = p->p_ucred->cr_uid;
301	p->p_cred->p_svgid = p->p_ucred->cr_gid;
302
303	/*
304	 * Store the vp for use in procfs
305	 */
306	if (p->p_textvp)		/* release old reference */
307		vrele(p->p_textvp);
308	VREF(ndp->ni_vp);
309	p->p_textvp = ndp->ni_vp;
310
311	/*
312	 * If tracing the process, trap to debugger so breakpoints
313	 * 	can be set before the program executes.
314	 */
315	if (p->p_flag & P_TRACED)
316		psignal(p, SIGTRAP);
317
318	/* clear "fork but no exec" flag, as we _are_ execing */
319	p->p_acflag &= ~AFORK;
320
321	/* Set entry address */
322	setregs(p, imgp->entry_addr, (u_long)stack_base);
323
324	/*
325	 * free various allocated resources
326	 */
327	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
328	if (bp != NULL)
329		brelse(bp);
330	else if (imgp->image_header != NULL)
331		free((void *)imgp->image_header, M_TEMP);
332	vrele(ndp->ni_vp);
333	FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
334
335	return (0);
336
337exec_fail_dealloc:
338	if (imgp->stringbase != NULL)
339		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
340	if (bp != NULL)
341		brelse(bp);
342	else if (imgp->image_header != NULL)
343		free((void *)imgp->image_header, M_TEMP);
344	if (ndp->ni_vp) {
345		vrele(ndp->ni_vp);
346		FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
347	}
348
349exec_fail:
350	if (imgp->vmspace_destroyed) {
351		/* sorry, no more process anymore. exit gracefully */
352		exit1(p, W_EXITCODE(0, SIGABRT));
353		/* NOT REACHED */
354		return(0);
355	} else {
356		return(error);
357	}
358}
359
360/*
361 * Destroy old address space, and allocate a new stack
362 *	The new stack is only SGROWSIZ large because it is grown
363 *	automatically in trap.c.
364 */
365int
366exec_new_vmspace(imgp)
367	struct image_params *imgp;
368{
369	int error;
370	struct vmspace *vmspace = imgp->proc->p_vmspace;
371	caddr_t	stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
372	vm_map_t map = &vmspace->vm_map;
373
374	imgp->vmspace_destroyed = 1;
375
376	/*
377	 * Blow away entire process VM, if address space not shared,
378	 * otherwise, create a new VM space so that other threads are
379	 * not disrupted
380	 */
381	if (vmspace->vm_refcnt == 1) {
382		if (vmspace->vm_shm)
383			shmexit(imgp->proc);
384		pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK);
385		vm_map_remove(map, 0, USRSTACK);
386	} else {
387		struct vmspace *oldvmspace = vmspace;
388
389		--vmspace->vm_refcnt;
390		vmspace = vmspace_alloc(map->min_offset, map->max_offset,
391		    map->entries_pageable);
392		bcopy(&oldvmspace->vm_startcopy, &vmspace->vm_startcopy,
393		    (caddr_t) (vmspace+ 1) - (caddr_t) &vmspace->vm_startcopy);
394		imgp->proc->p_vmspace = vmspace;
395		map = &vmspace->vm_map;
396	}
397
398	/* Allocate a new stack */
399	error = vm_map_find(map, NULL, 0, (vm_offset_t *)&stack_addr,
400	    SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
401	if (error)
402		return(error);
403
404	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
405
406	/* Initialize maximum stack address */
407	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
408
409	return(0);
410}
411
412/*
413 * Copy out argument and environment strings from the old process
414 *	address space into the temporary string buffer.
415 */
416int
417exec_extract_strings(imgp)
418	struct image_params *imgp;
419{
420	char	**argv, **envv;
421	char	*argp, *envp;
422	int	error, length;
423
424	/*
425	 * extract arguments first
426	 */
427
428	argv = imgp->uap->argv;
429
430	if (argv) {
431		while ((argp = (caddr_t) fuword(argv++))) {
432			if (argp == (caddr_t) -1)
433				return (EFAULT);
434			if ((error = copyinstr(argp, imgp->stringp,
435			    imgp->stringspace, &length))) {
436				if (error == ENAMETOOLONG)
437					return(E2BIG);
438				return (error);
439			}
440			imgp->stringspace -= length;
441			imgp->stringp += length;
442			imgp->argc++;
443		}
444	}
445
446	/*
447	 * extract environment strings
448	 */
449
450	envv = imgp->uap->envv;
451
452	if (envv) {
453		while ((envp = (caddr_t) fuword(envv++))) {
454			if (envp == (caddr_t) -1)
455				return (EFAULT);
456			if ((error = copyinstr(envp, imgp->stringp,
457			    imgp->stringspace, &length))) {
458				if (error == ENAMETOOLONG)
459					return(E2BIG);
460				return (error);
461			}
462			imgp->stringspace -= length;
463			imgp->stringp += length;
464			imgp->envc++;
465		}
466	}
467
468	return (0);
469}
470
471/*
472 * Copy strings out to the new process address space, constructing
473 *	new arg and env vector tables. Return a pointer to the base
474 *	so that it can be used as the initial stack pointer.
475 */
476int *
477exec_copyout_strings(imgp)
478	struct image_params *imgp;
479{
480	int argc, envc;
481	char **vectp;
482	char *stringp, *destp;
483	int *stack_base;
484	struct ps_strings *arginfo;
485	int szsigcode;
486
487	/*
488	 * Calculate string base and vector table pointers.
489	 * Also deal with signal trampoline code for this exec type.
490	 */
491	arginfo = PS_STRINGS;
492	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
493	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
494		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
495
496	/*
497	 * install sigcode
498	 */
499	if (szsigcode)
500		copyout(imgp->proc->p_sysent->sv_sigcode,
501			((caddr_t)arginfo - szsigcode), szsigcode);
502
503	/*
504	 * If we have a valid auxargs ptr, prepare some room
505	 * on the stack.
506	 */
507	if (imgp->auxargs)
508	/*
509	 * The '+ 2' is for the null pointers at the end of each of the
510	 * arg and env vector sets, and 'AT_COUNT*2' is room for the
511	 * ELF Auxargs data.
512	 */
513		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
514				  AT_COUNT*2) * sizeof(char*));
515	else
516	/*
517	 * The '+ 2' is for the null pointers at the end of each of the
518	 * arg and env vector sets
519	 */
520		vectp = (char **)
521			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
522
523	/*
524	 * vectp also becomes our initial stack base
525	 */
526	stack_base = (int *)vectp;
527
528	stringp = imgp->stringbase;
529	argc = imgp->argc;
530	envc = imgp->envc;
531
532	/*
533	 * Copy out strings - arguments and environment.
534	 */
535	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
536
537	/*
538	 * Fill in "ps_strings" struct for ps, w, etc.
539	 */
540	suword(&arginfo->ps_argvstr, (int)vectp);
541	suword(&arginfo->ps_nargvstr, argc);
542
543	/*
544	 * Fill in argument portion of vector table.
545	 */
546	for (; argc > 0; --argc) {
547		suword(vectp++, (int)destp);
548		while (*stringp++ != 0)
549			destp++;
550		destp++;
551	}
552
553	/* a null vector table pointer seperates the argp's from the envp's */
554	suword(vectp++, 0);
555
556	suword(&arginfo->ps_envstr, (int)vectp);
557	suword(&arginfo->ps_nenvstr, envc);
558
559	/*
560	 * Fill in environment portion of vector table.
561	 */
562	for (; envc > 0; --envc) {
563		suword(vectp++, (int)destp);
564		while (*stringp++ != 0)
565			destp++;
566		destp++;
567	}
568
569	/* end of vector table is a null pointer */
570	suword(vectp, 0);
571
572	return (stack_base);
573}
574
575/*
576 * Check permissions of file to execute.
577 *	Return 0 for success or error code on failure.
578 */
579static int
580exec_check_permissions(imgp)
581	struct image_params *imgp;
582{
583	struct proc *p = imgp->proc;
584	struct vnode *vp = imgp->vp;
585	struct vattr *attr = imgp->attr;
586	int error;
587
588	/* Get file attributes */
589	error = VOP_GETATTR(vp, attr, p->p_ucred, p);
590	if (error)
591		return (error);
592
593	/*
594	 * 1) Check if file execution is disabled for the filesystem that this
595	 *	file resides on.
596	 * 2) Insure that at least one execute bit is on - otherwise root
597	 *	will always succeed, and we don't want to happen unless the
598	 *	file really is executable.
599	 * 3) Insure that the file is a regular file.
600	 */
601	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
602	    ((attr->va_mode & 0111) == 0) ||
603	    (attr->va_type != VREG)) {
604		return (EACCES);
605	}
606
607	/*
608	 * Zero length files can't be exec'd
609	 */
610	if (attr->va_size == 0)
611		return (ENOEXEC);
612
613	/*
614	 *  Check for execute permission to file based on current credentials.
615	 */
616	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
617	if (error)
618		return (error);
619
620	/*
621	 * Check number of open-for-writes on the file and deny execution
622	 * if there are any.
623	 */
624	if (vp->v_writecount)
625		return (ETXTBSY);
626
627	/*
628	 * Call filesystem specific open routine (which does nothing in the
629	 * general case).
630	 */
631	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
632	if (error)
633		return (error);
634
635	/*
636	 * Disable setuid/setgid if the filesystem prohibits it or if
637	 * the process is being traced.
638	 */
639        if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED))
640		attr->va_mode &= ~(VSUID | VSGID);
641
642	return (0);
643}
644