imgact_elf.c revision 108685
1/*-
2 * Copyright (c) 2000 David O'Brien
3 * Copyright (c) 1995-1996 S�ren Schmidt
4 * Copyright (c) 1996 Peter Wemm
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * $FreeBSD: head/sys/kern/imgact_elf.c 108685 2003-01-04 22:07:48Z jake $
31 */
32
33#include <sys/param.h>
34#include <sys/exec.h>
35#include <sys/fcntl.h>
36#include <sys/imgact.h>
37#include <sys/imgact_elf.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/mutex.h>
42#include <sys/mman.h>
43#include <sys/namei.h>
44#include <sys/pioctl.h>
45#include <sys/proc.h>
46#include <sys/procfs.h>
47#include <sys/resourcevar.h>
48#include <sys/systm.h>
49#include <sys/signalvar.h>
50#include <sys/stat.h>
51#include <sys/sx.h>
52#include <sys/syscall.h>
53#include <sys/sysctl.h>
54#include <sys/sysent.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/vm_kern.h>
59#include <vm/vm_param.h>
60#include <vm/pmap.h>
61#include <vm/vm_map.h>
62#include <vm/vm_object.h>
63#include <vm/vm_extern.h>
64
65#include <machine/elf.h>
66#include <machine/md_var.h>
67
68#define OLD_EI_BRAND	8
69
70__ElfType(Brandinfo);
71__ElfType(Auxargs);
72
73static int __elfN(check_header)(const Elf_Ehdr *hdr);
74static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr,
75    const char *interp);
76static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
77    u_long *entry, size_t pagesize);
78static int __elfN(load_section)(struct proc *p,
79    struct vmspace *vmspace, struct vnode *vp, vm_object_t object,
80    vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
81    vm_prot_t prot, size_t pagesize);
82static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
83
84SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
85    "");
86
87static int fallback_brand = -1;
88SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand,
89    CTLFLAG_RW, &fallback_brand, 0,
90    __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
91TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
92    &fallback_brand);
93
94static int elf_trace = 0;
95SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, "");
96
97static int elf_legacy_coredump = 0;
98SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW,
99    &elf_legacy_coredump, 0, "");
100
101static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
102
103int
104__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
105{
106	int i;
107
108	for (i = 0; i < MAX_BRANDS; i++) {
109		if (elf_brand_list[i] == NULL) {
110			elf_brand_list[i] = entry;
111			break;
112		}
113	}
114	if (i == MAX_BRANDS)
115		return (-1);
116	return (0);
117}
118
119int
120__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
121{
122	int i;
123
124	for (i = 0; i < MAX_BRANDS; i++) {
125		if (elf_brand_list[i] == entry) {
126			elf_brand_list[i] = NULL;
127			break;
128		}
129	}
130	if (i == MAX_BRANDS)
131		return (-1);
132	return (0);
133}
134
135int
136__elfN(brand_inuse)(Elf_Brandinfo *entry)
137{
138	struct proc *p;
139	int rval = FALSE;
140
141	sx_slock(&allproc_lock);
142	LIST_FOREACH(p, &allproc, p_list) {
143		if (p->p_sysent == entry->sysvec) {
144			rval = TRUE;
145			break;
146		}
147	}
148	sx_sunlock(&allproc_lock);
149
150	return (rval);
151}
152
153static Elf_Brandinfo *
154__elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp)
155{
156	Elf_Brandinfo *bi;
157	int i;
158
159	/*
160	 * We support three types of branding -- (1) the ELF EI_OSABI field
161	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
162	 * branding w/in the ELF header, and (3) path of the `interp_path'
163	 * field.  We should also look for an ".note.ABI-tag" ELF section now
164	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
165	 */
166
167	/* If the executable has a brand, search for it in the brand list. */
168	for (i = 0; i < MAX_BRANDS; i++) {
169		bi = elf_brand_list[i];
170		if (bi != NULL && hdr->e_machine == bi->machine &&
171		    (hdr->e_ident[EI_OSABI] == bi->brand ||
172		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
173		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
174			return (bi);
175	}
176
177	/* Lacking a known brand, search for a recognized interpreter. */
178	if (interp != NULL) {
179		for (i = 0; i < MAX_BRANDS; i++) {
180			bi = elf_brand_list[i];
181			if (bi != NULL && hdr->e_machine == bi->machine &&
182			    strcmp(interp, bi->interp_path) == 0)
183				return (bi);
184		}
185	}
186
187	/* Lacking a recognized interpreter, try the default brand */
188	for (i = 0; i < MAX_BRANDS; i++) {
189		bi = elf_brand_list[i];
190		if (bi != NULL && hdr->e_machine == bi->machine &&
191		    fallback_brand == bi->brand)
192			return (bi);
193	}
194	return (NULL);
195}
196
197static int
198__elfN(check_header)(const Elf_Ehdr *hdr)
199{
200	Elf_Brandinfo *bi;
201	int i;
202
203	if (!IS_ELF(*hdr) ||
204	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
205	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
206	    hdr->e_ident[EI_VERSION] != EV_CURRENT)
207		return (ENOEXEC);
208
209	/*
210	 * Make sure we have at least one brand for this machine.
211	 */
212
213	for (i = 0; i < MAX_BRANDS; i++) {
214		bi = elf_brand_list[i];
215		if (bi != NULL && bi->machine == hdr->e_machine)
216			break;
217	}
218	if (i == MAX_BRANDS)
219		return (ENOEXEC);
220
221	if (hdr->e_version != ELF_TARG_VER)
222		return (ENOEXEC);
223
224	return (0);
225}
226
227static int
228__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
229	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
230	vm_prot_t max)
231{
232	int error, rv;
233	vm_offset_t off;
234	vm_offset_t data_buf = 0;
235
236	/*
237	 * Create the page if it doesn't exist yet. Ignore errors.
238	 */
239	vm_map_lock(map);
240	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), max,
241	    max, 0);
242	vm_map_unlock(map);
243
244	/*
245	 * Find the page from the underlying object.
246	 */
247	if (object) {
248		vm_object_reference(object);
249		rv = vm_map_find(exec_map,
250				 object,
251				 trunc_page(offset),
252				 &data_buf,
253				 PAGE_SIZE,
254				 TRUE,
255				 VM_PROT_READ,
256				 VM_PROT_ALL,
257				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
258		if (rv != KERN_SUCCESS) {
259			vm_object_deallocate(object);
260			return (rv);
261		}
262
263		off = offset - trunc_page(offset);
264		error = copyout((caddr_t)data_buf + off, (caddr_t)start,
265		    end - start);
266		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
267		if (error) {
268			return (KERN_FAILURE);
269		}
270	}
271
272	return (KERN_SUCCESS);
273}
274
275static int
276__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
277	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
278	vm_prot_t max, int cow)
279{
280	int rv;
281
282	if (start != trunc_page(start)) {
283		rv = __elfN(map_partial)(map, object, offset, start,
284		    round_page(start), prot, max);
285		if (rv)
286			return (rv);
287		offset += round_page(start) - start;
288		start = round_page(start);
289	}
290	if (end != round_page(end)) {
291		rv = __elfN(map_partial)(map, object, offset +
292		    trunc_page(end) - start, trunc_page(end), end, prot, max);
293		if (rv)
294			return (rv);
295		end = trunc_page(end);
296	}
297	if (end > start) {
298		if (offset & PAGE_MASK) {
299			vm_offset_t data_buf, off;
300			vm_size_t sz;
301			int error;
302
303			/*
304			 * The mapping is not page aligned. This means we have
305			 * to copy the data. Sigh.
306			 */
307			rv = vm_map_find(map, 0, 0, &start, end - start,
308			    FALSE, prot, max, 0);
309			if (rv)
310				return (rv);
311			while (start < end) {
312				vm_object_reference(object);
313				rv = vm_map_find(exec_map,
314						 object,
315						 trunc_page(offset),
316						 &data_buf,
317						 2 * PAGE_SIZE,
318						 TRUE,
319						 VM_PROT_READ,
320						 VM_PROT_ALL,
321						 (MAP_COPY_ON_WRITE
322						  | MAP_PREFAULT_PARTIAL));
323				if (rv != KERN_SUCCESS) {
324					vm_object_deallocate(object);
325					return (rv);
326				}
327				off = offset - trunc_page(offset);
328				sz = end - start;
329				if (sz > PAGE_SIZE)
330					sz = PAGE_SIZE;
331				error = copyout((caddr_t)data_buf + off,
332				    (caddr_t)start, sz);
333				vm_map_remove(exec_map, data_buf,
334				    data_buf + 2 * PAGE_SIZE);
335				if (error) {
336					return (KERN_FAILURE);
337				}
338				start += sz;
339			}
340			rv = KERN_SUCCESS;
341		} else {
342			vm_map_lock(map);
343			rv = vm_map_insert(map, object, offset, start, end,
344			    prot, max, cow);
345			vm_map_unlock(map);
346		}
347		return (rv);
348	} else {
349		return (KERN_SUCCESS);
350	}
351}
352
353static int
354__elfN(load_section)(struct proc *p, struct vmspace *vmspace,
355	struct vnode *vp, vm_object_t object, vm_offset_t offset,
356	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
357	size_t pagesize)
358{
359	size_t map_len;
360	vm_offset_t map_addr;
361	int error, rv, cow;
362	size_t copy_len;
363	vm_offset_t file_addr;
364	vm_offset_t data_buf = 0;
365
366	GIANT_REQUIRED;
367
368	error = 0;
369
370	/*
371	 * It's necessary to fail if the filsz + offset taken from the
372	 * header is greater than the actual file pager object's size.
373	 * If we were to allow this, then the vm_map_find() below would
374	 * walk right off the end of the file object and into the ether.
375	 *
376	 * While I'm here, might as well check for something else that
377	 * is invalid: filsz cannot be greater than memsz.
378	 */
379	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
380	    filsz > memsz) {
381		uprintf("elf_load_section: truncated ELF file\n");
382		return (ENOEXEC);
383	}
384
385#define trunc_page_ps(va, ps)	((va) & ~(ps - 1))
386#define round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
387
388	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
389	file_addr = trunc_page_ps(offset, pagesize);
390
391	/*
392	 * We have two choices.  We can either clear the data in the last page
393	 * of an oversized mapping, or we can start the anon mapping a page
394	 * early and copy the initialized data into that first page.  We
395	 * choose the second..
396	 */
397	if (memsz > filsz)
398		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
399	else
400		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
401
402	if (map_len != 0) {
403		vm_object_reference(object);
404
405		/* cow flags: don't dump readonly sections in core */
406		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
407		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
408
409		rv = __elfN(map_insert)(&vmspace->vm_map,
410				      object,
411				      file_addr,	/* file offset */
412				      map_addr,		/* virtual start */
413				      map_addr + map_len,/* virtual end */
414				      prot,
415				      VM_PROT_ALL,
416				      cow);
417		if (rv != KERN_SUCCESS) {
418			vm_object_deallocate(object);
419			return (EINVAL);
420		}
421
422		/* we can stop now if we've covered it all */
423		if (memsz == filsz) {
424			return (0);
425		}
426	}
427
428
429	/*
430	 * We have to get the remaining bit of the file into the first part
431	 * of the oversized map segment.  This is normally because the .data
432	 * segment in the file is extended to provide bss.  It's a neat idea
433	 * to try and save a page, but it's a pain in the behind to implement.
434	 */
435	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
436	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
437	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
438	    map_addr;
439
440	/* This had damn well better be true! */
441	if (map_len != 0) {
442		rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
443		    map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0);
444		if (rv != KERN_SUCCESS) {
445			return (EINVAL);
446		}
447	}
448
449	if (copy_len != 0) {
450		vm_offset_t off;
451		vm_object_reference(object);
452		rv = vm_map_find(exec_map,
453				 object,
454				 trunc_page(offset + filsz),
455				 &data_buf,
456				 PAGE_SIZE,
457				 TRUE,
458				 VM_PROT_READ,
459				 VM_PROT_ALL,
460				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
461		if (rv != KERN_SUCCESS) {
462			vm_object_deallocate(object);
463			return (EINVAL);
464		}
465
466		/* send the page fragment to user space */
467		off = trunc_page_ps(offset + filsz, pagesize) -
468		    trunc_page(offset + filsz);
469		error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr,
470		    copy_len);
471		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
472		if (error) {
473			return (error);
474		}
475	}
476
477	/*
478	 * set it to the specified protection.
479	 * XXX had better undo the damage from pasting over the cracks here!
480	 */
481	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
482	    round_page(map_addr + map_len),  prot, FALSE);
483
484	return (error);
485}
486
487/*
488 * Load the file "file" into memory.  It may be either a shared object
489 * or an executable.
490 *
491 * The "addr" reference parameter is in/out.  On entry, it specifies
492 * the address where a shared object should be loaded.  If the file is
493 * an executable, this value is ignored.  On exit, "addr" specifies
494 * where the file was actually loaded.
495 *
496 * The "entry" reference parameter is out only.  On exit, it specifies
497 * the entry point for the loaded file.
498 */
499static int
500__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
501	u_long *entry, size_t pagesize)
502{
503	struct {
504		struct nameidata nd;
505		struct vattr attr;
506		struct image_params image_params;
507	} *tempdata;
508	const Elf_Ehdr *hdr = NULL;
509	const Elf_Phdr *phdr = NULL;
510	struct nameidata *nd;
511	struct vmspace *vmspace = p->p_vmspace;
512	struct vattr *attr;
513	struct image_params *imgp;
514	vm_prot_t prot;
515	u_long rbase;
516	u_long base_addr = 0;
517	int error, i, numsegs;
518
519	if (curthread->td_proc != p)
520		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
521
522	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
523	nd = &tempdata->nd;
524	attr = &tempdata->attr;
525	imgp = &tempdata->image_params;
526
527	/*
528	 * Initialize part of the common data
529	 */
530	imgp->proc = p;
531	imgp->userspace_argv = NULL;
532	imgp->userspace_envv = NULL;
533	imgp->attr = attr;
534	imgp->firstpage = NULL;
535	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
536	imgp->object = NULL;
537	imgp->execlabel = NULL;
538
539	if (imgp->image_header == NULL) {
540		nd->ni_vp = NULL;
541		error = ENOMEM;
542		goto fail;
543	}
544
545	/* XXXKSE */
546	NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);
547
548	if ((error = namei(nd)) != 0) {
549		nd->ni_vp = NULL;
550		goto fail;
551	}
552	NDFREE(nd, NDF_ONLY_PNBUF);
553	imgp->vp = nd->ni_vp;
554
555	/*
556	 * Check permissions, modes, uid, etc on the file, and "open" it.
557	 */
558	error = exec_check_permissions(imgp);
559	if (error) {
560		VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
561		goto fail;
562	}
563
564	error = exec_map_first_page(imgp);
565	/*
566	 * Also make certain that the interpreter stays the same, so set
567	 * its VV_TEXT flag, too.
568	 */
569	if (error == 0)
570		nd->ni_vp->v_vflag |= VV_TEXT;
571
572	VOP_GETVOBJECT(nd->ni_vp, &imgp->object);
573	vm_object_reference(imgp->object);
574
575	VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
576	if (error)
577		goto fail;
578
579	hdr = (const Elf_Ehdr *)imgp->image_header;
580	if ((error = __elfN(check_header)(hdr)) != 0)
581		goto fail;
582	if (hdr->e_type == ET_DYN)
583		rbase = *addr;
584	else if (hdr->e_type == ET_EXEC)
585		rbase = 0;
586	else {
587		error = ENOEXEC;
588		goto fail;
589	}
590
591	/* Only support headers that fit within first page for now */
592	if ((hdr->e_phoff > PAGE_SIZE) ||
593	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
594		error = ENOEXEC;
595		goto fail;
596	}
597
598	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
599
600	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
601		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
602			prot = 0;
603			if (phdr[i].p_flags & PF_X)
604  				prot |= VM_PROT_EXECUTE;
605			if (phdr[i].p_flags & PF_W)
606  				prot |= VM_PROT_WRITE;
607			if (phdr[i].p_flags & PF_R)
608  				prot |= VM_PROT_READ;
609
610			if ((error = __elfN(load_section)(p, vmspace,
611			    nd->ni_vp, imgp->object, phdr[i].p_offset,
612			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
613			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
614			    pagesize)) != 0)
615				goto fail;
616			/*
617			 * Establish the base address if this is the
618			 * first segment.
619			 */
620			if (numsegs == 0)
621  				base_addr = trunc_page(phdr[i].p_vaddr +
622				    rbase);
623			numsegs++;
624		}
625	}
626	*addr = base_addr;
627	*entry = (unsigned long)hdr->e_entry + rbase;
628
629fail:
630	if (imgp->firstpage)
631		exec_unmap_first_page(imgp);
632	if (imgp->image_header)
633		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
634		    PAGE_SIZE);
635	if (imgp->object)
636		vm_object_deallocate(imgp->object);
637
638	if (nd->ni_vp)
639		vrele(nd->ni_vp);
640
641	free(tempdata, M_TEMP);
642
643	return (error);
644}
645
646static int
647__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
648{
649	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
650	const Elf_Phdr *phdr;
651	Elf_Auxargs *elf_auxargs = NULL;
652	struct vmspace *vmspace;
653	vm_prot_t prot;
654	u_long text_size = 0, data_size = 0, total_size = 0;
655	u_long text_addr = 0, data_addr = 0;
656	u_long seg_size, seg_addr;
657	u_long addr, entry = 0, proghdr = 0;
658	int error, i;
659	const char *interp = NULL;
660	Elf_Brandinfo *brand_info;
661	char *path;
662	struct thread *td = curthread;
663	struct sysentvec *sv;
664
665	GIANT_REQUIRED;
666
667	/*
668	 * Do we have a valid ELF header ?
669	 */
670	if (__elfN(check_header)(hdr) != 0 || hdr->e_type != ET_EXEC)
671		return (-1);
672
673	/*
674	 * From here on down, we return an errno, not -1, as we've
675	 * detected an ELF file.
676	 */
677
678	if ((hdr->e_phoff > PAGE_SIZE) ||
679	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
680		/* Only support headers in first page for now */
681		return (ENOEXEC);
682	}
683	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
684
685	/*
686	 * From this point on, we may have resources that need to be freed.
687	 */
688
689	VOP_UNLOCK(imgp->vp, 0, td);
690
691	for (i = 0; i < hdr->e_phnum; i++) {
692		switch (phdr[i].p_type) {
693	  	case PT_INTERP:	/* Path to interpreter */
694			if (phdr[i].p_filesz > MAXPATHLEN ||
695			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
696				error = ENOEXEC;
697				goto fail;
698			}
699			interp = imgp->image_header + phdr[i].p_offset;
700			break;
701		default:
702			break;
703		}
704	}
705
706	brand_info = __elfN(get_brandinfo)(hdr, interp);
707	if (brand_info == NULL) {
708		uprintf("ELF binary type \"%u\" not known.\n",
709		    hdr->e_ident[EI_OSABI]);
710		error = ENOEXEC;
711		goto fail;
712	}
713	sv = brand_info->sysvec;
714
715	if ((error = exec_extract_strings(imgp)) != 0)
716		goto fail;
717
718	exec_new_vmspace(imgp, sv);
719
720	vmspace = imgp->proc->p_vmspace;
721
722	for (i = 0; i < hdr->e_phnum; i++) {
723		switch (phdr[i].p_type) {
724		case PT_LOAD:	/* Loadable segment */
725			prot = 0;
726			if (phdr[i].p_flags & PF_X)
727  				prot |= VM_PROT_EXECUTE;
728			if (phdr[i].p_flags & PF_W)
729  				prot |= VM_PROT_WRITE;
730			if (phdr[i].p_flags & PF_R)
731  				prot |= VM_PROT_READ;
732
733#if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
734			/*
735			 * Some x86 binaries assume read == executable,
736			 * notably the M3 runtime and therefore cvsup
737			 */
738			if (prot & VM_PROT_READ)
739				prot |= VM_PROT_EXECUTE;
740#endif
741
742			if ((error = __elfN(load_section)(imgp->proc, vmspace,
743			    imgp->vp, imgp->object, phdr[i].p_offset,
744			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
745			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
746			    sv->sv_pagesize)) != 0)
747  				goto fail;
748
749			seg_addr = trunc_page(phdr[i].p_vaddr);
750			seg_size = round_page(phdr[i].p_memsz +
751			    phdr[i].p_vaddr - seg_addr);
752
753			/*
754			 * Is this .text or .data?  We can't use
755			 * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
756			 * alpha terribly and possibly does other bad
757			 * things so we stick to the old way of figuring
758			 * it out:  If the segment contains the program
759			 * entry point, it's a text segment, otherwise it
760			 * is a data segment.
761			 *
762			 * Note that obreak() assumes that data_addr +
763			 * data_size == end of data load area, and the ELF
764			 * file format expects segments to be sorted by
765			 * address.  If multiple data segments exist, the
766			 * last one will be used.
767			 */
768			if (hdr->e_entry >= phdr[i].p_vaddr &&
769			    hdr->e_entry < (phdr[i].p_vaddr +
770			    phdr[i].p_memsz)) {
771				text_size = seg_size;
772				text_addr = seg_addr;
773				entry = (u_long)hdr->e_entry;
774			} else {
775				data_size = seg_size;
776				data_addr = seg_addr;
777			}
778			total_size += seg_size;
779			break;
780		case PT_PHDR: 	/* Program header table info */
781			proghdr = phdr[i].p_vaddr;
782			break;
783		default:
784			break;
785		}
786	}
787
788	if (data_addr == 0 && data_size == 0) {
789		data_addr = text_addr;
790		data_size = text_size;
791	}
792
793	/*
794	 * Check limits.  It should be safe to check the
795	 * limits after loading the segments since we do
796	 * not actually fault in all the segments pages.
797	 */
798	if (data_size >
799	    imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur ||
800	    text_size > maxtsiz ||
801	    total_size >
802	    imgp->proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
803		error = ENOMEM;
804		goto fail;
805	}
806
807	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
808	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
809	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
810	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
811
812	addr = ELF_RTLD_ADDR(vmspace);
813
814	imgp->entry_addr = entry;
815
816	imgp->proc->p_sysent = sv;
817	if (interp != NULL) {
818		path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
819		snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path,
820		    interp);
821		if ((error = __elfN(load_file)(imgp->proc, path, &addr,
822		    &imgp->entry_addr, sv->sv_pagesize)) != 0) {
823			if ((error = __elfN(load_file)(imgp->proc, interp,
824			    &addr, &imgp->entry_addr, sv->sv_pagesize)) != 0) {
825				uprintf("ELF interpreter %s not found\n",
826				    path);
827				free(path, M_TEMP);
828				goto fail;
829			}
830		}
831		free(path, M_TEMP);
832	}
833
834	/*
835	 * Construct auxargs table (used by the fixup routine)
836	 */
837	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
838	elf_auxargs->execfd = -1;
839	elf_auxargs->phdr = proghdr;
840	elf_auxargs->phent = hdr->e_phentsize;
841	elf_auxargs->phnum = hdr->e_phnum;
842	elf_auxargs->pagesz = PAGE_SIZE;
843	elf_auxargs->base = addr;
844	elf_auxargs->flags = 0;
845	elf_auxargs->entry = entry;
846	elf_auxargs->trace = elf_trace;
847
848	imgp->auxargs = elf_auxargs;
849	imgp->interpreted = 0;
850
851fail:
852	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
853	return (error);
854}
855
856#define	suword __CONCAT(suword, __ELF_WORD_SIZE)
857
858int
859__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
860{
861	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
862	Elf_Addr *base;
863	Elf_Addr *pos;
864
865	base = (Elf_Addr *)*stack_base;
866	pos = base + (imgp->argc + imgp->envc + 2);
867
868	if (args->trace) {
869		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
870	}
871	if (args->execfd != -1) {
872		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
873	}
874	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
875	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
876	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
877	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
878	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
879	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
880	AUXARGS_ENTRY(pos, AT_BASE, args->base);
881	AUXARGS_ENTRY(pos, AT_NULL, 0);
882
883	free(imgp->auxargs, M_TEMP);
884	imgp->auxargs = NULL;
885
886	base--;
887	suword(base, (long)imgp->argc);
888	*stack_base = (register_t *)base;
889	return (0);
890}
891
892/*
893 * Code for generating ELF core dumps.
894 */
895
896typedef void (*segment_callback)(vm_map_entry_t, void *);
897
898/* Closure for cb_put_phdr(). */
899struct phdr_closure {
900	Elf_Phdr *phdr;		/* Program header to fill in */
901	Elf_Off offset;		/* Offset of segment in core file */
902};
903
904/* Closure for cb_size_segment(). */
905struct sseg_closure {
906	int count;		/* Count of writable segments. */
907	size_t size;		/* Total size of all writable segments. */
908};
909
910static void cb_put_phdr(vm_map_entry_t, void *);
911static void cb_size_segment(vm_map_entry_t, void *);
912static void each_writable_segment(struct proc *, segment_callback, void *);
913static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
914    int, void *, size_t);
915static void __elfN(puthdr)(struct proc *, void *, size_t *,
916    const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int);
917static void __elfN(putnote)(void *, size_t *, const char *, int,
918    const void *, size_t);
919
920extern int osreldate;
921
922int
923__elfN(coredump)(td, vp, limit)
924	struct thread *td;
925	register struct vnode *vp;
926	off_t limit;
927{
928	register struct proc *p = td->td_proc;
929	register struct ucred *cred = td->td_ucred;
930	int error = 0;
931	struct sseg_closure seginfo;
932	void *hdr;
933	size_t hdrsize;
934
935	/* Size the program segments. */
936	seginfo.count = 0;
937	seginfo.size = 0;
938	each_writable_segment(p, cb_size_segment, &seginfo);
939
940	/*
941	 * Calculate the size of the core file header area by making
942	 * a dry run of generating it.  Nothing is written, but the
943	 * size is calculated.
944	 */
945	hdrsize = 0;
946	__elfN(puthdr)((struct proc *)NULL, (void *)NULL, &hdrsize,
947	    (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
948	    (const prpsinfo_t *)NULL, seginfo.count);
949
950	if (hdrsize + seginfo.size >= limit)
951		return (EFAULT);
952
953	/*
954	 * Allocate memory for building the header, fill it up,
955	 * and write it out.
956	 */
957	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
958	if (hdr == NULL) {
959		return (EINVAL);
960	}
961	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
962
963	/* Write the contents of all of the writable segments. */
964	if (error == 0) {
965		Elf_Phdr *php;
966		off_t offset;
967		int i;
968
969		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
970		offset = hdrsize;
971		for (i = 0; i < seginfo.count; i++) {
972			error = vn_rdwr_inchunks(UIO_WRITE, vp,
973			    (caddr_t)(uintptr_t)php->p_vaddr,
974			    php->p_filesz, offset, UIO_USERSPACE,
975			    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *)NULL,
976			    curthread); /* XXXKSE */
977			if (error != 0)
978				break;
979			offset += php->p_filesz;
980			php++;
981		}
982	}
983	free(hdr, M_TEMP);
984
985	return (error);
986}
987
988/*
989 * A callback for each_writable_segment() to write out the segment's
990 * program header entry.
991 */
992static void
993cb_put_phdr(entry, closure)
994	vm_map_entry_t entry;
995	void *closure;
996{
997	struct phdr_closure *phc = (struct phdr_closure *)closure;
998	Elf_Phdr *phdr = phc->phdr;
999
1000	phc->offset = round_page(phc->offset);
1001
1002	phdr->p_type = PT_LOAD;
1003	phdr->p_offset = phc->offset;
1004	phdr->p_vaddr = entry->start;
1005	phdr->p_paddr = 0;
1006	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
1007	phdr->p_align = PAGE_SIZE;
1008	phdr->p_flags = 0;
1009	if (entry->protection & VM_PROT_READ)
1010		phdr->p_flags |= PF_R;
1011	if (entry->protection & VM_PROT_WRITE)
1012		phdr->p_flags |= PF_W;
1013	if (entry->protection & VM_PROT_EXECUTE)
1014		phdr->p_flags |= PF_X;
1015
1016	phc->offset += phdr->p_filesz;
1017	phc->phdr++;
1018}
1019
1020/*
1021 * A callback for each_writable_segment() to gather information about
1022 * the number of segments and their total size.
1023 */
1024static void
1025cb_size_segment(entry, closure)
1026	vm_map_entry_t entry;
1027	void *closure;
1028{
1029	struct sseg_closure *ssc = (struct sseg_closure *)closure;
1030
1031	ssc->count++;
1032	ssc->size += entry->end - entry->start;
1033}
1034
1035/*
1036 * For each writable segment in the process's memory map, call the given
1037 * function with a pointer to the map entry and some arbitrary
1038 * caller-supplied data.
1039 */
1040static void
1041each_writable_segment(p, func, closure)
1042	struct proc *p;
1043	segment_callback func;
1044	void *closure;
1045{
1046	vm_map_t map = &p->p_vmspace->vm_map;
1047	vm_map_entry_t entry;
1048
1049	for (entry = map->header.next; entry != &map->header;
1050	    entry = entry->next) {
1051		vm_object_t obj;
1052
1053		/*
1054		 * Don't dump inaccessible mappings, deal with legacy
1055		 * coredump mode.
1056		 *
1057		 * Note that read-only segments related to the elf binary
1058		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
1059		 * need to arbitrarily ignore such segments.
1060		 */
1061		if (elf_legacy_coredump) {
1062			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
1063				continue;
1064		} else {
1065			if ((entry->protection & VM_PROT_ALL) == 0)
1066				continue;
1067		}
1068
1069		/*
1070		 * Dont include memory segment in the coredump if
1071		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
1072		 * madvise(2).  Do not dump submaps (i.e. parts of the
1073		 * kernel map).
1074		 */
1075		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
1076			continue;
1077
1078		if ((obj = entry->object.vm_object) == NULL)
1079			continue;
1080
1081		/* Find the deepest backing object. */
1082		while (obj->backing_object != NULL)
1083			obj = obj->backing_object;
1084
1085		/* Ignore memory-mapped devices and such things. */
1086		if (obj->type != OBJT_DEFAULT &&
1087		    obj->type != OBJT_SWAP &&
1088		    obj->type != OBJT_VNODE)
1089			continue;
1090
1091		(*func)(entry, closure);
1092	}
1093}
1094
1095/*
1096 * Write the core file header to the file, including padding up to
1097 * the page boundary.
1098 */
1099static int
1100__elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
1101	struct thread *td;
1102	struct vnode *vp;
1103	struct ucred *cred;
1104	int numsegs;
1105	size_t hdrsize;
1106	void *hdr;
1107{
1108	struct {
1109		prstatus_t status;
1110		prfpregset_t fpregset;
1111		prpsinfo_t psinfo;
1112	} *tempdata;
1113	struct proc *p = td->td_proc;
1114	size_t off;
1115	prstatus_t *status;
1116	prfpregset_t *fpregset;
1117	prpsinfo_t *psinfo;
1118
1119	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK);
1120	status = &tempdata->status;
1121	fpregset = &tempdata->fpregset;
1122	psinfo = &tempdata->psinfo;
1123
1124	/* Gather the information for the header. */
1125	status->pr_version = PRSTATUS_VERSION;
1126	status->pr_statussz = sizeof(prstatus_t);
1127	status->pr_gregsetsz = sizeof(gregset_t);
1128	status->pr_fpregsetsz = sizeof(fpregset_t);
1129	status->pr_osreldate = osreldate;
1130	status->pr_cursig = p->p_sig;
1131	status->pr_pid = p->p_pid;
1132	fill_regs(td, &status->pr_reg);
1133
1134	fill_fpregs(td, fpregset);
1135
1136	psinfo->pr_version = PRPSINFO_VERSION;
1137	psinfo->pr_psinfosz = sizeof(prpsinfo_t);
1138	strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
1139
1140	/* XXX - We don't fill in the command line arguments properly yet. */
1141	strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs));
1142
1143	/* Fill in the header. */
1144	bzero(hdr, hdrsize);
1145	off = 0;
1146	__elfN(puthdr)(p, hdr, &off, status, fpregset, psinfo, numsegs);
1147
1148	free(tempdata, M_TEMP);
1149
1150	/* Write it to the core file. */
1151	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
1152	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
1153	    td)); /* XXXKSE */
1154}
1155
1156static void
1157__elfN(puthdr)(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
1158    const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
1159{
1160	size_t ehoff;
1161	size_t phoff;
1162	size_t noteoff;
1163	size_t notesz;
1164
1165	ehoff = *off;
1166	*off += sizeof(Elf_Ehdr);
1167
1168	phoff = *off;
1169	*off += (numsegs + 1) * sizeof(Elf_Phdr);
1170
1171	noteoff = *off;
1172	__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
1173	    sizeof *status);
1174	__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
1175	    sizeof *fpregset);
1176	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
1177	    sizeof *psinfo);
1178	notesz = *off - noteoff;
1179
1180	/* Align up to a page boundary for the program segments. */
1181	*off = round_page(*off);
1182
1183	if (dst != NULL) {
1184		Elf_Ehdr *ehdr;
1185		Elf_Phdr *phdr;
1186		struct phdr_closure phc;
1187
1188		/*
1189		 * Fill in the ELF header.
1190		 */
1191		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
1192		ehdr->e_ident[EI_MAG0] = ELFMAG0;
1193		ehdr->e_ident[EI_MAG1] = ELFMAG1;
1194		ehdr->e_ident[EI_MAG2] = ELFMAG2;
1195		ehdr->e_ident[EI_MAG3] = ELFMAG3;
1196		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
1197		ehdr->e_ident[EI_DATA] = ELF_DATA;
1198		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
1199		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
1200		ehdr->e_ident[EI_ABIVERSION] = 0;
1201		ehdr->e_ident[EI_PAD] = 0;
1202		ehdr->e_type = ET_CORE;
1203		ehdr->e_machine = ELF_ARCH;
1204		ehdr->e_version = EV_CURRENT;
1205		ehdr->e_entry = 0;
1206		ehdr->e_phoff = phoff;
1207		ehdr->e_flags = 0;
1208		ehdr->e_ehsize = sizeof(Elf_Ehdr);
1209		ehdr->e_phentsize = sizeof(Elf_Phdr);
1210		ehdr->e_phnum = numsegs + 1;
1211		ehdr->e_shentsize = sizeof(Elf_Shdr);
1212		ehdr->e_shnum = 0;
1213		ehdr->e_shstrndx = SHN_UNDEF;
1214
1215		/*
1216		 * Fill in the program header entries.
1217		 */
1218		phdr = (Elf_Phdr *)((char *)dst + phoff);
1219
1220		/* The note segement. */
1221		phdr->p_type = PT_NOTE;
1222		phdr->p_offset = noteoff;
1223		phdr->p_vaddr = 0;
1224		phdr->p_paddr = 0;
1225		phdr->p_filesz = notesz;
1226		phdr->p_memsz = 0;
1227		phdr->p_flags = 0;
1228		phdr->p_align = 0;
1229		phdr++;
1230
1231		/* All the writable segments from the program. */
1232		phc.phdr = phdr;
1233		phc.offset = *off;
1234		each_writable_segment(p, cb_put_phdr, &phc);
1235	}
1236}
1237
1238static void
1239__elfN(putnote)(void *dst, size_t *off, const char *name, int type,
1240    const void *desc, size_t descsz)
1241{
1242	Elf_Note note;
1243
1244	note.n_namesz = strlen(name) + 1;
1245	note.n_descsz = descsz;
1246	note.n_type = type;
1247	if (dst != NULL)
1248		bcopy(&note, (char *)dst + *off, sizeof note);
1249	*off += sizeof note;
1250	if (dst != NULL)
1251		bcopy(name, (char *)dst + *off, note.n_namesz);
1252	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
1253	if (dst != NULL)
1254		bcopy(desc, (char *)dst + *off, note.n_descsz);
1255	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
1256}
1257
1258/*
1259 * Tell kern_execve.c about it, with a little help from the linker.
1260 */
1261static struct execsw __elfN(execsw) = {
1262	__CONCAT(exec_, __elfN(imgact)),
1263	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
1264};
1265EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
1266