imgact_elf.c revision 102424
1/*-
2 * Copyright (c) 2000 David O'Brien
3 * Copyright (c) 1995-1996 S�ren Schmidt
4 * Copyright (c) 1996 Peter Wemm
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * $FreeBSD: head/sys/kern/imgact_elf.c 102424 2002-08-25 22:36:52Z jake $
31 */
32
33#include <sys/param.h>
34#include <sys/exec.h>
35#include <sys/fcntl.h>
36#include <sys/imgact.h>
37#include <sys/imgact_elf.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/mutex.h>
42#include <sys/mman.h>
43#include <sys/namei.h>
44#include <sys/pioctl.h>
45#include <sys/proc.h>
46#include <sys/procfs.h>
47#include <sys/resourcevar.h>
48#include <sys/systm.h>
49#include <sys/signalvar.h>
50#include <sys/stat.h>
51#include <sys/sx.h>
52#include <sys/syscall.h>
53#include <sys/sysctl.h>
54#include <sys/sysent.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/vm_kern.h>
59#include <vm/vm_param.h>
60#include <vm/pmap.h>
61#include <vm/vm_map.h>
62#include <vm/vm_object.h>
63#include <vm/vm_extern.h>
64
65#include <machine/elf.h>
66#include <machine/md_var.h>
67
68#define OLD_EI_BRAND	8
69
70__ElfType(Brandinfo);
71__ElfType(Auxargs);
72
73static int __elfN(check_header)(const Elf_Ehdr *hdr);
74static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
75    u_long *entry, size_t pagesize);
76static int __elfN(load_section)(struct proc *p,
77    struct vmspace *vmspace, struct vnode *vp, vm_object_t object,
78    vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
79    vm_prot_t prot, size_t pagesize);
80static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
81
82static int elf_trace = 0;
83#if __ELF_WORD_SIZE == 32
84SYSCTL_INT(_debug, OID_AUTO, elf32_trace, CTLFLAG_RW, &elf_trace, 0, "");
85#else
86SYSCTL_INT(_debug, OID_AUTO, elf64_trace, CTLFLAG_RW, &elf_trace, 0, "");
87#endif
88
89static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
90
91int
92__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
93{
94	int i;
95
96	for (i = 0; i < MAX_BRANDS; i++) {
97		if (elf_brand_list[i] == NULL) {
98			elf_brand_list[i] = entry;
99			break;
100		}
101	}
102	if (i == MAX_BRANDS)
103		return (-1);
104	return (0);
105}
106
107int
108__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
109{
110	int i;
111
112	for (i = 0; i < MAX_BRANDS; i++) {
113		if (elf_brand_list[i] == entry) {
114			elf_brand_list[i] = NULL;
115			break;
116		}
117	}
118	if (i == MAX_BRANDS)
119		return (-1);
120	return (0);
121}
122
123int
124__elfN(brand_inuse)(Elf_Brandinfo *entry)
125{
126	struct proc *p;
127	int rval = FALSE;
128
129	sx_slock(&allproc_lock);
130	LIST_FOREACH(p, &allproc, p_list) {
131		if (p->p_sysent == entry->sysvec) {
132			rval = TRUE;
133			break;
134		}
135	}
136	sx_sunlock(&allproc_lock);
137
138	return (rval);
139}
140
141static int
142__elfN(check_header)(const Elf_Ehdr *hdr)
143{
144	int i;
145
146	if (!IS_ELF(*hdr) ||
147	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
148	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
149	    hdr->e_ident[EI_VERSION] != EV_CURRENT)
150		return (ENOEXEC);
151
152	/*
153	 * Make sure we have at least one brand for this machine.
154	 */
155
156	for (i = 0; i < MAX_BRANDS; i++) {
157		if (elf_brand_list[i]->machine == hdr->e_machine)
158			break;
159	}
160	if (i == MAX_BRANDS)
161		return (ENOEXEC);
162
163	if (hdr->e_version != ELF_TARG_VER)
164		return (ENOEXEC);
165
166	return (0);
167}
168
169static int
170__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
171	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
172	vm_prot_t max)
173{
174	int error, rv;
175	vm_offset_t off;
176	vm_offset_t data_buf = 0;
177
178	/*
179	 * Create the page if it doesn't exist yet. Ignore errors.
180	 */
181	vm_map_lock(map);
182	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), max,
183	    max, 0);
184	vm_map_unlock(map);
185
186	/*
187	 * Find the page from the underlying object.
188	 */
189	if (object) {
190		vm_object_reference(object);
191		rv = vm_map_find(exec_map,
192				 object,
193				 trunc_page(offset),
194				 &data_buf,
195				 PAGE_SIZE,
196				 TRUE,
197				 VM_PROT_READ,
198				 VM_PROT_ALL,
199				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
200		if (rv != KERN_SUCCESS) {
201			vm_object_deallocate(object);
202			return (rv);
203		}
204
205		off = offset - trunc_page(offset);
206		error = copyout((caddr_t)data_buf + off, (caddr_t)start,
207		    end - start);
208		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
209		if (error) {
210			return (KERN_FAILURE);
211		}
212	}
213
214	return (KERN_SUCCESS);
215}
216
217static int
218__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
219	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
220	vm_prot_t max, int cow)
221{
222	int rv;
223
224	if (start != trunc_page(start)) {
225		rv = __elfN(map_partial)(map, object, offset, start,
226		    round_page(start), prot, max);
227		if (rv)
228			return (rv);
229		offset += round_page(start) - start;
230		start = round_page(start);
231	}
232	if (end != round_page(end)) {
233		rv = __elfN(map_partial)(map, object, offset +
234		    trunc_page(end) - start, trunc_page(end), end, prot, max);
235		if (rv)
236			return (rv);
237		end = trunc_page(end);
238	}
239	if (end > start) {
240		if (offset & PAGE_MASK) {
241			vm_offset_t data_buf, off;
242			vm_size_t sz;
243			int error;
244
245			/*
246			 * The mapping is not page aligned. This means we have
247			 * to copy the data. Sigh.
248			 */
249			rv = vm_map_find(map, 0, 0, &start, end - start,
250			    FALSE, prot, max, 0);
251			if (rv)
252				return (rv);
253			while (start < end) {
254				vm_object_reference(object);
255				rv = vm_map_find(exec_map,
256						 object,
257						 trunc_page(offset),
258						 &data_buf,
259						 2 * PAGE_SIZE,
260						 TRUE,
261						 VM_PROT_READ,
262						 VM_PROT_ALL,
263						 (MAP_COPY_ON_WRITE
264						  | MAP_PREFAULT_PARTIAL));
265				if (rv != KERN_SUCCESS) {
266					vm_object_deallocate(object);
267					return (rv);
268				}
269				off = offset - trunc_page(offset);
270				sz = end - start;
271				if (sz > PAGE_SIZE)
272					sz = PAGE_SIZE;
273				error = copyout((caddr_t)data_buf + off,
274				    (caddr_t)start, sz);
275				vm_map_remove(exec_map, data_buf,
276				    data_buf + 2 * PAGE_SIZE);
277				if (error) {
278					return (KERN_FAILURE);
279				}
280				start += sz;
281			}
282			rv = KERN_SUCCESS;
283		} else {
284			vm_map_lock(map);
285			rv = vm_map_insert(map, object, offset, start, end,
286			    prot, max, cow);
287			vm_map_unlock(map);
288		}
289		return (rv);
290	} else {
291		return (KERN_SUCCESS);
292	}
293}
294
295static int
296__elfN(load_section)(struct proc *p, struct vmspace *vmspace,
297	struct vnode *vp, vm_object_t object, vm_offset_t offset,
298	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
299	size_t pagesize)
300{
301	size_t map_len;
302	vm_offset_t map_addr;
303	int error, rv;
304	size_t copy_len;
305	vm_offset_t file_addr;
306	vm_offset_t data_buf = 0;
307
308	GIANT_REQUIRED;
309
310	error = 0;
311
312	/*
313	 * It's necessary to fail if the filsz + offset taken from the
314	 * header is greater than the actual file pager object's size.
315	 * If we were to allow this, then the vm_map_find() below would
316	 * walk right off the end of the file object and into the ether.
317	 *
318	 * While I'm here, might as well check for something else that
319	 * is invalid: filsz cannot be greater than memsz.
320	 */
321	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
322	    filsz > memsz) {
323		uprintf("elf_load_section: truncated ELF file\n");
324		return (ENOEXEC);
325	}
326
327#define trunc_page_ps(va, ps)	((va) & ~(ps - 1))
328#define round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
329
330	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
331	file_addr = trunc_page_ps(offset, pagesize);
332
333	/*
334	 * We have two choices.  We can either clear the data in the last page
335	 * of an oversized mapping, or we can start the anon mapping a page
336	 * early and copy the initialized data into that first page.  We
337	 * choose the second..
338	 */
339	if (memsz > filsz)
340		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
341	else
342		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
343
344	if (map_len != 0) {
345		vm_object_reference(object);
346		rv = __elfN(map_insert)(&vmspace->vm_map,
347				      object,
348				      file_addr,	/* file offset */
349				      map_addr,		/* virtual start */
350				      map_addr + map_len,/* virtual end */
351				      prot,
352				      VM_PROT_ALL,
353				      MAP_COPY_ON_WRITE | MAP_PREFAULT);
354		if (rv != KERN_SUCCESS) {
355			vm_object_deallocate(object);
356			return (EINVAL);
357		}
358
359		/* we can stop now if we've covered it all */
360		if (memsz == filsz) {
361			return (0);
362		}
363	}
364
365
366	/*
367	 * We have to get the remaining bit of the file into the first part
368	 * of the oversized map segment.  This is normally because the .data
369	 * segment in the file is extended to provide bss.  It's a neat idea
370	 * to try and save a page, but it's a pain in the behind to implement.
371	 */
372	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
373	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
374	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
375	    map_addr;
376
377	/* This had damn well better be true! */
378	if (map_len != 0) {
379		rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
380		    map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0);
381		if (rv != KERN_SUCCESS) {
382			return (EINVAL);
383		}
384	}
385
386	if (copy_len != 0) {
387		vm_offset_t off;
388		vm_object_reference(object);
389		rv = vm_map_find(exec_map,
390				 object,
391				 trunc_page(offset + filsz),
392				 &data_buf,
393				 PAGE_SIZE,
394				 TRUE,
395				 VM_PROT_READ,
396				 VM_PROT_ALL,
397				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
398		if (rv != KERN_SUCCESS) {
399			vm_object_deallocate(object);
400			return (EINVAL);
401		}
402
403		/* send the page fragment to user space */
404		off = trunc_page_ps(offset + filsz, pagesize) -
405		    trunc_page(offset + filsz);
406		error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr,
407		    copy_len);
408		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
409		if (error) {
410			return (error);
411		}
412	}
413
414	/*
415	 * set it to the specified protection.
416	 * XXX had better undo the damage from pasting over the cracks here!
417	 */
418	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
419	    round_page(map_addr + map_len),  prot, FALSE);
420
421	return (error);
422}
423
424/*
425 * Load the file "file" into memory.  It may be either a shared object
426 * or an executable.
427 *
428 * The "addr" reference parameter is in/out.  On entry, it specifies
429 * the address where a shared object should be loaded.  If the file is
430 * an executable, this value is ignored.  On exit, "addr" specifies
431 * where the file was actually loaded.
432 *
433 * The "entry" reference parameter is out only.  On exit, it specifies
434 * the entry point for the loaded file.
435 */
436static int
437__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
438	u_long *entry, size_t pagesize)
439{
440	struct {
441		struct nameidata nd;
442		struct vattr attr;
443		struct image_params image_params;
444	} *tempdata;
445	const Elf_Ehdr *hdr = NULL;
446	const Elf_Phdr *phdr = NULL;
447	struct nameidata *nd;
448	struct vmspace *vmspace = p->p_vmspace;
449	struct vattr *attr;
450	struct image_params *imgp;
451	vm_prot_t prot;
452	u_long rbase;
453	u_long base_addr = 0;
454	int error, i, numsegs;
455
456	if (curthread->td_proc != p)
457		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
458
459	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
460	nd = &tempdata->nd;
461	attr = &tempdata->attr;
462	imgp = &tempdata->image_params;
463
464	/*
465	 * Initialize part of the common data
466	 */
467	imgp->proc = p;
468	imgp->uap = NULL;
469	imgp->attr = attr;
470	imgp->firstpage = NULL;
471	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
472	imgp->object = NULL;
473
474	if (imgp->image_header == NULL) {
475		nd->ni_vp = NULL;
476		error = ENOMEM;
477		goto fail;
478	}
479
480	/* XXXKSE */
481	NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);
482
483	if ((error = namei(nd)) != 0) {
484		nd->ni_vp = NULL;
485		goto fail;
486	}
487	NDFREE(nd, NDF_ONLY_PNBUF);
488	imgp->vp = nd->ni_vp;
489
490	/*
491	 * Check permissions, modes, uid, etc on the file, and "open" it.
492	 */
493	error = exec_check_permissions(imgp);
494	if (error) {
495		VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
496		goto fail;
497	}
498
499	error = exec_map_first_page(imgp);
500	/*
501	 * Also make certain that the interpreter stays the same, so set
502	 * its VV_TEXT flag, too.
503	 */
504	if (error == 0)
505		nd->ni_vp->v_vflag |= VV_TEXT;
506
507	VOP_GETVOBJECT(nd->ni_vp, &imgp->object);
508	vm_object_reference(imgp->object);
509
510	VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
511	if (error)
512		goto fail;
513
514	hdr = (const Elf_Ehdr *)imgp->image_header;
515	if ((error = __elfN(check_header)(hdr)) != 0)
516		goto fail;
517	if (hdr->e_type == ET_DYN)
518		rbase = *addr;
519	else if (hdr->e_type == ET_EXEC)
520		rbase = 0;
521	else {
522		error = ENOEXEC;
523		goto fail;
524	}
525
526	/* Only support headers that fit within first page for now */
527	if ((hdr->e_phoff > PAGE_SIZE) ||
528	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
529		error = ENOEXEC;
530		goto fail;
531	}
532
533	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
534
535	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
536		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
537			prot = 0;
538			if (phdr[i].p_flags & PF_X)
539  				prot |= VM_PROT_EXECUTE;
540			if (phdr[i].p_flags & PF_W)
541  				prot |= VM_PROT_WRITE;
542			if (phdr[i].p_flags & PF_R)
543  				prot |= VM_PROT_READ;
544
545			if ((error = __elfN(load_section)(p, vmspace,
546			    nd->ni_vp, imgp->object, phdr[i].p_offset,
547			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
548			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
549			    pagesize)) != 0)
550				goto fail;
551			/*
552			 * Establish the base address if this is the
553			 * first segment.
554			 */
555			if (numsegs == 0)
556  				base_addr = trunc_page(phdr[i].p_vaddr +
557				    rbase);
558			numsegs++;
559		}
560	}
561	*addr = base_addr;
562	*entry = (unsigned long)hdr->e_entry + rbase;
563
564fail:
565	if (imgp->firstpage)
566		exec_unmap_first_page(imgp);
567	if (imgp->image_header)
568		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
569		    PAGE_SIZE);
570	if (imgp->object)
571		vm_object_deallocate(imgp->object);
572
573	if (nd->ni_vp)
574		vrele(nd->ni_vp);
575
576	free(tempdata, M_TEMP);
577
578	return (error);
579}
580
581extern int fallback_elf_brand;
582
583static int
584__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
585{
586	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
587	const Elf_Phdr *phdr;
588	Elf_Auxargs *elf_auxargs = NULL;
589	struct vmspace *vmspace;
590	vm_prot_t prot;
591	u_long text_size = 0, data_size = 0;
592	u_long text_addr = 0, data_addr = 0;
593	u_long addr, entry = 0, proghdr = 0;
594	vm_offset_t maxuser, usrstack, pagesize;
595	int error, i;
596	const char *interp = NULL;
597	Elf_Brandinfo *brand_info;
598	char *path;
599	struct thread *td = curthread;
600
601	GIANT_REQUIRED;
602
603	/*
604	 * Do we have a valid ELF header ?
605	 */
606	if (__elfN(check_header)(hdr) != 0 || hdr->e_type != ET_EXEC)
607		return (-1);
608
609	/*
610	 * From here on down, we return an errno, not -1, as we've
611	 * detected an ELF file.
612	 */
613
614	if ((hdr->e_phoff > PAGE_SIZE) ||
615	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
616		/* Only support headers in first page for now */
617		return (ENOEXEC);
618	}
619	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
620
621	/*
622	 * From this point on, we may have resources that need to be freed.
623	 */
624
625	VOP_UNLOCK(imgp->vp, 0, td);
626
627	if ((error = exec_extract_strings(imgp)) != 0)
628		goto fail;
629
630	/*
631	 * Tentatively identify the brand based on the machine so that
632	 * we can figure out VM ranges and page sizes.
633	 */
634	brand_info = NULL;
635	for (i = 0; i < MAX_BRANDS; i++) {
636		Elf_Brandinfo *bi = elf_brand_list[i];
637
638		if (bi != NULL &&
639		    hdr->e_machine == bi->machine &&
640		    (hdr->e_ident[EI_OSABI] == bi->brand
641		     || 0 ==
642		     strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
643		     bi->compat_3_brand, strlen(bi->compat_3_brand)))) {
644			brand_info = bi;
645			break;
646		}
647	}
648
649	pagesize = PAGE_SIZE;
650	maxuser = VM_MAXUSER_ADDRESS;
651	usrstack = USRSTACK;
652	if (brand_info) {
653		if (brand_info->sysvec->sv_pagesize)
654			pagesize = brand_info->sysvec->sv_pagesize;
655		if (brand_info->sysvec->sv_maxuser)
656			maxuser = brand_info->sysvec->sv_maxuser;
657		if (brand_info->sysvec->sv_usrstack)
658			usrstack = brand_info->sysvec->sv_usrstack;
659	}
660
661	exec_new_vmspace(imgp, VM_MIN_ADDRESS, maxuser, usrstack);
662
663	vmspace = imgp->proc->p_vmspace;
664
665	for (i = 0; i < hdr->e_phnum; i++) {
666		switch (phdr[i].p_type) {
667		case PT_LOAD:	/* Loadable segment */
668			prot = 0;
669			if (phdr[i].p_flags & PF_X)
670  				prot |= VM_PROT_EXECUTE;
671			if (phdr[i].p_flags & PF_W)
672  				prot |= VM_PROT_WRITE;
673			if (phdr[i].p_flags & PF_R)
674  				prot |= VM_PROT_READ;
675
676#if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
677			/*
678			 * Some x86 binaries assume read == executable,
679			 * notably the M3 runtime and therefore cvsup
680			 */
681			if (prot & VM_PROT_READ)
682				prot |= VM_PROT_EXECUTE;
683#endif
684
685			if ((error = __elfN(load_section)(imgp->proc, vmspace,
686			    imgp->vp, imgp->object, phdr[i].p_offset,
687			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
688			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
689			    pagesize)) != 0)
690  				goto fail;
691
692			/*
693			 * Is this .text or .data ??
694			 *
695			 * We only handle one each of those yet XXX
696			 */
697			if (hdr->e_entry >= phdr[i].p_vaddr &&
698			    hdr->e_entry < (phdr[i].p_vaddr +
699			    phdr[i].p_memsz)) {
700  				text_addr = trunc_page(phdr[i].p_vaddr);
701  				text_size = round_page(phdr[i].p_memsz +
702				    phdr[i].p_vaddr - text_addr);
703				entry = (u_long)hdr->e_entry;
704			} else {
705  				data_addr = trunc_page(phdr[i].p_vaddr);
706  				data_size = round_page(phdr[i].p_memsz +
707				    phdr[i].p_vaddr - data_addr);
708			}
709			break;
710	  	case PT_INTERP:	/* Path to interpreter */
711			if (phdr[i].p_filesz > MAXPATHLEN ||
712			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
713				error = ENOEXEC;
714				goto fail;
715			}
716			interp = imgp->image_header + phdr[i].p_offset;
717			break;
718		case PT_PHDR: 	/* Program header table info */
719			proghdr = phdr[i].p_vaddr;
720			break;
721		default:
722			break;
723		}
724	}
725
726	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
727	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
728	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
729	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
730
731	addr = ELF_RTLD_ADDR(vmspace);
732
733	imgp->entry_addr = entry;
734
735	brand_info = NULL;
736
737	/* We support three types of branding -- (1) the ELF EI_OSABI field
738	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
739	 * branding w/in the ELF header, and (3) path of the `interp_path'
740	 * field.  We should also look for an ".note.ABI-tag" ELF section now
741	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
742	 */
743
744	/* If the executable has a brand, search for it in the brand list. */
745	if (brand_info == NULL) {
746		for (i = 0; i < MAX_BRANDS; i++) {
747			Elf_Brandinfo *bi = elf_brand_list[i];
748
749			if (bi != NULL &&
750			    hdr->e_machine == bi->machine &&
751			    (hdr->e_ident[EI_OSABI] == bi->brand
752			    || 0 ==
753			    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
754			    bi->compat_3_brand, strlen(bi->compat_3_brand)))) {
755				brand_info = bi;
756				break;
757			}
758		}
759	}
760
761	/* Lacking a known brand, search for a recognized interpreter. */
762	if (brand_info == NULL && interp != NULL) {
763		for (i = 0; i < MAX_BRANDS; i++) {
764			Elf_Brandinfo *bi = elf_brand_list[i];
765
766			if (bi != NULL &&
767			    hdr->e_machine == bi->machine &&
768			    strcmp(interp, bi->interp_path) == 0) {
769				brand_info = bi;
770				break;
771			}
772		}
773	}
774
775	/* Lacking a recognized interpreter, try the default brand */
776	if (brand_info == NULL) {
777		for (i = 0; i < MAX_BRANDS; i++) {
778			Elf_Brandinfo *bi = elf_brand_list[i];
779
780			if (bi != NULL &&
781			    hdr->e_machine == bi->machine &&
782			    fallback_elf_brand == bi->brand) {
783				brand_info = bi;
784				break;
785			}
786		}
787	}
788
789	if (brand_info == NULL) {
790		uprintf("ELF binary type \"%u\" not known.\n",
791		    hdr->e_ident[EI_OSABI]);
792		error = ENOEXEC;
793		goto fail;
794	}
795
796	imgp->proc->p_sysent = brand_info->sysvec;
797	if (interp != NULL) {
798		path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
799		snprintf(path, MAXPATHLEN, "%s%s",
800			 brand_info->emul_path, interp);
801		if ((error = __elfN(load_file)(imgp->proc, path, &addr,
802		    &imgp->entry_addr, pagesize)) != 0) {
803			if ((error = __elfN(load_file)(imgp->proc, interp,
804			    &addr, &imgp->entry_addr, pagesize)) != 0) {
805				uprintf("ELF interpreter %s not found\n",
806				    path);
807				free(path, M_TEMP);
808				goto fail;
809			}
810		}
811		free(path, M_TEMP);
812	}
813
814	/*
815	 * Construct auxargs table (used by the fixup routine)
816	 */
817	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
818	elf_auxargs->execfd = -1;
819	elf_auxargs->phdr = proghdr;
820	elf_auxargs->phent = hdr->e_phentsize;
821	elf_auxargs->phnum = hdr->e_phnum;
822	elf_auxargs->pagesz = PAGE_SIZE;
823	elf_auxargs->base = addr;
824	elf_auxargs->flags = 0;
825	elf_auxargs->entry = entry;
826	elf_auxargs->trace = elf_trace;
827
828	imgp->auxargs = elf_auxargs;
829	imgp->interpreted = 0;
830
831fail:
832	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
833	return (error);
834}
835
836#if __ELF_WORD_SIZE == 32
837#define suword	suword32
838#define stacktype u_int32_t
839#else
840#define suword	suword64
841#define stacktype u_int64_t
842#endif
843
844int
845__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
846{
847	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
848	stacktype *base;
849	stacktype *pos;
850
851	base = (stacktype *)*stack_base;
852	pos = base + (imgp->argc + imgp->envc + 2);
853
854	if (args->trace) {
855		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
856	}
857	if (args->execfd != -1) {
858		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
859	}
860	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
861	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
862	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
863	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
864	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
865	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
866	AUXARGS_ENTRY(pos, AT_BASE, args->base);
867	AUXARGS_ENTRY(pos, AT_NULL, 0);
868
869	free(imgp->auxargs, M_TEMP);
870	imgp->auxargs = NULL;
871
872	base--;
873	suword(base, (long)imgp->argc);
874	*stack_base = (register_t *)base;
875	return (0);
876}
877
878/*
879 * Code for generating ELF core dumps.
880 */
881
882typedef void (*segment_callback)(vm_map_entry_t, void *);
883
884/* Closure for cb_put_phdr(). */
885struct phdr_closure {
886	Elf_Phdr *phdr;		/* Program header to fill in */
887	Elf_Off offset;		/* Offset of segment in core file */
888};
889
890/* Closure for cb_size_segment(). */
891struct sseg_closure {
892	int count;		/* Count of writable segments. */
893	size_t size;		/* Total size of all writable segments. */
894};
895
896static void cb_put_phdr(vm_map_entry_t, void *);
897static void cb_size_segment(vm_map_entry_t, void *);
898static void each_writable_segment(struct proc *, segment_callback, void *);
899static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
900    int, void *, size_t);
901static void __elfN(puthdr)(struct proc *, void *, size_t *,
902    const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int);
903static void __elfN(putnote)(void *, size_t *, const char *, int,
904    const void *, size_t);
905
906extern int osreldate;
907
908int
909__elfN(coredump)(td, vp, limit)
910	struct thread *td;
911	register struct vnode *vp;
912	off_t limit;
913{
914	register struct proc *p = td->td_proc;
915	register struct ucred *cred = td->td_ucred;
916	int error = 0;
917	struct sseg_closure seginfo;
918	void *hdr;
919	size_t hdrsize;
920
921	/* Size the program segments. */
922	seginfo.count = 0;
923	seginfo.size = 0;
924	each_writable_segment(p, cb_size_segment, &seginfo);
925
926	/*
927	 * Calculate the size of the core file header area by making
928	 * a dry run of generating it.  Nothing is written, but the
929	 * size is calculated.
930	 */
931	hdrsize = 0;
932	__elfN(puthdr)((struct proc *)NULL, (void *)NULL, &hdrsize,
933	    (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
934	    (const prpsinfo_t *)NULL, seginfo.count);
935
936	if (hdrsize + seginfo.size >= limit)
937		return (EFAULT);
938
939	/*
940	 * Allocate memory for building the header, fill it up,
941	 * and write it out.
942	 */
943	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
944	if (hdr == NULL) {
945		return (EINVAL);
946	}
947	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
948
949	/* Write the contents of all of the writable segments. */
950	if (error == 0) {
951		Elf_Phdr *php;
952		off_t offset;
953		int i;
954
955		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
956		offset = hdrsize;
957		for (i = 0; i < seginfo.count; i++) {
958			error = vn_rdwr_inchunks(UIO_WRITE, vp,
959			    (caddr_t)(uintptr_t)php->p_vaddr,
960			    php->p_filesz, offset, UIO_USERSPACE,
961			    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *)NULL,
962			    curthread); /* XXXKSE */
963			if (error != 0)
964				break;
965			offset += php->p_filesz;
966			php++;
967		}
968	}
969	free(hdr, M_TEMP);
970
971	return (error);
972}
973
974/*
975 * A callback for each_writable_segment() to write out the segment's
976 * program header entry.
977 */
978static void
979cb_put_phdr(entry, closure)
980	vm_map_entry_t entry;
981	void *closure;
982{
983	struct phdr_closure *phc = (struct phdr_closure *)closure;
984	Elf_Phdr *phdr = phc->phdr;
985
986	phc->offset = round_page(phc->offset);
987
988	phdr->p_type = PT_LOAD;
989	phdr->p_offset = phc->offset;
990	phdr->p_vaddr = entry->start;
991	phdr->p_paddr = 0;
992	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
993	phdr->p_align = PAGE_SIZE;
994	phdr->p_flags = 0;
995	if (entry->protection & VM_PROT_READ)
996		phdr->p_flags |= PF_R;
997	if (entry->protection & VM_PROT_WRITE)
998		phdr->p_flags |= PF_W;
999	if (entry->protection & VM_PROT_EXECUTE)
1000		phdr->p_flags |= PF_X;
1001
1002	phc->offset += phdr->p_filesz;
1003	phc->phdr++;
1004}
1005
1006/*
1007 * A callback for each_writable_segment() to gather information about
1008 * the number of segments and their total size.
1009 */
1010static void
1011cb_size_segment(entry, closure)
1012	vm_map_entry_t entry;
1013	void *closure;
1014{
1015	struct sseg_closure *ssc = (struct sseg_closure *)closure;
1016
1017	ssc->count++;
1018	ssc->size += entry->end - entry->start;
1019}
1020
1021/*
1022 * For each writable segment in the process's memory map, call the given
1023 * function with a pointer to the map entry and some arbitrary
1024 * caller-supplied data.
1025 */
1026static void
1027each_writable_segment(p, func, closure)
1028	struct proc *p;
1029	segment_callback func;
1030	void *closure;
1031{
1032	vm_map_t map = &p->p_vmspace->vm_map;
1033	vm_map_entry_t entry;
1034
1035	for (entry = map->header.next; entry != &map->header;
1036	    entry = entry->next) {
1037		vm_object_t obj;
1038
1039		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
1040		    (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
1041		    (VM_PROT_READ|VM_PROT_WRITE))
1042			continue;
1043
1044		/*
1045		** Dont include memory segment in the coredump if
1046		** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
1047		** madvise(2).
1048		*/
1049		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
1050			continue;
1051
1052		if ((obj = entry->object.vm_object) == NULL)
1053			continue;
1054
1055		/* Find the deepest backing object. */
1056		while (obj->backing_object != NULL)
1057			obj = obj->backing_object;
1058
1059		/* Ignore memory-mapped devices and such things. */
1060		if (obj->type != OBJT_DEFAULT &&
1061		    obj->type != OBJT_SWAP &&
1062		    obj->type != OBJT_VNODE)
1063			continue;
1064
1065		(*func)(entry, closure);
1066	}
1067}
1068
1069/*
1070 * Write the core file header to the file, including padding up to
1071 * the page boundary.
1072 */
1073static int
1074__elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
1075	struct thread *td;
1076	struct vnode *vp;
1077	struct ucred *cred;
1078	int numsegs;
1079	size_t hdrsize;
1080	void *hdr;
1081{
1082	struct {
1083		prstatus_t status;
1084		prfpregset_t fpregset;
1085		prpsinfo_t psinfo;
1086	} *tempdata;
1087	struct proc *p = td->td_proc;
1088	size_t off;
1089	prstatus_t *status;
1090	prfpregset_t *fpregset;
1091	prpsinfo_t *psinfo;
1092
1093	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK);
1094	status = &tempdata->status;
1095	fpregset = &tempdata->fpregset;
1096	psinfo = &tempdata->psinfo;
1097
1098	/* Gather the information for the header. */
1099	status->pr_version = PRSTATUS_VERSION;
1100	status->pr_statussz = sizeof(prstatus_t);
1101	status->pr_gregsetsz = sizeof(gregset_t);
1102	status->pr_fpregsetsz = sizeof(fpregset_t);
1103	status->pr_osreldate = osreldate;
1104	status->pr_cursig = p->p_sig;
1105	status->pr_pid = p->p_pid;
1106	fill_regs(td, &status->pr_reg);
1107
1108	fill_fpregs(td, fpregset);
1109
1110	psinfo->pr_version = PRPSINFO_VERSION;
1111	psinfo->pr_psinfosz = sizeof(prpsinfo_t);
1112	strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1);
1113
1114	/* XXX - We don't fill in the command line arguments properly yet. */
1115	strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ);
1116
1117	/* Fill in the header. */
1118	bzero(hdr, hdrsize);
1119	off = 0;
1120	__elfN(puthdr)(p, hdr, &off, status, fpregset, psinfo, numsegs);
1121
1122	free(tempdata, M_TEMP);
1123
1124	/* Write it to the core file. */
1125	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
1126	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
1127	    td)); /* XXXKSE */
1128}
1129
1130static void
1131__elfN(puthdr)(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
1132    const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
1133{
1134	size_t ehoff;
1135	size_t phoff;
1136	size_t noteoff;
1137	size_t notesz;
1138
1139	ehoff = *off;
1140	*off += sizeof(Elf_Ehdr);
1141
1142	phoff = *off;
1143	*off += (numsegs + 1) * sizeof(Elf_Phdr);
1144
1145	noteoff = *off;
1146	__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
1147	    sizeof *status);
1148	__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
1149	    sizeof *fpregset);
1150	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
1151	    sizeof *psinfo);
1152	notesz = *off - noteoff;
1153
1154	/* Align up to a page boundary for the program segments. */
1155	*off = round_page(*off);
1156
1157	if (dst != NULL) {
1158		Elf_Ehdr *ehdr;
1159		Elf_Phdr *phdr;
1160		struct phdr_closure phc;
1161
1162		/*
1163		 * Fill in the ELF header.
1164		 */
1165		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
1166		ehdr->e_ident[EI_MAG0] = ELFMAG0;
1167		ehdr->e_ident[EI_MAG1] = ELFMAG1;
1168		ehdr->e_ident[EI_MAG2] = ELFMAG2;
1169		ehdr->e_ident[EI_MAG3] = ELFMAG3;
1170		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
1171		ehdr->e_ident[EI_DATA] = ELF_DATA;
1172		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
1173		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
1174		ehdr->e_ident[EI_ABIVERSION] = 0;
1175		ehdr->e_ident[EI_PAD] = 0;
1176		ehdr->e_type = ET_CORE;
1177		ehdr->e_machine = ELF_ARCH;
1178		ehdr->e_version = EV_CURRENT;
1179		ehdr->e_entry = 0;
1180		ehdr->e_phoff = phoff;
1181		ehdr->e_flags = 0;
1182		ehdr->e_ehsize = sizeof(Elf_Ehdr);
1183		ehdr->e_phentsize = sizeof(Elf_Phdr);
1184		ehdr->e_phnum = numsegs + 1;
1185		ehdr->e_shentsize = sizeof(Elf_Shdr);
1186		ehdr->e_shnum = 0;
1187		ehdr->e_shstrndx = SHN_UNDEF;
1188
1189		/*
1190		 * Fill in the program header entries.
1191		 */
1192		phdr = (Elf_Phdr *)((char *)dst + phoff);
1193
1194		/* The note segement. */
1195		phdr->p_type = PT_NOTE;
1196		phdr->p_offset = noteoff;
1197		phdr->p_vaddr = 0;
1198		phdr->p_paddr = 0;
1199		phdr->p_filesz = notesz;
1200		phdr->p_memsz = 0;
1201		phdr->p_flags = 0;
1202		phdr->p_align = 0;
1203		phdr++;
1204
1205		/* All the writable segments from the program. */
1206		phc.phdr = phdr;
1207		phc.offset = *off;
1208		each_writable_segment(p, cb_put_phdr, &phc);
1209	}
1210}
1211
1212static void
1213__elfN(putnote)(void *dst, size_t *off, const char *name, int type,
1214    const void *desc, size_t descsz)
1215{
1216	Elf_Note note;
1217
1218	note.n_namesz = strlen(name) + 1;
1219	note.n_descsz = descsz;
1220	note.n_type = type;
1221	if (dst != NULL)
1222		bcopy(&note, (char *)dst + *off, sizeof note);
1223	*off += sizeof note;
1224	if (dst != NULL)
1225		bcopy(name, (char *)dst + *off, note.n_namesz);
1226	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
1227	if (dst != NULL)
1228		bcopy(desc, (char *)dst + *off, note.n_descsz);
1229	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
1230}
1231
1232/*
1233 * Tell kern_execve.c about it, with a little help from the linker.
1234 */
1235#if __ELF_WORD_SIZE == 32
1236static struct execsw elf_execsw = {exec_elf32_imgact, "ELF32"};
1237EXEC_SET(elf32, elf_execsw);
1238#else
1239static struct execsw elf_execsw = {exec_elf64_imgact, "ELF64"};
1240EXEC_SET(elf64, elf_execsw);
1241#endif
1242