imgact_elf.c revision 249277
1/*-
2 * Copyright (c) 2000 David O'Brien
3 * Copyright (c) 1995-1996 S��ren Schmidt
4 * Copyright (c) 1996 Peter Wemm
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/kern/imgact_elf.c 249277 2013-04-08 19:58:32Z attilio $");
33
34#include "opt_capsicum.h"
35#include "opt_compat.h"
36#include "opt_core.h"
37
38#include <sys/param.h>
39#include <sys/capability.h>
40#include <sys/exec.h>
41#include <sys/fcntl.h>
42#include <sys/imgact.h>
43#include <sys/imgact_elf.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/mman.h>
49#include <sys/namei.h>
50#include <sys/pioctl.h>
51#include <sys/proc.h>
52#include <sys/procfs.h>
53#include <sys/racct.h>
54#include <sys/resourcevar.h>
55#include <sys/rwlock.h>
56#include <sys/sf_buf.h>
57#include <sys/smp.h>
58#include <sys/systm.h>
59#include <sys/signalvar.h>
60#include <sys/stat.h>
61#include <sys/sx.h>
62#include <sys/syscall.h>
63#include <sys/sysctl.h>
64#include <sys/sysent.h>
65#include <sys/vnode.h>
66#include <sys/syslog.h>
67#include <sys/eventhandler.h>
68
69#include <net/zlib.h>
70
71#include <vm/vm.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_param.h>
74#include <vm/pmap.h>
75#include <vm/vm_map.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
78
79#include <machine/elf.h>
80#include <machine/md_var.h>
81
82#define OLD_EI_BRAND	8
83
84static int __elfN(check_header)(const Elf_Ehdr *hdr);
85static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
86    const char *interp, int interp_name_len, int32_t *osrel);
87static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
88    u_long *entry, size_t pagesize);
89static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
90    caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
91    size_t pagesize);
92static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
93static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note,
94    int32_t *osrel);
95static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
96static boolean_t __elfN(check_note)(struct image_params *imgp,
97    Elf_Brandnote *checknote, int32_t *osrel);
98static vm_prot_t __elfN(trans_prot)(Elf_Word);
99static Elf_Word __elfN(untrans_prot)(vm_prot_t);
100
101SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
102    "");
103
104#ifdef COMPRESS_USER_CORES
105static int compress_core(gzFile, char *, char *, unsigned int,
106    struct thread * td);
107#define CORE_BUF_SIZE	(16 * 1024)
108#endif
109
110int __elfN(fallback_brand) = -1;
111SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
112    fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
113    __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
114TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
115    &__elfN(fallback_brand));
116
117static int elf_legacy_coredump = 0;
118SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW,
119    &elf_legacy_coredump, 0, "");
120
121int __elfN(nxstack) =
122#if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */
123	1;
124#else
125	0;
126#endif
127SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
128    nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
129    __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
130
131#if __ELF_WORD_SIZE == 32
132#if defined(__amd64__) || defined(__ia64__)
133int i386_read_exec = 0;
134SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
135    "enable execution from readable segments");
136#endif
137#endif
138
139static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
140
141#define	trunc_page_ps(va, ps)	((va) & ~(ps - 1))
142#define	round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
143#define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
144
145static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
146
147Elf_Brandnote __elfN(freebsd_brandnote) = {
148	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
149	.hdr.n_descsz	= sizeof(int32_t),
150	.hdr.n_type	= 1,
151	.vendor		= FREEBSD_ABI_VENDOR,
152	.flags		= BN_TRANSLATE_OSREL,
153	.trans_osrel	= __elfN(freebsd_trans_osrel)
154};
155
156static boolean_t
157__elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
158{
159	uintptr_t p;
160
161	p = (uintptr_t)(note + 1);
162	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
163	*osrel = *(const int32_t *)(p);
164
165	return (TRUE);
166}
167
168static const char GNU_ABI_VENDOR[] = "GNU";
169static int GNU_KFREEBSD_ABI_DESC = 3;
170
171Elf_Brandnote __elfN(kfreebsd_brandnote) = {
172	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
173	.hdr.n_descsz	= 16,	/* XXX at least 16 */
174	.hdr.n_type	= 1,
175	.vendor		= GNU_ABI_VENDOR,
176	.flags		= BN_TRANSLATE_OSREL,
177	.trans_osrel	= kfreebsd_trans_osrel
178};
179
180static boolean_t
181kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
182{
183	const Elf32_Word *desc;
184	uintptr_t p;
185
186	p = (uintptr_t)(note + 1);
187	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
188
189	desc = (const Elf32_Word *)p;
190	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
191		return (FALSE);
192
193	/*
194	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
195	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
196	 */
197	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
198
199	return (TRUE);
200}
201
202int
203__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
204{
205	int i;
206
207	for (i = 0; i < MAX_BRANDS; i++) {
208		if (elf_brand_list[i] == NULL) {
209			elf_brand_list[i] = entry;
210			break;
211		}
212	}
213	if (i == MAX_BRANDS) {
214		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
215			__func__, entry);
216		return (-1);
217	}
218	return (0);
219}
220
221int
222__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
223{
224	int i;
225
226	for (i = 0; i < MAX_BRANDS; i++) {
227		if (elf_brand_list[i] == entry) {
228			elf_brand_list[i] = NULL;
229			break;
230		}
231	}
232	if (i == MAX_BRANDS)
233		return (-1);
234	return (0);
235}
236
237int
238__elfN(brand_inuse)(Elf_Brandinfo *entry)
239{
240	struct proc *p;
241	int rval = FALSE;
242
243	sx_slock(&allproc_lock);
244	FOREACH_PROC_IN_SYSTEM(p) {
245		if (p->p_sysent == entry->sysvec) {
246			rval = TRUE;
247			break;
248		}
249	}
250	sx_sunlock(&allproc_lock);
251
252	return (rval);
253}
254
255static Elf_Brandinfo *
256__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
257    int interp_name_len, int32_t *osrel)
258{
259	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
260	Elf_Brandinfo *bi;
261	boolean_t ret;
262	int i;
263
264	/*
265	 * We support four types of branding -- (1) the ELF EI_OSABI field
266	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
267	 * branding w/in the ELF header, (3) path of the `interp_path'
268	 * field, and (4) the ".note.ABI-tag" ELF section.
269	 */
270
271	/* Look for an ".note.ABI-tag" ELF section */
272	for (i = 0; i < MAX_BRANDS; i++) {
273		bi = elf_brand_list[i];
274		if (bi == NULL)
275			continue;
276		if (hdr->e_machine == bi->machine && (bi->flags &
277		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
278			ret = __elfN(check_note)(imgp, bi->brand_note, osrel);
279			if (ret)
280				return (bi);
281		}
282	}
283
284	/* If the executable has a brand, search for it in the brand list. */
285	for (i = 0; i < MAX_BRANDS; i++) {
286		bi = elf_brand_list[i];
287		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
288			continue;
289		if (hdr->e_machine == bi->machine &&
290		    (hdr->e_ident[EI_OSABI] == bi->brand ||
291		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
292		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
293			return (bi);
294	}
295
296	/* Lacking a known brand, search for a recognized interpreter. */
297	if (interp != NULL) {
298		for (i = 0; i < MAX_BRANDS; i++) {
299			bi = elf_brand_list[i];
300			if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
301				continue;
302			if (hdr->e_machine == bi->machine &&
303			    /* ELF image p_filesz includes terminating zero */
304			    strlen(bi->interp_path) + 1 == interp_name_len &&
305			    strncmp(interp, bi->interp_path, interp_name_len)
306			    == 0)
307				return (bi);
308		}
309	}
310
311	/* Lacking a recognized interpreter, try the default brand */
312	for (i = 0; i < MAX_BRANDS; i++) {
313		bi = elf_brand_list[i];
314		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
315			continue;
316		if (hdr->e_machine == bi->machine &&
317		    __elfN(fallback_brand) == bi->brand)
318			return (bi);
319	}
320	return (NULL);
321}
322
323static int
324__elfN(check_header)(const Elf_Ehdr *hdr)
325{
326	Elf_Brandinfo *bi;
327	int i;
328
329	if (!IS_ELF(*hdr) ||
330	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
331	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
332	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
333	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
334	    hdr->e_version != ELF_TARG_VER)
335		return (ENOEXEC);
336
337	/*
338	 * Make sure we have at least one brand for this machine.
339	 */
340
341	for (i = 0; i < MAX_BRANDS; i++) {
342		bi = elf_brand_list[i];
343		if (bi != NULL && bi->machine == hdr->e_machine)
344			break;
345	}
346	if (i == MAX_BRANDS)
347		return (ENOEXEC);
348
349	return (0);
350}
351
352static int
353__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
354    vm_offset_t start, vm_offset_t end, vm_prot_t prot)
355{
356	struct sf_buf *sf;
357	int error;
358	vm_offset_t off;
359
360	/*
361	 * Create the page if it doesn't exist yet. Ignore errors.
362	 */
363	vm_map_lock(map);
364	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
365	    VM_PROT_ALL, VM_PROT_ALL, 0);
366	vm_map_unlock(map);
367
368	/*
369	 * Find the page from the underlying object.
370	 */
371	if (object) {
372		sf = vm_imgact_map_page(object, offset);
373		if (sf == NULL)
374			return (KERN_FAILURE);
375		off = offset - trunc_page(offset);
376		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
377		    end - start);
378		vm_imgact_unmap_page(sf);
379		if (error) {
380			return (KERN_FAILURE);
381		}
382	}
383
384	return (KERN_SUCCESS);
385}
386
387static int
388__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
389    vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
390{
391	struct sf_buf *sf;
392	vm_offset_t off;
393	vm_size_t sz;
394	int error, rv;
395
396	if (start != trunc_page(start)) {
397		rv = __elfN(map_partial)(map, object, offset, start,
398		    round_page(start), prot);
399		if (rv)
400			return (rv);
401		offset += round_page(start) - start;
402		start = round_page(start);
403	}
404	if (end != round_page(end)) {
405		rv = __elfN(map_partial)(map, object, offset +
406		    trunc_page(end) - start, trunc_page(end), end, prot);
407		if (rv)
408			return (rv);
409		end = trunc_page(end);
410	}
411	if (end > start) {
412		if (offset & PAGE_MASK) {
413			/*
414			 * The mapping is not page aligned. This means we have
415			 * to copy the data. Sigh.
416			 */
417			rv = vm_map_find(map, NULL, 0, &start, end - start,
418			    FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
419			if (rv)
420				return (rv);
421			if (object == NULL)
422				return (KERN_SUCCESS);
423			for (; start < end; start += sz) {
424				sf = vm_imgact_map_page(object, offset);
425				if (sf == NULL)
426					return (KERN_FAILURE);
427				off = offset - trunc_page(offset);
428				sz = end - start;
429				if (sz > PAGE_SIZE - off)
430					sz = PAGE_SIZE - off;
431				error = copyout((caddr_t)sf_buf_kva(sf) + off,
432				    (caddr_t)start, sz);
433				vm_imgact_unmap_page(sf);
434				if (error) {
435					return (KERN_FAILURE);
436				}
437				offset += sz;
438			}
439			rv = KERN_SUCCESS;
440		} else {
441			vm_object_reference(object);
442			vm_map_lock(map);
443			rv = vm_map_insert(map, object, offset, start, end,
444			    prot, VM_PROT_ALL, cow);
445			vm_map_unlock(map);
446			if (rv != KERN_SUCCESS)
447				vm_object_deallocate(object);
448		}
449		return (rv);
450	} else {
451		return (KERN_SUCCESS);
452	}
453}
454
455static int
456__elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
457    caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
458    size_t pagesize)
459{
460	struct sf_buf *sf;
461	size_t map_len;
462	vm_map_t map;
463	vm_object_t object;
464	vm_offset_t map_addr;
465	int error, rv, cow;
466	size_t copy_len;
467	vm_offset_t file_addr;
468
469	/*
470	 * It's necessary to fail if the filsz + offset taken from the
471	 * header is greater than the actual file pager object's size.
472	 * If we were to allow this, then the vm_map_find() below would
473	 * walk right off the end of the file object and into the ether.
474	 *
475	 * While I'm here, might as well check for something else that
476	 * is invalid: filsz cannot be greater than memsz.
477	 */
478	if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) {
479		uprintf("elf_load_section: truncated ELF file\n");
480		return (ENOEXEC);
481	}
482
483	object = imgp->object;
484	map = &imgp->proc->p_vmspace->vm_map;
485	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
486	file_addr = trunc_page_ps(offset, pagesize);
487
488	/*
489	 * We have two choices.  We can either clear the data in the last page
490	 * of an oversized mapping, or we can start the anon mapping a page
491	 * early and copy the initialized data into that first page.  We
492	 * choose the second..
493	 */
494	if (memsz > filsz)
495		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
496	else
497		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
498
499	if (map_len != 0) {
500		/* cow flags: don't dump readonly sections in core */
501		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
502		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
503
504		rv = __elfN(map_insert)(map,
505				      object,
506				      file_addr,	/* file offset */
507				      map_addr,		/* virtual start */
508				      map_addr + map_len,/* virtual end */
509				      prot,
510				      cow);
511		if (rv != KERN_SUCCESS)
512			return (EINVAL);
513
514		/* we can stop now if we've covered it all */
515		if (memsz == filsz) {
516			return (0);
517		}
518	}
519
520
521	/*
522	 * We have to get the remaining bit of the file into the first part
523	 * of the oversized map segment.  This is normally because the .data
524	 * segment in the file is extended to provide bss.  It's a neat idea
525	 * to try and save a page, but it's a pain in the behind to implement.
526	 */
527	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
528	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
529	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
530	    map_addr;
531
532	/* This had damn well better be true! */
533	if (map_len != 0) {
534		rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr +
535		    map_len, VM_PROT_ALL, 0);
536		if (rv != KERN_SUCCESS) {
537			return (EINVAL);
538		}
539	}
540
541	if (copy_len != 0) {
542		vm_offset_t off;
543
544		sf = vm_imgact_map_page(object, offset + filsz);
545		if (sf == NULL)
546			return (EIO);
547
548		/* send the page fragment to user space */
549		off = trunc_page_ps(offset + filsz, pagesize) -
550		    trunc_page(offset + filsz);
551		error = copyout((caddr_t)sf_buf_kva(sf) + off,
552		    (caddr_t)map_addr, copy_len);
553		vm_imgact_unmap_page(sf);
554		if (error) {
555			return (error);
556		}
557	}
558
559	/*
560	 * set it to the specified protection.
561	 * XXX had better undo the damage from pasting over the cracks here!
562	 */
563	vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
564	    map_len), prot, FALSE);
565
566	return (0);
567}
568
569/*
570 * Load the file "file" into memory.  It may be either a shared object
571 * or an executable.
572 *
573 * The "addr" reference parameter is in/out.  On entry, it specifies
574 * the address where a shared object should be loaded.  If the file is
575 * an executable, this value is ignored.  On exit, "addr" specifies
576 * where the file was actually loaded.
577 *
578 * The "entry" reference parameter is out only.  On exit, it specifies
579 * the entry point for the loaded file.
580 */
581static int
582__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
583	u_long *entry, size_t pagesize)
584{
585	struct {
586		struct nameidata nd;
587		struct vattr attr;
588		struct image_params image_params;
589	} *tempdata;
590	const Elf_Ehdr *hdr = NULL;
591	const Elf_Phdr *phdr = NULL;
592	struct nameidata *nd;
593	struct vattr *attr;
594	struct image_params *imgp;
595	vm_prot_t prot;
596	u_long rbase;
597	u_long base_addr = 0;
598	int error, i, numsegs;
599
600#ifdef CAPABILITY_MODE
601	/*
602	 * XXXJA: This check can go away once we are sufficiently confident
603	 * that the checks in namei() are correct.
604	 */
605	if (IN_CAPABILITY_MODE(curthread))
606		return (ECAPMODE);
607#endif
608
609	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
610	nd = &tempdata->nd;
611	attr = &tempdata->attr;
612	imgp = &tempdata->image_params;
613
614	/*
615	 * Initialize part of the common data
616	 */
617	imgp->proc = p;
618	imgp->attr = attr;
619	imgp->firstpage = NULL;
620	imgp->image_header = NULL;
621	imgp->object = NULL;
622	imgp->execlabel = NULL;
623
624	NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
625	if ((error = namei(nd)) != 0) {
626		nd->ni_vp = NULL;
627		goto fail;
628	}
629	NDFREE(nd, NDF_ONLY_PNBUF);
630	imgp->vp = nd->ni_vp;
631
632	/*
633	 * Check permissions, modes, uid, etc on the file, and "open" it.
634	 */
635	error = exec_check_permissions(imgp);
636	if (error)
637		goto fail;
638
639	error = exec_map_first_page(imgp);
640	if (error)
641		goto fail;
642
643	/*
644	 * Also make certain that the interpreter stays the same, so set
645	 * its VV_TEXT flag, too.
646	 */
647	VOP_SET_TEXT(nd->ni_vp);
648
649	imgp->object = nd->ni_vp->v_object;
650
651	hdr = (const Elf_Ehdr *)imgp->image_header;
652	if ((error = __elfN(check_header)(hdr)) != 0)
653		goto fail;
654	if (hdr->e_type == ET_DYN)
655		rbase = *addr;
656	else if (hdr->e_type == ET_EXEC)
657		rbase = 0;
658	else {
659		error = ENOEXEC;
660		goto fail;
661	}
662
663	/* Only support headers that fit within first page for now      */
664	if ((hdr->e_phoff > PAGE_SIZE) ||
665	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
666		error = ENOEXEC;
667		goto fail;
668	}
669
670	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
671	if (!aligned(phdr, Elf_Addr)) {
672		error = ENOEXEC;
673		goto fail;
674	}
675
676	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
677		if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
678			/* Loadable segment */
679			prot = __elfN(trans_prot)(phdr[i].p_flags);
680			error = __elfN(load_section)(imgp, phdr[i].p_offset,
681			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
682			    phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize);
683			if (error != 0)
684				goto fail;
685			/*
686			 * Establish the base address if this is the
687			 * first segment.
688			 */
689			if (numsegs == 0)
690  				base_addr = trunc_page(phdr[i].p_vaddr +
691				    rbase);
692			numsegs++;
693		}
694	}
695	*addr = base_addr;
696	*entry = (unsigned long)hdr->e_entry + rbase;
697
698fail:
699	if (imgp->firstpage)
700		exec_unmap_first_page(imgp);
701
702	if (nd->ni_vp)
703		vput(nd->ni_vp);
704
705	free(tempdata, M_TEMP);
706
707	return (error);
708}
709
710static int
711__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
712{
713	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
714	const Elf_Phdr *phdr;
715	Elf_Auxargs *elf_auxargs;
716	struct vmspace *vmspace;
717	vm_prot_t prot;
718	u_long text_size = 0, data_size = 0, total_size = 0;
719	u_long text_addr = 0, data_addr = 0;
720	u_long seg_size, seg_addr;
721	u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
722	int32_t osrel = 0;
723	int error = 0, i, n, interp_name_len = 0;
724	const char *interp = NULL, *newinterp = NULL;
725	Elf_Brandinfo *brand_info;
726	char *path;
727	struct sysentvec *sv;
728
729	/*
730	 * Do we have a valid ELF header ?
731	 *
732	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
733	 * if particular brand doesn't support it.
734	 */
735	if (__elfN(check_header)(hdr) != 0 ||
736	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
737		return (-1);
738
739	/*
740	 * From here on down, we return an errno, not -1, as we've
741	 * detected an ELF file.
742	 */
743
744	if ((hdr->e_phoff > PAGE_SIZE) ||
745	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
746		/* Only support headers in first page for now */
747		return (ENOEXEC);
748	}
749	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
750	if (!aligned(phdr, Elf_Addr))
751		return (ENOEXEC);
752	n = 0;
753	baddr = 0;
754	for (i = 0; i < hdr->e_phnum; i++) {
755		switch (phdr[i].p_type) {
756		case PT_LOAD:
757			if (n == 0)
758				baddr = phdr[i].p_vaddr;
759			n++;
760			break;
761		case PT_INTERP:
762			/* Path to interpreter */
763			if (phdr[i].p_filesz > MAXPATHLEN ||
764			    phdr[i].p_offset > PAGE_SIZE ||
765			    phdr[i].p_filesz > PAGE_SIZE - phdr[i].p_offset)
766				return (ENOEXEC);
767			interp = imgp->image_header + phdr[i].p_offset;
768			interp_name_len = phdr[i].p_filesz;
769			break;
770		case PT_GNU_STACK:
771			if (__elfN(nxstack))
772				imgp->stack_prot =
773				    __elfN(trans_prot)(phdr[i].p_flags);
774			break;
775		}
776	}
777
778	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
779	    &osrel);
780	if (brand_info == NULL) {
781		uprintf("ELF binary type \"%u\" not known.\n",
782		    hdr->e_ident[EI_OSABI]);
783		return (ENOEXEC);
784	}
785	if (hdr->e_type == ET_DYN) {
786		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0)
787			return (ENOEXEC);
788		/*
789		 * Honour the base load address from the dso if it is
790		 * non-zero for some reason.
791		 */
792		if (baddr == 0)
793			et_dyn_addr = ET_DYN_LOAD_ADDR;
794		else
795			et_dyn_addr = 0;
796	} else
797		et_dyn_addr = 0;
798	sv = brand_info->sysvec;
799	if (interp != NULL && brand_info->interp_newpath != NULL)
800		newinterp = brand_info->interp_newpath;
801
802	/*
803	 * Avoid a possible deadlock if the current address space is destroyed
804	 * and that address space maps the locked vnode.  In the common case,
805	 * the locked vnode's v_usecount is decremented but remains greater
806	 * than zero.  Consequently, the vnode lock is not needed by vrele().
807	 * However, in cases where the vnode lock is external, such as nullfs,
808	 * v_usecount may become zero.
809	 *
810	 * The VV_TEXT flag prevents modifications to the executable while
811	 * the vnode is unlocked.
812	 */
813	VOP_UNLOCK(imgp->vp, 0);
814
815	error = exec_new_vmspace(imgp, sv);
816	imgp->proc->p_sysent = sv;
817
818	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
819	if (error)
820		return (error);
821
822	for (i = 0; i < hdr->e_phnum; i++) {
823		switch (phdr[i].p_type) {
824		case PT_LOAD:	/* Loadable segment */
825			if (phdr[i].p_memsz == 0)
826				break;
827			prot = __elfN(trans_prot)(phdr[i].p_flags);
828			error = __elfN(load_section)(imgp, phdr[i].p_offset,
829			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
830			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
831			    sv->sv_pagesize);
832			if (error != 0)
833				return (error);
834
835			/*
836			 * If this segment contains the program headers,
837			 * remember their virtual address for the AT_PHDR
838			 * aux entry. Static binaries don't usually include
839			 * a PT_PHDR entry.
840			 */
841			if (phdr[i].p_offset == 0 &&
842			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
843				<= phdr[i].p_filesz)
844				proghdr = phdr[i].p_vaddr + hdr->e_phoff +
845				    et_dyn_addr;
846
847			seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
848			seg_size = round_page(phdr[i].p_memsz +
849			    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
850
851			/*
852			 * Make the largest executable segment the official
853			 * text segment and all others data.
854			 *
855			 * Note that obreak() assumes that data_addr +
856			 * data_size == end of data load area, and the ELF
857			 * file format expects segments to be sorted by
858			 * address.  If multiple data segments exist, the
859			 * last one will be used.
860			 */
861
862			if (phdr[i].p_flags & PF_X && text_size < seg_size) {
863				text_size = seg_size;
864				text_addr = seg_addr;
865			} else {
866				data_size = seg_size;
867				data_addr = seg_addr;
868			}
869			total_size += seg_size;
870			break;
871		case PT_PHDR: 	/* Program header table info */
872			proghdr = phdr[i].p_vaddr + et_dyn_addr;
873			break;
874		default:
875			break;
876		}
877	}
878
879	if (data_addr == 0 && data_size == 0) {
880		data_addr = text_addr;
881		data_size = text_size;
882	}
883
884	entry = (u_long)hdr->e_entry + et_dyn_addr;
885
886	/*
887	 * Check limits.  It should be safe to check the
888	 * limits after loading the segments since we do
889	 * not actually fault in all the segments pages.
890	 */
891	PROC_LOCK(imgp->proc);
892	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
893	    text_size > maxtsiz ||
894	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM) ||
895	    racct_set(imgp->proc, RACCT_DATA, data_size) != 0 ||
896	    racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) {
897		PROC_UNLOCK(imgp->proc);
898		return (ENOMEM);
899	}
900
901	vmspace = imgp->proc->p_vmspace;
902	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
903	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
904	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
905	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
906
907	/*
908	 * We load the dynamic linker where a userland call
909	 * to mmap(0, ...) would put it.  The rationale behind this
910	 * calculation is that it leaves room for the heap to grow to
911	 * its maximum allowed size.
912	 */
913	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(imgp->proc,
914	    RLIMIT_DATA));
915	PROC_UNLOCK(imgp->proc);
916
917	imgp->entry_addr = entry;
918
919	if (interp != NULL) {
920		int have_interp = FALSE;
921		VOP_UNLOCK(imgp->vp, 0);
922		if (brand_info->emul_path != NULL &&
923		    brand_info->emul_path[0] != '\0') {
924			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
925			snprintf(path, MAXPATHLEN, "%s%s",
926			    brand_info->emul_path, interp);
927			error = __elfN(load_file)(imgp->proc, path, &addr,
928			    &imgp->entry_addr, sv->sv_pagesize);
929			free(path, M_TEMP);
930			if (error == 0)
931				have_interp = TRUE;
932		}
933		if (!have_interp && newinterp != NULL) {
934			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
935			    &imgp->entry_addr, sv->sv_pagesize);
936			if (error == 0)
937				have_interp = TRUE;
938		}
939		if (!have_interp) {
940			error = __elfN(load_file)(imgp->proc, interp, &addr,
941			    &imgp->entry_addr, sv->sv_pagesize);
942		}
943		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
944		if (error != 0) {
945			uprintf("ELF interpreter %s not found\n", interp);
946			return (error);
947		}
948	} else
949		addr = et_dyn_addr;
950
951	/*
952	 * Construct auxargs table (used by the fixup routine)
953	 */
954	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
955	elf_auxargs->execfd = -1;
956	elf_auxargs->phdr = proghdr;
957	elf_auxargs->phent = hdr->e_phentsize;
958	elf_auxargs->phnum = hdr->e_phnum;
959	elf_auxargs->pagesz = PAGE_SIZE;
960	elf_auxargs->base = addr;
961	elf_auxargs->flags = 0;
962	elf_auxargs->entry = entry;
963
964	imgp->auxargs = elf_auxargs;
965	imgp->interpreted = 0;
966	imgp->reloc_base = addr;
967	imgp->proc->p_osrel = osrel;
968
969	return (error);
970}
971
972#define	suword __CONCAT(suword, __ELF_WORD_SIZE)
973
974int
975__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
976{
977	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
978	Elf_Addr *base;
979	Elf_Addr *pos;
980
981	base = (Elf_Addr *)*stack_base;
982	pos = base + (imgp->args->argc + imgp->args->envc + 2);
983
984	if (args->execfd != -1)
985		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
986	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
987	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
988	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
989	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
990	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
991	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
992	AUXARGS_ENTRY(pos, AT_BASE, args->base);
993	if (imgp->execpathp != 0)
994		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
995	AUXARGS_ENTRY(pos, AT_OSRELDATE, osreldate);
996	if (imgp->canary != 0) {
997		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
998		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
999	}
1000	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
1001	if (imgp->pagesizes != 0) {
1002		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
1003		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
1004	}
1005	if (imgp->sysent->sv_timekeep_base != 0) {
1006		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
1007		    imgp->sysent->sv_timekeep_base);
1008	}
1009	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
1010	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
1011	    imgp->sysent->sv_stackprot);
1012	AUXARGS_ENTRY(pos, AT_NULL, 0);
1013
1014	free(imgp->auxargs, M_TEMP);
1015	imgp->auxargs = NULL;
1016
1017	base--;
1018	suword(base, (long)imgp->args->argc);
1019	*stack_base = (register_t *)base;
1020	return (0);
1021}
1022
1023/*
1024 * Code for generating ELF core dumps.
1025 */
1026
1027typedef void (*segment_callback)(vm_map_entry_t, void *);
1028
1029/* Closure for cb_put_phdr(). */
1030struct phdr_closure {
1031	Elf_Phdr *phdr;		/* Program header to fill in */
1032	Elf_Off offset;		/* Offset of segment in core file */
1033};
1034
1035/* Closure for cb_size_segment(). */
1036struct sseg_closure {
1037	int count;		/* Count of writable segments. */
1038	size_t size;		/* Total size of all writable segments. */
1039};
1040
1041static void cb_put_phdr(vm_map_entry_t, void *);
1042static void cb_size_segment(vm_map_entry_t, void *);
1043static void each_writable_segment(struct thread *, segment_callback, void *);
1044static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
1045    int, void *, size_t, gzFile);
1046static void __elfN(puthdr)(struct thread *, void *, size_t *, int);
1047static void __elfN(putnote)(void *, size_t *, const char *, int,
1048    const void *, size_t);
1049
1050#ifdef COMPRESS_USER_CORES
1051extern int compress_user_cores;
1052extern int compress_user_cores_gzlevel;
1053#endif
1054
1055static int
1056core_output(struct vnode *vp, void *base, size_t len, off_t offset,
1057    struct ucred *active_cred, struct ucred *file_cred,
1058    struct thread *td, char *core_buf, gzFile gzfile) {
1059
1060	int error;
1061	if (gzfile) {
1062#ifdef COMPRESS_USER_CORES
1063		error = compress_core(gzfile, base, core_buf, len, td);
1064#else
1065		panic("shouldn't be here");
1066#endif
1067	} else {
1068		error = vn_rdwr_inchunks(UIO_WRITE, vp, base, len, offset,
1069		    UIO_USERSPACE, IO_UNIT | IO_DIRECT, active_cred, file_cred,
1070		    NULL, td);
1071	}
1072	return (error);
1073}
1074
1075int
1076__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
1077{
1078	struct ucred *cred = td->td_ucred;
1079	int error = 0;
1080	struct sseg_closure seginfo;
1081	void *hdr;
1082	size_t hdrsize;
1083
1084	gzFile gzfile = Z_NULL;
1085	char *core_buf = NULL;
1086#ifdef COMPRESS_USER_CORES
1087	char gzopen_flags[8];
1088	char *p;
1089	int doing_compress = flags & IMGACT_CORE_COMPRESS;
1090#endif
1091
1092	hdr = NULL;
1093
1094#ifdef COMPRESS_USER_CORES
1095        if (doing_compress) {
1096                p = gzopen_flags;
1097                *p++ = 'w';
1098                if (compress_user_cores_gzlevel >= 0 &&
1099                    compress_user_cores_gzlevel <= 9)
1100                        *p++ = '0' + compress_user_cores_gzlevel;
1101                *p = 0;
1102                gzfile = gz_open("", gzopen_flags, vp);
1103                if (gzfile == Z_NULL) {
1104                        error = EFAULT;
1105                        goto done;
1106                }
1107                core_buf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
1108                if (!core_buf) {
1109                        error = ENOMEM;
1110                        goto done;
1111                }
1112        }
1113#endif
1114
1115	/* Size the program segments. */
1116	seginfo.count = 0;
1117	seginfo.size = 0;
1118	each_writable_segment(td, cb_size_segment, &seginfo);
1119
1120	/*
1121	 * Calculate the size of the core file header area by making
1122	 * a dry run of generating it.  Nothing is written, but the
1123	 * size is calculated.
1124	 */
1125	hdrsize = 0;
1126	__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
1127
1128#ifdef RACCT
1129	PROC_LOCK(td->td_proc);
1130	error = racct_add(td->td_proc, RACCT_CORE, hdrsize + seginfo.size);
1131	PROC_UNLOCK(td->td_proc);
1132	if (error != 0) {
1133		error = EFAULT;
1134		goto done;
1135	}
1136#endif
1137	if (hdrsize + seginfo.size >= limit) {
1138		error = EFAULT;
1139		goto done;
1140	}
1141
1142	/*
1143	 * Allocate memory for building the header, fill it up,
1144	 * and write it out.
1145	 */
1146	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
1147	if (hdr == NULL) {
1148		error = EINVAL;
1149		goto done;
1150	}
1151	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize,
1152	    gzfile);
1153
1154	/* Write the contents of all of the writable segments. */
1155	if (error == 0) {
1156		Elf_Phdr *php;
1157		off_t offset;
1158		int i;
1159
1160		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
1161		offset = hdrsize;
1162		for (i = 0; i < seginfo.count; i++) {
1163			error = core_output(vp, (caddr_t)(uintptr_t)php->p_vaddr,
1164			    php->p_filesz, offset, cred, NOCRED, curthread, core_buf, gzfile);
1165			if (error != 0)
1166				break;
1167			offset += php->p_filesz;
1168			php++;
1169		}
1170	}
1171	if (error) {
1172		log(LOG_WARNING,
1173		    "Failed to write core file for process %s (error %d)\n",
1174		    curproc->p_comm, error);
1175	}
1176
1177done:
1178#ifdef COMPRESS_USER_CORES
1179	if (core_buf)
1180		free(core_buf, M_TEMP);
1181	if (gzfile)
1182		gzclose(gzfile);
1183#endif
1184
1185	free(hdr, M_TEMP);
1186
1187	return (error);
1188}
1189
1190/*
1191 * A callback for each_writable_segment() to write out the segment's
1192 * program header entry.
1193 */
1194static void
1195cb_put_phdr(entry, closure)
1196	vm_map_entry_t entry;
1197	void *closure;
1198{
1199	struct phdr_closure *phc = (struct phdr_closure *)closure;
1200	Elf_Phdr *phdr = phc->phdr;
1201
1202	phc->offset = round_page(phc->offset);
1203
1204	phdr->p_type = PT_LOAD;
1205	phdr->p_offset = phc->offset;
1206	phdr->p_vaddr = entry->start;
1207	phdr->p_paddr = 0;
1208	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
1209	phdr->p_align = PAGE_SIZE;
1210	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
1211
1212	phc->offset += phdr->p_filesz;
1213	phc->phdr++;
1214}
1215
1216/*
1217 * A callback for each_writable_segment() to gather information about
1218 * the number of segments and their total size.
1219 */
1220static void
1221cb_size_segment(entry, closure)
1222	vm_map_entry_t entry;
1223	void *closure;
1224{
1225	struct sseg_closure *ssc = (struct sseg_closure *)closure;
1226
1227	ssc->count++;
1228	ssc->size += entry->end - entry->start;
1229}
1230
1231/*
1232 * For each writable segment in the process's memory map, call the given
1233 * function with a pointer to the map entry and some arbitrary
1234 * caller-supplied data.
1235 */
1236static void
1237each_writable_segment(td, func, closure)
1238	struct thread *td;
1239	segment_callback func;
1240	void *closure;
1241{
1242	struct proc *p = td->td_proc;
1243	vm_map_t map = &p->p_vmspace->vm_map;
1244	vm_map_entry_t entry;
1245	vm_object_t backing_object, object;
1246	boolean_t ignore_entry;
1247
1248	vm_map_lock_read(map);
1249	for (entry = map->header.next; entry != &map->header;
1250	    entry = entry->next) {
1251		/*
1252		 * Don't dump inaccessible mappings, deal with legacy
1253		 * coredump mode.
1254		 *
1255		 * Note that read-only segments related to the elf binary
1256		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
1257		 * need to arbitrarily ignore such segments.
1258		 */
1259		if (elf_legacy_coredump) {
1260			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
1261				continue;
1262		} else {
1263			if ((entry->protection & VM_PROT_ALL) == 0)
1264				continue;
1265		}
1266
1267		/*
1268		 * Dont include memory segment in the coredump if
1269		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
1270		 * madvise(2).  Do not dump submaps (i.e. parts of the
1271		 * kernel map).
1272		 */
1273		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
1274			continue;
1275
1276		if ((object = entry->object.vm_object) == NULL)
1277			continue;
1278
1279		/* Ignore memory-mapped devices and such things. */
1280		VM_OBJECT_RLOCK(object);
1281		while ((backing_object = object->backing_object) != NULL) {
1282			VM_OBJECT_RLOCK(backing_object);
1283			VM_OBJECT_RUNLOCK(object);
1284			object = backing_object;
1285		}
1286		ignore_entry = object->type != OBJT_DEFAULT &&
1287		    object->type != OBJT_SWAP && object->type != OBJT_VNODE;
1288		VM_OBJECT_RUNLOCK(object);
1289		if (ignore_entry)
1290			continue;
1291
1292		(*func)(entry, closure);
1293	}
1294	vm_map_unlock_read(map);
1295}
1296
1297/*
1298 * Write the core file header to the file, including padding up to
1299 * the page boundary.
1300 */
1301static int
1302__elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize, gzfile)
1303	struct thread *td;
1304	struct vnode *vp;
1305	struct ucred *cred;
1306	int numsegs;
1307	size_t hdrsize;
1308	void *hdr;
1309	gzFile gzfile;
1310{
1311	size_t off;
1312
1313	/* Fill in the header. */
1314	bzero(hdr, hdrsize);
1315	off = 0;
1316	__elfN(puthdr)(td, hdr, &off, numsegs);
1317
1318	if (!gzfile) {
1319		/* Write it to the core file. */
1320		return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
1321			UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
1322			td));
1323	} else {
1324#ifdef COMPRESS_USER_CORES
1325		if (gzwrite(gzfile, hdr, hdrsize) != hdrsize) {
1326			log(LOG_WARNING,
1327			    "Failed to compress core file header for process"
1328			    " %s.\n", curproc->p_comm);
1329			return (EFAULT);
1330		}
1331		else {
1332			return (0);
1333		}
1334#else
1335		panic("shouldn't be here");
1336#endif
1337	}
1338}
1339
1340#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
1341#include <compat/freebsd32/freebsd32.h>
1342
1343typedef struct prstatus32 elf_prstatus_t;
1344typedef struct prpsinfo32 elf_prpsinfo_t;
1345typedef struct fpreg32 elf_prfpregset_t;
1346typedef struct fpreg32 elf_fpregset_t;
1347typedef struct reg32 elf_gregset_t;
1348typedef struct thrmisc32 elf_thrmisc_t;
1349#else
1350typedef prstatus_t elf_prstatus_t;
1351typedef prpsinfo_t elf_prpsinfo_t;
1352typedef prfpregset_t elf_prfpregset_t;
1353typedef prfpregset_t elf_fpregset_t;
1354typedef gregset_t elf_gregset_t;
1355typedef thrmisc_t elf_thrmisc_t;
1356#endif
1357
1358static void
1359__elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs)
1360{
1361	struct {
1362		elf_prstatus_t status;
1363		elf_prfpregset_t fpregset;
1364		elf_prpsinfo_t psinfo;
1365		elf_thrmisc_t thrmisc;
1366	} *tempdata;
1367	elf_prstatus_t *status;
1368	elf_prfpregset_t *fpregset;
1369	elf_prpsinfo_t *psinfo;
1370	elf_thrmisc_t *thrmisc;
1371	struct proc *p;
1372	struct thread *thr;
1373	size_t ehoff, noteoff, notesz, phoff;
1374
1375	p = td->td_proc;
1376
1377	ehoff = *off;
1378	*off += sizeof(Elf_Ehdr);
1379
1380	phoff = *off;
1381	*off += (numsegs + 1) * sizeof(Elf_Phdr);
1382
1383	noteoff = *off;
1384	/*
1385	 * Don't allocate space for the notes if we're just calculating
1386	 * the size of the header. We also don't collect the data.
1387	 */
1388	if (dst != NULL) {
1389		tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK);
1390		status = &tempdata->status;
1391		fpregset = &tempdata->fpregset;
1392		psinfo = &tempdata->psinfo;
1393		thrmisc = &tempdata->thrmisc;
1394	} else {
1395		tempdata = NULL;
1396		status = NULL;
1397		fpregset = NULL;
1398		psinfo = NULL;
1399		thrmisc = NULL;
1400	}
1401
1402	if (dst != NULL) {
1403		psinfo->pr_version = PRPSINFO_VERSION;
1404		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
1405		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
1406		/*
1407		 * XXX - We don't fill in the command line arguments properly
1408		 * yet.
1409		 */
1410		strlcpy(psinfo->pr_psargs, p->p_comm,
1411		    sizeof(psinfo->pr_psargs));
1412	}
1413	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
1414	    sizeof *psinfo);
1415
1416	/*
1417	 * To have the debugger select the right thread (LWP) as the initial
1418	 * thread, we dump the state of the thread passed to us in td first.
1419	 * This is the thread that causes the core dump and thus likely to
1420	 * be the right thread one wants to have selected in the debugger.
1421	 */
1422	thr = td;
1423	while (thr != NULL) {
1424		if (dst != NULL) {
1425			status->pr_version = PRSTATUS_VERSION;
1426			status->pr_statussz = sizeof(elf_prstatus_t);
1427			status->pr_gregsetsz = sizeof(elf_gregset_t);
1428			status->pr_fpregsetsz = sizeof(elf_fpregset_t);
1429			status->pr_osreldate = osreldate;
1430			status->pr_cursig = p->p_sig;
1431			status->pr_pid = thr->td_tid;
1432#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
1433			fill_regs32(thr, &status->pr_reg);
1434			fill_fpregs32(thr, fpregset);
1435#else
1436			fill_regs(thr, &status->pr_reg);
1437			fill_fpregs(thr, fpregset);
1438#endif
1439			memset(&thrmisc->_pad, 0, sizeof (thrmisc->_pad));
1440			strcpy(thrmisc->pr_tname, thr->td_name);
1441		}
1442		__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
1443		    sizeof *status);
1444		__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
1445		    sizeof *fpregset);
1446		__elfN(putnote)(dst, off, "FreeBSD", NT_THRMISC, thrmisc,
1447		    sizeof *thrmisc);
1448		/*
1449		 * Allow for MD specific notes, as well as any MD
1450		 * specific preparations for writing MI notes.
1451		 */
1452		__elfN(dump_thread)(thr, dst, off);
1453
1454		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
1455		    TAILQ_NEXT(thr, td_plist);
1456		if (thr == td)
1457			thr = TAILQ_NEXT(thr, td_plist);
1458	}
1459
1460	notesz = *off - noteoff;
1461
1462	if (dst != NULL)
1463		free(tempdata, M_TEMP);
1464
1465	/* Align up to a page boundary for the program segments. */
1466	*off = round_page(*off);
1467
1468	if (dst != NULL) {
1469		Elf_Ehdr *ehdr;
1470		Elf_Phdr *phdr;
1471		struct phdr_closure phc;
1472
1473		/*
1474		 * Fill in the ELF header.
1475		 */
1476		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
1477		ehdr->e_ident[EI_MAG0] = ELFMAG0;
1478		ehdr->e_ident[EI_MAG1] = ELFMAG1;
1479		ehdr->e_ident[EI_MAG2] = ELFMAG2;
1480		ehdr->e_ident[EI_MAG3] = ELFMAG3;
1481		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
1482		ehdr->e_ident[EI_DATA] = ELF_DATA;
1483		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
1484		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
1485		ehdr->e_ident[EI_ABIVERSION] = 0;
1486		ehdr->e_ident[EI_PAD] = 0;
1487		ehdr->e_type = ET_CORE;
1488#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
1489		ehdr->e_machine = ELF_ARCH32;
1490#else
1491		ehdr->e_machine = ELF_ARCH;
1492#endif
1493		ehdr->e_version = EV_CURRENT;
1494		ehdr->e_entry = 0;
1495		ehdr->e_phoff = phoff;
1496		ehdr->e_flags = 0;
1497		ehdr->e_ehsize = sizeof(Elf_Ehdr);
1498		ehdr->e_phentsize = sizeof(Elf_Phdr);
1499		ehdr->e_phnum = numsegs + 1;
1500		ehdr->e_shentsize = sizeof(Elf_Shdr);
1501		ehdr->e_shnum = 0;
1502		ehdr->e_shstrndx = SHN_UNDEF;
1503
1504		/*
1505		 * Fill in the program header entries.
1506		 */
1507		phdr = (Elf_Phdr *)((char *)dst + phoff);
1508
1509		/* The note segement. */
1510		phdr->p_type = PT_NOTE;
1511		phdr->p_offset = noteoff;
1512		phdr->p_vaddr = 0;
1513		phdr->p_paddr = 0;
1514		phdr->p_filesz = notesz;
1515		phdr->p_memsz = 0;
1516		phdr->p_flags = PF_R;
1517		phdr->p_align = sizeof(Elf32_Size);
1518		phdr++;
1519
1520		/* All the writable segments from the program. */
1521		phc.phdr = phdr;
1522		phc.offset = *off;
1523		each_writable_segment(td, cb_put_phdr, &phc);
1524	}
1525}
1526
1527static void
1528__elfN(putnote)(void *dst, size_t *off, const char *name, int type,
1529    const void *desc, size_t descsz)
1530{
1531	Elf_Note note;
1532
1533	note.n_namesz = strlen(name) + 1;
1534	note.n_descsz = descsz;
1535	note.n_type = type;
1536	if (dst != NULL)
1537		bcopy(&note, (char *)dst + *off, sizeof note);
1538	*off += sizeof note;
1539	if (dst != NULL)
1540		bcopy(name, (char *)dst + *off, note.n_namesz);
1541	*off += roundup2(note.n_namesz, sizeof(Elf32_Size));
1542	if (dst != NULL)
1543		bcopy(desc, (char *)dst + *off, note.n_descsz);
1544	*off += roundup2(note.n_descsz, sizeof(Elf32_Size));
1545}
1546
1547static boolean_t
1548__elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote,
1549    int32_t *osrel, const Elf_Phdr *pnote)
1550{
1551	const Elf_Note *note, *note0, *note_end;
1552	const char *note_name;
1553	int i;
1554
1555	if (pnote == NULL || pnote->p_offset > PAGE_SIZE ||
1556	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset)
1557		return (FALSE);
1558
1559	note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
1560	note_end = (const Elf_Note *)(imgp->image_header +
1561	    pnote->p_offset + pnote->p_filesz);
1562	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
1563		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
1564		    (const char *)note < sizeof(Elf_Note))
1565			return (FALSE);
1566		if (note->n_namesz != checknote->hdr.n_namesz ||
1567		    note->n_descsz != checknote->hdr.n_descsz ||
1568		    note->n_type != checknote->hdr.n_type)
1569			goto nextnote;
1570		note_name = (const char *)(note + 1);
1571		if (note_name + checknote->hdr.n_namesz >=
1572		    (const char *)note_end || strncmp(checknote->vendor,
1573		    note_name, checknote->hdr.n_namesz) != 0)
1574			goto nextnote;
1575
1576		/*
1577		 * Fetch the osreldate for binary
1578		 * from the ELF OSABI-note if necessary.
1579		 */
1580		if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 &&
1581		    checknote->trans_osrel != NULL)
1582			return (checknote->trans_osrel(note, osrel));
1583		return (TRUE);
1584
1585nextnote:
1586		note = (const Elf_Note *)((const char *)(note + 1) +
1587		    roundup2(note->n_namesz, sizeof(Elf32_Addr)) +
1588		    roundup2(note->n_descsz, sizeof(Elf32_Addr)));
1589	}
1590
1591	return (FALSE);
1592}
1593
1594/*
1595 * Try to find the appropriate ABI-note section for checknote,
1596 * fetch the osreldate for binary from the ELF OSABI-note. Only the
1597 * first page of the image is searched, the same as for headers.
1598 */
1599static boolean_t
1600__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote,
1601    int32_t *osrel)
1602{
1603	const Elf_Phdr *phdr;
1604	const Elf_Ehdr *hdr;
1605	int i;
1606
1607	hdr = (const Elf_Ehdr *)imgp->image_header;
1608	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
1609
1610	for (i = 0; i < hdr->e_phnum; i++) {
1611		if (phdr[i].p_type == PT_NOTE &&
1612		    __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i]))
1613			return (TRUE);
1614	}
1615	return (FALSE);
1616
1617}
1618
1619/*
1620 * Tell kern_execve.c about it, with a little help from the linker.
1621 */
1622static struct execsw __elfN(execsw) = {
1623	__CONCAT(exec_, __elfN(imgact)),
1624	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
1625};
1626EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
1627
1628#ifdef COMPRESS_USER_CORES
1629/*
1630 * Compress and write out a core segment for a user process.
1631 *
1632 * 'inbuf' is the starting address of a VM segment in the process' address
1633 * space that is to be compressed and written out to the core file.  'dest_buf'
1634 * is a buffer in the kernel's address space.  The segment is copied from
1635 * 'inbuf' to 'dest_buf' first before being processed by the compression
1636 * routine gzwrite().  This copying is necessary because the content of the VM
1637 * segment may change between the compression pass and the crc-computation pass
1638 * in gzwrite().  This is because realtime threads may preempt the UNIX kernel.
1639 */
1640static int
1641compress_core (gzFile file, char *inbuf, char *dest_buf, unsigned int len,
1642    struct thread *td)
1643{
1644	int len_compressed;
1645	int error = 0;
1646	unsigned int chunk_len;
1647
1648	while (len) {
1649		chunk_len = (len > CORE_BUF_SIZE) ? CORE_BUF_SIZE : len;
1650		copyin(inbuf, dest_buf, chunk_len);
1651		len_compressed = gzwrite(file, dest_buf, chunk_len);
1652
1653		EVENTHANDLER_INVOKE(app_coredump_progress, td, len_compressed);
1654
1655		if ((unsigned int)len_compressed != chunk_len) {
1656			log(LOG_WARNING,
1657			    "compress_core: length mismatch (0x%x returned, "
1658			    "0x%x expected)\n", len_compressed, chunk_len);
1659			EVENTHANDLER_INVOKE(app_coredump_error, td,
1660			    "compress_core: length mismatch %x -> %x",
1661			    chunk_len, len_compressed);
1662			error = EFAULT;
1663			break;
1664		}
1665		inbuf += chunk_len;
1666		len -= chunk_len;
1667		maybe_yield();
1668	}
1669
1670	return (error);
1671}
1672#endif /* COMPRESS_USER_CORES */
1673
1674static vm_prot_t
1675__elfN(trans_prot)(Elf_Word flags)
1676{
1677	vm_prot_t prot;
1678
1679	prot = 0;
1680	if (flags & PF_X)
1681		prot |= VM_PROT_EXECUTE;
1682	if (flags & PF_W)
1683		prot |= VM_PROT_WRITE;
1684	if (flags & PF_R)
1685		prot |= VM_PROT_READ;
1686#if __ELF_WORD_SIZE == 32
1687#if defined(__amd64__) || defined(__ia64__)
1688	if (i386_read_exec && (flags & PF_R))
1689		prot |= VM_PROT_EXECUTE;
1690#endif
1691#endif
1692	return (prot);
1693}
1694
1695static Elf_Word
1696__elfN(untrans_prot)(vm_prot_t prot)
1697{
1698	Elf_Word flags;
1699
1700	flags = 0;
1701	if (prot & VM_PROT_EXECUTE)
1702		flags |= PF_X;
1703	if (prot & VM_PROT_READ)
1704		flags |= PF_R;
1705	if (prot & VM_PROT_WRITE)
1706		flags |= PF_W;
1707	return (flags);
1708}
1709