linux32_sysvec.c revision 293540
131921Sbrian/*-
231921Sbrian * Copyright (c) 2004 Tim J. Robbins
331921Sbrian * Copyright (c) 2003 Peter Wemm
431921Sbrian * Copyright (c) 2002 Doug Rabson
531921Sbrian * Copyright (c) 1998-1999 Andrew Gallatin
631921Sbrian * Copyright (c) 1994-1996 S��ren Schmidt
731921Sbrian * All rights reserved.
831921Sbrian *
931921Sbrian * Redistribution and use in source and binary forms, with or without
1031921Sbrian * modification, are permitted provided that the following conditions
1131921Sbrian * are met:
1231921Sbrian * 1. Redistributions of source code must retain the above copyright
1331921Sbrian *    notice, this list of conditions and the following disclaimer
1431921Sbrian *    in this position and unchanged.
1531921Sbrian * 2. Redistributions in binary form must reproduce the above copyright
1631921Sbrian *    notice, this list of conditions and the following disclaimer in the
1731921Sbrian *    documentation and/or other materials provided with the distribution.
1831921Sbrian * 3. The name of the author may not be used to endorse or promote products
1931921Sbrian *    derived from this software without specific prior written permission
2031921Sbrian *
2131921Sbrian * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
2231921Sbrian * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
2331921Sbrian * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
2431921Sbrian * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
2531921Sbrian * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
2650479Speter * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2731061Sbrian * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2831061Sbrian * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2964802Sbrian * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30202192Sed * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3136285Sbrian */
3233603Sbrian
3331061Sbrian#include <sys/cdefs.h>
3431061Sbrian__FBSDID("$FreeBSD: stable/10/sys/amd64/linux32/linux32_sysvec.c 293540 2016-01-09 16:29:51Z dchagin $");
3531061Sbrian#include "opt_compat.h"
3631061Sbrian
3731061Sbrian#ifndef COMPAT_FREEBSD32
3831061Sbrian#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
3936285Sbrian#endif
4032025Sbrian
4131061Sbrian#define	__ELF_WORD_SIZE	32
4236450Sbrian
4331061Sbrian#include <sys/param.h>
44202192Sed#include <sys/systm.h>
45202192Sed#include <sys/exec.h>
4636285Sbrian#include <sys/fcntl.h>
4736285Sbrian#include <sys/imgact.h>
4836467Sbrian#include <sys/imgact_elf.h>
4953241Sbrian#include <sys/kernel.h>
5051525Sbrian#include <sys/lock.h>
5151525Sbrian#include <sys/malloc.h>
5253535Sbrian#include <sys/module.h>
5353535Sbrian#include <sys/mutex.h>
5453535Sbrian#include <sys/proc.h>
5564802Sbrian#include <sys/resourcevar.h>
5664802Sbrian#include <sys/signalvar.h>
5764802Sbrian#include <sys/sysctl.h>
5864802Sbrian#include <sys/syscallsubr.h>
5964802Sbrian#include <sys/sysent.h>
6064802Sbrian#include <sys/sysproto.h>
6164802Sbrian#include <sys/vnode.h>
6264802Sbrian#include <sys/eventhandler.h>
6364802Sbrian
6464802Sbrian#include <vm/vm.h>
6564802Sbrian#include <vm/pmap.h>
6664802Sbrian#include <vm/vm_extern.h>
67202192Sed#include <vm/vm_map.h>
68202192Sed#include <vm/vm_object.h>
6964802Sbrian#include <vm/vm_page.h>
7064802Sbrian#include <vm/vm_param.h>
7164802Sbrian
7264802Sbrian#include <machine/cpu.h>
7364802Sbrian#include <machine/md_var.h>
74134885Smarcel#include <machine/pcb.h>
75134885Smarcel#include <machine/specialreg.h>
7664802Sbrian
7764802Sbrian#include <amd64/linux32/linux.h>
7864802Sbrian#include <amd64/linux32/linux32_proto.h>
7964802Sbrian#include <compat/linux/linux_emul.h>
8064802Sbrian#include <compat/linux/linux_futex.h>
8164802Sbrian#include <compat/linux/linux_ioctl.h>
82#include <compat/linux/linux_mib.h>
83#include <compat/linux/linux_misc.h>
84#include <compat/linux/linux_signal.h>
85#include <compat/linux/linux_util.h>
86#include <compat/linux/linux_vdso.h>
87
88MODULE_VERSION(linux, 1);
89
90#define	AUXARGS_ENTRY_32(pos, id, val)	\
91	do {				\
92		suword32(pos++, id);	\
93		suword32(pos++, val);	\
94	} while (0)
95
96#if BYTE_ORDER == LITTLE_ENDIAN
97#define SHELLMAGIC      0x2123 /* #! */
98#else
99#define SHELLMAGIC      0x2321
100#endif
101
102/*
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
106 * ldebug(sigreturn).
107 */
108#define	LINUX_SYS_linux_rt_sendsig	0
109#define	LINUX_SYS_linux_sendsig		0
110
111const char *linux_kplatform;
112static int linux_szsigcode;
113static vm_object_t linux_shared_page_obj;
114static char *linux_shared_page_mapping;
115extern char _binary_linux32_locore_o_start;
116extern char _binary_linux32_locore_o_end;
117
118extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
119
120SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
121
122static int	elf_linux_fixup(register_t **stack_base,
123		    struct image_params *iparams);
124static register_t *linux_copyout_strings(struct image_params *imgp);
125static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
126static void	exec_linux_setregs(struct thread *td,
127				   struct image_params *imgp, u_long stack);
128static void	linux32_fixlimit(struct rlimit *rl, int which);
129static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130static void	linux_vdso_install(void *param);
131static void	linux_vdso_deinstall(void *param);
132
133/*
134 * Linux syscalls return negative errno's, we do positive and map them
135 * Reference:
136 *   FreeBSD: src/sys/sys/errno.h
137 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138 *            linux-2.6.17.8/include/asm-generic/errno.h
139 */
140static int bsd_to_linux_errno[ELAST + 1] = {
141	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150	 -72, -67, -71
151};
152
153int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161	0, LINUX_SIGUSR1, LINUX_SIGUSR2
162};
163
164int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165	SIGHUP, SIGINT, SIGQUIT, SIGILL,
166	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172	SIGIO, SIGURG, SIGSYS
173};
174
175#define LINUX_T_UNKNOWN  255
176static int _bsd_to_linux_trapcode[] = {
177	LINUX_T_UNKNOWN,	/* 0 */
178	6,			/* 1  T_PRIVINFLT */
179	LINUX_T_UNKNOWN,	/* 2 */
180	3,			/* 3  T_BPTFLT */
181	LINUX_T_UNKNOWN,	/* 4 */
182	LINUX_T_UNKNOWN,	/* 5 */
183	16,			/* 6  T_ARITHTRAP */
184	254,			/* 7  T_ASTFLT */
185	LINUX_T_UNKNOWN,	/* 8 */
186	13,			/* 9  T_PROTFLT */
187	1,			/* 10 T_TRCTRAP */
188	LINUX_T_UNKNOWN,	/* 11 */
189	14,			/* 12 T_PAGEFLT */
190	LINUX_T_UNKNOWN,	/* 13 */
191	17,			/* 14 T_ALIGNFLT */
192	LINUX_T_UNKNOWN,	/* 15 */
193	LINUX_T_UNKNOWN,	/* 16 */
194	LINUX_T_UNKNOWN,	/* 17 */
195	0,			/* 18 T_DIVIDE */
196	2,			/* 19 T_NMI */
197	4,			/* 20 T_OFLOW */
198	5,			/* 21 T_BOUND */
199	7,			/* 22 T_DNA */
200	8,			/* 23 T_DOUBLEFLT */
201	9,			/* 24 T_FPOPFLT */
202	10,			/* 25 T_TSSFLT */
203	11,			/* 26 T_SEGNPFLT */
204	12,			/* 27 T_STKFLT */
205	18,			/* 28 T_MCHK */
206	19,			/* 29 T_XMMFLT */
207	15			/* 30 T_RESERVED */
208};
209#define bsd_to_linux_trapcode(code) \
210    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211     _bsd_to_linux_trapcode[(code)]: \
212     LINUX_T_UNKNOWN)
213
214struct linux32_ps_strings {
215	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
216	u_int ps_nargvstr;	/* the number of argument strings */
217	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
218	u_int ps_nenvstr;	/* the number of environment strings */
219};
220
221LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
222LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
223LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
224LINUX_VDSO_SYM_CHAR(linux_platform);
225
226/*
227 * If FreeBSD & Linux have a difference of opinion about what a trap
228 * means, deal with it here.
229 *
230 * MPSAFE
231 */
232static int
233translate_traps(int signal, int trap_code)
234{
235	if (signal != SIGBUS)
236		return signal;
237	switch (trap_code) {
238	case T_PROTFLT:
239	case T_TSSFLT:
240	case T_DOUBLEFLT:
241	case T_PAGEFLT:
242		return SIGSEGV;
243	default:
244		return signal;
245	}
246}
247
248static int
249elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
250{
251	Elf32_Auxargs *args;
252	Elf32_Addr *base;
253	Elf32_Addr *pos;
254	struct linux32_ps_strings *arginfo;
255
256	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
257
258	KASSERT(curthread->td_proc == imgp->proc,
259	    ("unsafe elf_linux_fixup(), should be curproc"));
260	base = (Elf32_Addr *)*stack_base;
261	args = (Elf32_Auxargs *)imgp->auxargs;
262	pos = base + (imgp->args->argc + imgp->args->envc + 2);
263
264	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
265	    imgp->proc->p_sysent->sv_shared_page_base);
266	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
267	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
268
269	/*
270	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
271	 * as it has appeared in the 2.4.0-rc7 first time.
272	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
273	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
274	 * is not present.
275	 * Also see linux_times() implementation.
276	 */
277	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
278		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
279	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
280	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
281	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
282	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
283	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
284	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
285	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
286	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
287	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
288	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
289	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
290	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
291	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
292	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary));
293	if (imgp->execpathp != 0)
294		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp));
295	if (args->execfd != -1)
296		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
297	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
298
299	free(imgp->auxargs, M_TEMP);
300	imgp->auxargs = NULL;
301
302	base--;
303	suword32(base, (uint32_t)imgp->args->argc);
304	*stack_base = (register_t *)base;
305	return (0);
306}
307
308static void
309linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
310{
311	struct thread *td = curthread;
312	struct proc *p = td->td_proc;
313	struct sigacts *psp;
314	struct trapframe *regs;
315	struct l_rt_sigframe *fp, frame;
316	int oonstack;
317	int sig;
318	int code;
319
320	sig = ksi->ksi_signo;
321	code = ksi->ksi_code;
322	PROC_LOCK_ASSERT(p, MA_OWNED);
323	psp = p->p_sigacts;
324	mtx_assert(&psp->ps_mtx, MA_OWNED);
325	regs = td->td_frame;
326	oonstack = sigonstack(regs->tf_rsp);
327
328#ifdef DEBUG
329	if (ldebug(rt_sendsig))
330		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
331		    catcher, sig, (void*)mask, code);
332#endif
333	/*
334	 * Allocate space for the signal handler context.
335	 */
336	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
337	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
338		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
339		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
340	} else
341		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
342	mtx_unlock(&psp->ps_mtx);
343
344	/*
345	 * Build the argument list for the signal handler.
346	 */
347	sig = BSD_TO_LINUX_SIGNAL(sig);
348
349	bzero(&frame, sizeof(frame));
350
351	frame.sf_handler = PTROUT(catcher);
352	frame.sf_sig = sig;
353	frame.sf_siginfo = PTROUT(&fp->sf_si);
354	frame.sf_ucontext = PTROUT(&fp->sf_sc);
355
356	/* Fill in POSIX parts */
357	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
358
359	/*
360	 * Build the signal context to be used by sigreturn
361	 * and libgcc unwind.
362	 */
363	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
364	frame.sf_sc.uc_link = 0;		/* XXX ??? */
365
366	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
367	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
368	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
369	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
370	PROC_UNLOCK(p);
371
372	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
373
374	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
375	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
376	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
377	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
378	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
379	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
380	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
381	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
382	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
383	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
384	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
385	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
386	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
387	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
388	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
389	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
390	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
391	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
392	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
393	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
394	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
395
396#ifdef DEBUG
397	if (ldebug(rt_sendsig))
398		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
399		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
400		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
401#endif
402
403	if (copyout(&frame, fp, sizeof(frame)) != 0) {
404		/*
405		 * Process has trashed its stack; give it an illegal
406		 * instruction to halt it in its tracks.
407		 */
408#ifdef DEBUG
409		if (ldebug(rt_sendsig))
410			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
411			    fp, oonstack);
412#endif
413		PROC_LOCK(p);
414		sigexit(td, SIGILL);
415	}
416
417	/*
418	 * Build context to run handler in.
419	 */
420	regs->tf_rsp = PTROUT(fp);
421	regs->tf_rip = linux32_rt_sigcode;
422	regs->tf_rflags &= ~(PSL_T | PSL_D);
423	regs->tf_cs = _ucode32sel;
424	regs->tf_ss = _udatasel;
425	regs->tf_ds = _udatasel;
426	regs->tf_es = _udatasel;
427	regs->tf_fs = _ufssel;
428	regs->tf_gs = _ugssel;
429	regs->tf_flags = TF_HASSEGS;
430	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
431	PROC_LOCK(p);
432	mtx_lock(&psp->ps_mtx);
433}
434
435
436/*
437 * Send an interrupt to process.
438 *
439 * Stack is set up to allow sigcode stored
440 * in u. to call routine, followed by kcall
441 * to sigreturn routine below.  After sigreturn
442 * resets the signal mask, the stack, and the
443 * frame pointer, it returns to the user
444 * specified pc, psl.
445 */
446static void
447linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
448{
449	struct thread *td = curthread;
450	struct proc *p = td->td_proc;
451	struct sigacts *psp;
452	struct trapframe *regs;
453	struct l_sigframe *fp, frame;
454	l_sigset_t lmask;
455	int oonstack, i;
456	int sig, code;
457
458	sig = ksi->ksi_signo;
459	code = ksi->ksi_code;
460	PROC_LOCK_ASSERT(p, MA_OWNED);
461	psp = p->p_sigacts;
462	mtx_assert(&psp->ps_mtx, MA_OWNED);
463	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
464		/* Signal handler installed with SA_SIGINFO. */
465		linux_rt_sendsig(catcher, ksi, mask);
466		return;
467	}
468
469	regs = td->td_frame;
470	oonstack = sigonstack(regs->tf_rsp);
471
472#ifdef DEBUG
473	if (ldebug(sendsig))
474		printf(ARGS(sendsig, "%p, %d, %p, %u"),
475		    catcher, sig, (void*)mask, code);
476#endif
477
478	/*
479	 * Allocate space for the signal handler context.
480	 */
481	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
482	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
483		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
484		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
485	} else
486		fp = (struct l_sigframe *)regs->tf_rsp - 1;
487	mtx_unlock(&psp->ps_mtx);
488	PROC_UNLOCK(p);
489
490	/*
491	 * Build the argument list for the signal handler.
492	 */
493	sig = BSD_TO_LINUX_SIGNAL(sig);
494
495	bzero(&frame, sizeof(frame));
496
497	frame.sf_handler = PTROUT(catcher);
498	frame.sf_sig = sig;
499
500	bsd_to_linux_sigset(mask, &lmask);
501
502	/*
503	 * Build the signal context to be used by sigreturn.
504	 */
505	frame.sf_sc.sc_mask   = lmask.__bits[0];
506	frame.sf_sc.sc_gs     = regs->tf_gs;
507	frame.sf_sc.sc_fs     = regs->tf_fs;
508	frame.sf_sc.sc_es     = regs->tf_es;
509	frame.sf_sc.sc_ds     = regs->tf_ds;
510	frame.sf_sc.sc_edi    = regs->tf_rdi;
511	frame.sf_sc.sc_esi    = regs->tf_rsi;
512	frame.sf_sc.sc_ebp    = regs->tf_rbp;
513	frame.sf_sc.sc_ebx    = regs->tf_rbx;
514	frame.sf_sc.sc_esp    = regs->tf_rsp;
515	frame.sf_sc.sc_edx    = regs->tf_rdx;
516	frame.sf_sc.sc_ecx    = regs->tf_rcx;
517	frame.sf_sc.sc_eax    = regs->tf_rax;
518	frame.sf_sc.sc_eip    = regs->tf_rip;
519	frame.sf_sc.sc_cs     = regs->tf_cs;
520	frame.sf_sc.sc_eflags = regs->tf_rflags;
521	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
522	frame.sf_sc.sc_ss     = regs->tf_ss;
523	frame.sf_sc.sc_err    = regs->tf_err;
524	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
525	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
526
527	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
528		frame.sf_extramask[i] = lmask.__bits[i+1];
529
530	if (copyout(&frame, fp, sizeof(frame)) != 0) {
531		/*
532		 * Process has trashed its stack; give it an illegal
533		 * instruction to halt it in its tracks.
534		 */
535		PROC_LOCK(p);
536		sigexit(td, SIGILL);
537	}
538
539	/*
540	 * Build context to run handler in.
541	 */
542	regs->tf_rsp = PTROUT(fp);
543	regs->tf_rip = linux32_sigcode;
544	regs->tf_rflags &= ~(PSL_T | PSL_D);
545	regs->tf_cs = _ucode32sel;
546	regs->tf_ss = _udatasel;
547	regs->tf_ds = _udatasel;
548	regs->tf_es = _udatasel;
549	regs->tf_fs = _ufssel;
550	regs->tf_gs = _ugssel;
551	regs->tf_flags = TF_HASSEGS;
552	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
553	PROC_LOCK(p);
554	mtx_lock(&psp->ps_mtx);
555}
556
557/*
558 * System call to cleanup state after a signal
559 * has been taken.  Reset signal mask and
560 * stack state from context left by sendsig (above).
561 * Return to previous pc and psl as specified by
562 * context left by sendsig. Check carefully to
563 * make sure that the user has not modified the
564 * psl to gain improper privileges or to cause
565 * a machine fault.
566 */
567int
568linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
569{
570	struct l_sigframe frame;
571	struct trapframe *regs;
572	sigset_t bmask;
573	l_sigset_t lmask;
574	int eflags, i;
575	ksiginfo_t ksi;
576
577	regs = td->td_frame;
578
579#ifdef DEBUG
580	if (ldebug(sigreturn))
581		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
582#endif
583	/*
584	 * The trampoline code hands us the sigframe.
585	 * It is unsafe to keep track of it ourselves, in the event that a
586	 * program jumps out of a signal handler.
587	 */
588	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
589		return (EFAULT);
590
591	/*
592	 * Check for security violations.
593	 */
594#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
595	eflags = frame.sf_sc.sc_eflags;
596	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
597		return(EINVAL);
598
599	/*
600	 * Don't allow users to load a valid privileged %cs.  Let the
601	 * hardware check for invalid selectors, excess privilege in
602	 * other selectors, invalid %eip's and invalid %esp's.
603	 */
604#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
605	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
606		ksiginfo_init_trap(&ksi);
607		ksi.ksi_signo = SIGBUS;
608		ksi.ksi_code = BUS_OBJERR;
609		ksi.ksi_trapno = T_PROTFLT;
610		ksi.ksi_addr = (void *)regs->tf_rip;
611		trapsignal(td, &ksi);
612		return(EINVAL);
613	}
614
615	lmask.__bits[0] = frame.sf_sc.sc_mask;
616	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
617		lmask.__bits[i+1] = frame.sf_extramask[i];
618	linux_to_bsd_sigset(&lmask, &bmask);
619	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
620
621	/*
622	 * Restore signal context.
623	 */
624	regs->tf_rdi    = frame.sf_sc.sc_edi;
625	regs->tf_rsi    = frame.sf_sc.sc_esi;
626	regs->tf_rbp    = frame.sf_sc.sc_ebp;
627	regs->tf_rbx    = frame.sf_sc.sc_ebx;
628	regs->tf_rdx    = frame.sf_sc.sc_edx;
629	regs->tf_rcx    = frame.sf_sc.sc_ecx;
630	regs->tf_rax    = frame.sf_sc.sc_eax;
631	regs->tf_rip    = frame.sf_sc.sc_eip;
632	regs->tf_cs     = frame.sf_sc.sc_cs;
633	regs->tf_ds     = frame.sf_sc.sc_ds;
634	regs->tf_es     = frame.sf_sc.sc_es;
635	regs->tf_fs     = frame.sf_sc.sc_fs;
636	regs->tf_gs     = frame.sf_sc.sc_gs;
637	regs->tf_rflags = eflags;
638	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
639	regs->tf_ss     = frame.sf_sc.sc_ss;
640	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
641
642	return (EJUSTRETURN);
643}
644
645/*
646 * System call to cleanup state after a signal
647 * has been taken.  Reset signal mask and
648 * stack state from context left by rt_sendsig (above).
649 * Return to previous pc and psl as specified by
650 * context left by sendsig. Check carefully to
651 * make sure that the user has not modified the
652 * psl to gain improper privileges or to cause
653 * a machine fault.
654 */
655int
656linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
657{
658	struct l_ucontext uc;
659	struct l_sigcontext *context;
660	sigset_t bmask;
661	l_stack_t *lss;
662	stack_t ss;
663	struct trapframe *regs;
664	int eflags;
665	ksiginfo_t ksi;
666
667	regs = td->td_frame;
668
669#ifdef DEBUG
670	if (ldebug(rt_sigreturn))
671		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
672#endif
673	/*
674	 * The trampoline code hands us the ucontext.
675	 * It is unsafe to keep track of it ourselves, in the event that a
676	 * program jumps out of a signal handler.
677	 */
678	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
679		return (EFAULT);
680
681	context = &uc.uc_mcontext;
682
683	/*
684	 * Check for security violations.
685	 */
686#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
687	eflags = context->sc_eflags;
688	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
689		return(EINVAL);
690
691	/*
692	 * Don't allow users to load a valid privileged %cs.  Let the
693	 * hardware check for invalid selectors, excess privilege in
694	 * other selectors, invalid %eip's and invalid %esp's.
695	 */
696#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
697	if (!CS_SECURE(context->sc_cs)) {
698		ksiginfo_init_trap(&ksi);
699		ksi.ksi_signo = SIGBUS;
700		ksi.ksi_code = BUS_OBJERR;
701		ksi.ksi_trapno = T_PROTFLT;
702		ksi.ksi_addr = (void *)regs->tf_rip;
703		trapsignal(td, &ksi);
704		return(EINVAL);
705	}
706
707	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
708	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
709
710	/*
711	 * Restore signal context
712	 */
713	regs->tf_gs	= context->sc_gs;
714	regs->tf_fs	= context->sc_fs;
715	regs->tf_es	= context->sc_es;
716	regs->tf_ds	= context->sc_ds;
717	regs->tf_rdi    = context->sc_edi;
718	regs->tf_rsi    = context->sc_esi;
719	regs->tf_rbp    = context->sc_ebp;
720	regs->tf_rbx    = context->sc_ebx;
721	regs->tf_rdx    = context->sc_edx;
722	regs->tf_rcx    = context->sc_ecx;
723	regs->tf_rax    = context->sc_eax;
724	regs->tf_rip    = context->sc_eip;
725	regs->tf_cs     = context->sc_cs;
726	regs->tf_rflags = eflags;
727	regs->tf_rsp    = context->sc_esp_at_signal;
728	regs->tf_ss     = context->sc_ss;
729	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
730
731	/*
732	 * call sigaltstack & ignore results..
733	 */
734	lss = &uc.uc_stack;
735	ss.ss_sp = PTRIN(lss->ss_sp);
736	ss.ss_size = lss->ss_size;
737	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
738
739#ifdef DEBUG
740	if (ldebug(rt_sigreturn))
741		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
742		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
743#endif
744	(void)kern_sigaltstack(td, &ss, NULL);
745
746	return (EJUSTRETURN);
747}
748
749static int
750linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
751{
752	struct proc *p;
753	struct trapframe *frame;
754
755	p = td->td_proc;
756	frame = td->td_frame;
757
758	sa->args[0] = frame->tf_rbx;
759	sa->args[1] = frame->tf_rcx;
760	sa->args[2] = frame->tf_rdx;
761	sa->args[3] = frame->tf_rsi;
762	sa->args[4] = frame->tf_rdi;
763	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
764	sa->code = frame->tf_rax;
765
766	if (sa->code >= p->p_sysent->sv_size)
767		sa->callp = &p->p_sysent->sv_table[0];
768	else
769		sa->callp = &p->p_sysent->sv_table[sa->code];
770	sa->narg = sa->callp->sy_narg;
771
772	td->td_retval[0] = 0;
773	td->td_retval[1] = frame->tf_rdx;
774
775	return (0);
776}
777
778/*
779 * If a linux binary is exec'ing something, try this image activator
780 * first.  We override standard shell script execution in order to
781 * be able to modify the interpreter path.  We only do this if a linux
782 * binary is doing the exec, so we do not create an EXEC module for it.
783 */
784static int	exec_linux_imgact_try(struct image_params *iparams);
785
786static int
787exec_linux_imgact_try(struct image_params *imgp)
788{
789	const char *head = (const char *)imgp->image_header;
790	char *rpath;
791	int error = -1;
792
793	/*
794	* The interpreter for shell scripts run from a linux binary needs
795	* to be located in /compat/linux if possible in order to recursively
796	* maintain linux path emulation.
797	*/
798	if (((const short *)head)[0] == SHELLMAGIC) {
799		/*
800		* Run our normal shell image activator.  If it succeeds attempt
801		* to use the alternate path for the interpreter.  If an
802		* alternate * path is found, use our stringspace to store it.
803		*/
804		if ((error = exec_shell_imgact(imgp)) == 0) {
805			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
806			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
807			    AT_FDCWD);
808			if (rpath != NULL)
809				imgp->args->fname_buf =
810				    imgp->interpreter_name = rpath;
811		}
812	}
813	return (error);
814}
815
816/*
817 * Clear registers on exec
818 * XXX copied from ia32_signal.c.
819 */
820static void
821exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
822{
823	struct trapframe *regs = td->td_frame;
824	struct pcb *pcb = td->td_pcb;
825
826	mtx_lock(&dt_lock);
827	if (td->td_proc->p_md.md_ldt != NULL)
828		user_ldt_free(td);
829	else
830		mtx_unlock(&dt_lock);
831
832	critical_enter();
833	wrmsr(MSR_FSBASE, 0);
834	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
835	pcb->pcb_fsbase = 0;
836	pcb->pcb_gsbase = 0;
837	critical_exit();
838	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
839
840	bzero((char *)regs, sizeof(struct trapframe));
841	regs->tf_rip = imgp->entry_addr;
842	regs->tf_rsp = stack;
843	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
844	regs->tf_gs = _ugssel;
845	regs->tf_fs = _ufssel;
846	regs->tf_es = _udatasel;
847	regs->tf_ds = _udatasel;
848	regs->tf_ss = _udatasel;
849	regs->tf_flags = TF_HASSEGS;
850	regs->tf_cs = _ucode32sel;
851	regs->tf_rbx = imgp->ps_strings;
852
853	fpstate_drop(td);
854
855	/* Do full restore on return so that we can change to a different %cs */
856	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
857	td->td_retval[1] = 0;
858}
859
860/*
861 * XXX copied from ia32_sysvec.c.
862 */
863static register_t *
864linux_copyout_strings(struct image_params *imgp)
865{
866	int argc, envc;
867	u_int32_t *vectp;
868	char *stringp, *destp;
869	u_int32_t *stack_base;
870	struct linux32_ps_strings *arginfo;
871	char canary[LINUX_AT_RANDOM_LEN];
872	size_t execpath_len;
873
874	/*
875	 * Calculate string base and vector table pointers.
876	 */
877	if (imgp->execpath != NULL && imgp->auxargs != NULL)
878		execpath_len = strlen(imgp->execpath) + 1;
879	else
880		execpath_len = 0;
881
882	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
883	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
884	    roundup(sizeof(canary), sizeof(char *)) -
885	    roundup(execpath_len, sizeof(char *)) -
886	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
887
888	if (execpath_len != 0) {
889		imgp->execpathp = (uintptr_t)arginfo - execpath_len;
890		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
891	}
892
893	/*
894	 * Prepare the canary for SSP.
895	 */
896	arc4rand(canary, sizeof(canary), 0);
897	imgp->canary = (uintptr_t)arginfo -
898	    roundup(execpath_len, sizeof(char *)) -
899	    roundup(sizeof(canary), sizeof(char *));
900	copyout(canary, (void *)imgp->canary, sizeof(canary));
901
902	/*
903	 * If we have a valid auxargs ptr, prepare some room
904	 * on the stack.
905	 */
906	if (imgp->auxargs) {
907		/*
908		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
909		 * lower compatibility.
910		 */
911		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
912		    (LINUX_AT_COUNT * 2);
913		/*
914		 * The '+ 2' is for the null pointers at the end of each of
915		 * the arg and env vector sets,and imgp->auxarg_size is room
916		 * for argument of Runtime loader.
917		 */
918		vectp = (u_int32_t *) (destp - (imgp->args->argc +
919		    imgp->args->envc + 2 + imgp->auxarg_size) *
920		    sizeof(u_int32_t));
921
922	} else
923		/*
924		 * The '+ 2' is for the null pointers at the end of each of
925		 * the arg and env vector sets
926		 */
927		vectp = (u_int32_t *)(destp - (imgp->args->argc +
928		    imgp->args->envc + 2) * sizeof(u_int32_t));
929
930	/*
931	 * vectp also becomes our initial stack base
932	 */
933	stack_base = vectp;
934
935	stringp = imgp->args->begin_argv;
936	argc = imgp->args->argc;
937	envc = imgp->args->envc;
938	/*
939	 * Copy out strings - arguments and environment.
940	 */
941	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
942
943	/*
944	 * Fill in "ps_strings" struct for ps, w, etc.
945	 */
946	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
947	suword32(&arginfo->ps_nargvstr, argc);
948
949	/*
950	 * Fill in argument portion of vector table.
951	 */
952	for (; argc > 0; --argc) {
953		suword32(vectp++, (uint32_t)(intptr_t)destp);
954		while (*stringp++ != 0)
955			destp++;
956		destp++;
957	}
958
959	/* a null vector table pointer separates the argp's from the envp's */
960	suword32(vectp++, 0);
961
962	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
963	suword32(&arginfo->ps_nenvstr, envc);
964
965	/*
966	 * Fill in environment portion of vector table.
967	 */
968	for (; envc > 0; --envc) {
969		suword32(vectp++, (uint32_t)(intptr_t)destp);
970		while (*stringp++ != 0)
971			destp++;
972		destp++;
973	}
974
975	/* end of vector table is a null pointer */
976	suword32(vectp, 0);
977
978	return ((register_t *)stack_base);
979}
980
981static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
982    "32-bit Linux emulation");
983
984static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
985SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
986    &linux32_maxdsiz, 0, "");
987static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
988SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
989    &linux32_maxssiz, 0, "");
990static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
991SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
992    &linux32_maxvmem, 0, "");
993
994#if defined(DEBUG)
995SYSCTL_PROC(_compat_linux32, OID_AUTO, debug,
996            CTLTYPE_STRING | CTLFLAG_RW,
997            0, 0, linux_sysctl_debug, "A",
998            "Linux debugging control");
999#endif
1000
1001static void
1002linux32_fixlimit(struct rlimit *rl, int which)
1003{
1004
1005	switch (which) {
1006	case RLIMIT_DATA:
1007		if (linux32_maxdsiz != 0) {
1008			if (rl->rlim_cur > linux32_maxdsiz)
1009				rl->rlim_cur = linux32_maxdsiz;
1010			if (rl->rlim_max > linux32_maxdsiz)
1011				rl->rlim_max = linux32_maxdsiz;
1012		}
1013		break;
1014	case RLIMIT_STACK:
1015		if (linux32_maxssiz != 0) {
1016			if (rl->rlim_cur > linux32_maxssiz)
1017				rl->rlim_cur = linux32_maxssiz;
1018			if (rl->rlim_max > linux32_maxssiz)
1019				rl->rlim_max = linux32_maxssiz;
1020		}
1021		break;
1022	case RLIMIT_VMEM:
1023		if (linux32_maxvmem != 0) {
1024			if (rl->rlim_cur > linux32_maxvmem)
1025				rl->rlim_cur = linux32_maxvmem;
1026			if (rl->rlim_max > linux32_maxvmem)
1027				rl->rlim_max = linux32_maxvmem;
1028		}
1029		break;
1030	}
1031}
1032
1033struct sysentvec elf_linux_sysvec = {
1034	.sv_size	= LINUX_SYS_MAXSYSCALL,
1035	.sv_table	= linux_sysent,
1036	.sv_mask	= 0,
1037	.sv_sigsize	= LINUX_SIGTBLSZ,
1038	.sv_sigtbl	= bsd_to_linux_signal,
1039	.sv_errsize	= ELAST + 1,
1040	.sv_errtbl	= bsd_to_linux_errno,
1041	.sv_transtrap	= translate_traps,
1042	.sv_fixup	= elf_linux_fixup,
1043	.sv_sendsig	= linux_sendsig,
1044	.sv_sigcode	= &_binary_linux32_locore_o_start,
1045	.sv_szsigcode	= &linux_szsigcode,
1046	.sv_prepsyscall	= NULL,
1047	.sv_name	= "Linux ELF32",
1048	.sv_coredump	= elf32_coredump,
1049	.sv_imgact_try	= exec_linux_imgact_try,
1050	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1051	.sv_pagesize	= PAGE_SIZE,
1052	.sv_minuser	= VM_MIN_ADDRESS,
1053	.sv_maxuser	= LINUX32_MAXUSER,
1054	.sv_usrstack	= LINUX32_USRSTACK,
1055	.sv_psstrings	= LINUX32_PS_STRINGS,
1056	.sv_stackprot	= VM_PROT_ALL,
1057	.sv_copyout_strings = linux_copyout_strings,
1058	.sv_setregs	= exec_linux_setregs,
1059	.sv_fixlimit	= linux32_fixlimit,
1060	.sv_maxssiz	= &linux32_maxssiz,
1061	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1062	.sv_set_syscall_retval = cpu_set_syscall_retval,
1063	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1064	.sv_syscallnames = NULL,
1065	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1066	.sv_shared_page_len = PAGE_SIZE,
1067	.sv_schedtail	= linux_schedtail,
1068	.sv_thread_detach = linux_thread_detach,
1069};
1070
1071static void
1072linux_vdso_install(void *param)
1073{
1074
1075	linux_szsigcode = (&_binary_linux32_locore_o_end -
1076	    &_binary_linux32_locore_o_start);
1077
1078	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1079		panic("Linux invalid vdso size\n");
1080
1081	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1082
1083	linux_shared_page_obj = __elfN(linux_shared_page_init)
1084	    (&linux_shared_page_mapping);
1085
1086	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1087
1088	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1089	    linux_szsigcode);
1090	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1091
1092	linux_kplatform = linux_shared_page_mapping +
1093	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1094}
1095SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1096    (sysinit_cfunc_t)linux_vdso_install, NULL);
1097
1098static void
1099linux_vdso_deinstall(void *param)
1100{
1101
1102	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1103};
1104SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1105    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1106
1107static char GNU_ABI_VENDOR[] = "GNU";
1108static int GNULINUX_ABI_DESC = 0;
1109
1110static boolean_t
1111linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1112{
1113	const Elf32_Word *desc;
1114	uintptr_t p;
1115
1116	p = (uintptr_t)(note + 1);
1117	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1118
1119	desc = (const Elf32_Word *)p;
1120	if (desc[0] != GNULINUX_ABI_DESC)
1121		return (FALSE);
1122
1123	/*
1124	 * For linux we encode osrel as follows (see linux_mib.c):
1125	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1126	 */
1127	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1128
1129	return (TRUE);
1130}
1131
1132static Elf_Brandnote linux32_brandnote = {
1133	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1134	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1135	.hdr.n_type	= 1,
1136	.vendor		= GNU_ABI_VENDOR,
1137	.flags		= BN_TRANSLATE_OSREL,
1138	.trans_osrel	= linux32_trans_osrel
1139};
1140
1141static Elf32_Brandinfo linux_brand = {
1142	.brand		= ELFOSABI_LINUX,
1143	.machine	= EM_386,
1144	.compat_3_brand	= "Linux",
1145	.emul_path	= "/compat/linux",
1146	.interp_path	= "/lib/ld-linux.so.1",
1147	.sysvec		= &elf_linux_sysvec,
1148	.interp_newpath	= NULL,
1149	.brand_note	= &linux32_brandnote,
1150	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1151};
1152
1153static Elf32_Brandinfo linux_glibc2brand = {
1154	.brand		= ELFOSABI_LINUX,
1155	.machine	= EM_386,
1156	.compat_3_brand	= "Linux",
1157	.emul_path	= "/compat/linux",
1158	.interp_path	= "/lib/ld-linux.so.2",
1159	.sysvec		= &elf_linux_sysvec,
1160	.interp_newpath	= NULL,
1161	.brand_note	= &linux32_brandnote,
1162	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1163};
1164
1165Elf32_Brandinfo *linux_brandlist[] = {
1166	&linux_brand,
1167	&linux_glibc2brand,
1168	NULL
1169};
1170
1171static int
1172linux_elf_modevent(module_t mod, int type, void *data)
1173{
1174	Elf32_Brandinfo **brandinfo;
1175	int error;
1176	struct linux_ioctl_handler **lihp;
1177
1178	error = 0;
1179
1180	switch(type) {
1181	case MOD_LOAD:
1182		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1183		     ++brandinfo)
1184			if (elf32_insert_brand_entry(*brandinfo) < 0)
1185				error = EINVAL;
1186		if (error == 0) {
1187			SET_FOREACH(lihp, linux_ioctl_handler_set)
1188				linux_ioctl_register_handler(*lihp);
1189			LIST_INIT(&futex_list);
1190			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1191			stclohz = (stathz ? stathz : hz);
1192			if (bootverbose)
1193				printf("Linux ELF exec handler installed\n");
1194		} else
1195			printf("cannot insert Linux ELF brand handler\n");
1196		break;
1197	case MOD_UNLOAD:
1198		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1199		     ++brandinfo)
1200			if (elf32_brand_inuse(*brandinfo))
1201				error = EBUSY;
1202		if (error == 0) {
1203			for (brandinfo = &linux_brandlist[0];
1204			     *brandinfo != NULL; ++brandinfo)
1205				if (elf32_remove_brand_entry(*brandinfo) < 0)
1206					error = EINVAL;
1207		}
1208		if (error == 0) {
1209			SET_FOREACH(lihp, linux_ioctl_handler_set)
1210				linux_ioctl_unregister_handler(*lihp);
1211			mtx_destroy(&futex_mtx);
1212			if (bootverbose)
1213				printf("Linux ELF exec handler removed\n");
1214		} else
1215			printf("Could not deinstall ELF interpreter entry\n");
1216		break;
1217	default:
1218		return (EOPNOTSUPP);
1219	}
1220	return (error);
1221}
1222
1223static moduledata_t linux_elf_mod = {
1224	"linuxelf",
1225	linux_elf_modevent,
1226	0
1227};
1228
1229DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1230MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1);
1231