linux_sysvec.c revision 293516
1/*-
2 * Copyright (c) 1994-1996 S��ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/i386/linux/linux_sysvec.c 293516 2016-01-09 15:48:11Z dchagin $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/exec.h>
35#include <sys/fcntl.h>
36#include <sys/imgact.h>
37#include <sys/imgact_aout.h>
38#include <sys/imgact_elf.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/module.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/signalvar.h>
46#include <sys/syscallsubr.h>
47#include <sys/sysent.h>
48#include <sys/sysproto.h>
49#include <sys/vnode.h>
50#include <sys/eventhandler.h>
51
52#include <vm/vm.h>
53#include <vm/pmap.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57#include <vm/vm_page.h>
58#include <vm/vm_param.h>
59
60#include <machine/cpu.h>
61#include <machine/cputypes.h>
62#include <machine/md_var.h>
63#include <machine/pcb.h>
64
65#include <i386/linux/linux.h>
66#include <i386/linux/linux_proto.h>
67#include <compat/linux/linux_emul.h>
68#include <compat/linux/linux_futex.h>
69#include <compat/linux/linux_ioctl.h>
70#include <compat/linux/linux_mib.h>
71#include <compat/linux/linux_misc.h>
72#include <compat/linux/linux_signal.h>
73#include <compat/linux/linux_util.h>
74#include <compat/linux/linux_vdso.h>
75
76MODULE_VERSION(linux, 1);
77
78MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
79
80#if BYTE_ORDER == LITTLE_ENDIAN
81#define SHELLMAGIC      0x2123 /* #! */
82#else
83#define SHELLMAGIC      0x2321
84#endif
85
86/*
87 * Allow the sendsig functions to use the ldebug() facility
88 * even though they are not syscalls themselves. Map them
89 * to syscall 0. This is slightly less bogus than using
90 * ldebug(sigreturn).
91 */
92#define	LINUX_SYS_linux_rt_sendsig	0
93#define	LINUX_SYS_linux_sendsig		0
94
95#define	LINUX_PS_STRINGS	(LINUX_USRSTACK - sizeof(struct ps_strings))
96
97static int linux_szsigcode;
98static vm_object_t linux_shared_page_obj;
99static char *linux_shared_page_mapping;
100extern char _binary_linux_locore_o_start;
101extern char _binary_linux_locore_o_end;
102
103extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
104
105SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
106SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
107
108static int	linux_fixup(register_t **stack_base,
109		    struct image_params *iparams);
110static int	elf_linux_fixup(register_t **stack_base,
111		    struct image_params *iparams);
112static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
113static void	exec_linux_setregs(struct thread *td,
114		    struct image_params *imgp, u_long stack);
115static register_t *linux_copyout_strings(struct image_params *imgp);
116static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
117static void	linux_vdso_install(void *param);
118static void	linux_vdso_deinstall(void *param);
119
120static int linux_szplatform;
121const char *linux_kplatform;
122
123static eventhandler_tag linux_exit_tag;
124static eventhandler_tag linux_exec_tag;
125static eventhandler_tag linux_thread_dtor_tag;
126
127/*
128 * Linux syscalls return negative errno's, we do positive and map them
129 * Reference:
130 *   FreeBSD: src/sys/sys/errno.h
131 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
132 *            linux-2.6.17.8/include/asm-generic/errno.h
133 */
134static int bsd_to_linux_errno[ELAST + 1] = {
135	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
136	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
137	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
138	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
139	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
140	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
141	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
142	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
143	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
144	 -72, -67, -71
145};
146
147int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
148	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
149	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
150	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
151	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
152	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
153	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
154	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
155	0, LINUX_SIGUSR1, LINUX_SIGUSR2
156};
157
158int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
159	SIGHUP, SIGINT, SIGQUIT, SIGILL,
160	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
161	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
162	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
163	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
164	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
165	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
166	SIGIO, SIGURG, SIGSYS
167};
168
169#define LINUX_T_UNKNOWN  255
170static int _bsd_to_linux_trapcode[] = {
171	LINUX_T_UNKNOWN,	/* 0 */
172	6,			/* 1  T_PRIVINFLT */
173	LINUX_T_UNKNOWN,	/* 2 */
174	3,			/* 3  T_BPTFLT */
175	LINUX_T_UNKNOWN,	/* 4 */
176	LINUX_T_UNKNOWN,	/* 5 */
177	16,			/* 6  T_ARITHTRAP */
178	254,			/* 7  T_ASTFLT */
179	LINUX_T_UNKNOWN,	/* 8 */
180	13,			/* 9  T_PROTFLT */
181	1,			/* 10 T_TRCTRAP */
182	LINUX_T_UNKNOWN,	/* 11 */
183	14,			/* 12 T_PAGEFLT */
184	LINUX_T_UNKNOWN,	/* 13 */
185	17,			/* 14 T_ALIGNFLT */
186	LINUX_T_UNKNOWN,	/* 15 */
187	LINUX_T_UNKNOWN,	/* 16 */
188	LINUX_T_UNKNOWN,	/* 17 */
189	0,			/* 18 T_DIVIDE */
190	2,			/* 19 T_NMI */
191	4,			/* 20 T_OFLOW */
192	5,			/* 21 T_BOUND */
193	7,			/* 22 T_DNA */
194	8,			/* 23 T_DOUBLEFLT */
195	9,			/* 24 T_FPOPFLT */
196	10,			/* 25 T_TSSFLT */
197	11,			/* 26 T_SEGNPFLT */
198	12,			/* 27 T_STKFLT */
199	18,			/* 28 T_MCHK */
200	19,			/* 29 T_XMMFLT */
201	15			/* 30 T_RESERVED */
202};
203#define bsd_to_linux_trapcode(code) \
204    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
205     _bsd_to_linux_trapcode[(code)]: \
206     LINUX_T_UNKNOWN)
207
208LINUX_VDSO_SYM_INTPTR(linux_sigcode);
209LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
210LINUX_VDSO_SYM_INTPTR(linux_vsyscall);
211
212/*
213 * If FreeBSD & Linux have a difference of opinion about what a trap
214 * means, deal with it here.
215 *
216 * MPSAFE
217 */
218static int
219translate_traps(int signal, int trap_code)
220{
221	if (signal != SIGBUS)
222		return (signal);
223	switch (trap_code) {
224	case T_PROTFLT:
225	case T_TSSFLT:
226	case T_DOUBLEFLT:
227	case T_PAGEFLT:
228		return (SIGSEGV);
229	default:
230		return (signal);
231	}
232}
233
234static int
235linux_fixup(register_t **stack_base, struct image_params *imgp)
236{
237	register_t *argv, *envp;
238
239	argv = *stack_base;
240	envp = *stack_base + (imgp->args->argc + 1);
241	(*stack_base)--;
242	suword(*stack_base, (intptr_t)(void *)envp);
243	(*stack_base)--;
244	suword(*stack_base, (intptr_t)(void *)argv);
245	(*stack_base)--;
246	suword(*stack_base, imgp->args->argc);
247	return (0);
248}
249
250static int
251elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
252{
253	struct proc *p;
254	Elf32_Auxargs *args;
255	Elf32_Addr *uplatform;
256	struct ps_strings *arginfo;
257	register_t *pos;
258
259	KASSERT(curthread->td_proc == imgp->proc,
260	    ("unsafe elf_linux_fixup(), should be curproc"));
261
262	p = imgp->proc;
263	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
264	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
265	args = (Elf32_Auxargs *)imgp->auxargs;
266	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
267
268	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
269	    imgp->proc->p_sysent->sv_shared_page_base);
270	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall);
271	AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
272
273	/*
274	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
275	 * as it has appeared in the 2.4.0-rc7 first time.
276	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
277	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
278	 * is not present.
279	 * Also see linux_times() implementation.
280	 */
281	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
282		AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
283	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
284	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
285	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
286	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
287	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
288	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
289	AUXARGS_ENTRY(pos, AT_BASE, args->base);
290	AUXARGS_ENTRY(pos, LINUX_AT_SECURE, 0);
291	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
292	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
293	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
294	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
295	AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
296	if (args->execfd != -1)
297		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
298	AUXARGS_ENTRY(pos, AT_NULL, 0);
299
300	free(imgp->auxargs, M_TEMP);
301	imgp->auxargs = NULL;
302
303	(*stack_base)--;
304	suword(*stack_base, (register_t)imgp->args->argc);
305	return (0);
306}
307
308/*
309 * Copied from kern/kern_exec.c
310 */
311static register_t *
312linux_copyout_strings(struct image_params *imgp)
313{
314	int argc, envc;
315	char **vectp;
316	char *stringp, *destp;
317	register_t *stack_base;
318	struct ps_strings *arginfo;
319	struct proc *p;
320
321	/*
322	 * Calculate string base and vector table pointers.
323	 * Also deal with signal trampoline code for this exec type.
324	 */
325	p = imgp->proc;
326	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
327	destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
328	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
329
330	/*
331	 * install LINUX_PLATFORM
332	 */
333	copyout(linux_kplatform, ((caddr_t)arginfo - linux_szplatform),
334	    linux_szplatform);
335
336	/*
337	 * If we have a valid auxargs ptr, prepare some room
338	 * on the stack.
339	 */
340	if (imgp->auxargs) {
341		/*
342		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
343		 * lower compatibility.
344		 */
345		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
346		    (LINUX_AT_COUNT * 2);
347		/*
348		 * The '+ 2' is for the null pointers at the end of each of
349		 * the arg and env vector sets,and imgp->auxarg_size is room
350		 * for argument of Runtime loader.
351		 */
352		vectp = (char **)(destp - (imgp->args->argc +
353		    imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *));
354	} else {
355		/*
356		 * The '+ 2' is for the null pointers at the end of each of
357		 * the arg and env vector sets
358		 */
359		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
360		    sizeof(char *));
361	}
362
363	/*
364	 * vectp also becomes our initial stack base
365	 */
366	stack_base = (register_t *)vectp;
367
368	stringp = imgp->args->begin_argv;
369	argc = imgp->args->argc;
370	envc = imgp->args->envc;
371
372	/*
373	 * Copy out strings - arguments and environment.
374	 */
375	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
376
377	/*
378	 * Fill in "ps_strings" struct for ps, w, etc.
379	 */
380	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
381	suword(&arginfo->ps_nargvstr, argc);
382
383	/*
384	 * Fill in argument portion of vector table.
385	 */
386	for (; argc > 0; --argc) {
387		suword(vectp++, (long)(intptr_t)destp);
388		while (*stringp++ != 0)
389			destp++;
390		destp++;
391	}
392
393	/* a null vector table pointer separates the argp's from the envp's */
394	suword(vectp++, 0);
395
396	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
397	suword(&arginfo->ps_nenvstr, envc);
398
399	/*
400	 * Fill in environment portion of vector table.
401	 */
402	for (; envc > 0; --envc) {
403		suword(vectp++, (long)(intptr_t)destp);
404		while (*stringp++ != 0)
405			destp++;
406		destp++;
407	}
408
409	/* end of vector table is a null pointer */
410	suword(vectp, 0);
411
412	return (stack_base);
413}
414
415static void
416linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
417{
418	struct thread *td = curthread;
419	struct proc *p = td->td_proc;
420	struct sigacts *psp;
421	struct trapframe *regs;
422	struct l_rt_sigframe *fp, frame;
423	int sig, code;
424	int oonstack;
425
426	sig = ksi->ksi_signo;
427	code = ksi->ksi_code;
428	PROC_LOCK_ASSERT(p, MA_OWNED);
429	psp = p->p_sigacts;
430	mtx_assert(&psp->ps_mtx, MA_OWNED);
431	regs = td->td_frame;
432	oonstack = sigonstack(regs->tf_esp);
433
434#ifdef DEBUG
435	if (ldebug(rt_sendsig))
436		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
437		    catcher, sig, (void*)mask, code);
438#endif
439	/*
440	 * Allocate space for the signal handler context.
441	 */
442	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
443	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
444		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
445		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
446	} else
447		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
448	mtx_unlock(&psp->ps_mtx);
449
450	/*
451	 * Build the argument list for the signal handler.
452	 */
453	if (p->p_sysent->sv_sigtbl)
454		if (sig <= p->p_sysent->sv_sigsize)
455			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
456
457	bzero(&frame, sizeof(frame));
458
459	frame.sf_handler = catcher;
460	frame.sf_sig = sig;
461	frame.sf_siginfo = &fp->sf_si;
462	frame.sf_ucontext = &fp->sf_sc;
463
464	/* Fill in POSIX parts */
465	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
466
467	/*
468	 * Build the signal context to be used by sigreturn.
469	 */
470	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
471	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
472
473	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
474	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
475	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
476	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
477	PROC_UNLOCK(p);
478
479	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
480
481	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
482	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
483	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
484	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
485	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
486	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
487	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
488	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
489	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
490	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_esp;
491	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
492	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
493	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
494	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
495	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
496	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
497	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
498	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
499	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
500	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
501	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
502
503#ifdef DEBUG
504	if (ldebug(rt_sendsig))
505		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
506		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
507		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
508#endif
509
510	if (copyout(&frame, fp, sizeof(frame)) != 0) {
511		/*
512		 * Process has trashed its stack; give it an illegal
513		 * instruction to halt it in its tracks.
514		 */
515#ifdef DEBUG
516		if (ldebug(rt_sendsig))
517			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
518			    fp, oonstack);
519#endif
520		PROC_LOCK(p);
521		sigexit(td, SIGILL);
522	}
523
524	/*
525	 * Build context to run handler in.
526	 */
527	regs->tf_esp = (int)fp;
528	regs->tf_eip = linux_rt_sigcode;
529	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
530	regs->tf_cs = _ucodesel;
531	regs->tf_ds = _udatasel;
532	regs->tf_es = _udatasel;
533	regs->tf_fs = _udatasel;
534	regs->tf_ss = _udatasel;
535	PROC_LOCK(p);
536	mtx_lock(&psp->ps_mtx);
537}
538
539
540/*
541 * Send an interrupt to process.
542 *
543 * Stack is set up to allow sigcode stored
544 * in u. to call routine, followed by kcall
545 * to sigreturn routine below.  After sigreturn
546 * resets the signal mask, the stack, and the
547 * frame pointer, it returns to the user
548 * specified pc, psl.
549 */
550static void
551linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
552{
553	struct thread *td = curthread;
554	struct proc *p = td->td_proc;
555	struct sigacts *psp;
556	struct trapframe *regs;
557	struct l_sigframe *fp, frame;
558	l_sigset_t lmask;
559	int sig, code;
560	int oonstack, i;
561
562	PROC_LOCK_ASSERT(p, MA_OWNED);
563	psp = p->p_sigacts;
564	sig = ksi->ksi_signo;
565	code = ksi->ksi_code;
566	mtx_assert(&psp->ps_mtx, MA_OWNED);
567	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
568		/* Signal handler installed with SA_SIGINFO. */
569		linux_rt_sendsig(catcher, ksi, mask);
570		return;
571	}
572	regs = td->td_frame;
573	oonstack = sigonstack(regs->tf_esp);
574
575#ifdef DEBUG
576	if (ldebug(sendsig))
577		printf(ARGS(sendsig, "%p, %d, %p, %u"),
578		    catcher, sig, (void*)mask, code);
579#endif
580
581	/*
582	 * Allocate space for the signal handler context.
583	 */
584	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
585	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
586		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
587		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
588	} else
589		fp = (struct l_sigframe *)regs->tf_esp - 1;
590	mtx_unlock(&psp->ps_mtx);
591	PROC_UNLOCK(p);
592
593	/*
594	 * Build the argument list for the signal handler.
595	 */
596	if (p->p_sysent->sv_sigtbl)
597		if (sig <= p->p_sysent->sv_sigsize)
598			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
599
600	bzero(&frame, sizeof(frame));
601
602	frame.sf_handler = catcher;
603	frame.sf_sig = sig;
604
605	bsd_to_linux_sigset(mask, &lmask);
606
607	/*
608	 * Build the signal context to be used by sigreturn.
609	 */
610	frame.sf_sc.sc_mask   = lmask.__bits[0];
611	frame.sf_sc.sc_gs     = rgs();
612	frame.sf_sc.sc_fs     = regs->tf_fs;
613	frame.sf_sc.sc_es     = regs->tf_es;
614	frame.sf_sc.sc_ds     = regs->tf_ds;
615	frame.sf_sc.sc_edi    = regs->tf_edi;
616	frame.sf_sc.sc_esi    = regs->tf_esi;
617	frame.sf_sc.sc_ebp    = regs->tf_ebp;
618	frame.sf_sc.sc_ebx    = regs->tf_ebx;
619	frame.sf_sc.sc_esp    = regs->tf_esp;
620	frame.sf_sc.sc_edx    = regs->tf_edx;
621	frame.sf_sc.sc_ecx    = regs->tf_ecx;
622	frame.sf_sc.sc_eax    = regs->tf_eax;
623	frame.sf_sc.sc_eip    = regs->tf_eip;
624	frame.sf_sc.sc_cs     = regs->tf_cs;
625	frame.sf_sc.sc_eflags = regs->tf_eflags;
626	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
627	frame.sf_sc.sc_ss     = regs->tf_ss;
628	frame.sf_sc.sc_err    = regs->tf_err;
629	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
630	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
631
632	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
633		frame.sf_extramask[i] = lmask.__bits[i+1];
634
635	if (copyout(&frame, fp, sizeof(frame)) != 0) {
636		/*
637		 * Process has trashed its stack; give it an illegal
638		 * instruction to halt it in its tracks.
639		 */
640		PROC_LOCK(p);
641		sigexit(td, SIGILL);
642	}
643
644	/*
645	 * Build context to run handler in.
646	 */
647	regs->tf_esp = (int)fp;
648	regs->tf_eip = linux_sigcode;
649	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
650	regs->tf_cs = _ucodesel;
651	regs->tf_ds = _udatasel;
652	regs->tf_es = _udatasel;
653	regs->tf_fs = _udatasel;
654	regs->tf_ss = _udatasel;
655	PROC_LOCK(p);
656	mtx_lock(&psp->ps_mtx);
657}
658
659/*
660 * System call to cleanup state after a signal
661 * has been taken.  Reset signal mask and
662 * stack state from context left by sendsig (above).
663 * Return to previous pc and psl as specified by
664 * context left by sendsig. Check carefully to
665 * make sure that the user has not modified the
666 * psl to gain improper privileges or to cause
667 * a machine fault.
668 */
669int
670linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
671{
672	struct l_sigframe frame;
673	struct trapframe *regs;
674	l_sigset_t lmask;
675	sigset_t bmask;
676	int eflags, i;
677	ksiginfo_t ksi;
678
679	regs = td->td_frame;
680
681#ifdef DEBUG
682	if (ldebug(sigreturn))
683		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
684#endif
685	/*
686	 * The trampoline code hands us the sigframe.
687	 * It is unsafe to keep track of it ourselves, in the event that a
688	 * program jumps out of a signal handler.
689	 */
690	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
691		return (EFAULT);
692
693	/*
694	 * Check for security violations.
695	 */
696#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
697	eflags = frame.sf_sc.sc_eflags;
698	if (!EFLAGS_SECURE(eflags, regs->tf_eflags))
699		return (EINVAL);
700
701	/*
702	 * Don't allow users to load a valid privileged %cs.  Let the
703	 * hardware check for invalid selectors, excess privilege in
704	 * other selectors, invalid %eip's and invalid %esp's.
705	 */
706#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
707	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
708		ksiginfo_init_trap(&ksi);
709		ksi.ksi_signo = SIGBUS;
710		ksi.ksi_code = BUS_OBJERR;
711		ksi.ksi_trapno = T_PROTFLT;
712		ksi.ksi_addr = (void *)regs->tf_eip;
713		trapsignal(td, &ksi);
714		return (EINVAL);
715	}
716
717	lmask.__bits[0] = frame.sf_sc.sc_mask;
718	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
719		lmask.__bits[i+1] = frame.sf_extramask[i];
720	linux_to_bsd_sigset(&lmask, &bmask);
721	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
722
723	/*
724	 * Restore signal context.
725	 */
726	/* %gs was restored by the trampoline. */
727	regs->tf_fs     = frame.sf_sc.sc_fs;
728	regs->tf_es     = frame.sf_sc.sc_es;
729	regs->tf_ds     = frame.sf_sc.sc_ds;
730	regs->tf_edi    = frame.sf_sc.sc_edi;
731	regs->tf_esi    = frame.sf_sc.sc_esi;
732	regs->tf_ebp    = frame.sf_sc.sc_ebp;
733	regs->tf_ebx    = frame.sf_sc.sc_ebx;
734	regs->tf_edx    = frame.sf_sc.sc_edx;
735	regs->tf_ecx    = frame.sf_sc.sc_ecx;
736	regs->tf_eax    = frame.sf_sc.sc_eax;
737	regs->tf_eip    = frame.sf_sc.sc_eip;
738	regs->tf_cs     = frame.sf_sc.sc_cs;
739	regs->tf_eflags = eflags;
740	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
741	regs->tf_ss     = frame.sf_sc.sc_ss;
742
743	return (EJUSTRETURN);
744}
745
746/*
747 * System call to cleanup state after a signal
748 * has been taken.  Reset signal mask and
749 * stack state from context left by rt_sendsig (above).
750 * Return to previous pc and psl as specified by
751 * context left by sendsig. Check carefully to
752 * make sure that the user has not modified the
753 * psl to gain improper privileges or to cause
754 * a machine fault.
755 */
756int
757linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
758{
759	struct l_ucontext uc;
760	struct l_sigcontext *context;
761	sigset_t bmask;
762	l_stack_t *lss;
763	stack_t ss;
764	struct trapframe *regs;
765	int eflags;
766	ksiginfo_t ksi;
767
768	regs = td->td_frame;
769
770#ifdef DEBUG
771	if (ldebug(rt_sigreturn))
772		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
773#endif
774	/*
775	 * The trampoline code hands us the ucontext.
776	 * It is unsafe to keep track of it ourselves, in the event that a
777	 * program jumps out of a signal handler.
778	 */
779	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
780		return (EFAULT);
781
782	context = &uc.uc_mcontext;
783
784	/*
785	 * Check for security violations.
786	 */
787#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
788	eflags = context->sc_eflags;
789	if (!EFLAGS_SECURE(eflags, regs->tf_eflags))
790		return (EINVAL);
791
792	/*
793	 * Don't allow users to load a valid privileged %cs.  Let the
794	 * hardware check for invalid selectors, excess privilege in
795	 * other selectors, invalid %eip's and invalid %esp's.
796	 */
797#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
798	if (!CS_SECURE(context->sc_cs)) {
799		ksiginfo_init_trap(&ksi);
800		ksi.ksi_signo = SIGBUS;
801		ksi.ksi_code = BUS_OBJERR;
802		ksi.ksi_trapno = T_PROTFLT;
803		ksi.ksi_addr = (void *)regs->tf_eip;
804		trapsignal(td, &ksi);
805		return (EINVAL);
806	}
807
808	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
809	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
810
811	/*
812	 * Restore signal context
813	 */
814	/* %gs was restored by the trampoline. */
815	regs->tf_fs     = context->sc_fs;
816	regs->tf_es     = context->sc_es;
817	regs->tf_ds     = context->sc_ds;
818	regs->tf_edi    = context->sc_edi;
819	regs->tf_esi    = context->sc_esi;
820	regs->tf_ebp    = context->sc_ebp;
821	regs->tf_ebx    = context->sc_ebx;
822	regs->tf_edx    = context->sc_edx;
823	regs->tf_ecx    = context->sc_ecx;
824	regs->tf_eax    = context->sc_eax;
825	regs->tf_eip    = context->sc_eip;
826	regs->tf_cs     = context->sc_cs;
827	regs->tf_eflags = eflags;
828	regs->tf_esp    = context->sc_esp_at_signal;
829	regs->tf_ss     = context->sc_ss;
830
831	/*
832	 * call sigaltstack & ignore results..
833	 */
834	lss = &uc.uc_stack;
835	ss.ss_sp = lss->ss_sp;
836	ss.ss_size = lss->ss_size;
837	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
838
839#ifdef DEBUG
840	if (ldebug(rt_sigreturn))
841		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
842		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
843#endif
844	(void)kern_sigaltstack(td, &ss, NULL);
845
846	return (EJUSTRETURN);
847}
848
849static int
850linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
851{
852	struct proc *p;
853	struct trapframe *frame;
854
855	p = td->td_proc;
856	frame = td->td_frame;
857
858	sa->code = frame->tf_eax;
859	sa->args[0] = frame->tf_ebx;
860	sa->args[1] = frame->tf_ecx;
861	sa->args[2] = frame->tf_edx;
862	sa->args[3] = frame->tf_esi;
863	sa->args[4] = frame->tf_edi;
864	sa->args[5] = frame->tf_ebp;	/* Unconfirmed */
865
866	if (sa->code >= p->p_sysent->sv_size)
867		sa->callp = &p->p_sysent->sv_table[0];
868 	else
869 		sa->callp = &p->p_sysent->sv_table[sa->code];
870	sa->narg = sa->callp->sy_narg;
871
872	td->td_retval[0] = 0;
873	td->td_retval[1] = frame->tf_edx;
874
875	return (0);
876}
877
878/*
879 * If a linux binary is exec'ing something, try this image activator
880 * first.  We override standard shell script execution in order to
881 * be able to modify the interpreter path.  We only do this if a linux
882 * binary is doing the exec, so we do not create an EXEC module for it.
883 */
884static int	exec_linux_imgact_try(struct image_params *iparams);
885
886static int
887exec_linux_imgact_try(struct image_params *imgp)
888{
889    const char *head = (const char *)imgp->image_header;
890    char *rpath;
891    int error = -1;
892
893    /*
894     * The interpreter for shell scripts run from a linux binary needs
895     * to be located in /compat/linux if possible in order to recursively
896     * maintain linux path emulation.
897     */
898    if (((const short *)head)[0] == SHELLMAGIC) {
899	    /*
900	     * Run our normal shell image activator.  If it succeeds attempt
901	     * to use the alternate path for the interpreter.  If an alternate
902	     * path is found, use our stringspace to store it.
903	     */
904	    if ((error = exec_shell_imgact(imgp)) == 0) {
905		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
906			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD);
907		    if (rpath != NULL)
908			    imgp->args->fname_buf =
909				imgp->interpreter_name = rpath;
910	    }
911    }
912    return (error);
913}
914
915/*
916 * exec_setregs may initialize some registers differently than Linux
917 * does, thus potentially confusing Linux binaries. If necessary, we
918 * override the exec_setregs default(s) here.
919 */
920static void
921exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
922{
923	struct pcb *pcb = td->td_pcb;
924
925	exec_setregs(td, imgp, stack);
926
927	/* Linux sets %gs to 0, we default to _udatasel */
928	pcb->pcb_gs = 0;
929	load_gs(0);
930
931	pcb->pcb_initial_npxcw = __LINUX_NPXCW__;
932}
933
934static void
935linux_get_machine(const char **dst)
936{
937
938	switch (cpu_class) {
939	case CPUCLASS_686:
940		*dst = "i686";
941		break;
942	case CPUCLASS_586:
943		*dst = "i586";
944		break;
945	case CPUCLASS_486:
946		*dst = "i486";
947		break;
948	default:
949		*dst = "i386";
950	}
951}
952
953struct sysentvec linux_sysvec = {
954	.sv_size	= LINUX_SYS_MAXSYSCALL,
955	.sv_table	= linux_sysent,
956	.sv_mask	= 0,
957	.sv_sigsize	= LINUX_SIGTBLSZ,
958	.sv_sigtbl	= bsd_to_linux_signal,
959	.sv_errsize	= ELAST + 1,
960	.sv_errtbl	= bsd_to_linux_errno,
961	.sv_transtrap	= translate_traps,
962	.sv_fixup	= linux_fixup,
963	.sv_sendsig	= linux_sendsig,
964	.sv_sigcode	= &_binary_linux_locore_o_start,
965	.sv_szsigcode	= &linux_szsigcode,
966	.sv_prepsyscall	= NULL,
967	.sv_name	= "Linux a.out",
968	.sv_coredump	= NULL,
969	.sv_imgact_try	= exec_linux_imgact_try,
970	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
971	.sv_pagesize	= PAGE_SIZE,
972	.sv_minuser	= VM_MIN_ADDRESS,
973	.sv_maxuser	= VM_MAXUSER_ADDRESS,
974	.sv_usrstack	= LINUX_USRSTACK,
975	.sv_psstrings	= PS_STRINGS,
976	.sv_stackprot	= VM_PROT_ALL,
977	.sv_copyout_strings = exec_copyout_strings,
978	.sv_setregs	= exec_linux_setregs,
979	.sv_fixlimit	= NULL,
980	.sv_maxssiz	= NULL,
981	.sv_flags	= SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32,
982	.sv_set_syscall_retval = cpu_set_syscall_retval,
983	.sv_fetch_syscall_args = linux_fetch_syscall_args,
984	.sv_syscallnames = NULL,
985	.sv_shared_page_base = LINUX_SHAREDPAGE,
986	.sv_shared_page_len = PAGE_SIZE,
987	.sv_schedtail	= linux_schedtail,
988	.sv_thread_detach = linux_thread_detach,
989};
990INIT_SYSENTVEC(aout_sysvec, &linux_sysvec);
991
992struct sysentvec elf_linux_sysvec = {
993	.sv_size	= LINUX_SYS_MAXSYSCALL,
994	.sv_table	= linux_sysent,
995	.sv_mask	= 0,
996	.sv_sigsize	= LINUX_SIGTBLSZ,
997	.sv_sigtbl	= bsd_to_linux_signal,
998	.sv_errsize	= ELAST + 1,
999	.sv_errtbl	= bsd_to_linux_errno,
1000	.sv_transtrap	= translate_traps,
1001	.sv_fixup	= elf_linux_fixup,
1002	.sv_sendsig	= linux_sendsig,
1003	.sv_sigcode	= &_binary_linux_locore_o_start,
1004	.sv_szsigcode	= &linux_szsigcode,
1005	.sv_prepsyscall	= NULL,
1006	.sv_name	= "Linux ELF",
1007	.sv_coredump	= elf32_coredump,
1008	.sv_imgact_try	= exec_linux_imgact_try,
1009	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1010	.sv_pagesize	= PAGE_SIZE,
1011	.sv_minuser	= VM_MIN_ADDRESS,
1012	.sv_maxuser	= VM_MAXUSER_ADDRESS,
1013	.sv_usrstack	= LINUX_USRSTACK,
1014	.sv_psstrings	= LINUX_PS_STRINGS,
1015	.sv_stackprot	= VM_PROT_ALL,
1016	.sv_copyout_strings = linux_copyout_strings,
1017	.sv_setregs	= exec_linux_setregs,
1018	.sv_fixlimit	= NULL,
1019	.sv_maxssiz	= NULL,
1020	.sv_flags	= SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP,
1021	.sv_set_syscall_retval = cpu_set_syscall_retval,
1022	.sv_fetch_syscall_args = linux_fetch_syscall_args,
1023	.sv_syscallnames = NULL,
1024	.sv_shared_page_base = LINUX_SHAREDPAGE,
1025	.sv_shared_page_len = PAGE_SIZE,
1026	.sv_schedtail	= linux_schedtail,
1027	.sv_thread_detach = linux_thread_detach,
1028};
1029
1030static void
1031linux_vdso_install(void *param)
1032{
1033
1034	linux_szsigcode = (&_binary_linux_locore_o_end -
1035	    &_binary_linux_locore_o_start);
1036
1037	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1038		panic("Linux invalid vdso size\n");
1039
1040	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1041
1042	linux_shared_page_obj = __elfN(linux_shared_page_init)
1043	    (&linux_shared_page_mapping);
1044
1045	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX_SHAREDPAGE);
1046
1047	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1048	    linux_szsigcode);
1049	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1050}
1051SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1052    (sysinit_cfunc_t)linux_vdso_install, NULL);
1053
1054static void
1055linux_vdso_deinstall(void *param)
1056{
1057
1058	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1059};
1060SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1061    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1062
1063static char GNU_ABI_VENDOR[] = "GNU";
1064static int GNULINUX_ABI_DESC = 0;
1065
1066static boolean_t
1067linux_trans_osrel(const Elf_Note *note, int32_t *osrel)
1068{
1069	const Elf32_Word *desc;
1070	uintptr_t p;
1071
1072	p = (uintptr_t)(note + 1);
1073	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1074
1075	desc = (const Elf32_Word *)p;
1076	if (desc[0] != GNULINUX_ABI_DESC)
1077		return (FALSE);
1078
1079	/*
1080	 * For linux we encode osrel as follows (see linux_mib.c):
1081	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1082	 */
1083	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1084
1085	return (TRUE);
1086}
1087
1088static Elf_Brandnote linux_brandnote = {
1089	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1090	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1091	.hdr.n_type	= 1,
1092	.vendor		= GNU_ABI_VENDOR,
1093	.flags		= BN_TRANSLATE_OSREL,
1094	.trans_osrel	= linux_trans_osrel
1095};
1096
1097static Elf32_Brandinfo linux_brand = {
1098	.brand		= ELFOSABI_LINUX,
1099	.machine	= EM_386,
1100	.compat_3_brand	= "Linux",
1101	.emul_path	= "/compat/linux",
1102	.interp_path	= "/lib/ld-linux.so.1",
1103	.sysvec		= &elf_linux_sysvec,
1104	.interp_newpath	= NULL,
1105	.brand_note	= &linux_brandnote,
1106	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1107};
1108
1109static Elf32_Brandinfo linux_glibc2brand = {
1110	.brand		= ELFOSABI_LINUX,
1111	.machine	= EM_386,
1112	.compat_3_brand	= "Linux",
1113	.emul_path	= "/compat/linux",
1114	.interp_path	= "/lib/ld-linux.so.2",
1115	.sysvec		= &elf_linux_sysvec,
1116	.interp_newpath	= NULL,
1117	.brand_note	= &linux_brandnote,
1118	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1119};
1120
1121Elf32_Brandinfo *linux_brandlist[] = {
1122	&linux_brand,
1123	&linux_glibc2brand,
1124	NULL
1125};
1126
1127static int
1128linux_elf_modevent(module_t mod, int type, void *data)
1129{
1130	Elf32_Brandinfo **brandinfo;
1131	int error;
1132	struct linux_ioctl_handler **lihp;
1133	struct linux_device_handler **ldhp;
1134
1135	error = 0;
1136
1137	switch(type) {
1138	case MOD_LOAD:
1139		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1140		     ++brandinfo)
1141			if (elf32_insert_brand_entry(*brandinfo) < 0)
1142				error = EINVAL;
1143		if (error == 0) {
1144			SET_FOREACH(lihp, linux_ioctl_handler_set)
1145				linux_ioctl_register_handler(*lihp);
1146			SET_FOREACH(ldhp, linux_device_handler_set)
1147				linux_device_register_handler(*ldhp);
1148			LIST_INIT(&futex_list);
1149			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1150			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
1151			      NULL, 1000);
1152			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
1153			      NULL, 1000);
1154			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1155			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1156			linux_get_machine(&linux_kplatform);
1157			linux_szplatform = roundup(strlen(linux_kplatform) + 1,
1158			    sizeof(char *));
1159			linux_osd_jail_register();
1160			stclohz = (stathz ? stathz : hz);
1161			if (bootverbose)
1162				printf("Linux ELF exec handler installed\n");
1163		} else
1164			printf("cannot insert Linux ELF brand handler\n");
1165		break;
1166	case MOD_UNLOAD:
1167		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1168		     ++brandinfo)
1169			if (elf32_brand_inuse(*brandinfo))
1170				error = EBUSY;
1171		if (error == 0) {
1172			for (brandinfo = &linux_brandlist[0];
1173			     *brandinfo != NULL; ++brandinfo)
1174				if (elf32_remove_brand_entry(*brandinfo) < 0)
1175					error = EINVAL;
1176		}
1177		if (error == 0) {
1178			SET_FOREACH(lihp, linux_ioctl_handler_set)
1179				linux_ioctl_unregister_handler(*lihp);
1180			SET_FOREACH(ldhp, linux_device_handler_set)
1181				linux_device_unregister_handler(*ldhp);
1182			mtx_destroy(&futex_mtx);
1183			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1184			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1185			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1186			linux_osd_jail_deregister();
1187			if (bootverbose)
1188				printf("Linux ELF exec handler removed\n");
1189		} else
1190			printf("Could not deinstall ELF interpreter entry\n");
1191		break;
1192	default:
1193		return (EOPNOTSUPP);
1194	}
1195	return (error);
1196}
1197
1198static moduledata_t linux_elf_mod = {
1199	"linuxelf",
1200	linux_elf_modevent,
1201	0
1202};
1203
1204DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1205