linux_sysvec.c revision 151343
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 151343 2005-10-14 20:22:57Z jhb $");
31
32/* XXX we use functions that might not exist. */
33#include "opt_compat.h"
34
35#ifndef COMPAT_43
36#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/exec.h>
42#include <sys/imgact.h>
43#include <sys/imgact_aout.h>
44#include <sys/imgact_elf.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/module.h>
49#include <sys/mutex.h>
50#include <sys/proc.h>
51#include <sys/signalvar.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysent.h>
54#include <sys/sysproto.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#include <vm/vm_extern.h>
60#include <vm/vm_map.h>
61#include <vm/vm_object.h>
62#include <vm/vm_page.h>
63#include <vm/vm_param.h>
64
65#include <machine/cpu.h>
66#include <machine/md_var.h>
67#include <machine/pcb.h>
68
69#include <i386/linux/linux.h>
70#include <i386/linux/linux_proto.h>
71#include <compat/linux/linux_mib.h>
72#include <compat/linux/linux_signal.h>
73#include <compat/linux/linux_util.h>
74
75MODULE_VERSION(linux, 1);
76
77MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
78
79#if BYTE_ORDER == LITTLE_ENDIAN
80#define SHELLMAGIC      0x2123 /* #! */
81#else
82#define SHELLMAGIC      0x2321
83#endif
84
85/*
86 * Allow the sendsig functions to use the ldebug() facility
87 * even though they are not syscalls themselves. Map them
88 * to syscall 0. This is slightly less bogus than using
89 * ldebug(sigreturn).
90 */
91#define	LINUX_SYS_linux_rt_sendsig	0
92#define	LINUX_SYS_linux_sendsig		0
93
94#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
95#define	__LINUX_NPXCW__		0x37f
96
97extern char linux_sigcode[];
98extern int linux_szsigcode;
99
100extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
101
102SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
103
104static int	linux_fixup(register_t **stack_base,
105		    struct image_params *iparams);
106static int	elf_linux_fixup(register_t **stack_base,
107		    struct image_params *iparams);
108static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
109		    caddr_t *params);
110static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
111static void	exec_linux_setregs(struct thread *td, u_long entry,
112				   u_long stack, u_long ps_strings);
113
114/*
115 * Linux syscalls return negative errno's, we do positive and map them
116 */
117static int bsd_to_linux_errno[ELAST + 1] = {
118	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
119	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
120	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
121	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
122	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
123	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
124	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
125	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
126	-6, -6, -43, -42, -75, -6, -84
127};
128
129int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
130	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
131	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
132	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
133	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
134	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
135	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
136	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
137	0, LINUX_SIGUSR1, LINUX_SIGUSR2
138};
139
140int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
141	SIGHUP, SIGINT, SIGQUIT, SIGILL,
142	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
143	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
144	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
145	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
146	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
147	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
148	SIGIO, SIGURG, SIGSYS
149};
150
151#define LINUX_T_UNKNOWN  255
152static int _bsd_to_linux_trapcode[] = {
153	LINUX_T_UNKNOWN,	/* 0 */
154	6,			/* 1  T_PRIVINFLT */
155	LINUX_T_UNKNOWN,	/* 2 */
156	3,			/* 3  T_BPTFLT */
157	LINUX_T_UNKNOWN,	/* 4 */
158	LINUX_T_UNKNOWN,	/* 5 */
159	16,			/* 6  T_ARITHTRAP */
160	254,			/* 7  T_ASTFLT */
161	LINUX_T_UNKNOWN,	/* 8 */
162	13,			/* 9  T_PROTFLT */
163	1,			/* 10 T_TRCTRAP */
164	LINUX_T_UNKNOWN,	/* 11 */
165	14,			/* 12 T_PAGEFLT */
166	LINUX_T_UNKNOWN,	/* 13 */
167	17,			/* 14 T_ALIGNFLT */
168	LINUX_T_UNKNOWN,	/* 15 */
169	LINUX_T_UNKNOWN,	/* 16 */
170	LINUX_T_UNKNOWN,	/* 17 */
171	0,			/* 18 T_DIVIDE */
172	2,			/* 19 T_NMI */
173	4,			/* 20 T_OFLOW */
174	5,			/* 21 T_BOUND */
175	7,			/* 22 T_DNA */
176	8,			/* 23 T_DOUBLEFLT */
177	9,			/* 24 T_FPOPFLT */
178	10,			/* 25 T_TSSFLT */
179	11,			/* 26 T_SEGNPFLT */
180	12,			/* 27 T_STKFLT */
181	18,			/* 28 T_MCHK */
182	19,			/* 29 T_XMMFLT */
183	15			/* 30 T_RESERVED */
184};
185#define bsd_to_linux_trapcode(code) \
186    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
187     _bsd_to_linux_trapcode[(code)]: \
188     LINUX_T_UNKNOWN)
189
190/*
191 * If FreeBSD & Linux have a difference of opinion about what a trap
192 * means, deal with it here.
193 *
194 * MPSAFE
195 */
196static int
197translate_traps(int signal, int trap_code)
198{
199	if (signal != SIGBUS)
200		return signal;
201	switch (trap_code) {
202	case T_PROTFLT:
203	case T_TSSFLT:
204	case T_DOUBLEFLT:
205	case T_PAGEFLT:
206		return SIGSEGV;
207	default:
208		return signal;
209	}
210}
211
212static int
213linux_fixup(register_t **stack_base, struct image_params *imgp)
214{
215	register_t *argv, *envp;
216
217	argv = *stack_base;
218	envp = *stack_base + (imgp->args->argc + 1);
219	(*stack_base)--;
220	**stack_base = (intptr_t)(void *)envp;
221	(*stack_base)--;
222	**stack_base = (intptr_t)(void *)argv;
223	(*stack_base)--;
224	**stack_base = imgp->args->argc;
225	return 0;
226}
227
228static int
229elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
230{
231	Elf32_Auxargs *args;
232	register_t *pos;
233
234	KASSERT(curthread->td_proc == imgp->proc &&
235	    (curthread->td_proc->p_flag & P_SA) == 0,
236	    ("unsafe elf_linux_fixup(), should be curproc"));
237	args = (Elf32_Auxargs *)imgp->auxargs;
238	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
239
240	if (args->trace)
241		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
242	if (args->execfd != -1)
243		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
244	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
245	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
246	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
247	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
248	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
249	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
250	AUXARGS_ENTRY(pos, AT_BASE, args->base);
251	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
252	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
253	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
254	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
255	AUXARGS_ENTRY(pos, AT_NULL, 0);
256
257	free(imgp->auxargs, M_TEMP);
258	imgp->auxargs = NULL;
259
260	(*stack_base)--;
261	**stack_base = (register_t)imgp->args->argc;
262	return 0;
263}
264
265extern int _ucodesel, _udatasel;
266extern unsigned long linux_sznonrtsigcode;
267
268static void
269linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
270{
271	struct thread *td = curthread;
272	struct proc *p = td->td_proc;
273	struct sigacts *psp;
274	struct trapframe *regs;
275	struct l_rt_sigframe *fp, frame;
276	int sig, code;
277	int oonstack;
278
279	sig = ksi->ksi_signo;
280	code = ksi->ksi_code;
281	PROC_LOCK_ASSERT(p, MA_OWNED);
282	psp = p->p_sigacts;
283	mtx_assert(&psp->ps_mtx, MA_OWNED);
284	regs = td->td_frame;
285	oonstack = sigonstack(regs->tf_esp);
286
287#ifdef DEBUG
288	if (ldebug(rt_sendsig))
289		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
290		    catcher, sig, (void*)mask, code);
291#endif
292	/*
293	 * Allocate space for the signal handler context.
294	 */
295	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
296	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
297		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
298		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
299	} else
300		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
301	mtx_unlock(&psp->ps_mtx);
302
303	/*
304	 * Build the argument list for the signal handler.
305	 */
306	if (p->p_sysent->sv_sigtbl)
307		if (sig <= p->p_sysent->sv_sigsize)
308			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
309
310	bzero(&frame, sizeof(frame));
311
312	frame.sf_handler = catcher;
313	frame.sf_sig = sig;
314	frame.sf_siginfo = &fp->sf_si;
315	frame.sf_ucontext = &fp->sf_sc;
316
317	/* Fill in POSIX parts */
318	frame.sf_si.lsi_signo = sig;
319	frame.sf_si.lsi_code = code;
320	frame.sf_si.lsi_addr = ksi->ksi_addr;
321
322	/*
323	 * Build the signal context to be used by sigreturn.
324	 */
325	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
326	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
327
328	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
329	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
330	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
331	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
332	PROC_UNLOCK(p);
333
334	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
335
336	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
337	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
338	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
339	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
340	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
341	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
342	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
343	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
344	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
345	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
346	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
347	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
348	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
349	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
350	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
351	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
352	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
353	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
354	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
355
356#ifdef DEBUG
357	if (ldebug(rt_sendsig))
358		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
359		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
360		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
361#endif
362
363	if (copyout(&frame, fp, sizeof(frame)) != 0) {
364		/*
365		 * Process has trashed its stack; give it an illegal
366		 * instruction to halt it in its tracks.
367		 */
368#ifdef DEBUG
369		if (ldebug(rt_sendsig))
370			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
371			    fp, oonstack);
372#endif
373		PROC_LOCK(p);
374		sigexit(td, SIGILL);
375	}
376
377	/*
378	 * Build context to run handler in.
379	 */
380	regs->tf_esp = (int)fp;
381	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
382	    linux_sznonrtsigcode;
383	regs->tf_eflags &= ~(PSL_T | PSL_VM);
384	regs->tf_cs = _ucodesel;
385	regs->tf_ds = _udatasel;
386	regs->tf_es = _udatasel;
387	regs->tf_fs = _udatasel;
388	regs->tf_ss = _udatasel;
389	PROC_LOCK(p);
390	mtx_lock(&psp->ps_mtx);
391}
392
393
394/*
395 * Send an interrupt to process.
396 *
397 * Stack is set up to allow sigcode stored
398 * in u. to call routine, followed by kcall
399 * to sigreturn routine below.  After sigreturn
400 * resets the signal mask, the stack, and the
401 * frame pointer, it returns to the user
402 * specified pc, psl.
403 */
404static void
405linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
406{
407	struct thread *td = curthread;
408	struct proc *p = td->td_proc;
409	struct sigacts *psp;
410	struct trapframe *regs;
411	struct l_sigframe *fp, frame;
412	l_sigset_t lmask;
413	int sig, code;
414	int oonstack, i;
415
416	PROC_LOCK_ASSERT(p, MA_OWNED);
417	psp = p->p_sigacts;
418	sig = ksi->ksi_signo;
419	code = ksi->ksi_code;
420	mtx_assert(&psp->ps_mtx, MA_OWNED);
421	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
422		/* Signal handler installed with SA_SIGINFO. */
423		linux_rt_sendsig(catcher, ksi, mask);
424		return;
425	}
426	regs = td->td_frame;
427	oonstack = sigonstack(regs->tf_esp);
428
429#ifdef DEBUG
430	if (ldebug(sendsig))
431		printf(ARGS(sendsig, "%p, %d, %p, %u"),
432		    catcher, sig, (void*)mask, code);
433#endif
434
435	/*
436	 * Allocate space for the signal handler context.
437	 */
438	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
439	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
440		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
441		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
442	} else
443		fp = (struct l_sigframe *)regs->tf_esp - 1;
444	mtx_unlock(&psp->ps_mtx);
445	PROC_UNLOCK(p);
446
447	/*
448	 * Build the argument list for the signal handler.
449	 */
450	if (p->p_sysent->sv_sigtbl)
451		if (sig <= p->p_sysent->sv_sigsize)
452			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
453
454	bzero(&frame, sizeof(frame));
455
456	frame.sf_handler = catcher;
457	frame.sf_sig = sig;
458
459	bsd_to_linux_sigset(mask, &lmask);
460
461	/*
462	 * Build the signal context to be used by sigreturn.
463	 */
464	frame.sf_sc.sc_mask   = lmask.__bits[0];
465	frame.sf_sc.sc_gs     = rgs();
466	frame.sf_sc.sc_fs     = regs->tf_fs;
467	frame.sf_sc.sc_es     = regs->tf_es;
468	frame.sf_sc.sc_ds     = regs->tf_ds;
469	frame.sf_sc.sc_edi    = regs->tf_edi;
470	frame.sf_sc.sc_esi    = regs->tf_esi;
471	frame.sf_sc.sc_ebp    = regs->tf_ebp;
472	frame.sf_sc.sc_ebx    = regs->tf_ebx;
473	frame.sf_sc.sc_edx    = regs->tf_edx;
474	frame.sf_sc.sc_ecx    = regs->tf_ecx;
475	frame.sf_sc.sc_eax    = regs->tf_eax;
476	frame.sf_sc.sc_eip    = regs->tf_eip;
477	frame.sf_sc.sc_cs     = regs->tf_cs;
478	frame.sf_sc.sc_eflags = regs->tf_eflags;
479	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
480	frame.sf_sc.sc_ss     = regs->tf_ss;
481	frame.sf_sc.sc_err    = regs->tf_err;
482	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
483
484	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
485		frame.sf_extramask[i] = lmask.__bits[i+1];
486
487	if (copyout(&frame, fp, sizeof(frame)) != 0) {
488		/*
489		 * Process has trashed its stack; give it an illegal
490		 * instruction to halt it in its tracks.
491		 */
492		PROC_LOCK(p);
493		sigexit(td, SIGILL);
494	}
495
496	/*
497	 * Build context to run handler in.
498	 */
499	regs->tf_esp = (int)fp;
500	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
501	regs->tf_eflags &= ~(PSL_T | PSL_VM);
502	regs->tf_cs = _ucodesel;
503	regs->tf_ds = _udatasel;
504	regs->tf_es = _udatasel;
505	regs->tf_fs = _udatasel;
506	regs->tf_ss = _udatasel;
507	PROC_LOCK(p);
508	mtx_lock(&psp->ps_mtx);
509}
510
511/*
512 * System call to cleanup state after a signal
513 * has been taken.  Reset signal mask and
514 * stack state from context left by sendsig (above).
515 * Return to previous pc and psl as specified by
516 * context left by sendsig. Check carefully to
517 * make sure that the user has not modified the
518 * psl to gain improper privileges or to cause
519 * a machine fault.
520 */
521int
522linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
523{
524	struct proc *p = td->td_proc;
525	struct l_sigframe frame;
526	struct trapframe *regs;
527	l_sigset_t lmask;
528	int eflags, i;
529	ksiginfo_t ksi;
530
531	regs = td->td_frame;
532
533#ifdef DEBUG
534	if (ldebug(sigreturn))
535		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
536#endif
537	/*
538	 * The trampoline code hands us the sigframe.
539	 * It is unsafe to keep track of it ourselves, in the event that a
540	 * program jumps out of a signal handler.
541	 */
542	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
543		return (EFAULT);
544
545	/*
546	 * Check for security violations.
547	 */
548#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
549	eflags = frame.sf_sc.sc_eflags;
550	/*
551	 * XXX do allow users to change the privileged flag PSL_RF.  The
552	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
553	 * sometimes set it there too.  tf_eflags is kept in the signal
554	 * context during signal handling and there is no other place
555	 * to remember it, so the PSL_RF bit may be corrupted by the
556	 * signal handler without us knowing.  Corruption of the PSL_RF
557	 * bit at worst causes one more or one less debugger trap, so
558	 * allowing it is fairly harmless.
559	 */
560	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
561		return(EINVAL);
562
563	/*
564	 * Don't allow users to load a valid privileged %cs.  Let the
565	 * hardware check for invalid selectors, excess privilege in
566	 * other selectors, invalid %eip's and invalid %esp's.
567	 */
568#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
569	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
570		ksiginfo_init_trap(&ksi);
571		ksi.ksi_signo = SIGBUS;
572		ksi.ksi_code = BUS_OBJERR;
573		ksi.ksi_trapno = T_PROTFLT;
574		ksi.ksi_addr = (void *)regs->tf_eip;
575		trapsignal(td, &ksi);
576		return(EINVAL);
577	}
578
579	lmask.__bits[0] = frame.sf_sc.sc_mask;
580	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
581		lmask.__bits[i+1] = frame.sf_extramask[i];
582	PROC_LOCK(p);
583	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
584	SIG_CANTMASK(td->td_sigmask);
585	signotify(td);
586	PROC_UNLOCK(p);
587
588	/*
589	 * Restore signal context.
590	 */
591	/* %gs was restored by the trampoline. */
592	regs->tf_fs     = frame.sf_sc.sc_fs;
593	regs->tf_es     = frame.sf_sc.sc_es;
594	regs->tf_ds     = frame.sf_sc.sc_ds;
595	regs->tf_edi    = frame.sf_sc.sc_edi;
596	regs->tf_esi    = frame.sf_sc.sc_esi;
597	regs->tf_ebp    = frame.sf_sc.sc_ebp;
598	regs->tf_ebx    = frame.sf_sc.sc_ebx;
599	regs->tf_edx    = frame.sf_sc.sc_edx;
600	regs->tf_ecx    = frame.sf_sc.sc_ecx;
601	regs->tf_eax    = frame.sf_sc.sc_eax;
602	regs->tf_eip    = frame.sf_sc.sc_eip;
603	regs->tf_cs     = frame.sf_sc.sc_cs;
604	regs->tf_eflags = eflags;
605	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
606	regs->tf_ss     = frame.sf_sc.sc_ss;
607
608	return (EJUSTRETURN);
609}
610
611/*
612 * System call to cleanup state after a signal
613 * has been taken.  Reset signal mask and
614 * stack state from context left by rt_sendsig (above).
615 * Return to previous pc and psl as specified by
616 * context left by sendsig. Check carefully to
617 * make sure that the user has not modified the
618 * psl to gain improper privileges or to cause
619 * a machine fault.
620 */
621int
622linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
623{
624	struct proc *p = td->td_proc;
625	struct l_ucontext uc;
626	struct l_sigcontext *context;
627	l_stack_t *lss;
628	stack_t ss;
629	struct trapframe *regs;
630	int eflags;
631	ksiginfo_t ksi;
632
633	regs = td->td_frame;
634
635#ifdef DEBUG
636	if (ldebug(rt_sigreturn))
637		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
638#endif
639	/*
640	 * The trampoline code hands us the ucontext.
641	 * It is unsafe to keep track of it ourselves, in the event that a
642	 * program jumps out of a signal handler.
643	 */
644	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
645		return (EFAULT);
646
647	context = &uc.uc_mcontext;
648
649	/*
650	 * Check for security violations.
651	 */
652#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
653	eflags = context->sc_eflags;
654	/*
655	 * XXX do allow users to change the privileged flag PSL_RF.  The
656	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
657	 * sometimes set it there too.  tf_eflags is kept in the signal
658	 * context during signal handling and there is no other place
659	 * to remember it, so the PSL_RF bit may be corrupted by the
660	 * signal handler without us knowing.  Corruption of the PSL_RF
661	 * bit at worst causes one more or one less debugger trap, so
662	 * allowing it is fairly harmless.
663	 */
664	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
665		return(EINVAL);
666
667	/*
668	 * Don't allow users to load a valid privileged %cs.  Let the
669	 * hardware check for invalid selectors, excess privilege in
670	 * other selectors, invalid %eip's and invalid %esp's.
671	 */
672#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
673	if (!CS_SECURE(context->sc_cs)) {
674		ksiginfo_init_trap(&ksi);
675		ksi.ksi_signo = SIGBUS;
676		ksi.ksi_code = BUS_OBJERR;
677		ksi.ksi_trapno = T_PROTFLT;
678		ksi.ksi_addr = (void *)regs->tf_eip;
679		trapsignal(td, &ksi);
680		return(EINVAL);
681	}
682
683	PROC_LOCK(p);
684	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
685	SIG_CANTMASK(td->td_sigmask);
686	signotify(td);
687	PROC_UNLOCK(p);
688
689	/*
690	 * Restore signal context
691	 */
692	/* %gs was restored by the trampoline. */
693	regs->tf_fs     = context->sc_fs;
694	regs->tf_es     = context->sc_es;
695	regs->tf_ds     = context->sc_ds;
696	regs->tf_edi    = context->sc_edi;
697	regs->tf_esi    = context->sc_esi;
698	regs->tf_ebp    = context->sc_ebp;
699	regs->tf_ebx    = context->sc_ebx;
700	regs->tf_edx    = context->sc_edx;
701	regs->tf_ecx    = context->sc_ecx;
702	regs->tf_eax    = context->sc_eax;
703	regs->tf_eip    = context->sc_eip;
704	regs->tf_cs     = context->sc_cs;
705	regs->tf_eflags = eflags;
706	regs->tf_esp    = context->sc_esp_at_signal;
707	regs->tf_ss     = context->sc_ss;
708
709	/*
710	 * call sigaltstack & ignore results..
711	 */
712	lss = &uc.uc_stack;
713	ss.ss_sp = lss->ss_sp;
714	ss.ss_size = lss->ss_size;
715	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
716
717#ifdef DEBUG
718	if (ldebug(rt_sigreturn))
719		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
720		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
721#endif
722	(void)kern_sigaltstack(td, &ss, NULL);
723
724	return (EJUSTRETURN);
725}
726
727/*
728 * MPSAFE
729 */
730static void
731linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
732{
733	args[0] = tf->tf_ebx;
734	args[1] = tf->tf_ecx;
735	args[2] = tf->tf_edx;
736	args[3] = tf->tf_esi;
737	args[4] = tf->tf_edi;
738	args[5] = tf->tf_ebp;	/* Unconfirmed */
739	*params = NULL;		/* no copyin */
740}
741
742/*
743 * If a linux binary is exec'ing something, try this image activator
744 * first.  We override standard shell script execution in order to
745 * be able to modify the interpreter path.  We only do this if a linux
746 * binary is doing the exec, so we do not create an EXEC module for it.
747 */
748static int	exec_linux_imgact_try(struct image_params *iparams);
749
750static int
751exec_linux_imgact_try(struct image_params *imgp)
752{
753    const char *head = (const char *)imgp->image_header;
754    char *rpath;
755    int error = -1, len;
756
757    /*
758     * The interpreter for shell scripts run from a linux binary needs
759     * to be located in /compat/linux if possible in order to recursively
760     * maintain linux path emulation.
761     */
762    if (((const short *)head)[0] == SHELLMAGIC) {
763	    /*
764	     * Run our normal shell image activator.  If it succeeds attempt
765	     * to use the alternate path for the interpreter.  If an alternate
766	     * path is found, use our stringspace to store it.
767	     */
768	    if ((error = exec_shell_imgact(imgp)) == 0) {
769		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
770			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
771		    if (rpath != NULL) {
772			    len = strlen(rpath) + 1;
773
774			    if (len <= MAXSHELLCMDLEN) {
775				    memcpy(imgp->interpreter_name, rpath, len);
776			    }
777			    free(rpath, M_TEMP);
778		    }
779	    }
780    }
781    return(error);
782}
783
784/*
785 * exec_setregs may initialize some registers differently than Linux
786 * does, thus potentially confusing Linux binaries. If necessary, we
787 * override the exec_setregs default(s) here.
788 */
789static void
790exec_linux_setregs(struct thread *td, u_long entry,
791		   u_long stack, u_long ps_strings)
792{
793	static const u_short control = __LINUX_NPXCW__;
794	struct pcb *pcb = td->td_pcb;
795
796	exec_setregs(td, entry, stack, ps_strings);
797
798	/* Linux sets %gs to 0, we default to _udatasel */
799	pcb->pcb_gs = 0; load_gs(0);
800
801	/* Linux sets the i387 to extended precision. */
802	fldcw(&control);
803}
804
805struct sysentvec linux_sysvec = {
806	LINUX_SYS_MAXSYSCALL,
807	linux_sysent,
808	0xff,
809	LINUX_SIGTBLSZ,
810	bsd_to_linux_signal,
811	ELAST + 1,
812	bsd_to_linux_errno,
813	translate_traps,
814	linux_fixup,
815	linux_sendsig,
816	linux_sigcode,
817	&linux_szsigcode,
818	linux_prepsyscall,
819	"Linux a.out",
820	NULL,
821	exec_linux_imgact_try,
822	LINUX_MINSIGSTKSZ,
823	PAGE_SIZE,
824	VM_MIN_ADDRESS,
825	VM_MAXUSER_ADDRESS,
826	USRSTACK,
827	PS_STRINGS,
828	VM_PROT_ALL,
829	exec_copyout_strings,
830	exec_linux_setregs,
831	NULL
832};
833
834struct sysentvec elf_linux_sysvec = {
835	LINUX_SYS_MAXSYSCALL,
836	linux_sysent,
837	0xff,
838	LINUX_SIGTBLSZ,
839	bsd_to_linux_signal,
840	ELAST + 1,
841	bsd_to_linux_errno,
842	translate_traps,
843	elf_linux_fixup,
844	linux_sendsig,
845	linux_sigcode,
846	&linux_szsigcode,
847	linux_prepsyscall,
848	"Linux ELF",
849	elf32_coredump,
850	exec_linux_imgact_try,
851	LINUX_MINSIGSTKSZ,
852	PAGE_SIZE,
853	VM_MIN_ADDRESS,
854	VM_MAXUSER_ADDRESS,
855	USRSTACK,
856	PS_STRINGS,
857	VM_PROT_ALL,
858	exec_copyout_strings,
859	exec_linux_setregs,
860	NULL
861};
862
863static Elf32_Brandinfo linux_brand = {
864					ELFOSABI_LINUX,
865					EM_386,
866					"Linux",
867					"/compat/linux",
868					"/lib/ld-linux.so.1",
869					&elf_linux_sysvec,
870					NULL,
871				 };
872
873static Elf32_Brandinfo linux_glibc2brand = {
874					ELFOSABI_LINUX,
875					EM_386,
876					"Linux",
877					"/compat/linux",
878					"/lib/ld-linux.so.2",
879					&elf_linux_sysvec,
880					NULL,
881				 };
882
883Elf32_Brandinfo *linux_brandlist[] = {
884					&linux_brand,
885					&linux_glibc2brand,
886					NULL
887				};
888
889static int
890linux_elf_modevent(module_t mod, int type, void *data)
891{
892	Elf32_Brandinfo **brandinfo;
893	int error;
894	struct linux_ioctl_handler **lihp;
895
896	error = 0;
897
898	switch(type) {
899	case MOD_LOAD:
900		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
901		     ++brandinfo)
902			if (elf32_insert_brand_entry(*brandinfo) < 0)
903				error = EINVAL;
904		if (error == 0) {
905			SET_FOREACH(lihp, linux_ioctl_handler_set)
906				linux_ioctl_register_handler(*lihp);
907			if (bootverbose)
908				printf("Linux ELF exec handler installed\n");
909		} else
910			printf("cannot insert Linux ELF brand handler\n");
911		break;
912	case MOD_UNLOAD:
913		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
914		     ++brandinfo)
915			if (elf32_brand_inuse(*brandinfo))
916				error = EBUSY;
917		if (error == 0) {
918			for (brandinfo = &linux_brandlist[0];
919			     *brandinfo != NULL; ++brandinfo)
920				if (elf32_remove_brand_entry(*brandinfo) < 0)
921					error = EINVAL;
922		}
923		if (error == 0) {
924			SET_FOREACH(lihp, linux_ioctl_handler_set)
925				linux_ioctl_unregister_handler(*lihp);
926			if (bootverbose)
927				printf("Linux ELF exec handler removed\n");
928			linux_mib_destroy();
929		} else
930			printf("Could not deinstall ELF interpreter entry\n");
931		break;
932	default:
933		return EOPNOTSUPP;
934	}
935	return error;
936}
937
938static moduledata_t linux_elf_mod = {
939	"linuxelf",
940	linux_elf_modevent,
941	0
942};
943
944DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
945