linux_sysvec.c revision 148540
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 148540 2005-07-29 19:40:39Z jhb $");
31
32/* XXX we use functions that might not exist. */
33#include "opt_compat.h"
34
35#ifndef COMPAT_43
36#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/exec.h>
42#include <sys/imgact.h>
43#include <sys/imgact_aout.h>
44#include <sys/imgact_elf.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/module.h>
49#include <sys/mutex.h>
50#include <sys/proc.h>
51#include <sys/signalvar.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysent.h>
54#include <sys/sysproto.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#include <vm/vm_extern.h>
60#include <vm/vm_map.h>
61#include <vm/vm_object.h>
62#include <vm/vm_page.h>
63#include <vm/vm_param.h>
64
65#include <machine/cpu.h>
66#include <machine/md_var.h>
67#include <machine/pcb.h>
68
69#include <i386/linux/linux.h>
70#include <i386/linux/linux_proto.h>
71#include <compat/linux/linux_mib.h>
72#include <compat/linux/linux_signal.h>
73#include <compat/linux/linux_util.h>
74
75MODULE_VERSION(linux, 1);
76
77MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
78
79#if BYTE_ORDER == LITTLE_ENDIAN
80#define SHELLMAGIC      0x2123 /* #! */
81#else
82#define SHELLMAGIC      0x2321
83#endif
84
85/*
86 * Allow the sendsig functions to use the ldebug() facility
87 * even though they are not syscalls themselves. Map them
88 * to syscall 0. This is slightly less bogus than using
89 * ldebug(sigreturn).
90 */
91#define	LINUX_SYS_linux_rt_sendsig	0
92#define	LINUX_SYS_linux_sendsig		0
93
94#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
95#define	__LINUX_NPXCW__		0x37f
96
97extern char linux_sigcode[];
98extern int linux_szsigcode;
99
100extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
101
102SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
103
104static int	linux_fixup(register_t **stack_base,
105		    struct image_params *iparams);
106static int	elf_linux_fixup(register_t **stack_base,
107		    struct image_params *iparams);
108static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
109		    caddr_t *params);
110static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
111		    u_long code);
112static void	exec_linux_setregs(struct thread *td, u_long entry,
113				   u_long stack, u_long ps_strings);
114
115/*
116 * Linux syscalls return negative errno's, we do positive and map them
117 */
118static int bsd_to_linux_errno[ELAST + 1] = {
119	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
120	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
121	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
122	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
123	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
124	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
125	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
126	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
127	-6, -6, -43, -42, -75, -6, -84
128};
129
130int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
131	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
132	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
133	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
134	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
135	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
136	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
137	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
138	0, LINUX_SIGUSR1, LINUX_SIGUSR2
139};
140
141int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
142	SIGHUP, SIGINT, SIGQUIT, SIGILL,
143	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
144	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
145	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
146	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
147	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
148	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
149	SIGIO, SIGURG, SIGSYS
150};
151
152#define LINUX_T_UNKNOWN  255
153static int _bsd_to_linux_trapcode[] = {
154	LINUX_T_UNKNOWN,	/* 0 */
155	6,			/* 1  T_PRIVINFLT */
156	LINUX_T_UNKNOWN,	/* 2 */
157	3,			/* 3  T_BPTFLT */
158	LINUX_T_UNKNOWN,	/* 4 */
159	LINUX_T_UNKNOWN,	/* 5 */
160	16,			/* 6  T_ARITHTRAP */
161	254,			/* 7  T_ASTFLT */
162	LINUX_T_UNKNOWN,	/* 8 */
163	13,			/* 9  T_PROTFLT */
164	1,			/* 10 T_TRCTRAP */
165	LINUX_T_UNKNOWN,	/* 11 */
166	14,			/* 12 T_PAGEFLT */
167	LINUX_T_UNKNOWN,	/* 13 */
168	17,			/* 14 T_ALIGNFLT */
169	LINUX_T_UNKNOWN,	/* 15 */
170	LINUX_T_UNKNOWN,	/* 16 */
171	LINUX_T_UNKNOWN,	/* 17 */
172	0,			/* 18 T_DIVIDE */
173	2,			/* 19 T_NMI */
174	4,			/* 20 T_OFLOW */
175	5,			/* 21 T_BOUND */
176	7,			/* 22 T_DNA */
177	8,			/* 23 T_DOUBLEFLT */
178	9,			/* 24 T_FPOPFLT */
179	10,			/* 25 T_TSSFLT */
180	11,			/* 26 T_SEGNPFLT */
181	12,			/* 27 T_STKFLT */
182	18,			/* 28 T_MCHK */
183	19,			/* 29 T_XMMFLT */
184	15			/* 30 T_RESERVED */
185};
186#define bsd_to_linux_trapcode(code) \
187    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
188     _bsd_to_linux_trapcode[(code)]: \
189     LINUX_T_UNKNOWN)
190
191/*
192 * If FreeBSD & Linux have a difference of opinion about what a trap
193 * means, deal with it here.
194 *
195 * MPSAFE
196 */
197static int
198translate_traps(int signal, int trap_code)
199{
200	if (signal != SIGBUS)
201		return signal;
202	switch (trap_code) {
203	case T_PROTFLT:
204	case T_TSSFLT:
205	case T_DOUBLEFLT:
206	case T_PAGEFLT:
207		return SIGSEGV;
208	default:
209		return signal;
210	}
211}
212
213static int
214linux_fixup(register_t **stack_base, struct image_params *imgp)
215{
216	register_t *argv, *envp;
217
218	argv = *stack_base;
219	envp = *stack_base + (imgp->args->argc + 1);
220	(*stack_base)--;
221	**stack_base = (intptr_t)(void *)envp;
222	(*stack_base)--;
223	**stack_base = (intptr_t)(void *)argv;
224	(*stack_base)--;
225	**stack_base = imgp->args->argc;
226	return 0;
227}
228
229static int
230elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
231{
232	Elf32_Auxargs *args;
233	register_t *pos;
234
235	KASSERT(curthread->td_proc == imgp->proc &&
236	    (curthread->td_proc->p_flag & P_SA) == 0,
237	    ("unsafe elf_linux_fixup(), should be curproc"));
238	args = (Elf32_Auxargs *)imgp->auxargs;
239	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
240
241	if (args->trace)
242		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
243	if (args->execfd != -1)
244		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
245	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
246	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
247	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
248	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
249	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
250	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
251	AUXARGS_ENTRY(pos, AT_BASE, args->base);
252	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
253	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
254	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
255	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
256	AUXARGS_ENTRY(pos, AT_NULL, 0);
257
258	free(imgp->auxargs, M_TEMP);
259	imgp->auxargs = NULL;
260
261	(*stack_base)--;
262	**stack_base = (register_t)imgp->args->argc;
263	return 0;
264}
265
266extern int _ucodesel, _udatasel;
267extern unsigned long linux_sznonrtsigcode;
268
269static void
270linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
271{
272	struct thread *td = curthread;
273	struct proc *p = td->td_proc;
274	struct sigacts *psp;
275	struct trapframe *regs;
276	struct l_rt_sigframe *fp, frame;
277	int oonstack;
278
279	PROC_LOCK_ASSERT(p, MA_OWNED);
280	psp = p->p_sigacts;
281	mtx_assert(&psp->ps_mtx, MA_OWNED);
282	regs = td->td_frame;
283	oonstack = sigonstack(regs->tf_esp);
284
285#ifdef DEBUG
286	if (ldebug(rt_sendsig))
287		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
288		    catcher, sig, (void*)mask, code);
289#endif
290	/*
291	 * Allocate space for the signal handler context.
292	 */
293	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
294	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
295		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
296		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
297	} else
298		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
299	mtx_unlock(&psp->ps_mtx);
300
301	/*
302	 * Build the argument list for the signal handler.
303	 */
304	if (p->p_sysent->sv_sigtbl)
305		if (sig <= p->p_sysent->sv_sigsize)
306			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
307
308	bzero(&frame, sizeof(frame));
309
310	frame.sf_handler = catcher;
311	frame.sf_sig = sig;
312	frame.sf_siginfo = &fp->sf_si;
313	frame.sf_ucontext = &fp->sf_sc;
314
315	/* Fill in POSIX parts */
316	frame.sf_si.lsi_signo = sig;
317	frame.sf_si.lsi_code = code;
318	frame.sf_si.lsi_addr = (void *)regs->tf_err;
319
320	/*
321	 * Build the signal context to be used by sigreturn.
322	 */
323	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
324	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
325
326	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
327	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
328	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
329	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
330	PROC_UNLOCK(p);
331
332	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
333
334	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
335	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
336	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
337	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
338	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
339	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
340	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
341	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
342	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
343	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
344	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
345	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
346	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
347	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
348	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
349	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
350	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
351	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
352	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
353
354#ifdef DEBUG
355	if (ldebug(rt_sendsig))
356		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
357		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
358		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
359#endif
360
361	if (copyout(&frame, fp, sizeof(frame)) != 0) {
362		/*
363		 * Process has trashed its stack; give it an illegal
364		 * instruction to halt it in its tracks.
365		 */
366#ifdef DEBUG
367		if (ldebug(rt_sendsig))
368			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
369			    fp, oonstack);
370#endif
371		PROC_LOCK(p);
372		sigexit(td, SIGILL);
373	}
374
375	/*
376	 * Build context to run handler in.
377	 */
378	regs->tf_esp = (int)fp;
379	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
380	    linux_sznonrtsigcode;
381	regs->tf_eflags &= ~(PSL_T | PSL_VM);
382	regs->tf_cs = _ucodesel;
383	regs->tf_ds = _udatasel;
384	regs->tf_es = _udatasel;
385	regs->tf_fs = _udatasel;
386	regs->tf_ss = _udatasel;
387	PROC_LOCK(p);
388	mtx_lock(&psp->ps_mtx);
389}
390
391
392/*
393 * Send an interrupt to process.
394 *
395 * Stack is set up to allow sigcode stored
396 * in u. to call routine, followed by kcall
397 * to sigreturn routine below.  After sigreturn
398 * resets the signal mask, the stack, and the
399 * frame pointer, it returns to the user
400 * specified pc, psl.
401 */
402static void
403linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
404{
405	struct thread *td = curthread;
406	struct proc *p = td->td_proc;
407	struct sigacts *psp;
408	struct trapframe *regs;
409	struct l_sigframe *fp, frame;
410	l_sigset_t lmask;
411	int oonstack, i;
412
413	PROC_LOCK_ASSERT(p, MA_OWNED);
414	psp = p->p_sigacts;
415	mtx_assert(&psp->ps_mtx, MA_OWNED);
416	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417		/* Signal handler installed with SA_SIGINFO. */
418		linux_rt_sendsig(catcher, sig, mask, code);
419		return;
420	}
421
422	regs = td->td_frame;
423	oonstack = sigonstack(regs->tf_esp);
424
425#ifdef DEBUG
426	if (ldebug(sendsig))
427		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
428		    catcher, sig, (void*)mask, code);
429#endif
430
431	/*
432	 * Allocate space for the signal handler context.
433	 */
434	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
435	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
436		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
437		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
438	} else
439		fp = (struct l_sigframe *)regs->tf_esp - 1;
440	mtx_unlock(&psp->ps_mtx);
441	PROC_UNLOCK(p);
442
443	/*
444	 * Build the argument list for the signal handler.
445	 */
446	if (p->p_sysent->sv_sigtbl)
447		if (sig <= p->p_sysent->sv_sigsize)
448			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
449
450	bzero(&frame, sizeof(frame));
451
452	frame.sf_handler = catcher;
453	frame.sf_sig = sig;
454
455	bsd_to_linux_sigset(mask, &lmask);
456
457	/*
458	 * Build the signal context to be used by sigreturn.
459	 */
460	frame.sf_sc.sc_mask   = lmask.__bits[0];
461	frame.sf_sc.sc_gs     = rgs();
462	frame.sf_sc.sc_fs     = regs->tf_fs;
463	frame.sf_sc.sc_es     = regs->tf_es;
464	frame.sf_sc.sc_ds     = regs->tf_ds;
465	frame.sf_sc.sc_edi    = regs->tf_edi;
466	frame.sf_sc.sc_esi    = regs->tf_esi;
467	frame.sf_sc.sc_ebp    = regs->tf_ebp;
468	frame.sf_sc.sc_ebx    = regs->tf_ebx;
469	frame.sf_sc.sc_edx    = regs->tf_edx;
470	frame.sf_sc.sc_ecx    = regs->tf_ecx;
471	frame.sf_sc.sc_eax    = regs->tf_eax;
472	frame.sf_sc.sc_eip    = regs->tf_eip;
473	frame.sf_sc.sc_cs     = regs->tf_cs;
474	frame.sf_sc.sc_eflags = regs->tf_eflags;
475	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
476	frame.sf_sc.sc_ss     = regs->tf_ss;
477	frame.sf_sc.sc_err    = regs->tf_err;
478	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
479
480	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
481		frame.sf_extramask[i] = lmask.__bits[i+1];
482
483	if (copyout(&frame, fp, sizeof(frame)) != 0) {
484		/*
485		 * Process has trashed its stack; give it an illegal
486		 * instruction to halt it in its tracks.
487		 */
488		PROC_LOCK(p);
489		sigexit(td, SIGILL);
490	}
491
492	/*
493	 * Build context to run handler in.
494	 */
495	regs->tf_esp = (int)fp;
496	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
497	regs->tf_eflags &= ~(PSL_T | PSL_VM);
498	regs->tf_cs = _ucodesel;
499	regs->tf_ds = _udatasel;
500	regs->tf_es = _udatasel;
501	regs->tf_fs = _udatasel;
502	regs->tf_ss = _udatasel;
503	PROC_LOCK(p);
504	mtx_lock(&psp->ps_mtx);
505}
506
507/*
508 * System call to cleanup state after a signal
509 * has been taken.  Reset signal mask and
510 * stack state from context left by sendsig (above).
511 * Return to previous pc and psl as specified by
512 * context left by sendsig. Check carefully to
513 * make sure that the user has not modified the
514 * psl to gain improper privileges or to cause
515 * a machine fault.
516 */
517int
518linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
519{
520	struct proc *p = td->td_proc;
521	struct l_sigframe frame;
522	struct trapframe *regs;
523	l_sigset_t lmask;
524	int eflags, i;
525
526	regs = td->td_frame;
527
528#ifdef DEBUG
529	if (ldebug(sigreturn))
530		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
531#endif
532	/*
533	 * The trampoline code hands us the sigframe.
534	 * It is unsafe to keep track of it ourselves, in the event that a
535	 * program jumps out of a signal handler.
536	 */
537	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
538		return (EFAULT);
539
540	/*
541	 * Check for security violations.
542	 */
543#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
544	eflags = frame.sf_sc.sc_eflags;
545	/*
546	 * XXX do allow users to change the privileged flag PSL_RF.  The
547	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
548	 * sometimes set it there too.  tf_eflags is kept in the signal
549	 * context during signal handling and there is no other place
550	 * to remember it, so the PSL_RF bit may be corrupted by the
551	 * signal handler without us knowing.  Corruption of the PSL_RF
552	 * bit at worst causes one more or one less debugger trap, so
553	 * allowing it is fairly harmless.
554	 */
555	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
556		return(EINVAL);
557
558	/*
559	 * Don't allow users to load a valid privileged %cs.  Let the
560	 * hardware check for invalid selectors, excess privilege in
561	 * other selectors, invalid %eip's and invalid %esp's.
562	 */
563#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
564	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
565		trapsignal(td, SIGBUS, T_PROTFLT);
566		return(EINVAL);
567	}
568
569	lmask.__bits[0] = frame.sf_sc.sc_mask;
570	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
571		lmask.__bits[i+1] = frame.sf_extramask[i];
572	PROC_LOCK(p);
573	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
574	SIG_CANTMASK(td->td_sigmask);
575	signotify(td);
576	PROC_UNLOCK(p);
577
578	/*
579	 * Restore signal context.
580	 */
581	/* %gs was restored by the trampoline. */
582	regs->tf_fs     = frame.sf_sc.sc_fs;
583	regs->tf_es     = frame.sf_sc.sc_es;
584	regs->tf_ds     = frame.sf_sc.sc_ds;
585	regs->tf_edi    = frame.sf_sc.sc_edi;
586	regs->tf_esi    = frame.sf_sc.sc_esi;
587	regs->tf_ebp    = frame.sf_sc.sc_ebp;
588	regs->tf_ebx    = frame.sf_sc.sc_ebx;
589	regs->tf_edx    = frame.sf_sc.sc_edx;
590	regs->tf_ecx    = frame.sf_sc.sc_ecx;
591	regs->tf_eax    = frame.sf_sc.sc_eax;
592	regs->tf_eip    = frame.sf_sc.sc_eip;
593	regs->tf_cs     = frame.sf_sc.sc_cs;
594	regs->tf_eflags = eflags;
595	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
596	regs->tf_ss     = frame.sf_sc.sc_ss;
597
598	return (EJUSTRETURN);
599}
600
601/*
602 * System call to cleanup state after a signal
603 * has been taken.  Reset signal mask and
604 * stack state from context left by rt_sendsig (above).
605 * Return to previous pc and psl as specified by
606 * context left by sendsig. Check carefully to
607 * make sure that the user has not modified the
608 * psl to gain improper privileges or to cause
609 * a machine fault.
610 */
611int
612linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
613{
614	struct proc *p = td->td_proc;
615	struct l_ucontext uc;
616	struct l_sigcontext *context;
617	l_stack_t *lss;
618	stack_t ss;
619	struct trapframe *regs;
620	int eflags;
621
622	regs = td->td_frame;
623
624#ifdef DEBUG
625	if (ldebug(rt_sigreturn))
626		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
627#endif
628	/*
629	 * The trampoline code hands us the ucontext.
630	 * It is unsafe to keep track of it ourselves, in the event that a
631	 * program jumps out of a signal handler.
632	 */
633	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
634		return (EFAULT);
635
636	context = &uc.uc_mcontext;
637
638	/*
639	 * Check for security violations.
640	 */
641#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
642	eflags = context->sc_eflags;
643	/*
644	 * XXX do allow users to change the privileged flag PSL_RF.  The
645	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
646	 * sometimes set it there too.  tf_eflags is kept in the signal
647	 * context during signal handling and there is no other place
648	 * to remember it, so the PSL_RF bit may be corrupted by the
649	 * signal handler without us knowing.  Corruption of the PSL_RF
650	 * bit at worst causes one more or one less debugger trap, so
651	 * allowing it is fairly harmless.
652	 */
653	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
654		return(EINVAL);
655
656	/*
657	 * Don't allow users to load a valid privileged %cs.  Let the
658	 * hardware check for invalid selectors, excess privilege in
659	 * other selectors, invalid %eip's and invalid %esp's.
660	 */
661#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
662	if (!CS_SECURE(context->sc_cs)) {
663		trapsignal(td, SIGBUS, T_PROTFLT);
664		return(EINVAL);
665	}
666
667	PROC_LOCK(p);
668	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
669	SIG_CANTMASK(td->td_sigmask);
670	signotify(td);
671	PROC_UNLOCK(p);
672
673	/*
674	 * Restore signal context
675	 */
676	/* %gs was restored by the trampoline. */
677	regs->tf_fs     = context->sc_fs;
678	regs->tf_es     = context->sc_es;
679	regs->tf_ds     = context->sc_ds;
680	regs->tf_edi    = context->sc_edi;
681	regs->tf_esi    = context->sc_esi;
682	regs->tf_ebp    = context->sc_ebp;
683	regs->tf_ebx    = context->sc_ebx;
684	regs->tf_edx    = context->sc_edx;
685	regs->tf_ecx    = context->sc_ecx;
686	regs->tf_eax    = context->sc_eax;
687	regs->tf_eip    = context->sc_eip;
688	regs->tf_cs     = context->sc_cs;
689	regs->tf_eflags = eflags;
690	regs->tf_esp    = context->sc_esp_at_signal;
691	regs->tf_ss     = context->sc_ss;
692
693	/*
694	 * call sigaltstack & ignore results..
695	 */
696	lss = &uc.uc_stack;
697	ss.ss_sp = lss->ss_sp;
698	ss.ss_size = lss->ss_size;
699	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
700
701#ifdef DEBUG
702	if (ldebug(rt_sigreturn))
703		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
704		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
705#endif
706	(void)kern_sigaltstack(td, &ss, NULL);
707
708	return (EJUSTRETURN);
709}
710
711/*
712 * MPSAFE
713 */
714static void
715linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
716{
717	args[0] = tf->tf_ebx;
718	args[1] = tf->tf_ecx;
719	args[2] = tf->tf_edx;
720	args[3] = tf->tf_esi;
721	args[4] = tf->tf_edi;
722	args[5] = tf->tf_ebp;	/* Unconfirmed */
723	*params = NULL;		/* no copyin */
724}
725
726/*
727 * If a linux binary is exec'ing something, try this image activator
728 * first.  We override standard shell script execution in order to
729 * be able to modify the interpreter path.  We only do this if a linux
730 * binary is doing the exec, so we do not create an EXEC module for it.
731 */
732static int	exec_linux_imgact_try(struct image_params *iparams);
733
734static int
735exec_linux_imgact_try(struct image_params *imgp)
736{
737    const char *head = (const char *)imgp->image_header;
738    char *rpath;
739    int error = -1, len;
740
741    /*
742     * The interpreter for shell scripts run from a linux binary needs
743     * to be located in /compat/linux if possible in order to recursively
744     * maintain linux path emulation.
745     */
746    if (((const short *)head)[0] == SHELLMAGIC) {
747	    /*
748	     * Run our normal shell image activator.  If it succeeds attempt
749	     * to use the alternate path for the interpreter.  If an alternate
750	     * path is found, use our stringspace to store it.
751	     */
752	    if ((error = exec_shell_imgact(imgp)) == 0) {
753		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
754			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
755		    if (rpath != NULL) {
756			    len = strlen(rpath) + 1;
757
758			    if (len <= MAXSHELLCMDLEN) {
759				    memcpy(imgp->interpreter_name, rpath, len);
760			    }
761			    free(rpath, M_TEMP);
762		    }
763	    }
764    }
765    return(error);
766}
767
768/*
769 * exec_setregs may initialize some registers differently than Linux
770 * does, thus potentially confusing Linux binaries. If necessary, we
771 * override the exec_setregs default(s) here.
772 */
773static void
774exec_linux_setregs(struct thread *td, u_long entry,
775		   u_long stack, u_long ps_strings)
776{
777	static const u_short control = __LINUX_NPXCW__;
778	struct pcb *pcb = td->td_pcb;
779
780	exec_setregs(td, entry, stack, ps_strings);
781
782	/* Linux sets %gs to 0, we default to _udatasel */
783	pcb->pcb_gs = 0; load_gs(0);
784
785	/* Linux sets the i387 to extended precision. */
786	fldcw(&control);
787}
788
789struct sysentvec linux_sysvec = {
790	LINUX_SYS_MAXSYSCALL,
791	linux_sysent,
792	0xff,
793	LINUX_SIGTBLSZ,
794	bsd_to_linux_signal,
795	ELAST + 1,
796	bsd_to_linux_errno,
797	translate_traps,
798	linux_fixup,
799	linux_sendsig,
800	linux_sigcode,
801	&linux_szsigcode,
802	linux_prepsyscall,
803	"Linux a.out",
804	NULL,
805	exec_linux_imgact_try,
806	LINUX_MINSIGSTKSZ,
807	PAGE_SIZE,
808	VM_MIN_ADDRESS,
809	VM_MAXUSER_ADDRESS,
810	USRSTACK,
811	PS_STRINGS,
812	VM_PROT_ALL,
813	exec_copyout_strings,
814	exec_linux_setregs,
815	NULL
816};
817
818struct sysentvec elf_linux_sysvec = {
819	LINUX_SYS_MAXSYSCALL,
820	linux_sysent,
821	0xff,
822	LINUX_SIGTBLSZ,
823	bsd_to_linux_signal,
824	ELAST + 1,
825	bsd_to_linux_errno,
826	translate_traps,
827	elf_linux_fixup,
828	linux_sendsig,
829	linux_sigcode,
830	&linux_szsigcode,
831	linux_prepsyscall,
832	"Linux ELF",
833	elf32_coredump,
834	exec_linux_imgact_try,
835	LINUX_MINSIGSTKSZ,
836	PAGE_SIZE,
837	VM_MIN_ADDRESS,
838	VM_MAXUSER_ADDRESS,
839	USRSTACK,
840	PS_STRINGS,
841	VM_PROT_ALL,
842	exec_copyout_strings,
843	exec_linux_setregs,
844	NULL
845};
846
847static Elf32_Brandinfo linux_brand = {
848					ELFOSABI_LINUX,
849					EM_386,
850					"Linux",
851					"/compat/linux",
852					"/lib/ld-linux.so.1",
853					&elf_linux_sysvec,
854					NULL,
855				 };
856
857static Elf32_Brandinfo linux_glibc2brand = {
858					ELFOSABI_LINUX,
859					EM_386,
860					"Linux",
861					"/compat/linux",
862					"/lib/ld-linux.so.2",
863					&elf_linux_sysvec,
864					NULL,
865				 };
866
867Elf32_Brandinfo *linux_brandlist[] = {
868					&linux_brand,
869					&linux_glibc2brand,
870					NULL
871				};
872
873static int
874linux_elf_modevent(module_t mod, int type, void *data)
875{
876	Elf32_Brandinfo **brandinfo;
877	int error;
878	struct linux_ioctl_handler **lihp;
879
880	error = 0;
881
882	switch(type) {
883	case MOD_LOAD:
884		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
885		     ++brandinfo)
886			if (elf32_insert_brand_entry(*brandinfo) < 0)
887				error = EINVAL;
888		if (error == 0) {
889			SET_FOREACH(lihp, linux_ioctl_handler_set)
890				linux_ioctl_register_handler(*lihp);
891			if (bootverbose)
892				printf("Linux ELF exec handler installed\n");
893		} else
894			printf("cannot insert Linux ELF brand handler\n");
895		break;
896	case MOD_UNLOAD:
897		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
898		     ++brandinfo)
899			if (elf32_brand_inuse(*brandinfo))
900				error = EBUSY;
901		if (error == 0) {
902			for (brandinfo = &linux_brandlist[0];
903			     *brandinfo != NULL; ++brandinfo)
904				if (elf32_remove_brand_entry(*brandinfo) < 0)
905					error = EINVAL;
906		}
907		if (error == 0) {
908			SET_FOREACH(lihp, linux_ioctl_handler_set)
909				linux_ioctl_unregister_handler(*lihp);
910			if (bootverbose)
911				printf("Linux ELF exec handler removed\n");
912			linux_mib_destroy();
913		} else
914			printf("Could not deinstall ELF interpreter entry\n");
915		break;
916	default:
917		return EOPNOTSUPP;
918	}
919	return error;
920}
921
922static moduledata_t linux_elf_mod = {
923	"linuxelf",
924	linux_elf_modevent,
925	0
926};
927
928DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
929