linux_sysvec.c revision 141469
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 141469 2005-02-07 18:37:51Z jhb $");
31
32/* XXX we use functions that might not exist. */
33#include "opt_compat.h"
34
35#ifndef COMPAT_43
36#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/exec.h>
42#include <sys/imgact.h>
43#include <sys/imgact_aout.h>
44#include <sys/imgact_elf.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/module.h>
49#include <sys/mutex.h>
50#include <sys/proc.h>
51#include <sys/signalvar.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysent.h>
54#include <sys/sysproto.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#include <vm/vm_extern.h>
60#include <vm/vm_map.h>
61#include <vm/vm_object.h>
62#include <vm/vm_page.h>
63#include <vm/vm_param.h>
64
65#include <machine/cpu.h>
66#include <machine/md_var.h>
67#include <machine/pcb.h>
68
69#include <i386/linux/linux.h>
70#include <i386/linux/linux_proto.h>
71#include <compat/linux/linux_mib.h>
72#include <compat/linux/linux_signal.h>
73#include <compat/linux/linux_util.h>
74
75MODULE_VERSION(linux, 1);
76MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
77MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
78MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
79
80MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
81
82#if BYTE_ORDER == LITTLE_ENDIAN
83#define SHELLMAGIC      0x2123 /* #! */
84#else
85#define SHELLMAGIC      0x2321
86#endif
87
88/*
89 * Allow the sendsig functions to use the ldebug() facility
90 * even though they are not syscalls themselves. Map them
91 * to syscall 0. This is slightly less bogus than using
92 * ldebug(sigreturn).
93 */
94#define	LINUX_SYS_linux_rt_sendsig	0
95#define	LINUX_SYS_linux_sendsig		0
96
97#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
98#define	__LINUX_NPXCW__		0x37f
99
100extern char linux_sigcode[];
101extern int linux_szsigcode;
102
103extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
104
105SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
106
107static int	linux_fixup(register_t **stack_base,
108		    struct image_params *iparams);
109static int	elf_linux_fixup(register_t **stack_base,
110		    struct image_params *iparams);
111static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
112		    caddr_t *params);
113static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
114		    u_long code);
115static void	exec_linux_setregs(struct thread *td, u_long entry,
116				   u_long stack, u_long ps_strings);
117
118/*
119 * Linux syscalls return negative errno's, we do positive and map them
120 */
121static int bsd_to_linux_errno[ELAST + 1] = {
122	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
123	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
124	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
125	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
126	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
127	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
128	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
129	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
130	-6, -6, -43, -42, -75, -6, -84
131};
132
133int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
134	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
135	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
136	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
137	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
138	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
139	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
140	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
141	0, LINUX_SIGUSR1, LINUX_SIGUSR2
142};
143
144int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
145	SIGHUP, SIGINT, SIGQUIT, SIGILL,
146	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
147	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
148	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
149	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
150	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
151	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
152	SIGIO, SIGURG, SIGSYS
153};
154
155#define LINUX_T_UNKNOWN  255
156static int _bsd_to_linux_trapcode[] = {
157	LINUX_T_UNKNOWN,	/* 0 */
158	6,			/* 1  T_PRIVINFLT */
159	LINUX_T_UNKNOWN,	/* 2 */
160	3,			/* 3  T_BPTFLT */
161	LINUX_T_UNKNOWN,	/* 4 */
162	LINUX_T_UNKNOWN,	/* 5 */
163	16,			/* 6  T_ARITHTRAP */
164	254,			/* 7  T_ASTFLT */
165	LINUX_T_UNKNOWN,	/* 8 */
166	13,			/* 9  T_PROTFLT */
167	1,			/* 10 T_TRCTRAP */
168	LINUX_T_UNKNOWN,	/* 11 */
169	14,			/* 12 T_PAGEFLT */
170	LINUX_T_UNKNOWN,	/* 13 */
171	17,			/* 14 T_ALIGNFLT */
172	LINUX_T_UNKNOWN,	/* 15 */
173	LINUX_T_UNKNOWN,	/* 16 */
174	LINUX_T_UNKNOWN,	/* 17 */
175	0,			/* 18 T_DIVIDE */
176	2,			/* 19 T_NMI */
177	4,			/* 20 T_OFLOW */
178	5,			/* 21 T_BOUND */
179	7,			/* 22 T_DNA */
180	8,			/* 23 T_DOUBLEFLT */
181	9,			/* 24 T_FPOPFLT */
182	10,			/* 25 T_TSSFLT */
183	11,			/* 26 T_SEGNPFLT */
184	12,			/* 27 T_STKFLT */
185	18,			/* 28 T_MCHK */
186	19,			/* 29 T_XMMFLT */
187	15			/* 30 T_RESERVED */
188};
189#define bsd_to_linux_trapcode(code) \
190    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
191     _bsd_to_linux_trapcode[(code)]: \
192     LINUX_T_UNKNOWN)
193
194/*
195 * If FreeBSD & Linux have a difference of opinion about what a trap
196 * means, deal with it here.
197 *
198 * MPSAFE
199 */
200static int
201translate_traps(int signal, int trap_code)
202{
203	if (signal != SIGBUS)
204		return signal;
205	switch (trap_code) {
206	case T_PROTFLT:
207	case T_TSSFLT:
208	case T_DOUBLEFLT:
209	case T_PAGEFLT:
210		return SIGSEGV;
211	default:
212		return signal;
213	}
214}
215
216static int
217linux_fixup(register_t **stack_base, struct image_params *imgp)
218{
219	register_t *argv, *envp;
220
221	argv = *stack_base;
222	envp = *stack_base + (imgp->args->argc + 1);
223	(*stack_base)--;
224	**stack_base = (intptr_t)(void *)envp;
225	(*stack_base)--;
226	**stack_base = (intptr_t)(void *)argv;
227	(*stack_base)--;
228	**stack_base = imgp->args->argc;
229	return 0;
230}
231
232static int
233elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
234{
235	Elf32_Auxargs *args;
236	register_t *pos;
237
238	KASSERT(curthread->td_proc == imgp->proc &&
239	    (curthread->td_proc->p_flag & P_SA) == 0,
240	    ("unsafe elf_linux_fixup(), should be curproc"));
241	args = (Elf32_Auxargs *)imgp->auxargs;
242	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
243
244	if (args->trace)
245		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
246	if (args->execfd != -1)
247		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
248	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
249	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
250	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
251	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
252	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
253	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
254	AUXARGS_ENTRY(pos, AT_BASE, args->base);
255	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
256	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
257	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
258	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
259	AUXARGS_ENTRY(pos, AT_NULL, 0);
260
261	free(imgp->auxargs, M_TEMP);
262	imgp->auxargs = NULL;
263
264	(*stack_base)--;
265	**stack_base = (register_t)imgp->args->argc;
266	return 0;
267}
268
269extern int _ucodesel, _udatasel;
270extern unsigned long linux_sznonrtsigcode;
271
272static void
273linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
274{
275	struct thread *td = curthread;
276	struct proc *p = td->td_proc;
277	struct sigacts *psp;
278	struct trapframe *regs;
279	struct l_rt_sigframe *fp, frame;
280	int oonstack;
281
282	PROC_LOCK_ASSERT(p, MA_OWNED);
283	psp = p->p_sigacts;
284	mtx_assert(&psp->ps_mtx, MA_OWNED);
285	regs = td->td_frame;
286	oonstack = sigonstack(regs->tf_esp);
287
288#ifdef DEBUG
289	if (ldebug(rt_sendsig))
290		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
291		    catcher, sig, (void*)mask, code);
292#endif
293	/*
294	 * Allocate space for the signal handler context.
295	 */
296	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
297	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
298		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
299		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
300	} else
301		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
302	mtx_unlock(&psp->ps_mtx);
303
304	/*
305	 * Build the argument list for the signal handler.
306	 */
307	if (p->p_sysent->sv_sigtbl)
308		if (sig <= p->p_sysent->sv_sigsize)
309			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
310
311	bzero(&frame, sizeof(frame));
312
313	frame.sf_handler = catcher;
314	frame.sf_sig = sig;
315	frame.sf_siginfo = &fp->sf_si;
316	frame.sf_ucontext = &fp->sf_sc;
317
318	/* Fill in POSIX parts */
319	frame.sf_si.lsi_signo = sig;
320	frame.sf_si.lsi_code = code;
321	frame.sf_si.lsi_addr = (void *)regs->tf_err;
322
323	/*
324	 * Build the signal context to be used by sigreturn.
325	 */
326	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
327	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
328
329	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
330	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
331	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
332	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
333	PROC_UNLOCK(p);
334
335	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
336
337	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
338	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
339	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
340	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
341	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
342	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
343	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
344	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
345	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
346	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
347	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
348	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
349	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
350	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
351	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
352	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
353	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
354	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
355	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
356
357#ifdef DEBUG
358	if (ldebug(rt_sendsig))
359		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
360		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
361		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
362#endif
363
364	if (copyout(&frame, fp, sizeof(frame)) != 0) {
365		/*
366		 * Process has trashed its stack; give it an illegal
367		 * instruction to halt it in its tracks.
368		 */
369#ifdef DEBUG
370		if (ldebug(rt_sendsig))
371			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
372			    fp, oonstack);
373#endif
374		PROC_LOCK(p);
375		sigexit(td, SIGILL);
376	}
377
378	/*
379	 * Build context to run handler in.
380	 */
381	regs->tf_esp = (int)fp;
382	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
383	    linux_sznonrtsigcode;
384	regs->tf_eflags &= ~(PSL_T | PSL_VM);
385	regs->tf_cs = _ucodesel;
386	regs->tf_ds = _udatasel;
387	regs->tf_es = _udatasel;
388	regs->tf_fs = _udatasel;
389	regs->tf_ss = _udatasel;
390	PROC_LOCK(p);
391	mtx_lock(&psp->ps_mtx);
392}
393
394
395/*
396 * Send an interrupt to process.
397 *
398 * Stack is set up to allow sigcode stored
399 * in u. to call routine, followed by kcall
400 * to sigreturn routine below.  After sigreturn
401 * resets the signal mask, the stack, and the
402 * frame pointer, it returns to the user
403 * specified pc, psl.
404 */
405static void
406linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
407{
408	struct thread *td = curthread;
409	struct proc *p = td->td_proc;
410	struct sigacts *psp;
411	struct trapframe *regs;
412	struct l_sigframe *fp, frame;
413	l_sigset_t lmask;
414	int oonstack, i;
415
416	PROC_LOCK_ASSERT(p, MA_OWNED);
417	psp = p->p_sigacts;
418	mtx_assert(&psp->ps_mtx, MA_OWNED);
419	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
420		/* Signal handler installed with SA_SIGINFO. */
421		linux_rt_sendsig(catcher, sig, mask, code);
422		return;
423	}
424
425	regs = td->td_frame;
426	oonstack = sigonstack(regs->tf_esp);
427
428#ifdef DEBUG
429	if (ldebug(sendsig))
430		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
431		    catcher, sig, (void*)mask, code);
432#endif
433
434	/*
435	 * Allocate space for the signal handler context.
436	 */
437	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
438	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
439		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
440		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
441	} else
442		fp = (struct l_sigframe *)regs->tf_esp - 1;
443	mtx_unlock(&psp->ps_mtx);
444	PROC_UNLOCK(p);
445
446	/*
447	 * Build the argument list for the signal handler.
448	 */
449	if (p->p_sysent->sv_sigtbl)
450		if (sig <= p->p_sysent->sv_sigsize)
451			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
452
453	bzero(&frame, sizeof(frame));
454
455	frame.sf_handler = catcher;
456	frame.sf_sig = sig;
457
458	bsd_to_linux_sigset(mask, &lmask);
459
460	/*
461	 * Build the signal context to be used by sigreturn.
462	 */
463	frame.sf_sc.sc_mask   = lmask.__bits[0];
464	frame.sf_sc.sc_gs     = rgs();
465	frame.sf_sc.sc_fs     = regs->tf_fs;
466	frame.sf_sc.sc_es     = regs->tf_es;
467	frame.sf_sc.sc_ds     = regs->tf_ds;
468	frame.sf_sc.sc_edi    = regs->tf_edi;
469	frame.sf_sc.sc_esi    = regs->tf_esi;
470	frame.sf_sc.sc_ebp    = regs->tf_ebp;
471	frame.sf_sc.sc_ebx    = regs->tf_ebx;
472	frame.sf_sc.sc_edx    = regs->tf_edx;
473	frame.sf_sc.sc_ecx    = regs->tf_ecx;
474	frame.sf_sc.sc_eax    = regs->tf_eax;
475	frame.sf_sc.sc_eip    = regs->tf_eip;
476	frame.sf_sc.sc_cs     = regs->tf_cs;
477	frame.sf_sc.sc_eflags = regs->tf_eflags;
478	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
479	frame.sf_sc.sc_ss     = regs->tf_ss;
480	frame.sf_sc.sc_err    = regs->tf_err;
481	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
482
483	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
484		frame.sf_extramask[i] = lmask.__bits[i+1];
485
486	if (copyout(&frame, fp, sizeof(frame)) != 0) {
487		/*
488		 * Process has trashed its stack; give it an illegal
489		 * instruction to halt it in its tracks.
490		 */
491		PROC_LOCK(p);
492		sigexit(td, SIGILL);
493	}
494
495	/*
496	 * Build context to run handler in.
497	 */
498	regs->tf_esp = (int)fp;
499	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
500	regs->tf_eflags &= ~(PSL_T | PSL_VM);
501	regs->tf_cs = _ucodesel;
502	regs->tf_ds = _udatasel;
503	regs->tf_es = _udatasel;
504	regs->tf_fs = _udatasel;
505	regs->tf_ss = _udatasel;
506	PROC_LOCK(p);
507	mtx_lock(&psp->ps_mtx);
508}
509
510/*
511 * System call to cleanup state after a signal
512 * has been taken.  Reset signal mask and
513 * stack state from context left by sendsig (above).
514 * Return to previous pc and psl as specified by
515 * context left by sendsig. Check carefully to
516 * make sure that the user has not modified the
517 * psl to gain improper privileges or to cause
518 * a machine fault.
519 */
520int
521linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
522{
523	struct proc *p = td->td_proc;
524	struct l_sigframe frame;
525	struct trapframe *regs;
526	l_sigset_t lmask;
527	int eflags, i;
528
529	regs = td->td_frame;
530
531#ifdef DEBUG
532	if (ldebug(sigreturn))
533		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
534#endif
535	/*
536	 * The trampoline code hands us the sigframe.
537	 * It is unsafe to keep track of it ourselves, in the event that a
538	 * program jumps out of a signal handler.
539	 */
540	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
541		return (EFAULT);
542
543	/*
544	 * Check for security violations.
545	 */
546#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
547	eflags = frame.sf_sc.sc_eflags;
548	/*
549	 * XXX do allow users to change the privileged flag PSL_RF.  The
550	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
551	 * sometimes set it there too.  tf_eflags is kept in the signal
552	 * context during signal handling and there is no other place
553	 * to remember it, so the PSL_RF bit may be corrupted by the
554	 * signal handler without us knowing.  Corruption of the PSL_RF
555	 * bit at worst causes one more or one less debugger trap, so
556	 * allowing it is fairly harmless.
557	 */
558	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
559		return(EINVAL);
560
561	/*
562	 * Don't allow users to load a valid privileged %cs.  Let the
563	 * hardware check for invalid selectors, excess privilege in
564	 * other selectors, invalid %eip's and invalid %esp's.
565	 */
566#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
567	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
568		trapsignal(td, SIGBUS, T_PROTFLT);
569		return(EINVAL);
570	}
571
572	lmask.__bits[0] = frame.sf_sc.sc_mask;
573	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
574		lmask.__bits[i+1] = frame.sf_extramask[i];
575	PROC_LOCK(p);
576	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
577	SIG_CANTMASK(td->td_sigmask);
578	signotify(td);
579	PROC_UNLOCK(p);
580
581	/*
582	 * Restore signal context.
583	 */
584	/* %gs was restored by the trampoline. */
585	regs->tf_fs     = frame.sf_sc.sc_fs;
586	regs->tf_es     = frame.sf_sc.sc_es;
587	regs->tf_ds     = frame.sf_sc.sc_ds;
588	regs->tf_edi    = frame.sf_sc.sc_edi;
589	regs->tf_esi    = frame.sf_sc.sc_esi;
590	regs->tf_ebp    = frame.sf_sc.sc_ebp;
591	regs->tf_ebx    = frame.sf_sc.sc_ebx;
592	regs->tf_edx    = frame.sf_sc.sc_edx;
593	regs->tf_ecx    = frame.sf_sc.sc_ecx;
594	regs->tf_eax    = frame.sf_sc.sc_eax;
595	regs->tf_eip    = frame.sf_sc.sc_eip;
596	regs->tf_cs     = frame.sf_sc.sc_cs;
597	regs->tf_eflags = eflags;
598	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
599	regs->tf_ss     = frame.sf_sc.sc_ss;
600
601	return (EJUSTRETURN);
602}
603
604/*
605 * System call to cleanup state after a signal
606 * has been taken.  Reset signal mask and
607 * stack state from context left by rt_sendsig (above).
608 * Return to previous pc and psl as specified by
609 * context left by sendsig. Check carefully to
610 * make sure that the user has not modified the
611 * psl to gain improper privileges or to cause
612 * a machine fault.
613 */
614int
615linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
616{
617	struct proc *p = td->td_proc;
618	struct l_ucontext uc;
619	struct l_sigcontext *context;
620	l_stack_t *lss;
621	stack_t ss;
622	struct trapframe *regs;
623	int eflags;
624
625	regs = td->td_frame;
626
627#ifdef DEBUG
628	if (ldebug(rt_sigreturn))
629		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
630#endif
631	/*
632	 * The trampoline code hands us the ucontext.
633	 * It is unsafe to keep track of it ourselves, in the event that a
634	 * program jumps out of a signal handler.
635	 */
636	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
637		return (EFAULT);
638
639	context = &uc.uc_mcontext;
640
641	/*
642	 * Check for security violations.
643	 */
644#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
645	eflags = context->sc_eflags;
646	/*
647	 * XXX do allow users to change the privileged flag PSL_RF.  The
648	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
649	 * sometimes set it there too.  tf_eflags is kept in the signal
650	 * context during signal handling and there is no other place
651	 * to remember it, so the PSL_RF bit may be corrupted by the
652	 * signal handler without us knowing.  Corruption of the PSL_RF
653	 * bit at worst causes one more or one less debugger trap, so
654	 * allowing it is fairly harmless.
655	 */
656	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
657		return(EINVAL);
658
659	/*
660	 * Don't allow users to load a valid privileged %cs.  Let the
661	 * hardware check for invalid selectors, excess privilege in
662	 * other selectors, invalid %eip's and invalid %esp's.
663	 */
664#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
665	if (!CS_SECURE(context->sc_cs)) {
666		trapsignal(td, SIGBUS, T_PROTFLT);
667		return(EINVAL);
668	}
669
670	PROC_LOCK(p);
671	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
672	SIG_CANTMASK(td->td_sigmask);
673	signotify(td);
674	PROC_UNLOCK(p);
675
676	/*
677	 * Restore signal context
678	 */
679	/* %gs was restored by the trampoline. */
680	regs->tf_fs     = context->sc_fs;
681	regs->tf_es     = context->sc_es;
682	regs->tf_ds     = context->sc_ds;
683	regs->tf_edi    = context->sc_edi;
684	regs->tf_esi    = context->sc_esi;
685	regs->tf_ebp    = context->sc_ebp;
686	regs->tf_ebx    = context->sc_ebx;
687	regs->tf_edx    = context->sc_edx;
688	regs->tf_ecx    = context->sc_ecx;
689	regs->tf_eax    = context->sc_eax;
690	regs->tf_eip    = context->sc_eip;
691	regs->tf_cs     = context->sc_cs;
692	regs->tf_eflags = eflags;
693	regs->tf_esp    = context->sc_esp_at_signal;
694	regs->tf_ss     = context->sc_ss;
695
696	/*
697	 * call sigaltstack & ignore results..
698	 */
699	lss = &uc.uc_stack;
700	ss.ss_sp = lss->ss_sp;
701	ss.ss_size = lss->ss_size;
702	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
703
704#ifdef DEBUG
705	if (ldebug(rt_sigreturn))
706		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
707		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
708#endif
709	(void)kern_sigaltstack(td, &ss, NULL);
710
711	return (EJUSTRETURN);
712}
713
714/*
715 * MPSAFE
716 */
717static void
718linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
719{
720	args[0] = tf->tf_ebx;
721	args[1] = tf->tf_ecx;
722	args[2] = tf->tf_edx;
723	args[3] = tf->tf_esi;
724	args[4] = tf->tf_edi;
725	args[5] = tf->tf_ebp;	/* Unconfirmed */
726	*params = NULL;		/* no copyin */
727}
728
729/*
730 * If a linux binary is exec'ing something, try this image activator
731 * first.  We override standard shell script execution in order to
732 * be able to modify the interpreter path.  We only do this if a linux
733 * binary is doing the exec, so we do not create an EXEC module for it.
734 */
735static int	exec_linux_imgact_try(struct image_params *iparams);
736
737static int
738exec_linux_imgact_try(struct image_params *imgp)
739{
740    const char *head = (const char *)imgp->image_header;
741    char *rpath;
742    int error = -1, len;
743
744    /*
745     * The interpreter for shell scripts run from a linux binary needs
746     * to be located in /compat/linux if possible in order to recursively
747     * maintain linux path emulation.
748     */
749    if (((const short *)head)[0] == SHELLMAGIC) {
750	    /*
751	     * Run our normal shell image activator.  If it succeeds attempt
752	     * to use the alternate path for the interpreter.  If an alternate
753	     * path is found, use our stringspace to store it.
754	     */
755	    if ((error = exec_shell_imgact(imgp)) == 0) {
756		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
757			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
758		    if (rpath != NULL) {
759			    len = strlen(rpath) + 1;
760
761			    if (len <= MAXSHELLCMDLEN) {
762				    memcpy(imgp->interpreter_name, rpath, len);
763			    }
764			    free(rpath, M_TEMP);
765		    }
766	    }
767    }
768    return(error);
769}
770
771/*
772 * exec_setregs may initialize some registers differently than Linux
773 * does, thus potentially confusing Linux binaries. If necessary, we
774 * override the exec_setregs default(s) here.
775 */
776static void
777exec_linux_setregs(struct thread *td, u_long entry,
778		   u_long stack, u_long ps_strings)
779{
780	static const u_short control = __LINUX_NPXCW__;
781	struct pcb *pcb = td->td_pcb;
782
783	exec_setregs(td, entry, stack, ps_strings);
784
785	/* Linux sets %gs to 0, we default to _udatasel */
786	pcb->pcb_gs = 0; load_gs(0);
787
788	/* Linux sets the i387 to extended precision. */
789	fldcw(&control);
790}
791
792struct sysentvec linux_sysvec = {
793	LINUX_SYS_MAXSYSCALL,
794	linux_sysent,
795	0xff,
796	LINUX_SIGTBLSZ,
797	bsd_to_linux_signal,
798	ELAST + 1,
799	bsd_to_linux_errno,
800	translate_traps,
801	linux_fixup,
802	linux_sendsig,
803	linux_sigcode,
804	&linux_szsigcode,
805	linux_prepsyscall,
806	"Linux a.out",
807	NULL,
808	exec_linux_imgact_try,
809	LINUX_MINSIGSTKSZ,
810	PAGE_SIZE,
811	VM_MIN_ADDRESS,
812	VM_MAXUSER_ADDRESS,
813	USRSTACK,
814	PS_STRINGS,
815	VM_PROT_ALL,
816	exec_copyout_strings,
817	exec_linux_setregs,
818	NULL
819};
820
821struct sysentvec elf_linux_sysvec = {
822	LINUX_SYS_MAXSYSCALL,
823	linux_sysent,
824	0xff,
825	LINUX_SIGTBLSZ,
826	bsd_to_linux_signal,
827	ELAST + 1,
828	bsd_to_linux_errno,
829	translate_traps,
830	elf_linux_fixup,
831	linux_sendsig,
832	linux_sigcode,
833	&linux_szsigcode,
834	linux_prepsyscall,
835	"Linux ELF",
836	elf32_coredump,
837	exec_linux_imgact_try,
838	LINUX_MINSIGSTKSZ,
839	PAGE_SIZE,
840	VM_MIN_ADDRESS,
841	VM_MAXUSER_ADDRESS,
842	USRSTACK,
843	PS_STRINGS,
844	VM_PROT_ALL,
845	exec_copyout_strings,
846	exec_linux_setregs,
847	NULL
848};
849
850static Elf32_Brandinfo linux_brand = {
851					ELFOSABI_LINUX,
852					EM_386,
853					"Linux",
854					"/compat/linux",
855					"/lib/ld-linux.so.1",
856					&elf_linux_sysvec,
857					NULL,
858				 };
859
860static Elf32_Brandinfo linux_glibc2brand = {
861					ELFOSABI_LINUX,
862					EM_386,
863					"Linux",
864					"/compat/linux",
865					"/lib/ld-linux.so.2",
866					&elf_linux_sysvec,
867					NULL,
868				 };
869
870Elf32_Brandinfo *linux_brandlist[] = {
871					&linux_brand,
872					&linux_glibc2brand,
873					NULL
874				};
875
876static int
877linux_elf_modevent(module_t mod, int type, void *data)
878{
879	Elf32_Brandinfo **brandinfo;
880	int error;
881	struct linux_ioctl_handler **lihp;
882
883	error = 0;
884
885	switch(type) {
886	case MOD_LOAD:
887		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
888		     ++brandinfo)
889			if (elf32_insert_brand_entry(*brandinfo) < 0)
890				error = EINVAL;
891		if (error == 0) {
892			SET_FOREACH(lihp, linux_ioctl_handler_set)
893				linux_ioctl_register_handler(*lihp);
894			if (bootverbose)
895				printf("Linux ELF exec handler installed\n");
896		} else
897			printf("cannot insert Linux ELF brand handler\n");
898		break;
899	case MOD_UNLOAD:
900		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
901		     ++brandinfo)
902			if (elf32_brand_inuse(*brandinfo))
903				error = EBUSY;
904		if (error == 0) {
905			for (brandinfo = &linux_brandlist[0];
906			     *brandinfo != NULL; ++brandinfo)
907				if (elf32_remove_brand_entry(*brandinfo) < 0)
908					error = EINVAL;
909		}
910		if (error == 0) {
911			SET_FOREACH(lihp, linux_ioctl_handler_set)
912				linux_ioctl_unregister_handler(*lihp);
913			if (bootverbose)
914				printf("Linux ELF exec handler removed\n");
915			linux_mib_destroy();
916		} else
917			printf("Could not deinstall ELF interpreter entry\n");
918		break;
919	default:
920		return EOPNOTSUPP;
921	}
922	return error;
923}
924
925static moduledata_t linux_elf_mod = {
926	"linuxelf",
927	linux_elf_modevent,
928	0
929};
930
931DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
932