linux_sysvec.c revision 177145
10SN/A/*-
2157SN/A * Copyright (c) 1994-1996 S�ren Schmidt
30SN/A * All rights reserved.
40SN/A *
50SN/A * Redistribution and use in source and binary forms, with or without
60SN/A * modification, are permitted provided that the following conditions
7157SN/A * are met:
80SN/A * 1. Redistributions of source code must retain the above copyright
9157SN/A *    notice, this list of conditions and the following disclaimer
100SN/A *    in this position and unchanged.
110SN/A * 2. Redistributions in binary form must reproduce the above copyright
120SN/A *    notice, this list of conditions and the following disclaimer in the
130SN/A *    documentation and/or other materials provided with the distribution.
140SN/A * 3. The name of the author may not be used to endorse or promote products
150SN/A *    derived from this software without specific prior written permission
160SN/A *
170SN/A * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
180SN/A * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
190SN/A * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
200SN/A * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21157SN/A * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22157SN/A * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23157SN/A * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
240SN/A * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
250SN/A * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
260SN/A * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
270SN/A */
280SN/A
290SN/A#include <sys/cdefs.h>
300SN/A__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 177145 2008-03-13 10:54:38Z kib $");
310SN/A
320SN/A#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/exec.h>
35#include <sys/imgact.h>
36#include <sys/imgact_aout.h>
37#include <sys/imgact_elf.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/module.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysent.h>
47#include <sys/sysproto.h>
48#include <sys/vnode.h>
49#include <sys/eventhandler.h>
50
51#include <vm/vm.h>
52#include <vm/pmap.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_map.h>
55#include <vm/vm_object.h>
56#include <vm/vm_page.h>
57#include <vm/vm_param.h>
58
59#include <machine/cpu.h>
60#include <machine/md_var.h>
61#include <machine/pcb.h>
62
63#include <i386/linux/linux.h>
64#include <i386/linux/linux_proto.h>
65#include <compat/linux/linux_emul.h>
66#include <compat/linux/linux_mib.h>
67#include <compat/linux/linux_signal.h>
68#include <compat/linux/linux_util.h>
69
70MODULE_VERSION(linux, 1);
71
72MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
73
74#if BYTE_ORDER == LITTLE_ENDIAN
75#define SHELLMAGIC      0x2123 /* #! */
76#else
77#define SHELLMAGIC      0x2321
78#endif
79
80/*
81 * Allow the sendsig functions to use the ldebug() facility
82 * even though they are not syscalls themselves. Map them
83 * to syscall 0. This is slightly less bogus than using
84 * ldebug(sigreturn).
85 */
86#define	LINUX_SYS_linux_rt_sendsig	0
87#define	LINUX_SYS_linux_sendsig		0
88
89#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
90#define	__LINUX_NPXCW__		0x37f
91
92extern char linux_sigcode[];
93extern int linux_szsigcode;
94
95extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
96
97SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
98SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
99
100static int	linux_fixup(register_t **stack_base,
101		    struct image_params *iparams);
102static int	elf_linux_fixup(register_t **stack_base,
103		    struct image_params *iparams);
104static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
105		    caddr_t *params);
106static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
107static void	exec_linux_setregs(struct thread *td, u_long entry,
108				   u_long stack, u_long ps_strings);
109
110extern LIST_HEAD(futex_list, futex) futex_list;
111extern struct sx futex_sx;
112
113static eventhandler_tag linux_exit_tag;
114static eventhandler_tag linux_schedtail_tag;
115static eventhandler_tag linux_exec_tag;
116
117/*
118 * Linux syscalls return negative errno's, we do positive and map them
119 * Reference:
120 *   FreeBSD: src/sys/sys/errno.h
121 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
122 *            linux-2.6.17.8/include/asm-generic/errno.h
123 */
124static int bsd_to_linux_errno[ELAST + 1] = {
125	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
126	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
127	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
128	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
129	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
130	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
131	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
132	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
133	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
134	 -72, -67, -71
135};
136
137int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
138	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
139	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
140	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
141	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
142	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
143	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
144	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
145	0, LINUX_SIGUSR1, LINUX_SIGUSR2
146};
147
148int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
149	SIGHUP, SIGINT, SIGQUIT, SIGILL,
150	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
151	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
152	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
153	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
154	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
155	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
156	SIGIO, SIGURG, SIGSYS
157};
158
159#define LINUX_T_UNKNOWN  255
160static int _bsd_to_linux_trapcode[] = {
161	LINUX_T_UNKNOWN,	/* 0 */
162	6,			/* 1  T_PRIVINFLT */
163	LINUX_T_UNKNOWN,	/* 2 */
164	3,			/* 3  T_BPTFLT */
165	LINUX_T_UNKNOWN,	/* 4 */
166	LINUX_T_UNKNOWN,	/* 5 */
167	16,			/* 6  T_ARITHTRAP */
168	254,			/* 7  T_ASTFLT */
169	LINUX_T_UNKNOWN,	/* 8 */
170	13,			/* 9  T_PROTFLT */
171	1,			/* 10 T_TRCTRAP */
172	LINUX_T_UNKNOWN,	/* 11 */
173	14,			/* 12 T_PAGEFLT */
174	LINUX_T_UNKNOWN,	/* 13 */
175	17,			/* 14 T_ALIGNFLT */
176	LINUX_T_UNKNOWN,	/* 15 */
177	LINUX_T_UNKNOWN,	/* 16 */
178	LINUX_T_UNKNOWN,	/* 17 */
179	0,			/* 18 T_DIVIDE */
180	2,			/* 19 T_NMI */
181	4,			/* 20 T_OFLOW */
182	5,			/* 21 T_BOUND */
183	7,			/* 22 T_DNA */
184	8,			/* 23 T_DOUBLEFLT */
185	9,			/* 24 T_FPOPFLT */
186	10,			/* 25 T_TSSFLT */
187	11,			/* 26 T_SEGNPFLT */
188	12,			/* 27 T_STKFLT */
189	18,			/* 28 T_MCHK */
190	19,			/* 29 T_XMMFLT */
191	15			/* 30 T_RESERVED */
192};
193#define bsd_to_linux_trapcode(code) \
194    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
195     _bsd_to_linux_trapcode[(code)]: \
196     LINUX_T_UNKNOWN)
197
198/*
199 * If FreeBSD & Linux have a difference of opinion about what a trap
200 * means, deal with it here.
201 *
202 * MPSAFE
203 */
204static int
205translate_traps(int signal, int trap_code)
206{
207	if (signal != SIGBUS)
208		return signal;
209	switch (trap_code) {
210	case T_PROTFLT:
211	case T_TSSFLT:
212	case T_DOUBLEFLT:
213	case T_PAGEFLT:
214		return SIGSEGV;
215	default:
216		return signal;
217	}
218}
219
220static int
221linux_fixup(register_t **stack_base, struct image_params *imgp)
222{
223	register_t *argv, *envp;
224
225	argv = *stack_base;
226	envp = *stack_base + (imgp->args->argc + 1);
227	(*stack_base)--;
228	**stack_base = (intptr_t)(void *)envp;
229	(*stack_base)--;
230	**stack_base = (intptr_t)(void *)argv;
231	(*stack_base)--;
232	**stack_base = imgp->args->argc;
233	return 0;
234}
235
236static int
237elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
238{
239	Elf32_Auxargs *args;
240	register_t *pos;
241
242	KASSERT(curthread->td_proc == imgp->proc,
243	    ("unsafe elf_linux_fixup(), should be curproc"));
244	args = (Elf32_Auxargs *)imgp->auxargs;
245	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
246
247	if (args->trace)
248		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
249	if (args->execfd != -1)
250		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
251	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
252	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
253	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
254	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
255	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
256	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
257	AUXARGS_ENTRY(pos, AT_BASE, args->base);
258	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
259	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
260	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
261	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
262	AUXARGS_ENTRY(pos, AT_NULL, 0);
263
264	free(imgp->auxargs, M_TEMP);
265	imgp->auxargs = NULL;
266
267	(*stack_base)--;
268	**stack_base = (register_t)imgp->args->argc;
269	return 0;
270}
271
272extern int _ucodesel, _udatasel;
273extern unsigned long linux_sznonrtsigcode;
274
275static void
276linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
277{
278	struct thread *td = curthread;
279	struct proc *p = td->td_proc;
280	struct sigacts *psp;
281	struct trapframe *regs;
282	struct l_rt_sigframe *fp, frame;
283	int sig, code;
284	int oonstack;
285
286	sig = ksi->ksi_signo;
287	code = ksi->ksi_code;
288	PROC_LOCK_ASSERT(p, MA_OWNED);
289	psp = p->p_sigacts;
290	mtx_assert(&psp->ps_mtx, MA_OWNED);
291	regs = td->td_frame;
292	oonstack = sigonstack(regs->tf_esp);
293
294#ifdef DEBUG
295	if (ldebug(rt_sendsig))
296		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
297		    catcher, sig, (void*)mask, code);
298#endif
299	/*
300	 * Allocate space for the signal handler context.
301	 */
302	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
303	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
304		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
305		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
306	} else
307		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
308	mtx_unlock(&psp->ps_mtx);
309
310	/*
311	 * Build the argument list for the signal handler.
312	 */
313	if (p->p_sysent->sv_sigtbl)
314		if (sig <= p->p_sysent->sv_sigsize)
315			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
316
317	bzero(&frame, sizeof(frame));
318
319	frame.sf_handler = catcher;
320	frame.sf_sig = sig;
321	frame.sf_siginfo = &fp->sf_si;
322	frame.sf_ucontext = &fp->sf_sc;
323
324	/* Fill in POSIX parts */
325	frame.sf_si.lsi_signo = sig;
326	frame.sf_si.lsi_code = code;
327	frame.sf_si.lsi_addr = ksi->ksi_addr;
328
329	/*
330	 * Build the signal context to be used by sigreturn.
331	 */
332	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
333	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
334
335	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
336	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
337	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
338	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
339	PROC_UNLOCK(p);
340
341	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
342
343	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
344	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
345	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
346	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
347	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
348	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
349	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
350	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
351	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
352	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
353	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
354	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
355	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
356	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
357	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
358	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
359	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
360	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
361	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
362	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
363
364#ifdef DEBUG
365	if (ldebug(rt_sendsig))
366		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
367		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
368		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
369#endif
370
371	if (copyout(&frame, fp, sizeof(frame)) != 0) {
372		/*
373		 * Process has trashed its stack; give it an illegal
374		 * instruction to halt it in its tracks.
375		 */
376#ifdef DEBUG
377		if (ldebug(rt_sendsig))
378			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
379			    fp, oonstack);
380#endif
381		PROC_LOCK(p);
382		sigexit(td, SIGILL);
383	}
384
385	/*
386	 * Build context to run handler in.
387	 */
388	regs->tf_esp = (int)fp;
389	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
390	    linux_sznonrtsigcode;
391	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
392	regs->tf_cs = _ucodesel;
393	regs->tf_ds = _udatasel;
394	regs->tf_es = _udatasel;
395	regs->tf_fs = _udatasel;
396	regs->tf_ss = _udatasel;
397	PROC_LOCK(p);
398	mtx_lock(&psp->ps_mtx);
399}
400
401
402/*
403 * Send an interrupt to process.
404 *
405 * Stack is set up to allow sigcode stored
406 * in u. to call routine, followed by kcall
407 * to sigreturn routine below.  After sigreturn
408 * resets the signal mask, the stack, and the
409 * frame pointer, it returns to the user
410 * specified pc, psl.
411 */
412static void
413linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
414{
415	struct thread *td = curthread;
416	struct proc *p = td->td_proc;
417	struct sigacts *psp;
418	struct trapframe *regs;
419	struct l_sigframe *fp, frame;
420	l_sigset_t lmask;
421	int sig, code;
422	int oonstack, i;
423
424	PROC_LOCK_ASSERT(p, MA_OWNED);
425	psp = p->p_sigacts;
426	sig = ksi->ksi_signo;
427	code = ksi->ksi_code;
428	mtx_assert(&psp->ps_mtx, MA_OWNED);
429	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
430		/* Signal handler installed with SA_SIGINFO. */
431		linux_rt_sendsig(catcher, ksi, mask);
432		return;
433	}
434	regs = td->td_frame;
435	oonstack = sigonstack(regs->tf_esp);
436
437#ifdef DEBUG
438	if (ldebug(sendsig))
439		printf(ARGS(sendsig, "%p, %d, %p, %u"),
440		    catcher, sig, (void*)mask, code);
441#endif
442
443	/*
444	 * Allocate space for the signal handler context.
445	 */
446	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
447	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
448		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
449		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
450	} else
451		fp = (struct l_sigframe *)regs->tf_esp - 1;
452	mtx_unlock(&psp->ps_mtx);
453	PROC_UNLOCK(p);
454
455	/*
456	 * Build the argument list for the signal handler.
457	 */
458	if (p->p_sysent->sv_sigtbl)
459		if (sig <= p->p_sysent->sv_sigsize)
460			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
461
462	bzero(&frame, sizeof(frame));
463
464	frame.sf_handler = catcher;
465	frame.sf_sig = sig;
466
467	bsd_to_linux_sigset(mask, &lmask);
468
469	/*
470	 * Build the signal context to be used by sigreturn.
471	 */
472	frame.sf_sc.sc_mask   = lmask.__bits[0];
473	frame.sf_sc.sc_gs     = rgs();
474	frame.sf_sc.sc_fs     = regs->tf_fs;
475	frame.sf_sc.sc_es     = regs->tf_es;
476	frame.sf_sc.sc_ds     = regs->tf_ds;
477	frame.sf_sc.sc_edi    = regs->tf_edi;
478	frame.sf_sc.sc_esi    = regs->tf_esi;
479	frame.sf_sc.sc_ebp    = regs->tf_ebp;
480	frame.sf_sc.sc_ebx    = regs->tf_ebx;
481	frame.sf_sc.sc_edx    = regs->tf_edx;
482	frame.sf_sc.sc_ecx    = regs->tf_ecx;
483	frame.sf_sc.sc_eax    = regs->tf_eax;
484	frame.sf_sc.sc_eip    = regs->tf_eip;
485	frame.sf_sc.sc_cs     = regs->tf_cs;
486	frame.sf_sc.sc_eflags = regs->tf_eflags;
487	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
488	frame.sf_sc.sc_ss     = regs->tf_ss;
489	frame.sf_sc.sc_err    = regs->tf_err;
490	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
491	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
492
493	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
494		frame.sf_extramask[i] = lmask.__bits[i+1];
495
496	if (copyout(&frame, fp, sizeof(frame)) != 0) {
497		/*
498		 * Process has trashed its stack; give it an illegal
499		 * instruction to halt it in its tracks.
500		 */
501		PROC_LOCK(p);
502		sigexit(td, SIGILL);
503	}
504
505	/*
506	 * Build context to run handler in.
507	 */
508	regs->tf_esp = (int)fp;
509	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
510	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
511	regs->tf_cs = _ucodesel;
512	regs->tf_ds = _udatasel;
513	regs->tf_es = _udatasel;
514	regs->tf_fs = _udatasel;
515	regs->tf_ss = _udatasel;
516	PROC_LOCK(p);
517	mtx_lock(&psp->ps_mtx);
518}
519
520/*
521 * System call to cleanup state after a signal
522 * has been taken.  Reset signal mask and
523 * stack state from context left by sendsig (above).
524 * Return to previous pc and psl as specified by
525 * context left by sendsig. Check carefully to
526 * make sure that the user has not modified the
527 * psl to gain improper privileges or to cause
528 * a machine fault.
529 */
530int
531linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
532{
533	struct proc *p = td->td_proc;
534	struct l_sigframe frame;
535	struct trapframe *regs;
536	l_sigset_t lmask;
537	int eflags, i;
538	ksiginfo_t ksi;
539
540	regs = td->td_frame;
541
542#ifdef DEBUG
543	if (ldebug(sigreturn))
544		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
545#endif
546	/*
547	 * The trampoline code hands us the sigframe.
548	 * It is unsafe to keep track of it ourselves, in the event that a
549	 * program jumps out of a signal handler.
550	 */
551	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
552		return (EFAULT);
553
554	/*
555	 * Check for security violations.
556	 */
557#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
558	eflags = frame.sf_sc.sc_eflags;
559	/*
560	 * XXX do allow users to change the privileged flag PSL_RF.  The
561	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
562	 * sometimes set it there too.  tf_eflags is kept in the signal
563	 * context during signal handling and there is no other place
564	 * to remember it, so the PSL_RF bit may be corrupted by the
565	 * signal handler without us knowing.  Corruption of the PSL_RF
566	 * bit at worst causes one more or one less debugger trap, so
567	 * allowing it is fairly harmless.
568	 */
569	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
570		return(EINVAL);
571
572	/*
573	 * Don't allow users to load a valid privileged %cs.  Let the
574	 * hardware check for invalid selectors, excess privilege in
575	 * other selectors, invalid %eip's and invalid %esp's.
576	 */
577#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
578	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
579		ksiginfo_init_trap(&ksi);
580		ksi.ksi_signo = SIGBUS;
581		ksi.ksi_code = BUS_OBJERR;
582		ksi.ksi_trapno = T_PROTFLT;
583		ksi.ksi_addr = (void *)regs->tf_eip;
584		trapsignal(td, &ksi);
585		return(EINVAL);
586	}
587
588	lmask.__bits[0] = frame.sf_sc.sc_mask;
589	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
590		lmask.__bits[i+1] = frame.sf_extramask[i];
591	PROC_LOCK(p);
592	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
593	SIG_CANTMASK(td->td_sigmask);
594	signotify(td);
595	PROC_UNLOCK(p);
596
597	/*
598	 * Restore signal context.
599	 */
600	/* %gs was restored by the trampoline. */
601	regs->tf_fs     = frame.sf_sc.sc_fs;
602	regs->tf_es     = frame.sf_sc.sc_es;
603	regs->tf_ds     = frame.sf_sc.sc_ds;
604	regs->tf_edi    = frame.sf_sc.sc_edi;
605	regs->tf_esi    = frame.sf_sc.sc_esi;
606	regs->tf_ebp    = frame.sf_sc.sc_ebp;
607	regs->tf_ebx    = frame.sf_sc.sc_ebx;
608	regs->tf_edx    = frame.sf_sc.sc_edx;
609	regs->tf_ecx    = frame.sf_sc.sc_ecx;
610	regs->tf_eax    = frame.sf_sc.sc_eax;
611	regs->tf_eip    = frame.sf_sc.sc_eip;
612	regs->tf_cs     = frame.sf_sc.sc_cs;
613	regs->tf_eflags = eflags;
614	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
615	regs->tf_ss     = frame.sf_sc.sc_ss;
616
617	return (EJUSTRETURN);
618}
619
620/*
621 * System call to cleanup state after a signal
622 * has been taken.  Reset signal mask and
623 * stack state from context left by rt_sendsig (above).
624 * Return to previous pc and psl as specified by
625 * context left by sendsig. Check carefully to
626 * make sure that the user has not modified the
627 * psl to gain improper privileges or to cause
628 * a machine fault.
629 */
630int
631linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
632{
633	struct proc *p = td->td_proc;
634	struct l_ucontext uc;
635	struct l_sigcontext *context;
636	l_stack_t *lss;
637	stack_t ss;
638	struct trapframe *regs;
639	int eflags;
640	ksiginfo_t ksi;
641
642	regs = td->td_frame;
643
644#ifdef DEBUG
645	if (ldebug(rt_sigreturn))
646		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
647#endif
648	/*
649	 * The trampoline code hands us the ucontext.
650	 * It is unsafe to keep track of it ourselves, in the event that a
651	 * program jumps out of a signal handler.
652	 */
653	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
654		return (EFAULT);
655
656	context = &uc.uc_mcontext;
657
658	/*
659	 * Check for security violations.
660	 */
661#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
662	eflags = context->sc_eflags;
663	/*
664	 * XXX do allow users to change the privileged flag PSL_RF.  The
665	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
666	 * sometimes set it there too.  tf_eflags is kept in the signal
667	 * context during signal handling and there is no other place
668	 * to remember it, so the PSL_RF bit may be corrupted by the
669	 * signal handler without us knowing.  Corruption of the PSL_RF
670	 * bit at worst causes one more or one less debugger trap, so
671	 * allowing it is fairly harmless.
672	 */
673	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
674		return(EINVAL);
675
676	/*
677	 * Don't allow users to load a valid privileged %cs.  Let the
678	 * hardware check for invalid selectors, excess privilege in
679	 * other selectors, invalid %eip's and invalid %esp's.
680	 */
681#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
682	if (!CS_SECURE(context->sc_cs)) {
683		ksiginfo_init_trap(&ksi);
684		ksi.ksi_signo = SIGBUS;
685		ksi.ksi_code = BUS_OBJERR;
686		ksi.ksi_trapno = T_PROTFLT;
687		ksi.ksi_addr = (void *)regs->tf_eip;
688		trapsignal(td, &ksi);
689		return(EINVAL);
690	}
691
692	PROC_LOCK(p);
693	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
694	SIG_CANTMASK(td->td_sigmask);
695	signotify(td);
696	PROC_UNLOCK(p);
697
698	/*
699	 * Restore signal context
700	 */
701	/* %gs was restored by the trampoline. */
702	regs->tf_fs     = context->sc_fs;
703	regs->tf_es     = context->sc_es;
704	regs->tf_ds     = context->sc_ds;
705	regs->tf_edi    = context->sc_edi;
706	regs->tf_esi    = context->sc_esi;
707	regs->tf_ebp    = context->sc_ebp;
708	regs->tf_ebx    = context->sc_ebx;
709	regs->tf_edx    = context->sc_edx;
710	regs->tf_ecx    = context->sc_ecx;
711	regs->tf_eax    = context->sc_eax;
712	regs->tf_eip    = context->sc_eip;
713	regs->tf_cs     = context->sc_cs;
714	regs->tf_eflags = eflags;
715	regs->tf_esp    = context->sc_esp_at_signal;
716	regs->tf_ss     = context->sc_ss;
717
718	/*
719	 * call sigaltstack & ignore results..
720	 */
721	lss = &uc.uc_stack;
722	ss.ss_sp = lss->ss_sp;
723	ss.ss_size = lss->ss_size;
724	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
725
726#ifdef DEBUG
727	if (ldebug(rt_sigreturn))
728		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
729		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
730#endif
731	(void)kern_sigaltstack(td, &ss, NULL);
732
733	return (EJUSTRETURN);
734}
735
736/*
737 * MPSAFE
738 */
739static void
740linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
741{
742	args[0] = tf->tf_ebx;
743	args[1] = tf->tf_ecx;
744	args[2] = tf->tf_edx;
745	args[3] = tf->tf_esi;
746	args[4] = tf->tf_edi;
747	args[5] = tf->tf_ebp;	/* Unconfirmed */
748	*params = NULL;		/* no copyin */
749}
750
751/*
752 * If a linux binary is exec'ing something, try this image activator
753 * first.  We override standard shell script execution in order to
754 * be able to modify the interpreter path.  We only do this if a linux
755 * binary is doing the exec, so we do not create an EXEC module for it.
756 */
757static int	exec_linux_imgact_try(struct image_params *iparams);
758
759static int
760exec_linux_imgact_try(struct image_params *imgp)
761{
762    const char *head = (const char *)imgp->image_header;
763    char *rpath;
764    int error = -1, len;
765
766    /*
767     * The interpreter for shell scripts run from a linux binary needs
768     * to be located in /compat/linux if possible in order to recursively
769     * maintain linux path emulation.
770     */
771    if (((const short *)head)[0] == SHELLMAGIC) {
772	    /*
773	     * Run our normal shell image activator.  If it succeeds attempt
774	     * to use the alternate path for the interpreter.  If an alternate
775	     * path is found, use our stringspace to store it.
776	     */
777	    if ((error = exec_shell_imgact(imgp)) == 0) {
778		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
779			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
780		    if (rpath != NULL) {
781			    len = strlen(rpath) + 1;
782
783			    if (len <= MAXSHELLCMDLEN) {
784				    memcpy(imgp->interpreter_name, rpath, len);
785			    }
786			    free(rpath, M_TEMP);
787		    }
788	    }
789    }
790    return(error);
791}
792
793/*
794 * exec_setregs may initialize some registers differently than Linux
795 * does, thus potentially confusing Linux binaries. If necessary, we
796 * override the exec_setregs default(s) here.
797 */
798static void
799exec_linux_setregs(struct thread *td, u_long entry,
800		   u_long stack, u_long ps_strings)
801{
802	static const u_short control = __LINUX_NPXCW__;
803	struct pcb *pcb = td->td_pcb;
804
805	exec_setregs(td, entry, stack, ps_strings);
806
807	/* Linux sets %gs to 0, we default to _udatasel */
808	pcb->pcb_gs = 0; load_gs(0);
809
810	/* Linux sets the i387 to extended precision. */
811	fldcw(&control);
812}
813
814struct sysentvec linux_sysvec = {
815	LINUX_SYS_MAXSYSCALL,
816	linux_sysent,
817	0,
818	LINUX_SIGTBLSZ,
819	bsd_to_linux_signal,
820	ELAST + 1,
821	bsd_to_linux_errno,
822	translate_traps,
823	linux_fixup,
824	linux_sendsig,
825	linux_sigcode,
826	&linux_szsigcode,
827	linux_prepsyscall,
828	"Linux a.out",
829	NULL,
830	exec_linux_imgact_try,
831	LINUX_MINSIGSTKSZ,
832	PAGE_SIZE,
833	VM_MIN_ADDRESS,
834	VM_MAXUSER_ADDRESS,
835	USRSTACK,
836	PS_STRINGS,
837	VM_PROT_ALL,
838	exec_copyout_strings,
839	exec_linux_setregs,
840	NULL
841};
842
843struct sysentvec elf_linux_sysvec = {
844	LINUX_SYS_MAXSYSCALL,
845	linux_sysent,
846	0,
847	LINUX_SIGTBLSZ,
848	bsd_to_linux_signal,
849	ELAST + 1,
850	bsd_to_linux_errno,
851	translate_traps,
852	elf_linux_fixup,
853	linux_sendsig,
854	linux_sigcode,
855	&linux_szsigcode,
856	linux_prepsyscall,
857	"Linux ELF",
858	elf32_coredump,
859	exec_linux_imgact_try,
860	LINUX_MINSIGSTKSZ,
861	PAGE_SIZE,
862	VM_MIN_ADDRESS,
863	VM_MAXUSER_ADDRESS,
864	USRSTACK,
865	PS_STRINGS,
866	VM_PROT_ALL,
867	exec_copyout_strings,
868	exec_linux_setregs,
869	NULL
870};
871
872static Elf32_Brandinfo linux_brand = {
873					ELFOSABI_LINUX,
874					EM_386,
875					"Linux",
876					"/compat/linux",
877					"/lib/ld-linux.so.1",
878					&elf_linux_sysvec,
879					NULL,
880					BI_CAN_EXEC_DYN,
881				 };
882
883static Elf32_Brandinfo linux_glibc2brand = {
884					ELFOSABI_LINUX,
885					EM_386,
886					"Linux",
887					"/compat/linux",
888					"/lib/ld-linux.so.2",
889					&elf_linux_sysvec,
890					NULL,
891					BI_CAN_EXEC_DYN,
892				 };
893
894Elf32_Brandinfo *linux_brandlist[] = {
895					&linux_brand,
896					&linux_glibc2brand,
897					NULL
898				};
899
900static int
901linux_elf_modevent(module_t mod, int type, void *data)
902{
903	Elf32_Brandinfo **brandinfo;
904	int error;
905	struct linux_ioctl_handler **lihp;
906	struct linux_device_handler **ldhp;
907
908	error = 0;
909
910	switch(type) {
911	case MOD_LOAD:
912		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
913		     ++brandinfo)
914			if (elf32_insert_brand_entry(*brandinfo) < 0)
915				error = EINVAL;
916		if (error == 0) {
917			SET_FOREACH(lihp, linux_ioctl_handler_set)
918				linux_ioctl_register_handler(*lihp);
919			SET_FOREACH(ldhp, linux_device_handler_set)
920				linux_device_register_handler(*ldhp);
921			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
922			sx_init(&emul_shared_lock, "emuldata->shared lock");
923			LIST_INIT(&futex_list);
924			sx_init(&futex_sx, "futex protection lock");
925			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
926			      NULL, 1000);
927			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
928			      NULL, 1000);
929			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
930			      NULL, 1000);
931			if (bootverbose)
932				printf("Linux ELF exec handler installed\n");
933		} else
934			printf("cannot insert Linux ELF brand handler\n");
935		break;
936	case MOD_UNLOAD:
937		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
938		     ++brandinfo)
939			if (elf32_brand_inuse(*brandinfo))
940				error = EBUSY;
941		if (error == 0) {
942			for (brandinfo = &linux_brandlist[0];
943			     *brandinfo != NULL; ++brandinfo)
944				if (elf32_remove_brand_entry(*brandinfo) < 0)
945					error = EINVAL;
946		}
947		if (error == 0) {
948			SET_FOREACH(lihp, linux_ioctl_handler_set)
949				linux_ioctl_unregister_handler(*lihp);
950			SET_FOREACH(ldhp, linux_device_handler_set)
951				linux_device_unregister_handler(*ldhp);
952			mtx_destroy(&emul_lock);
953			sx_destroy(&emul_shared_lock);
954			sx_destroy(&futex_sx);
955			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
956			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
957			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
958			if (bootverbose)
959				printf("Linux ELF exec handler removed\n");
960		} else
961			printf("Could not deinstall ELF interpreter entry\n");
962		break;
963	default:
964		return EOPNOTSUPP;
965	}
966	return error;
967}
968
969static moduledata_t linux_elf_mod = {
970	"linuxelf",
971	linux_elf_modevent,
972	0
973};
974
975DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
976