linux_sysvec.c revision 177997
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 177997 2008-04-08 09:45:49Z kib $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/exec.h>
35#include <sys/fcntl.h>
36#include <sys/imgact.h>
37#include <sys/imgact_aout.h>
38#include <sys/imgact_elf.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/module.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/signalvar.h>
46#include <sys/syscallsubr.h>
47#include <sys/sysent.h>
48#include <sys/sysproto.h>
49#include <sys/vnode.h>
50#include <sys/eventhandler.h>
51
52#include <vm/vm.h>
53#include <vm/pmap.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57#include <vm/vm_page.h>
58#include <vm/vm_param.h>
59
60#include <machine/cpu.h>
61#include <machine/md_var.h>
62#include <machine/pcb.h>
63
64#include <i386/linux/linux.h>
65#include <i386/linux/linux_proto.h>
66#include <compat/linux/linux_emul.h>
67#include <compat/linux/linux_mib.h>
68#include <compat/linux/linux_signal.h>
69#include <compat/linux/linux_util.h>
70
71MODULE_VERSION(linux, 1);
72
73MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
74
75#if BYTE_ORDER == LITTLE_ENDIAN
76#define SHELLMAGIC      0x2123 /* #! */
77#else
78#define SHELLMAGIC      0x2321
79#endif
80
81/*
82 * Allow the sendsig functions to use the ldebug() facility
83 * even though they are not syscalls themselves. Map them
84 * to syscall 0. This is slightly less bogus than using
85 * ldebug(sigreturn).
86 */
87#define	LINUX_SYS_linux_rt_sendsig	0
88#define	LINUX_SYS_linux_sendsig		0
89
90#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
91#define	__LINUX_NPXCW__		0x37f
92
93extern char linux_sigcode[];
94extern int linux_szsigcode;
95
96extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
97
98SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
99SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
100
101static int	linux_fixup(register_t **stack_base,
102		    struct image_params *iparams);
103static int	elf_linux_fixup(register_t **stack_base,
104		    struct image_params *iparams);
105static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
106		    caddr_t *params);
107static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
108static void	exec_linux_setregs(struct thread *td, u_long entry,
109				   u_long stack, u_long ps_strings);
110
111extern LIST_HEAD(futex_list, futex) futex_list;
112extern struct sx futex_sx;
113
114static eventhandler_tag linux_exit_tag;
115static eventhandler_tag linux_schedtail_tag;
116static eventhandler_tag linux_exec_tag;
117
118/*
119 * Linux syscalls return negative errno's, we do positive and map them
120 * Reference:
121 *   FreeBSD: src/sys/sys/errno.h
122 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
123 *            linux-2.6.17.8/include/asm-generic/errno.h
124 */
125static int bsd_to_linux_errno[ELAST + 1] = {
126	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
127	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
128	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
129	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
130	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
131	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
132	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
133	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
134	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
135	 -72, -67, -71
136};
137
138int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
139	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
140	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
141	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
142	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
143	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
144	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
145	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
146	0, LINUX_SIGUSR1, LINUX_SIGUSR2
147};
148
149int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
150	SIGHUP, SIGINT, SIGQUIT, SIGILL,
151	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
152	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
153	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
154	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
155	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
156	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
157	SIGIO, SIGURG, SIGSYS
158};
159
160#define LINUX_T_UNKNOWN  255
161static int _bsd_to_linux_trapcode[] = {
162	LINUX_T_UNKNOWN,	/* 0 */
163	6,			/* 1  T_PRIVINFLT */
164	LINUX_T_UNKNOWN,	/* 2 */
165	3,			/* 3  T_BPTFLT */
166	LINUX_T_UNKNOWN,	/* 4 */
167	LINUX_T_UNKNOWN,	/* 5 */
168	16,			/* 6  T_ARITHTRAP */
169	254,			/* 7  T_ASTFLT */
170	LINUX_T_UNKNOWN,	/* 8 */
171	13,			/* 9  T_PROTFLT */
172	1,			/* 10 T_TRCTRAP */
173	LINUX_T_UNKNOWN,	/* 11 */
174	14,			/* 12 T_PAGEFLT */
175	LINUX_T_UNKNOWN,	/* 13 */
176	17,			/* 14 T_ALIGNFLT */
177	LINUX_T_UNKNOWN,	/* 15 */
178	LINUX_T_UNKNOWN,	/* 16 */
179	LINUX_T_UNKNOWN,	/* 17 */
180	0,			/* 18 T_DIVIDE */
181	2,			/* 19 T_NMI */
182	4,			/* 20 T_OFLOW */
183	5,			/* 21 T_BOUND */
184	7,			/* 22 T_DNA */
185	8,			/* 23 T_DOUBLEFLT */
186	9,			/* 24 T_FPOPFLT */
187	10,			/* 25 T_TSSFLT */
188	11,			/* 26 T_SEGNPFLT */
189	12,			/* 27 T_STKFLT */
190	18,			/* 28 T_MCHK */
191	19,			/* 29 T_XMMFLT */
192	15			/* 30 T_RESERVED */
193};
194#define bsd_to_linux_trapcode(code) \
195    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
196     _bsd_to_linux_trapcode[(code)]: \
197     LINUX_T_UNKNOWN)
198
199/*
200 * If FreeBSD & Linux have a difference of opinion about what a trap
201 * means, deal with it here.
202 *
203 * MPSAFE
204 */
205static int
206translate_traps(int signal, int trap_code)
207{
208	if (signal != SIGBUS)
209		return signal;
210	switch (trap_code) {
211	case T_PROTFLT:
212	case T_TSSFLT:
213	case T_DOUBLEFLT:
214	case T_PAGEFLT:
215		return SIGSEGV;
216	default:
217		return signal;
218	}
219}
220
221static int
222linux_fixup(register_t **stack_base, struct image_params *imgp)
223{
224	register_t *argv, *envp;
225
226	argv = *stack_base;
227	envp = *stack_base + (imgp->args->argc + 1);
228	(*stack_base)--;
229	**stack_base = (intptr_t)(void *)envp;
230	(*stack_base)--;
231	**stack_base = (intptr_t)(void *)argv;
232	(*stack_base)--;
233	**stack_base = imgp->args->argc;
234	return 0;
235}
236
237static int
238elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
239{
240	Elf32_Auxargs *args;
241	register_t *pos;
242
243	KASSERT(curthread->td_proc == imgp->proc,
244	    ("unsafe elf_linux_fixup(), should be curproc"));
245	args = (Elf32_Auxargs *)imgp->auxargs;
246	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
247
248	if (args->trace)
249		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
250	if (args->execfd != -1)
251		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
252	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
253	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
254	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
255	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
256	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
257	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
258	AUXARGS_ENTRY(pos, AT_BASE, args->base);
259	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
260	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
261	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
262	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
263	AUXARGS_ENTRY(pos, AT_NULL, 0);
264
265	free(imgp->auxargs, M_TEMP);
266	imgp->auxargs = NULL;
267
268	(*stack_base)--;
269	**stack_base = (register_t)imgp->args->argc;
270	return 0;
271}
272
273extern int _ucodesel, _udatasel;
274extern unsigned long linux_sznonrtsigcode;
275
276static void
277linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
278{
279	struct thread *td = curthread;
280	struct proc *p = td->td_proc;
281	struct sigacts *psp;
282	struct trapframe *regs;
283	struct l_rt_sigframe *fp, frame;
284	int sig, code;
285	int oonstack;
286
287	sig = ksi->ksi_signo;
288	code = ksi->ksi_code;
289	PROC_LOCK_ASSERT(p, MA_OWNED);
290	psp = p->p_sigacts;
291	mtx_assert(&psp->ps_mtx, MA_OWNED);
292	regs = td->td_frame;
293	oonstack = sigonstack(regs->tf_esp);
294
295#ifdef DEBUG
296	if (ldebug(rt_sendsig))
297		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
298		    catcher, sig, (void*)mask, code);
299#endif
300	/*
301	 * Allocate space for the signal handler context.
302	 */
303	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
304	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
305		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
306		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
307	} else
308		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
309	mtx_unlock(&psp->ps_mtx);
310
311	/*
312	 * Build the argument list for the signal handler.
313	 */
314	if (p->p_sysent->sv_sigtbl)
315		if (sig <= p->p_sysent->sv_sigsize)
316			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
317
318	bzero(&frame, sizeof(frame));
319
320	frame.sf_handler = catcher;
321	frame.sf_sig = sig;
322	frame.sf_siginfo = &fp->sf_si;
323	frame.sf_ucontext = &fp->sf_sc;
324
325	/* Fill in POSIX parts */
326	frame.sf_si.lsi_signo = sig;
327	frame.sf_si.lsi_code = code;
328	frame.sf_si.lsi_addr = ksi->ksi_addr;
329
330	/*
331	 * Build the signal context to be used by sigreturn.
332	 */
333	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
334	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
335
336	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
337	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
338	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
339	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
340	PROC_UNLOCK(p);
341
342	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
343
344	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
345	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
346	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
347	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
348	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
349	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
350	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
351	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
352	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
353	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
354	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
355	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
356	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
357	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
358	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
359	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
360	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
361	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
362	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
363	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
364
365#ifdef DEBUG
366	if (ldebug(rt_sendsig))
367		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
368		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
369		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
370#endif
371
372	if (copyout(&frame, fp, sizeof(frame)) != 0) {
373		/*
374		 * Process has trashed its stack; give it an illegal
375		 * instruction to halt it in its tracks.
376		 */
377#ifdef DEBUG
378		if (ldebug(rt_sendsig))
379			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
380			    fp, oonstack);
381#endif
382		PROC_LOCK(p);
383		sigexit(td, SIGILL);
384	}
385
386	/*
387	 * Build context to run handler in.
388	 */
389	regs->tf_esp = (int)fp;
390	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
391	    linux_sznonrtsigcode;
392	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
393	regs->tf_cs = _ucodesel;
394	regs->tf_ds = _udatasel;
395	regs->tf_es = _udatasel;
396	regs->tf_fs = _udatasel;
397	regs->tf_ss = _udatasel;
398	PROC_LOCK(p);
399	mtx_lock(&psp->ps_mtx);
400}
401
402
403/*
404 * Send an interrupt to process.
405 *
406 * Stack is set up to allow sigcode stored
407 * in u. to call routine, followed by kcall
408 * to sigreturn routine below.  After sigreturn
409 * resets the signal mask, the stack, and the
410 * frame pointer, it returns to the user
411 * specified pc, psl.
412 */
413static void
414linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
415{
416	struct thread *td = curthread;
417	struct proc *p = td->td_proc;
418	struct sigacts *psp;
419	struct trapframe *regs;
420	struct l_sigframe *fp, frame;
421	l_sigset_t lmask;
422	int sig, code;
423	int oonstack, i;
424
425	PROC_LOCK_ASSERT(p, MA_OWNED);
426	psp = p->p_sigacts;
427	sig = ksi->ksi_signo;
428	code = ksi->ksi_code;
429	mtx_assert(&psp->ps_mtx, MA_OWNED);
430	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
431		/* Signal handler installed with SA_SIGINFO. */
432		linux_rt_sendsig(catcher, ksi, mask);
433		return;
434	}
435	regs = td->td_frame;
436	oonstack = sigonstack(regs->tf_esp);
437
438#ifdef DEBUG
439	if (ldebug(sendsig))
440		printf(ARGS(sendsig, "%p, %d, %p, %u"),
441		    catcher, sig, (void*)mask, code);
442#endif
443
444	/*
445	 * Allocate space for the signal handler context.
446	 */
447	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
448	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
449		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
450		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
451	} else
452		fp = (struct l_sigframe *)regs->tf_esp - 1;
453	mtx_unlock(&psp->ps_mtx);
454	PROC_UNLOCK(p);
455
456	/*
457	 * Build the argument list for the signal handler.
458	 */
459	if (p->p_sysent->sv_sigtbl)
460		if (sig <= p->p_sysent->sv_sigsize)
461			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
462
463	bzero(&frame, sizeof(frame));
464
465	frame.sf_handler = catcher;
466	frame.sf_sig = sig;
467
468	bsd_to_linux_sigset(mask, &lmask);
469
470	/*
471	 * Build the signal context to be used by sigreturn.
472	 */
473	frame.sf_sc.sc_mask   = lmask.__bits[0];
474	frame.sf_sc.sc_gs     = rgs();
475	frame.sf_sc.sc_fs     = regs->tf_fs;
476	frame.sf_sc.sc_es     = regs->tf_es;
477	frame.sf_sc.sc_ds     = regs->tf_ds;
478	frame.sf_sc.sc_edi    = regs->tf_edi;
479	frame.sf_sc.sc_esi    = regs->tf_esi;
480	frame.sf_sc.sc_ebp    = regs->tf_ebp;
481	frame.sf_sc.sc_ebx    = regs->tf_ebx;
482	frame.sf_sc.sc_edx    = regs->tf_edx;
483	frame.sf_sc.sc_ecx    = regs->tf_ecx;
484	frame.sf_sc.sc_eax    = regs->tf_eax;
485	frame.sf_sc.sc_eip    = regs->tf_eip;
486	frame.sf_sc.sc_cs     = regs->tf_cs;
487	frame.sf_sc.sc_eflags = regs->tf_eflags;
488	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
489	frame.sf_sc.sc_ss     = regs->tf_ss;
490	frame.sf_sc.sc_err    = regs->tf_err;
491	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
492	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
493
494	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
495		frame.sf_extramask[i] = lmask.__bits[i+1];
496
497	if (copyout(&frame, fp, sizeof(frame)) != 0) {
498		/*
499		 * Process has trashed its stack; give it an illegal
500		 * instruction to halt it in its tracks.
501		 */
502		PROC_LOCK(p);
503		sigexit(td, SIGILL);
504	}
505
506	/*
507	 * Build context to run handler in.
508	 */
509	regs->tf_esp = (int)fp;
510	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
511	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
512	regs->tf_cs = _ucodesel;
513	regs->tf_ds = _udatasel;
514	regs->tf_es = _udatasel;
515	regs->tf_fs = _udatasel;
516	regs->tf_ss = _udatasel;
517	PROC_LOCK(p);
518	mtx_lock(&psp->ps_mtx);
519}
520
521/*
522 * System call to cleanup state after a signal
523 * has been taken.  Reset signal mask and
524 * stack state from context left by sendsig (above).
525 * Return to previous pc and psl as specified by
526 * context left by sendsig. Check carefully to
527 * make sure that the user has not modified the
528 * psl to gain improper privileges or to cause
529 * a machine fault.
530 */
531int
532linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
533{
534	struct proc *p = td->td_proc;
535	struct l_sigframe frame;
536	struct trapframe *regs;
537	l_sigset_t lmask;
538	int eflags, i;
539	ksiginfo_t ksi;
540
541	regs = td->td_frame;
542
543#ifdef DEBUG
544	if (ldebug(sigreturn))
545		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
546#endif
547	/*
548	 * The trampoline code hands us the sigframe.
549	 * It is unsafe to keep track of it ourselves, in the event that a
550	 * program jumps out of a signal handler.
551	 */
552	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
553		return (EFAULT);
554
555	/*
556	 * Check for security violations.
557	 */
558#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
559	eflags = frame.sf_sc.sc_eflags;
560	/*
561	 * XXX do allow users to change the privileged flag PSL_RF.  The
562	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
563	 * sometimes set it there too.  tf_eflags is kept in the signal
564	 * context during signal handling and there is no other place
565	 * to remember it, so the PSL_RF bit may be corrupted by the
566	 * signal handler without us knowing.  Corruption of the PSL_RF
567	 * bit at worst causes one more or one less debugger trap, so
568	 * allowing it is fairly harmless.
569	 */
570	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
571		return(EINVAL);
572
573	/*
574	 * Don't allow users to load a valid privileged %cs.  Let the
575	 * hardware check for invalid selectors, excess privilege in
576	 * other selectors, invalid %eip's and invalid %esp's.
577	 */
578#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
579	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
580		ksiginfo_init_trap(&ksi);
581		ksi.ksi_signo = SIGBUS;
582		ksi.ksi_code = BUS_OBJERR;
583		ksi.ksi_trapno = T_PROTFLT;
584		ksi.ksi_addr = (void *)regs->tf_eip;
585		trapsignal(td, &ksi);
586		return(EINVAL);
587	}
588
589	lmask.__bits[0] = frame.sf_sc.sc_mask;
590	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
591		lmask.__bits[i+1] = frame.sf_extramask[i];
592	PROC_LOCK(p);
593	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
594	SIG_CANTMASK(td->td_sigmask);
595	signotify(td);
596	PROC_UNLOCK(p);
597
598	/*
599	 * Restore signal context.
600	 */
601	/* %gs was restored by the trampoline. */
602	regs->tf_fs     = frame.sf_sc.sc_fs;
603	regs->tf_es     = frame.sf_sc.sc_es;
604	regs->tf_ds     = frame.sf_sc.sc_ds;
605	regs->tf_edi    = frame.sf_sc.sc_edi;
606	regs->tf_esi    = frame.sf_sc.sc_esi;
607	regs->tf_ebp    = frame.sf_sc.sc_ebp;
608	regs->tf_ebx    = frame.sf_sc.sc_ebx;
609	regs->tf_edx    = frame.sf_sc.sc_edx;
610	regs->tf_ecx    = frame.sf_sc.sc_ecx;
611	regs->tf_eax    = frame.sf_sc.sc_eax;
612	regs->tf_eip    = frame.sf_sc.sc_eip;
613	regs->tf_cs     = frame.sf_sc.sc_cs;
614	regs->tf_eflags = eflags;
615	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
616	regs->tf_ss     = frame.sf_sc.sc_ss;
617
618	return (EJUSTRETURN);
619}
620
621/*
622 * System call to cleanup state after a signal
623 * has been taken.  Reset signal mask and
624 * stack state from context left by rt_sendsig (above).
625 * Return to previous pc and psl as specified by
626 * context left by sendsig. Check carefully to
627 * make sure that the user has not modified the
628 * psl to gain improper privileges or to cause
629 * a machine fault.
630 */
631int
632linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
633{
634	struct proc *p = td->td_proc;
635	struct l_ucontext uc;
636	struct l_sigcontext *context;
637	l_stack_t *lss;
638	stack_t ss;
639	struct trapframe *regs;
640	int eflags;
641	ksiginfo_t ksi;
642
643	regs = td->td_frame;
644
645#ifdef DEBUG
646	if (ldebug(rt_sigreturn))
647		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
648#endif
649	/*
650	 * The trampoline code hands us the ucontext.
651	 * It is unsafe to keep track of it ourselves, in the event that a
652	 * program jumps out of a signal handler.
653	 */
654	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
655		return (EFAULT);
656
657	context = &uc.uc_mcontext;
658
659	/*
660	 * Check for security violations.
661	 */
662#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
663	eflags = context->sc_eflags;
664	/*
665	 * XXX do allow users to change the privileged flag PSL_RF.  The
666	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
667	 * sometimes set it there too.  tf_eflags is kept in the signal
668	 * context during signal handling and there is no other place
669	 * to remember it, so the PSL_RF bit may be corrupted by the
670	 * signal handler without us knowing.  Corruption of the PSL_RF
671	 * bit at worst causes one more or one less debugger trap, so
672	 * allowing it is fairly harmless.
673	 */
674	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
675		return(EINVAL);
676
677	/*
678	 * Don't allow users to load a valid privileged %cs.  Let the
679	 * hardware check for invalid selectors, excess privilege in
680	 * other selectors, invalid %eip's and invalid %esp's.
681	 */
682#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
683	if (!CS_SECURE(context->sc_cs)) {
684		ksiginfo_init_trap(&ksi);
685		ksi.ksi_signo = SIGBUS;
686		ksi.ksi_code = BUS_OBJERR;
687		ksi.ksi_trapno = T_PROTFLT;
688		ksi.ksi_addr = (void *)regs->tf_eip;
689		trapsignal(td, &ksi);
690		return(EINVAL);
691	}
692
693	PROC_LOCK(p);
694	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
695	SIG_CANTMASK(td->td_sigmask);
696	signotify(td);
697	PROC_UNLOCK(p);
698
699	/*
700	 * Restore signal context
701	 */
702	/* %gs was restored by the trampoline. */
703	regs->tf_fs     = context->sc_fs;
704	regs->tf_es     = context->sc_es;
705	regs->tf_ds     = context->sc_ds;
706	regs->tf_edi    = context->sc_edi;
707	regs->tf_esi    = context->sc_esi;
708	regs->tf_ebp    = context->sc_ebp;
709	regs->tf_ebx    = context->sc_ebx;
710	regs->tf_edx    = context->sc_edx;
711	regs->tf_ecx    = context->sc_ecx;
712	regs->tf_eax    = context->sc_eax;
713	regs->tf_eip    = context->sc_eip;
714	regs->tf_cs     = context->sc_cs;
715	regs->tf_eflags = eflags;
716	regs->tf_esp    = context->sc_esp_at_signal;
717	regs->tf_ss     = context->sc_ss;
718
719	/*
720	 * call sigaltstack & ignore results..
721	 */
722	lss = &uc.uc_stack;
723	ss.ss_sp = lss->ss_sp;
724	ss.ss_size = lss->ss_size;
725	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
726
727#ifdef DEBUG
728	if (ldebug(rt_sigreturn))
729		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
730		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
731#endif
732	(void)kern_sigaltstack(td, &ss, NULL);
733
734	return (EJUSTRETURN);
735}
736
737/*
738 * MPSAFE
739 */
740static void
741linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
742{
743	args[0] = tf->tf_ebx;
744	args[1] = tf->tf_ecx;
745	args[2] = tf->tf_edx;
746	args[3] = tf->tf_esi;
747	args[4] = tf->tf_edi;
748	args[5] = tf->tf_ebp;	/* Unconfirmed */
749	*params = NULL;		/* no copyin */
750}
751
752/*
753 * If a linux binary is exec'ing something, try this image activator
754 * first.  We override standard shell script execution in order to
755 * be able to modify the interpreter path.  We only do this if a linux
756 * binary is doing the exec, so we do not create an EXEC module for it.
757 */
758static int	exec_linux_imgact_try(struct image_params *iparams);
759
760static int
761exec_linux_imgact_try(struct image_params *imgp)
762{
763    const char *head = (const char *)imgp->image_header;
764    char *rpath;
765    int error = -1, len;
766
767    /*
768     * The interpreter for shell scripts run from a linux binary needs
769     * to be located in /compat/linux if possible in order to recursively
770     * maintain linux path emulation.
771     */
772    if (((const short *)head)[0] == SHELLMAGIC) {
773	    /*
774	     * Run our normal shell image activator.  If it succeeds attempt
775	     * to use the alternate path for the interpreter.  If an alternate
776	     * path is found, use our stringspace to store it.
777	     */
778	    if ((error = exec_shell_imgact(imgp)) == 0) {
779		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
780			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD);
781		    if (rpath != NULL) {
782			    len = strlen(rpath) + 1;
783
784			    if (len <= MAXSHELLCMDLEN) {
785				    memcpy(imgp->interpreter_name, rpath, len);
786			    }
787			    free(rpath, M_TEMP);
788		    }
789	    }
790    }
791    return(error);
792}
793
794/*
795 * exec_setregs may initialize some registers differently than Linux
796 * does, thus potentially confusing Linux binaries. If necessary, we
797 * override the exec_setregs default(s) here.
798 */
799static void
800exec_linux_setregs(struct thread *td, u_long entry,
801		   u_long stack, u_long ps_strings)
802{
803	static const u_short control = __LINUX_NPXCW__;
804	struct pcb *pcb = td->td_pcb;
805
806	exec_setregs(td, entry, stack, ps_strings);
807
808	/* Linux sets %gs to 0, we default to _udatasel */
809	pcb->pcb_gs = 0; load_gs(0);
810
811	/* Linux sets the i387 to extended precision. */
812	fldcw(&control);
813}
814
815struct sysentvec linux_sysvec = {
816	LINUX_SYS_MAXSYSCALL,
817	linux_sysent,
818	0,
819	LINUX_SIGTBLSZ,
820	bsd_to_linux_signal,
821	ELAST + 1,
822	bsd_to_linux_errno,
823	translate_traps,
824	linux_fixup,
825	linux_sendsig,
826	linux_sigcode,
827	&linux_szsigcode,
828	linux_prepsyscall,
829	"Linux a.out",
830	NULL,
831	exec_linux_imgact_try,
832	LINUX_MINSIGSTKSZ,
833	PAGE_SIZE,
834	VM_MIN_ADDRESS,
835	VM_MAXUSER_ADDRESS,
836	USRSTACK,
837	PS_STRINGS,
838	VM_PROT_ALL,
839	exec_copyout_strings,
840	exec_linux_setregs,
841	NULL
842};
843
844struct sysentvec elf_linux_sysvec = {
845	LINUX_SYS_MAXSYSCALL,
846	linux_sysent,
847	0,
848	LINUX_SIGTBLSZ,
849	bsd_to_linux_signal,
850	ELAST + 1,
851	bsd_to_linux_errno,
852	translate_traps,
853	elf_linux_fixup,
854	linux_sendsig,
855	linux_sigcode,
856	&linux_szsigcode,
857	linux_prepsyscall,
858	"Linux ELF",
859	elf32_coredump,
860	exec_linux_imgact_try,
861	LINUX_MINSIGSTKSZ,
862	PAGE_SIZE,
863	VM_MIN_ADDRESS,
864	VM_MAXUSER_ADDRESS,
865	USRSTACK,
866	PS_STRINGS,
867	VM_PROT_ALL,
868	exec_copyout_strings,
869	exec_linux_setregs,
870	NULL
871};
872
873static Elf32_Brandinfo linux_brand = {
874					ELFOSABI_LINUX,
875					EM_386,
876					"Linux",
877					"/compat/linux",
878					"/lib/ld-linux.so.1",
879					&elf_linux_sysvec,
880					NULL,
881					BI_CAN_EXEC_DYN,
882				 };
883
884static Elf32_Brandinfo linux_glibc2brand = {
885					ELFOSABI_LINUX,
886					EM_386,
887					"Linux",
888					"/compat/linux",
889					"/lib/ld-linux.so.2",
890					&elf_linux_sysvec,
891					NULL,
892					BI_CAN_EXEC_DYN,
893				 };
894
895Elf32_Brandinfo *linux_brandlist[] = {
896					&linux_brand,
897					&linux_glibc2brand,
898					NULL
899				};
900
901static int
902linux_elf_modevent(module_t mod, int type, void *data)
903{
904	Elf32_Brandinfo **brandinfo;
905	int error;
906	struct linux_ioctl_handler **lihp;
907	struct linux_device_handler **ldhp;
908
909	error = 0;
910
911	switch(type) {
912	case MOD_LOAD:
913		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
914		     ++brandinfo)
915			if (elf32_insert_brand_entry(*brandinfo) < 0)
916				error = EINVAL;
917		if (error == 0) {
918			SET_FOREACH(lihp, linux_ioctl_handler_set)
919				linux_ioctl_register_handler(*lihp);
920			SET_FOREACH(ldhp, linux_device_handler_set)
921				linux_device_register_handler(*ldhp);
922			mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
923			sx_init(&emul_shared_lock, "emuldata->shared lock");
924			LIST_INIT(&futex_list);
925			sx_init(&futex_sx, "futex protection lock");
926			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
927			      NULL, 1000);
928			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
929			      NULL, 1000);
930			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
931			      NULL, 1000);
932			if (bootverbose)
933				printf("Linux ELF exec handler installed\n");
934		} else
935			printf("cannot insert Linux ELF brand handler\n");
936		break;
937	case MOD_UNLOAD:
938		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
939		     ++brandinfo)
940			if (elf32_brand_inuse(*brandinfo))
941				error = EBUSY;
942		if (error == 0) {
943			for (brandinfo = &linux_brandlist[0];
944			     *brandinfo != NULL; ++brandinfo)
945				if (elf32_remove_brand_entry(*brandinfo) < 0)
946					error = EINVAL;
947		}
948		if (error == 0) {
949			SET_FOREACH(lihp, linux_ioctl_handler_set)
950				linux_ioctl_unregister_handler(*lihp);
951			SET_FOREACH(ldhp, linux_device_handler_set)
952				linux_device_unregister_handler(*ldhp);
953			mtx_destroy(&emul_lock);
954			sx_destroy(&emul_shared_lock);
955			sx_destroy(&futex_sx);
956			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
957			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
958			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
959			if (bootverbose)
960				printf("Linux ELF exec handler removed\n");
961		} else
962			printf("Could not deinstall ELF interpreter entry\n");
963		break;
964	default:
965		return EOPNOTSUPP;
966	}
967	return error;
968}
969
970static moduledata_t linux_elf_mod = {
971	"linuxelf",
972	linux_elf_modevent,
973	0
974};
975
976DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
977