linux_sysvec.c revision 161310
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 161310 2006-08-15 12:54:30Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/exec.h>
35#include <sys/imgact.h>
36#include <sys/imgact_aout.h>
37#include <sys/imgact_elf.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/module.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysent.h>
47#include <sys/sysproto.h>
48#include <sys/vnode.h>
49#include <sys/eventhandler.h>
50
51#include <vm/vm.h>
52#include <vm/pmap.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_map.h>
55#include <vm/vm_object.h>
56#include <vm/vm_page.h>
57#include <vm/vm_param.h>
58
59#include <machine/cpu.h>
60#include <machine/md_var.h>
61#include <machine/pcb.h>
62
63#include <i386/linux/linux.h>
64#include <i386/linux/linux_proto.h>
65#include <compat/linux/linux_mib.h>
66#include <compat/linux/linux_signal.h>
67#include <compat/linux/linux_util.h>
68
69MODULE_VERSION(linux, 1);
70
71MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
72
73#if BYTE_ORDER == LITTLE_ENDIAN
74#define SHELLMAGIC      0x2123 /* #! */
75#else
76#define SHELLMAGIC      0x2321
77#endif
78
79/*
80 * Allow the sendsig functions to use the ldebug() facility
81 * even though they are not syscalls themselves. Map them
82 * to syscall 0. This is slightly less bogus than using
83 * ldebug(sigreturn).
84 */
85#define	LINUX_SYS_linux_rt_sendsig	0
86#define	LINUX_SYS_linux_sendsig		0
87
88#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
89#define	__LINUX_NPXCW__		0x37f
90
91extern char linux_sigcode[];
92extern int linux_szsigcode;
93
94extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
95
96SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
97SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
98
99static int	linux_fixup(register_t **stack_base,
100		    struct image_params *iparams);
101static int	elf_linux_fixup(register_t **stack_base,
102		    struct image_params *iparams);
103static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
104		    caddr_t *params);
105static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
106static void	exec_linux_setregs(struct thread *td, u_long entry,
107				   u_long stack, u_long ps_strings);
108
109extern void linux_proc_exit(void *, struct proc *, struct image_params *);
110extern void linux_proc_exec(void *, struct proc *, struct image_params *);
111extern void linux_schedtail(void *, struct proc *);
112extern LIST_HEAD(futex_list, futex) futex_list;
113extern struct sx emul_shared_lock;
114extern struct sx emul_lock;
115extern struct mtx futex_mtx;
116
117static eventhandler_tag linux_exit_tag;
118static eventhandler_tag linux_schedtail_tag;
119static eventhandler_tag linux_exec_tag;
120
121/*
122 * Linux syscalls return negative errno's, we do positive and map them
123 * Reference:
124 *   FreeBSD: src/sys/sys/errno.h
125 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
126 *            linux-2.6.17.8/include/asm-generic/errno.h
127 */
128static int bsd_to_linux_errno[ELAST + 1] = {
129	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
130	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
131	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
132	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
133	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
134	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
135	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
136	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
137	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
138	 -72, -67, -71
139};
140
141int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
142	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
143	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
144	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
145	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
146	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
147	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
148	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
149	0, LINUX_SIGUSR1, LINUX_SIGUSR2
150};
151
152int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
153	SIGHUP, SIGINT, SIGQUIT, SIGILL,
154	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
155	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
156	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
157	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
158	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
159	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
160	SIGIO, SIGURG, SIGSYS
161};
162
163#define LINUX_T_UNKNOWN  255
164static int _bsd_to_linux_trapcode[] = {
165	LINUX_T_UNKNOWN,	/* 0 */
166	6,			/* 1  T_PRIVINFLT */
167	LINUX_T_UNKNOWN,	/* 2 */
168	3,			/* 3  T_BPTFLT */
169	LINUX_T_UNKNOWN,	/* 4 */
170	LINUX_T_UNKNOWN,	/* 5 */
171	16,			/* 6  T_ARITHTRAP */
172	254,			/* 7  T_ASTFLT */
173	LINUX_T_UNKNOWN,	/* 8 */
174	13,			/* 9  T_PROTFLT */
175	1,			/* 10 T_TRCTRAP */
176	LINUX_T_UNKNOWN,	/* 11 */
177	14,			/* 12 T_PAGEFLT */
178	LINUX_T_UNKNOWN,	/* 13 */
179	17,			/* 14 T_ALIGNFLT */
180	LINUX_T_UNKNOWN,	/* 15 */
181	LINUX_T_UNKNOWN,	/* 16 */
182	LINUX_T_UNKNOWN,	/* 17 */
183	0,			/* 18 T_DIVIDE */
184	2,			/* 19 T_NMI */
185	4,			/* 20 T_OFLOW */
186	5,			/* 21 T_BOUND */
187	7,			/* 22 T_DNA */
188	8,			/* 23 T_DOUBLEFLT */
189	9,			/* 24 T_FPOPFLT */
190	10,			/* 25 T_TSSFLT */
191	11,			/* 26 T_SEGNPFLT */
192	12,			/* 27 T_STKFLT */
193	18,			/* 28 T_MCHK */
194	19,			/* 29 T_XMMFLT */
195	15			/* 30 T_RESERVED */
196};
197#define bsd_to_linux_trapcode(code) \
198    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
199     _bsd_to_linux_trapcode[(code)]: \
200     LINUX_T_UNKNOWN)
201
202/*
203 * If FreeBSD & Linux have a difference of opinion about what a trap
204 * means, deal with it here.
205 *
206 * MPSAFE
207 */
208static int
209translate_traps(int signal, int trap_code)
210{
211	if (signal != SIGBUS)
212		return signal;
213	switch (trap_code) {
214	case T_PROTFLT:
215	case T_TSSFLT:
216	case T_DOUBLEFLT:
217	case T_PAGEFLT:
218		return SIGSEGV;
219	default:
220		return signal;
221	}
222}
223
224static int
225linux_fixup(register_t **stack_base, struct image_params *imgp)
226{
227	register_t *argv, *envp;
228
229	argv = *stack_base;
230	envp = *stack_base + (imgp->args->argc + 1);
231	(*stack_base)--;
232	**stack_base = (intptr_t)(void *)envp;
233	(*stack_base)--;
234	**stack_base = (intptr_t)(void *)argv;
235	(*stack_base)--;
236	**stack_base = imgp->args->argc;
237	return 0;
238}
239
240static int
241elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
242{
243	Elf32_Auxargs *args;
244	register_t *pos;
245
246	KASSERT(curthread->td_proc == imgp->proc &&
247	    (curthread->td_proc->p_flag & P_SA) == 0,
248	    ("unsafe elf_linux_fixup(), should be curproc"));
249	args = (Elf32_Auxargs *)imgp->auxargs;
250	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
251
252	if (args->trace)
253		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
254	if (args->execfd != -1)
255		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
256	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
257	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
258	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
259	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
260	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
261	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
262	AUXARGS_ENTRY(pos, AT_BASE, args->base);
263	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
264	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
265	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
266	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
267	AUXARGS_ENTRY(pos, AT_NULL, 0);
268
269	free(imgp->auxargs, M_TEMP);
270	imgp->auxargs = NULL;
271
272	(*stack_base)--;
273	**stack_base = (register_t)imgp->args->argc;
274	return 0;
275}
276
277extern int _ucodesel, _udatasel;
278extern unsigned long linux_sznonrtsigcode;
279
280static void
281linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
282{
283	struct thread *td = curthread;
284	struct proc *p = td->td_proc;
285	struct sigacts *psp;
286	struct trapframe *regs;
287	struct l_rt_sigframe *fp, frame;
288	int sig, code;
289	int oonstack;
290
291	sig = ksi->ksi_signo;
292	code = ksi->ksi_code;
293	PROC_LOCK_ASSERT(p, MA_OWNED);
294	psp = p->p_sigacts;
295	mtx_assert(&psp->ps_mtx, MA_OWNED);
296	regs = td->td_frame;
297	oonstack = sigonstack(regs->tf_esp);
298
299#ifdef DEBUG
300	if (ldebug(rt_sendsig))
301		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
302		    catcher, sig, (void*)mask, code);
303#endif
304	/*
305	 * Allocate space for the signal handler context.
306	 */
307	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
308	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
309		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
310		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
311	} else
312		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
313	mtx_unlock(&psp->ps_mtx);
314
315	/*
316	 * Build the argument list for the signal handler.
317	 */
318	if (p->p_sysent->sv_sigtbl)
319		if (sig <= p->p_sysent->sv_sigsize)
320			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
321
322	bzero(&frame, sizeof(frame));
323
324	frame.sf_handler = catcher;
325	frame.sf_sig = sig;
326	frame.sf_siginfo = &fp->sf_si;
327	frame.sf_ucontext = &fp->sf_sc;
328
329	/* Fill in POSIX parts */
330	frame.sf_si.lsi_signo = sig;
331	frame.sf_si.lsi_code = code;
332	frame.sf_si.lsi_addr = ksi->ksi_addr;
333
334	/*
335	 * Build the signal context to be used by sigreturn.
336	 */
337	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
338	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
339
340	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
341	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
342	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
343	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
344	PROC_UNLOCK(p);
345
346	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
347
348	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
349	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
350	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
351	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
352	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
353	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
354	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
355	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
356	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
357	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
358	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
359	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
360	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
361	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
362	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
363	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
364	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
365	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
366	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
367
368#ifdef DEBUG
369	if (ldebug(rt_sendsig))
370		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
371		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
372		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
373#endif
374
375	if (copyout(&frame, fp, sizeof(frame)) != 0) {
376		/*
377		 * Process has trashed its stack; give it an illegal
378		 * instruction to halt it in its tracks.
379		 */
380#ifdef DEBUG
381		if (ldebug(rt_sendsig))
382			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
383			    fp, oonstack);
384#endif
385		PROC_LOCK(p);
386		sigexit(td, SIGILL);
387	}
388
389	/*
390	 * Build context to run handler in.
391	 */
392	regs->tf_esp = (int)fp;
393	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
394	    linux_sznonrtsigcode;
395	regs->tf_eflags &= ~(PSL_T | PSL_VM);
396	regs->tf_cs = _ucodesel;
397	regs->tf_ds = _udatasel;
398	regs->tf_es = _udatasel;
399	regs->tf_fs = _udatasel;
400	regs->tf_ss = _udatasel;
401	PROC_LOCK(p);
402	mtx_lock(&psp->ps_mtx);
403}
404
405
406/*
407 * Send an interrupt to process.
408 *
409 * Stack is set up to allow sigcode stored
410 * in u. to call routine, followed by kcall
411 * to sigreturn routine below.  After sigreturn
412 * resets the signal mask, the stack, and the
413 * frame pointer, it returns to the user
414 * specified pc, psl.
415 */
416static void
417linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
418{
419	struct thread *td = curthread;
420	struct proc *p = td->td_proc;
421	struct sigacts *psp;
422	struct trapframe *regs;
423	struct l_sigframe *fp, frame;
424	l_sigset_t lmask;
425	int sig, code;
426	int oonstack, i;
427
428	PROC_LOCK_ASSERT(p, MA_OWNED);
429	psp = p->p_sigacts;
430	sig = ksi->ksi_signo;
431	code = ksi->ksi_code;
432	mtx_assert(&psp->ps_mtx, MA_OWNED);
433	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
434		/* Signal handler installed with SA_SIGINFO. */
435		linux_rt_sendsig(catcher, ksi, mask);
436		return;
437	}
438	regs = td->td_frame;
439	oonstack = sigonstack(regs->tf_esp);
440
441#ifdef DEBUG
442	if (ldebug(sendsig))
443		printf(ARGS(sendsig, "%p, %d, %p, %u"),
444		    catcher, sig, (void*)mask, code);
445#endif
446
447	/*
448	 * Allocate space for the signal handler context.
449	 */
450	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
451	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
452		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
453		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
454	} else
455		fp = (struct l_sigframe *)regs->tf_esp - 1;
456	mtx_unlock(&psp->ps_mtx);
457	PROC_UNLOCK(p);
458
459	/*
460	 * Build the argument list for the signal handler.
461	 */
462	if (p->p_sysent->sv_sigtbl)
463		if (sig <= p->p_sysent->sv_sigsize)
464			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
465
466	bzero(&frame, sizeof(frame));
467
468	frame.sf_handler = catcher;
469	frame.sf_sig = sig;
470
471	bsd_to_linux_sigset(mask, &lmask);
472
473	/*
474	 * Build the signal context to be used by sigreturn.
475	 */
476	frame.sf_sc.sc_mask   = lmask.__bits[0];
477	frame.sf_sc.sc_gs     = rgs();
478	frame.sf_sc.sc_fs     = regs->tf_fs;
479	frame.sf_sc.sc_es     = regs->tf_es;
480	frame.sf_sc.sc_ds     = regs->tf_ds;
481	frame.sf_sc.sc_edi    = regs->tf_edi;
482	frame.sf_sc.sc_esi    = regs->tf_esi;
483	frame.sf_sc.sc_ebp    = regs->tf_ebp;
484	frame.sf_sc.sc_ebx    = regs->tf_ebx;
485	frame.sf_sc.sc_edx    = regs->tf_edx;
486	frame.sf_sc.sc_ecx    = regs->tf_ecx;
487	frame.sf_sc.sc_eax    = regs->tf_eax;
488	frame.sf_sc.sc_eip    = regs->tf_eip;
489	frame.sf_sc.sc_cs     = regs->tf_cs;
490	frame.sf_sc.sc_eflags = regs->tf_eflags;
491	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
492	frame.sf_sc.sc_ss     = regs->tf_ss;
493	frame.sf_sc.sc_err    = regs->tf_err;
494	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
495
496	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
497		frame.sf_extramask[i] = lmask.__bits[i+1];
498
499	if (copyout(&frame, fp, sizeof(frame)) != 0) {
500		/*
501		 * Process has trashed its stack; give it an illegal
502		 * instruction to halt it in its tracks.
503		 */
504		PROC_LOCK(p);
505		sigexit(td, SIGILL);
506	}
507
508	/*
509	 * Build context to run handler in.
510	 */
511	regs->tf_esp = (int)fp;
512	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
513	regs->tf_eflags &= ~(PSL_T | PSL_VM);
514	regs->tf_cs = _ucodesel;
515	regs->tf_ds = _udatasel;
516	regs->tf_es = _udatasel;
517	regs->tf_fs = _udatasel;
518	regs->tf_ss = _udatasel;
519	PROC_LOCK(p);
520	mtx_lock(&psp->ps_mtx);
521}
522
523/*
524 * System call to cleanup state after a signal
525 * has been taken.  Reset signal mask and
526 * stack state from context left by sendsig (above).
527 * Return to previous pc and psl as specified by
528 * context left by sendsig. Check carefully to
529 * make sure that the user has not modified the
530 * psl to gain improper privileges or to cause
531 * a machine fault.
532 */
533int
534linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
535{
536	struct proc *p = td->td_proc;
537	struct l_sigframe frame;
538	struct trapframe *regs;
539	l_sigset_t lmask;
540	int eflags, i;
541	ksiginfo_t ksi;
542
543	regs = td->td_frame;
544
545#ifdef DEBUG
546	if (ldebug(sigreturn))
547		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
548#endif
549	/*
550	 * The trampoline code hands us the sigframe.
551	 * It is unsafe to keep track of it ourselves, in the event that a
552	 * program jumps out of a signal handler.
553	 */
554	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
555		return (EFAULT);
556
557	/*
558	 * Check for security violations.
559	 */
560#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
561	eflags = frame.sf_sc.sc_eflags;
562	/*
563	 * XXX do allow users to change the privileged flag PSL_RF.  The
564	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
565	 * sometimes set it there too.  tf_eflags is kept in the signal
566	 * context during signal handling and there is no other place
567	 * to remember it, so the PSL_RF bit may be corrupted by the
568	 * signal handler without us knowing.  Corruption of the PSL_RF
569	 * bit at worst causes one more or one less debugger trap, so
570	 * allowing it is fairly harmless.
571	 */
572	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
573		return(EINVAL);
574
575	/*
576	 * Don't allow users to load a valid privileged %cs.  Let the
577	 * hardware check for invalid selectors, excess privilege in
578	 * other selectors, invalid %eip's and invalid %esp's.
579	 */
580#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
581	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
582		ksiginfo_init_trap(&ksi);
583		ksi.ksi_signo = SIGBUS;
584		ksi.ksi_code = BUS_OBJERR;
585		ksi.ksi_trapno = T_PROTFLT;
586		ksi.ksi_addr = (void *)regs->tf_eip;
587		trapsignal(td, &ksi);
588		return(EINVAL);
589	}
590
591	lmask.__bits[0] = frame.sf_sc.sc_mask;
592	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
593		lmask.__bits[i+1] = frame.sf_extramask[i];
594	PROC_LOCK(p);
595	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
596	SIG_CANTMASK(td->td_sigmask);
597	signotify(td);
598	PROC_UNLOCK(p);
599
600	/*
601	 * Restore signal context.
602	 */
603	/* %gs was restored by the trampoline. */
604	regs->tf_fs     = frame.sf_sc.sc_fs;
605	regs->tf_es     = frame.sf_sc.sc_es;
606	regs->tf_ds     = frame.sf_sc.sc_ds;
607	regs->tf_edi    = frame.sf_sc.sc_edi;
608	regs->tf_esi    = frame.sf_sc.sc_esi;
609	regs->tf_ebp    = frame.sf_sc.sc_ebp;
610	regs->tf_ebx    = frame.sf_sc.sc_ebx;
611	regs->tf_edx    = frame.sf_sc.sc_edx;
612	regs->tf_ecx    = frame.sf_sc.sc_ecx;
613	regs->tf_eax    = frame.sf_sc.sc_eax;
614	regs->tf_eip    = frame.sf_sc.sc_eip;
615	regs->tf_cs     = frame.sf_sc.sc_cs;
616	regs->tf_eflags = eflags;
617	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
618	regs->tf_ss     = frame.sf_sc.sc_ss;
619
620	return (EJUSTRETURN);
621}
622
623/*
624 * System call to cleanup state after a signal
625 * has been taken.  Reset signal mask and
626 * stack state from context left by rt_sendsig (above).
627 * Return to previous pc and psl as specified by
628 * context left by sendsig. Check carefully to
629 * make sure that the user has not modified the
630 * psl to gain improper privileges or to cause
631 * a machine fault.
632 */
633int
634linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
635{
636	struct proc *p = td->td_proc;
637	struct l_ucontext uc;
638	struct l_sigcontext *context;
639	l_stack_t *lss;
640	stack_t ss;
641	struct trapframe *regs;
642	int eflags;
643	ksiginfo_t ksi;
644
645	regs = td->td_frame;
646
647#ifdef DEBUG
648	if (ldebug(rt_sigreturn))
649		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
650#endif
651	/*
652	 * The trampoline code hands us the ucontext.
653	 * It is unsafe to keep track of it ourselves, in the event that a
654	 * program jumps out of a signal handler.
655	 */
656	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
657		return (EFAULT);
658
659	context = &uc.uc_mcontext;
660
661	/*
662	 * Check for security violations.
663	 */
664#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
665	eflags = context->sc_eflags;
666	/*
667	 * XXX do allow users to change the privileged flag PSL_RF.  The
668	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
669	 * sometimes set it there too.  tf_eflags is kept in the signal
670	 * context during signal handling and there is no other place
671	 * to remember it, so the PSL_RF bit may be corrupted by the
672	 * signal handler without us knowing.  Corruption of the PSL_RF
673	 * bit at worst causes one more or one less debugger trap, so
674	 * allowing it is fairly harmless.
675	 */
676	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
677		return(EINVAL);
678
679	/*
680	 * Don't allow users to load a valid privileged %cs.  Let the
681	 * hardware check for invalid selectors, excess privilege in
682	 * other selectors, invalid %eip's and invalid %esp's.
683	 */
684#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
685	if (!CS_SECURE(context->sc_cs)) {
686		ksiginfo_init_trap(&ksi);
687		ksi.ksi_signo = SIGBUS;
688		ksi.ksi_code = BUS_OBJERR;
689		ksi.ksi_trapno = T_PROTFLT;
690		ksi.ksi_addr = (void *)regs->tf_eip;
691		trapsignal(td, &ksi);
692		return(EINVAL);
693	}
694
695	PROC_LOCK(p);
696	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
697	SIG_CANTMASK(td->td_sigmask);
698	signotify(td);
699	PROC_UNLOCK(p);
700
701	/*
702	 * Restore signal context
703	 */
704	/* %gs was restored by the trampoline. */
705	regs->tf_fs     = context->sc_fs;
706	regs->tf_es     = context->sc_es;
707	regs->tf_ds     = context->sc_ds;
708	regs->tf_edi    = context->sc_edi;
709	regs->tf_esi    = context->sc_esi;
710	regs->tf_ebp    = context->sc_ebp;
711	regs->tf_ebx    = context->sc_ebx;
712	regs->tf_edx    = context->sc_edx;
713	regs->tf_ecx    = context->sc_ecx;
714	regs->tf_eax    = context->sc_eax;
715	regs->tf_eip    = context->sc_eip;
716	regs->tf_cs     = context->sc_cs;
717	regs->tf_eflags = eflags;
718	regs->tf_esp    = context->sc_esp_at_signal;
719	regs->tf_ss     = context->sc_ss;
720
721	/*
722	 * call sigaltstack & ignore results..
723	 */
724	lss = &uc.uc_stack;
725	ss.ss_sp = lss->ss_sp;
726	ss.ss_size = lss->ss_size;
727	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
728
729#ifdef DEBUG
730	if (ldebug(rt_sigreturn))
731		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
732		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
733#endif
734	(void)kern_sigaltstack(td, &ss, NULL);
735
736	return (EJUSTRETURN);
737}
738
739/*
740 * MPSAFE
741 */
742static void
743linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
744{
745	args[0] = tf->tf_ebx;
746	args[1] = tf->tf_ecx;
747	args[2] = tf->tf_edx;
748	args[3] = tf->tf_esi;
749	args[4] = tf->tf_edi;
750	args[5] = tf->tf_ebp;	/* Unconfirmed */
751	*params = NULL;		/* no copyin */
752}
753
754/*
755 * If a linux binary is exec'ing something, try this image activator
756 * first.  We override standard shell script execution in order to
757 * be able to modify the interpreter path.  We only do this if a linux
758 * binary is doing the exec, so we do not create an EXEC module for it.
759 */
760static int	exec_linux_imgact_try(struct image_params *iparams);
761
762static int
763exec_linux_imgact_try(struct image_params *imgp)
764{
765    const char *head = (const char *)imgp->image_header;
766    char *rpath;
767    int error = -1, len;
768
769    /*
770     * The interpreter for shell scripts run from a linux binary needs
771     * to be located in /compat/linux if possible in order to recursively
772     * maintain linux path emulation.
773     */
774    if (((const short *)head)[0] == SHELLMAGIC) {
775	    /*
776	     * Run our normal shell image activator.  If it succeeds attempt
777	     * to use the alternate path for the interpreter.  If an alternate
778	     * path is found, use our stringspace to store it.
779	     */
780	    if ((error = exec_shell_imgact(imgp)) == 0) {
781		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
782			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
783		    if (rpath != NULL) {
784			    len = strlen(rpath) + 1;
785
786			    if (len <= MAXSHELLCMDLEN) {
787				    memcpy(imgp->interpreter_name, rpath, len);
788			    }
789			    free(rpath, M_TEMP);
790		    }
791	    }
792    }
793    return(error);
794}
795
796/*
797 * exec_setregs may initialize some registers differently than Linux
798 * does, thus potentially confusing Linux binaries. If necessary, we
799 * override the exec_setregs default(s) here.
800 */
801static void
802exec_linux_setregs(struct thread *td, u_long entry,
803		   u_long stack, u_long ps_strings)
804{
805	static const u_short control = __LINUX_NPXCW__;
806	struct pcb *pcb = td->td_pcb;
807
808	exec_setregs(td, entry, stack, ps_strings);
809
810	/* Linux sets %gs to 0, we default to _udatasel */
811	pcb->pcb_gs = 0; load_gs(0);
812
813	/* Linux sets the i387 to extended precision. */
814	fldcw(&control);
815}
816
817struct sysentvec linux_sysvec = {
818	LINUX_SYS_MAXSYSCALL,
819	linux_sysent,
820	0,
821	LINUX_SIGTBLSZ,
822	bsd_to_linux_signal,
823	ELAST + 1,
824	bsd_to_linux_errno,
825	translate_traps,
826	linux_fixup,
827	linux_sendsig,
828	linux_sigcode,
829	&linux_szsigcode,
830	linux_prepsyscall,
831	"Linux a.out",
832	NULL,
833	exec_linux_imgact_try,
834	LINUX_MINSIGSTKSZ,
835	PAGE_SIZE,
836	VM_MIN_ADDRESS,
837	VM_MAXUSER_ADDRESS,
838	USRSTACK,
839	PS_STRINGS,
840	VM_PROT_ALL,
841	exec_copyout_strings,
842	exec_linux_setregs,
843	NULL
844};
845
846struct sysentvec elf_linux_sysvec = {
847	LINUX_SYS_MAXSYSCALL,
848	linux_sysent,
849	0,
850	LINUX_SIGTBLSZ,
851	bsd_to_linux_signal,
852	ELAST + 1,
853	bsd_to_linux_errno,
854	translate_traps,
855	elf_linux_fixup,
856	linux_sendsig,
857	linux_sigcode,
858	&linux_szsigcode,
859	linux_prepsyscall,
860	"Linux ELF",
861	elf32_coredump,
862	exec_linux_imgact_try,
863	LINUX_MINSIGSTKSZ,
864	PAGE_SIZE,
865	VM_MIN_ADDRESS,
866	VM_MAXUSER_ADDRESS,
867	USRSTACK,
868	PS_STRINGS,
869	VM_PROT_ALL,
870	exec_copyout_strings,
871	exec_linux_setregs,
872	NULL
873};
874
875static Elf32_Brandinfo linux_brand = {
876					ELFOSABI_LINUX,
877					EM_386,
878					"Linux",
879					"/compat/linux",
880					"/lib/ld-linux.so.1",
881					&elf_linux_sysvec,
882					NULL,
883					BI_CAN_EXEC_DYN,
884				 };
885
886static Elf32_Brandinfo linux_glibc2brand = {
887					ELFOSABI_LINUX,
888					EM_386,
889					"Linux",
890					"/compat/linux",
891					"/lib/ld-linux.so.2",
892					&elf_linux_sysvec,
893					NULL,
894					BI_CAN_EXEC_DYN,
895				 };
896
897Elf32_Brandinfo *linux_brandlist[] = {
898					&linux_brand,
899					&linux_glibc2brand,
900					NULL
901				};
902
903static int
904linux_elf_modevent(module_t mod, int type, void *data)
905{
906	Elf32_Brandinfo **brandinfo;
907	int error;
908	struct linux_ioctl_handler **lihp;
909	struct linux_device_handler **ldhp;
910
911	error = 0;
912
913	switch(type) {
914	case MOD_LOAD:
915		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
916		     ++brandinfo)
917			if (elf32_insert_brand_entry(*brandinfo) < 0)
918				error = EINVAL;
919		if (error == 0) {
920			SET_FOREACH(lihp, linux_ioctl_handler_set)
921				linux_ioctl_register_handler(*lihp);
922			SET_FOREACH(ldhp, linux_device_handler_set)
923				linux_device_register_handler(*ldhp);
924			sx_init(&emul_lock, "emuldata lock");
925			sx_init(&emul_shared_lock, "emuldata->shared lock");
926			LIST_INIT(&futex_list);
927			mtx_init(&futex_mtx, "futex protection lock", NULL, MTX_DEF);
928			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
929			      NULL, 1000);
930			linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail, linux_schedtail,
931			      NULL, 1000);
932			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
933			      NULL, 1000);
934			if (bootverbose)
935				printf("Linux ELF exec handler installed\n");
936		} else
937			printf("cannot insert Linux ELF brand handler\n");
938		break;
939	case MOD_UNLOAD:
940		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
941		     ++brandinfo)
942			if (elf32_brand_inuse(*brandinfo))
943				error = EBUSY;
944		if (error == 0) {
945			for (brandinfo = &linux_brandlist[0];
946			     *brandinfo != NULL; ++brandinfo)
947				if (elf32_remove_brand_entry(*brandinfo) < 0)
948					error = EINVAL;
949		}
950		if (error == 0) {
951			SET_FOREACH(lihp, linux_ioctl_handler_set)
952				linux_ioctl_unregister_handler(*lihp);
953			SET_FOREACH(ldhp, linux_device_handler_set)
954				linux_device_unregister_handler(*ldhp);
955			sx_destroy(&emul_lock);
956			sx_destroy(&emul_shared_lock);
957			mtx_destroy(&futex_mtx);
958			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
959			EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
960			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
961			if (bootverbose)
962				printf("Linux ELF exec handler removed\n");
963		} else
964			printf("Could not deinstall ELF interpreter entry\n");
965		break;
966	default:
967		return EOPNOTSUPP;
968	}
969	return error;
970}
971
972static moduledata_t linux_elf_mod = {
973	"linuxelf",
974	linux_elf_modevent,
975	0
976};
977
978DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
979