linux_sysvec.c revision 116361
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 116361 2003-06-15 00:31:24Z davidxu $");
31
32/* XXX we use functions that might not exist. */
33#include "opt_compat.h"
34
35#ifndef COMPAT_43
36#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/imgact.h>
42#include <sys/imgact_aout.h>
43#include <sys/imgact_elf.h>
44#include <sys/lock.h>
45#include <sys/malloc.h>
46#include <sys/mutex.h>
47#include <sys/proc.h>
48#include <sys/signalvar.h>
49#include <sys/syscallsubr.h>
50#include <sys/sysent.h>
51#include <sys/sysproto.h>
52#include <sys/user.h>
53#include <sys/vnode.h>
54
55#include <vm/vm.h>
56#include <vm/vm_param.h>
57#include <vm/vm_page.h>
58#include <vm/vm_extern.h>
59#include <sys/exec.h>
60#include <sys/kernel.h>
61#include <sys/module.h>
62#include <machine/cpu.h>
63#include <machine/md_var.h>
64#include <sys/mutex.h>
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/pmap.h>
69#include <vm/vm_map.h>
70#include <vm/vm_object.h>
71
72#include <i386/linux/linux.h>
73#include <i386/linux/linux_proto.h>
74#include <compat/linux/linux_mib.h>
75#include <compat/linux/linux_signal.h>
76#include <compat/linux/linux_util.h>
77
78MODULE_VERSION(linux, 1);
79MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
80MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
81MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
82
83MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
84
85#if BYTE_ORDER == LITTLE_ENDIAN
86#define SHELLMAGIC      0x2123 /* #! */
87#else
88#define SHELLMAGIC      0x2321
89#endif
90
91/*
92 * Allow the sendsig functions to use the ldebug() facility
93 * even though they are not syscalls themselves. Map them
94 * to syscall 0. This is slightly less bogus than using
95 * ldebug(sigreturn).
96 */
97#define	LINUX_SYS_linux_rt_sendsig	0
98#define	LINUX_SYS_linux_sendsig		0
99
100extern char linux_sigcode[];
101extern int linux_szsigcode;
102
103extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
104
105SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
106
107static int	linux_fixup(register_t **stack_base,
108		    struct image_params *iparams);
109static int	elf_linux_fixup(register_t **stack_base,
110		    struct image_params *iparams);
111static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
112		    caddr_t *params);
113static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
114		    u_long code);
115static void	exec_linux_setregs(struct thread *td, u_long entry,
116				   u_long stack, u_long ps_strings);
117
118/*
119 * Linux syscalls return negative errno's, we do positive and map them
120 */
121static int bsd_to_linux_errno[ELAST + 1] = {
122	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
123	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
124	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
125	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
126	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
127	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
128	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
129	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
130	-6, -6, -43, -42, -75, -6, -84
131};
132
133int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
134	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
135	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
136	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
137	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
138	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
139	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
140	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
141	0, LINUX_SIGUSR1, LINUX_SIGUSR2
142};
143
144int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
145	SIGHUP, SIGINT, SIGQUIT, SIGILL,
146	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
147	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
148	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
149	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
150	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
151	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
152	SIGIO, SIGURG, SIGSYS
153};
154
155#define LINUX_T_UNKNOWN  255
156static int _bsd_to_linux_trapcode[] = {
157	LINUX_T_UNKNOWN,	/* 0 */
158	6,			/* 1  T_PRIVINFLT */
159	LINUX_T_UNKNOWN,	/* 2 */
160	3,			/* 3  T_BPTFLT */
161	LINUX_T_UNKNOWN,	/* 4 */
162	LINUX_T_UNKNOWN,	/* 5 */
163	16,			/* 6  T_ARITHTRAP */
164	254,			/* 7  T_ASTFLT */
165	LINUX_T_UNKNOWN,	/* 8 */
166	13,			/* 9  T_PROTFLT */
167	1,			/* 10 T_TRCTRAP */
168	LINUX_T_UNKNOWN,	/* 11 */
169	14,			/* 12 T_PAGEFLT */
170	LINUX_T_UNKNOWN,	/* 13 */
171	17,			/* 14 T_ALIGNFLT */
172	LINUX_T_UNKNOWN,	/* 15 */
173	LINUX_T_UNKNOWN,	/* 16 */
174	LINUX_T_UNKNOWN,	/* 17 */
175	0,			/* 18 T_DIVIDE */
176	2,			/* 19 T_NMI */
177	4,			/* 20 T_OFLOW */
178	5,			/* 21 T_BOUND */
179	7,			/* 22 T_DNA */
180	8,			/* 23 T_DOUBLEFLT */
181	9,			/* 24 T_FPOPFLT */
182	10,			/* 25 T_TSSFLT */
183	11,			/* 26 T_SEGNPFLT */
184	12,			/* 27 T_STKFLT */
185	18,			/* 28 T_MCHK */
186	19,			/* 29 T_XMMFLT */
187	15			/* 30 T_RESERVED */
188};
189#define bsd_to_linux_trapcode(code) \
190    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
191     _bsd_to_linux_trapcode[(code)]: \
192     LINUX_T_UNKNOWN)
193
194/*
195 * If FreeBSD & Linux have a difference of opinion about what a trap
196 * means, deal with it here.
197 *
198 * MPSAFE
199 */
200static int
201translate_traps(int signal, int trap_code)
202{
203	if (signal != SIGBUS)
204		return signal;
205	switch (trap_code) {
206	case T_PROTFLT:
207	case T_TSSFLT:
208	case T_DOUBLEFLT:
209	case T_PAGEFLT:
210		return SIGSEGV;
211	default:
212		return signal;
213	}
214}
215
216static int
217linux_fixup(register_t **stack_base, struct image_params *imgp)
218{
219	register_t *argv, *envp;
220
221	argv = *stack_base;
222	envp = *stack_base + (imgp->argc + 1);
223	(*stack_base)--;
224	**stack_base = (intptr_t)(void *)envp;
225	(*stack_base)--;
226	**stack_base = (intptr_t)(void *)argv;
227	(*stack_base)--;
228	**stack_base = imgp->argc;
229	return 0;
230}
231
232static int
233elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
234{
235	Elf32_Auxargs *args;
236	register_t *pos;
237
238	KASSERT(curthread->td_proc == imgp->proc &&
239	    (curthread->td_proc->p_flag & P_SA) == 0,
240	    ("unsafe elf_linux_fixup(), should be curproc"));
241	args = (Elf32_Auxargs *)imgp->auxargs;
242	pos = *stack_base + (imgp->argc + imgp->envc + 2);
243
244	if (args->trace)
245		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
246	if (args->execfd != -1)
247		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
248	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
249	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
250	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
251	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
252	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
253	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
254	AUXARGS_ENTRY(pos, AT_BASE, args->base);
255	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
256	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
257	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
258	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
259	AUXARGS_ENTRY(pos, AT_NULL, 0);
260
261	free(imgp->auxargs, M_TEMP);
262	imgp->auxargs = NULL;
263
264	(*stack_base)--;
265	**stack_base = (register_t)imgp->argc;
266	return 0;
267}
268
269extern int _ucodesel, _udatasel;
270extern unsigned long linux_sznonrtsigcode;
271
272static void
273linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
274{
275	struct thread *td = curthread;
276	struct proc *p = td->td_proc;
277	struct sigacts *psp;
278	struct trapframe *regs;
279	struct l_rt_sigframe *fp, frame;
280	int oonstack;
281
282	PROC_LOCK_ASSERT(p, MA_OWNED);
283	psp = p->p_sigacts;
284	mtx_assert(&psp->ps_mtx, MA_OWNED);
285	regs = td->td_frame;
286	oonstack = sigonstack(regs->tf_esp);
287
288#ifdef DEBUG
289	if (ldebug(rt_sendsig))
290		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
291		    catcher, sig, (void*)mask, code);
292#endif
293	/*
294	 * Allocate space for the signal handler context.
295	 */
296	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
297	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
298		fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp +
299		    p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe));
300	} else
301		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
302	mtx_unlock(&psp->ps_mtx);
303
304	/*
305	 * Build the argument list for the signal handler.
306	 */
307	if (p->p_sysent->sv_sigtbl)
308		if (sig <= p->p_sysent->sv_sigsize)
309			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
310
311	bzero(&frame, sizeof(frame));
312
313	frame.sf_handler = catcher;
314	frame.sf_sig = sig;
315	frame.sf_siginfo = &fp->sf_si;
316	frame.sf_ucontext = &fp->sf_sc;
317
318	/* Fill in POSIX parts */
319	frame.sf_si.lsi_signo = sig;
320	frame.sf_si.lsi_code = code;
321	frame.sf_si.lsi_addr = (void *)regs->tf_err;
322
323	/*
324	 * Build the signal context to be used by sigreturn.
325	 */
326	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
327	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
328
329	frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp;
330	frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size;
331	frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
332	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
333	PROC_UNLOCK(p);
334
335	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
336
337	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
338	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
339	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
340	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
341	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
342	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
343	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
344	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
345	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
346	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
347	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
348	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
349	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
350	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
351	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
352	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
353	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
354	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
355	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
356
357#ifdef DEBUG
358	if (ldebug(rt_sendsig))
359		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
360		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
361		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
362#endif
363
364	if (copyout(&frame, fp, sizeof(frame)) != 0) {
365		/*
366		 * Process has trashed its stack; give it an illegal
367		 * instruction to halt it in its tracks.
368		 */
369#ifdef DEBUG
370		if (ldebug(rt_sendsig))
371			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
372			    fp, oonstack);
373#endif
374		PROC_LOCK(p);
375		sigexit(td, SIGILL);
376	}
377
378	/*
379	 * Build context to run handler in.
380	 */
381	regs->tf_esp = (int)fp;
382	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
383	    linux_sznonrtsigcode;
384	regs->tf_eflags &= ~(PSL_T | PSL_VM);
385	regs->tf_cs = _ucodesel;
386	regs->tf_ds = _udatasel;
387	regs->tf_es = _udatasel;
388	regs->tf_fs = _udatasel;
389	regs->tf_ss = _udatasel;
390	PROC_LOCK(p);
391	mtx_lock(&psp->ps_mtx);
392}
393
394
395/*
396 * Send an interrupt to process.
397 *
398 * Stack is set up to allow sigcode stored
399 * in u. to call routine, followed by kcall
400 * to sigreturn routine below.  After sigreturn
401 * resets the signal mask, the stack, and the
402 * frame pointer, it returns to the user
403 * specified pc, psl.
404 */
405static void
406linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
407{
408	struct thread *td = curthread;
409	struct proc *p = td->td_proc;
410	struct sigacts *psp;
411	struct trapframe *regs;
412	struct l_sigframe *fp, frame;
413	l_sigset_t lmask;
414	int oonstack, i;
415
416	PROC_LOCK_ASSERT(p, MA_OWNED);
417	psp = p->p_sigacts;
418	mtx_assert(&psp->ps_mtx, MA_OWNED);
419	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
420		/* Signal handler installed with SA_SIGINFO. */
421		linux_rt_sendsig(catcher, sig, mask, code);
422		return;
423	}
424
425	regs = td->td_frame;
426	oonstack = sigonstack(regs->tf_esp);
427
428#ifdef DEBUG
429	if (ldebug(sendsig))
430		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
431		    catcher, sig, (void*)mask, code);
432#endif
433
434	/*
435	 * Allocate space for the signal handler context.
436	 */
437	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
438	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
439		fp = (struct l_sigframe *)(p->p_sigstk.ss_sp +
440		    p->p_sigstk.ss_size - sizeof(struct l_sigframe));
441	} else
442		fp = (struct l_sigframe *)regs->tf_esp - 1;
443	mtx_unlock(&psp->ps_mtx);
444	PROC_UNLOCK(p);
445
446	/*
447	 * Build the argument list for the signal handler.
448	 */
449	if (p->p_sysent->sv_sigtbl)
450		if (sig <= p->p_sysent->sv_sigsize)
451			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
452
453	bzero(&frame, sizeof(frame));
454
455	frame.sf_handler = catcher;
456	frame.sf_sig = sig;
457
458	bsd_to_linux_sigset(mask, &lmask);
459
460	/*
461	 * Build the signal context to be used by sigreturn.
462	 */
463	frame.sf_sc.sc_mask   = lmask.__bits[0];
464	frame.sf_sc.sc_gs     = rgs();
465	frame.sf_sc.sc_fs     = regs->tf_fs;
466	frame.sf_sc.sc_es     = regs->tf_es;
467	frame.sf_sc.sc_ds     = regs->tf_ds;
468	frame.sf_sc.sc_edi    = regs->tf_edi;
469	frame.sf_sc.sc_esi    = regs->tf_esi;
470	frame.sf_sc.sc_ebp    = regs->tf_ebp;
471	frame.sf_sc.sc_ebx    = regs->tf_ebx;
472	frame.sf_sc.sc_edx    = regs->tf_edx;
473	frame.sf_sc.sc_ecx    = regs->tf_ecx;
474	frame.sf_sc.sc_eax    = regs->tf_eax;
475	frame.sf_sc.sc_eip    = regs->tf_eip;
476	frame.sf_sc.sc_cs     = regs->tf_cs;
477	frame.sf_sc.sc_eflags = regs->tf_eflags;
478	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
479	frame.sf_sc.sc_ss     = regs->tf_ss;
480	frame.sf_sc.sc_err    = regs->tf_err;
481	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
482
483	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
484		frame.sf_extramask[i] = lmask.__bits[i+1];
485
486	if (copyout(&frame, fp, sizeof(frame)) != 0) {
487		/*
488		 * Process has trashed its stack; give it an illegal
489		 * instruction to halt it in its tracks.
490		 */
491		PROC_LOCK(p);
492		sigexit(td, SIGILL);
493	}
494
495	/*
496	 * Build context to run handler in.
497	 */
498	regs->tf_esp = (int)fp;
499	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
500	regs->tf_eflags &= ~(PSL_T | PSL_VM);
501	regs->tf_cs = _ucodesel;
502	regs->tf_ds = _udatasel;
503	regs->tf_es = _udatasel;
504	regs->tf_fs = _udatasel;
505	regs->tf_ss = _udatasel;
506	PROC_LOCK(p);
507	mtx_lock(&psp->ps_mtx);
508}
509
510/*
511 * System call to cleanup state after a signal
512 * has been taken.  Reset signal mask and
513 * stack state from context left by sendsig (above).
514 * Return to previous pc and psl as specified by
515 * context left by sendsig. Check carefully to
516 * make sure that the user has not modified the
517 * psl to gain improper privileges or to cause
518 * a machine fault.
519 */
520int
521linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
522{
523	struct proc *p = td->td_proc;
524	struct l_sigframe frame;
525	struct trapframe *regs;
526	l_sigset_t lmask;
527	int eflags, i;
528
529	regs = td->td_frame;
530
531#ifdef DEBUG
532	if (ldebug(sigreturn))
533		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
534#endif
535	/*
536	 * The trampoline code hands us the sigframe.
537	 * It is unsafe to keep track of it ourselves, in the event that a
538	 * program jumps out of a signal handler.
539	 */
540	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
541		return (EFAULT);
542
543	/*
544	 * Check for security violations.
545	 */
546#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
547	eflags = frame.sf_sc.sc_eflags;
548	/*
549	 * XXX do allow users to change the privileged flag PSL_RF.  The
550	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
551	 * sometimes set it there too.  tf_eflags is kept in the signal
552	 * context during signal handling and there is no other place
553	 * to remember it, so the PSL_RF bit may be corrupted by the
554	 * signal handler without us knowing.  Corruption of the PSL_RF
555	 * bit at worst causes one more or one less debugger trap, so
556	 * allowing it is fairly harmless.
557	 */
558	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
559		return(EINVAL);
560
561	/*
562	 * Don't allow users to load a valid privileged %cs.  Let the
563	 * hardware check for invalid selectors, excess privilege in
564	 * other selectors, invalid %eip's and invalid %esp's.
565	 */
566#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
567	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
568		trapsignal(td, SIGBUS, T_PROTFLT);
569		return(EINVAL);
570	}
571
572	lmask.__bits[0] = frame.sf_sc.sc_mask;
573	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
574		lmask.__bits[i+1] = frame.sf_extramask[i];
575	PROC_LOCK(p);
576	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
577	SIG_CANTMASK(td->td_sigmask);
578	signotify(td);
579	PROC_UNLOCK(p);
580
581	/*
582	 * Restore signal context.
583	 */
584	/* %gs was restored by the trampoline. */
585	regs->tf_fs     = frame.sf_sc.sc_fs;
586	regs->tf_es     = frame.sf_sc.sc_es;
587	regs->tf_ds     = frame.sf_sc.sc_ds;
588	regs->tf_edi    = frame.sf_sc.sc_edi;
589	regs->tf_esi    = frame.sf_sc.sc_esi;
590	regs->tf_ebp    = frame.sf_sc.sc_ebp;
591	regs->tf_ebx    = frame.sf_sc.sc_ebx;
592	regs->tf_edx    = frame.sf_sc.sc_edx;
593	regs->tf_ecx    = frame.sf_sc.sc_ecx;
594	regs->tf_eax    = frame.sf_sc.sc_eax;
595	regs->tf_eip    = frame.sf_sc.sc_eip;
596	regs->tf_cs     = frame.sf_sc.sc_cs;
597	regs->tf_eflags = eflags;
598	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
599	regs->tf_ss     = frame.sf_sc.sc_ss;
600
601	return (EJUSTRETURN);
602}
603
604/*
605 * System call to cleanup state after a signal
606 * has been taken.  Reset signal mask and
607 * stack state from context left by rt_sendsig (above).
608 * Return to previous pc and psl as specified by
609 * context left by sendsig. Check carefully to
610 * make sure that the user has not modified the
611 * psl to gain improper privileges or to cause
612 * a machine fault.
613 */
614int
615linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
616{
617	struct proc *p = td->td_proc;
618	struct l_ucontext uc;
619	struct l_sigcontext *context;
620	l_stack_t *lss;
621	stack_t ss;
622	struct trapframe *regs;
623	int eflags;
624
625	regs = td->td_frame;
626
627#ifdef DEBUG
628	if (ldebug(rt_sigreturn))
629		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
630#endif
631	/*
632	 * The trampoline code hands us the ucontext.
633	 * It is unsafe to keep track of it ourselves, in the event that a
634	 * program jumps out of a signal handler.
635	 */
636	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
637		return (EFAULT);
638
639	context = &uc.uc_mcontext;
640
641	/*
642	 * Check for security violations.
643	 */
644#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
645	eflags = context->sc_eflags;
646	/*
647	 * XXX do allow users to change the privileged flag PSL_RF.  The
648	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
649	 * sometimes set it there too.  tf_eflags is kept in the signal
650	 * context during signal handling and there is no other place
651	 * to remember it, so the PSL_RF bit may be corrupted by the
652	 * signal handler without us knowing.  Corruption of the PSL_RF
653	 * bit at worst causes one more or one less debugger trap, so
654	 * allowing it is fairly harmless.
655	 */
656	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
657		return(EINVAL);
658
659	/*
660	 * Don't allow users to load a valid privileged %cs.  Let the
661	 * hardware check for invalid selectors, excess privilege in
662	 * other selectors, invalid %eip's and invalid %esp's.
663	 */
664#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
665	if (!CS_SECURE(context->sc_cs)) {
666		trapsignal(td, SIGBUS, T_PROTFLT);
667		return(EINVAL);
668	}
669
670	PROC_LOCK(p);
671	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
672	SIG_CANTMASK(td->td_sigmask);
673	signotify(td);
674	PROC_UNLOCK(p);
675
676	/*
677	 * Restore signal context
678	 */
679	/* %gs was restored by the trampoline. */
680	regs->tf_fs     = context->sc_fs;
681	regs->tf_es     = context->sc_es;
682	regs->tf_ds     = context->sc_ds;
683	regs->tf_edi    = context->sc_edi;
684	regs->tf_esi    = context->sc_esi;
685	regs->tf_ebp    = context->sc_ebp;
686	regs->tf_ebx    = context->sc_ebx;
687	regs->tf_edx    = context->sc_edx;
688	regs->tf_ecx    = context->sc_ecx;
689	regs->tf_eax    = context->sc_eax;
690	regs->tf_eip    = context->sc_eip;
691	regs->tf_cs     = context->sc_cs;
692	regs->tf_eflags = eflags;
693	regs->tf_esp    = context->sc_esp_at_signal;
694	regs->tf_ss     = context->sc_ss;
695
696	/*
697	 * call sigaltstack & ignore results..
698	 */
699	lss = &uc.uc_stack;
700	ss.ss_sp = lss->ss_sp;
701	ss.ss_size = lss->ss_size;
702	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
703
704#ifdef DEBUG
705	if (ldebug(rt_sigreturn))
706		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
707		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
708#endif
709	(void)kern_sigaltstack(td, &ss, NULL);
710
711	return (EJUSTRETURN);
712}
713
714/*
715 * MPSAFE
716 */
717static void
718linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
719{
720	args[0] = tf->tf_ebx;
721	args[1] = tf->tf_ecx;
722	args[2] = tf->tf_edx;
723	args[3] = tf->tf_esi;
724	args[4] = tf->tf_edi;
725	args[5] = tf->tf_ebp;	/* Unconfirmed */
726	*params = NULL;		/* no copyin */
727}
728
729
730
731/*
732 * Dump core, into a file named as described in the comments for
733 * expand_name(), unless the process was setuid/setgid.
734 */
735static int
736linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
737{
738	struct proc *p = td->td_proc;
739	struct ucred *cred = td->td_ucred;
740	struct vmspace *vm = p->p_vmspace;
741	char *tempuser;
742	int error;
743
744	if (ctob((uarea_pages + kstack_pages) +
745	    vm->vm_dsize + vm->vm_ssize) >= limit)
746		return (EFAULT);
747	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
748	    M_WAITOK | M_ZERO);
749	if (tempuser == NULL)
750		return (ENOMEM);
751	PROC_LOCK(p);
752	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
753	PROC_UNLOCK(p);
754	bcopy(p->p_uarea, tempuser, sizeof(struct user));
755	bcopy(td->td_frame,
756	    tempuser + ctob(uarea_pages) +
757	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
758	    sizeof(struct trapframe));
759	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
760	    ctob(uarea_pages + kstack_pages),
761	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
762	    (int *)NULL, td);
763	free(tempuser, M_TEMP);
764	if (error == 0)
765		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
766		    (int)ctob(vm->vm_dsize),
767		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
768		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
769	if (error == 0)
770		error = vn_rdwr_inchunks(UIO_WRITE, vp,
771		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
772		    round_page(ctob(vm->vm_ssize)),
773		    (off_t)ctob(uarea_pages + kstack_pages) +
774			ctob(vm->vm_dsize), UIO_USERSPACE,
775		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
776	return (error);
777}
778/*
779 * If a linux binary is exec'ing something, try this image activator
780 * first.  We override standard shell script execution in order to
781 * be able to modify the interpreter path.  We only do this if a linux
782 * binary is doing the exec, so we do not create an EXEC module for it.
783 */
784static int	exec_linux_imgact_try(struct image_params *iparams);
785
786static int
787exec_linux_imgact_try(struct image_params *imgp)
788{
789    const char *head = (const char *)imgp->image_header;
790    int error = -1;
791
792    /*
793     * The interpreter for shell scripts run from a linux binary needs
794     * to be located in /compat/linux if possible in order to recursively
795     * maintain linux path emulation.
796     */
797    if (((const short *)head)[0] == SHELLMAGIC) {
798	    /*
799	     * Run our normal shell image activator.  If it succeeds attempt
800	     * to use the alternate path for the interpreter.  If an alternate
801	     * path is found, use our stringspace to store it.
802	     */
803	    if ((error = exec_shell_imgact(imgp)) == 0) {
804		    char *rpath = NULL;
805
806		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
807			imgp->interpreter_name, &rpath, 0);
808		    if (rpath != imgp->interpreter_name) {
809			    int len = strlen(rpath) + 1;
810
811			    if (len <= MAXSHELLCMDLEN) {
812				    memcpy(imgp->interpreter_name, rpath, len);
813			    }
814			    free(rpath, M_TEMP);
815		    }
816	    }
817    }
818    return(error);
819}
820
821/*
822 * exec_setregs may initialize some registers differently than Linux
823 * does, thus potentially confusing Linux binaries. If necessary, we
824 * override the exec_setregs default(s) here.
825 */
826static void
827exec_linux_setregs(struct thread *td, u_long entry,
828		   u_long stack, u_long ps_strings)
829{
830	struct pcb *pcb = td->td_pcb;
831
832	exec_setregs(td, entry, stack, ps_strings);
833
834	/* Linux sets %gs to 0, we default to _udatasel */
835	pcb->pcb_gs = 0; load_gs(0);
836}
837
838struct sysentvec linux_sysvec = {
839	LINUX_SYS_MAXSYSCALL,
840	linux_sysent,
841	0xff,
842	LINUX_SIGTBLSZ,
843	bsd_to_linux_signal,
844	ELAST + 1,
845	bsd_to_linux_errno,
846	translate_traps,
847	linux_fixup,
848	linux_sendsig,
849	linux_sigcode,
850	&linux_szsigcode,
851	linux_prepsyscall,
852	"Linux a.out",
853	linux_aout_coredump,
854	exec_linux_imgact_try,
855	LINUX_MINSIGSTKSZ,
856	PAGE_SIZE,
857	VM_MIN_ADDRESS,
858	VM_MAXUSER_ADDRESS,
859	USRSTACK,
860	PS_STRINGS,
861	VM_PROT_ALL,
862	exec_copyout_strings,
863	exec_linux_setregs
864};
865
866struct sysentvec elf_linux_sysvec = {
867	LINUX_SYS_MAXSYSCALL,
868	linux_sysent,
869	0xff,
870	LINUX_SIGTBLSZ,
871	bsd_to_linux_signal,
872	ELAST + 1,
873	bsd_to_linux_errno,
874	translate_traps,
875	elf_linux_fixup,
876	linux_sendsig,
877	linux_sigcode,
878	&linux_szsigcode,
879	linux_prepsyscall,
880	"Linux ELF",
881	elf32_coredump,
882	exec_linux_imgact_try,
883	LINUX_MINSIGSTKSZ,
884	PAGE_SIZE,
885	VM_MIN_ADDRESS,
886	VM_MAXUSER_ADDRESS,
887	USRSTACK,
888	PS_STRINGS,
889	VM_PROT_ALL,
890	exec_copyout_strings,
891	exec_linux_setregs
892};
893
894static Elf32_Brandinfo linux_brand = {
895					ELFOSABI_LINUX,
896					EM_386,
897					"Linux",
898					"/compat/linux",
899					"/lib/ld-linux.so.1",
900					&elf_linux_sysvec
901				 };
902
903static Elf32_Brandinfo linux_glibc2brand = {
904					ELFOSABI_LINUX,
905					EM_386,
906					"Linux",
907					"/compat/linux",
908					"/lib/ld-linux.so.2",
909					&elf_linux_sysvec
910				 };
911
912Elf32_Brandinfo *linux_brandlist[] = {
913					&linux_brand,
914					&linux_glibc2brand,
915					NULL
916				};
917
918static int
919linux_elf_modevent(module_t mod, int type, void *data)
920{
921	Elf32_Brandinfo **brandinfo;
922	int error;
923	struct linux_ioctl_handler **lihp;
924
925	error = 0;
926
927	switch(type) {
928	case MOD_LOAD:
929		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
930		     ++brandinfo)
931			if (elf32_insert_brand_entry(*brandinfo) < 0)
932				error = EINVAL;
933		if (error == 0) {
934			SET_FOREACH(lihp, linux_ioctl_handler_set)
935				linux_ioctl_register_handler(*lihp);
936			if (bootverbose)
937				printf("Linux ELF exec handler installed\n");
938		} else
939			printf("cannot insert Linux ELF brand handler\n");
940		break;
941	case MOD_UNLOAD:
942		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
943		     ++brandinfo)
944			if (elf32_brand_inuse(*brandinfo))
945				error = EBUSY;
946		if (error == 0) {
947			for (brandinfo = &linux_brandlist[0];
948			     *brandinfo != NULL; ++brandinfo)
949				if (elf32_remove_brand_entry(*brandinfo) < 0)
950					error = EINVAL;
951		}
952		if (error == 0) {
953			SET_FOREACH(lihp, linux_ioctl_handler_set)
954				linux_ioctl_unregister_handler(*lihp);
955			if (bootverbose)
956				printf("Linux ELF exec handler removed\n");
957			linux_mib_destroy();
958		} else
959			printf("Could not deinstall ELF interpreter entry\n");
960		break;
961	default:
962		break;
963	}
964	return error;
965}
966
967static moduledata_t linux_elf_mod = {
968	"linuxelf",
969	linux_elf_modevent,
970	0
971};
972
973DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
974