linux_sysvec.c revision 124092
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 124092 2004-01-03 02:02:26Z davidxu $");
31
32/* XXX we use functions that might not exist. */
33#include "opt_compat.h"
34
35#ifndef COMPAT_43
36#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/exec.h>
42#include <sys/imgact.h>
43#include <sys/imgact_aout.h>
44#include <sys/imgact_elf.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/module.h>
49#include <sys/mutex.h>
50#include <sys/proc.h>
51#include <sys/signalvar.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysent.h>
54#include <sys/sysproto.h>
55#include <sys/user.h>
56#include <sys/vnode.h>
57
58#include <vm/vm.h>
59#include <vm/pmap.h>
60#include <vm/vm_extern.h>
61#include <vm/vm_map.h>
62#include <vm/vm_object.h>
63#include <vm/vm_page.h>
64#include <vm/vm_param.h>
65
66#include <machine/cpu.h>
67#include <machine/md_var.h>
68
69#include <i386/linux/linux.h>
70#include <i386/linux/linux_proto.h>
71#include <compat/linux/linux_mib.h>
72#include <compat/linux/linux_signal.h>
73#include <compat/linux/linux_util.h>
74
75MODULE_VERSION(linux, 1);
76MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
77MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
78MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
79
80MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
81
82#if BYTE_ORDER == LITTLE_ENDIAN
83#define SHELLMAGIC      0x2123 /* #! */
84#else
85#define SHELLMAGIC      0x2321
86#endif
87
88/*
89 * Allow the sendsig functions to use the ldebug() facility
90 * even though they are not syscalls themselves. Map them
91 * to syscall 0. This is slightly less bogus than using
92 * ldebug(sigreturn).
93 */
94#define	LINUX_SYS_linux_rt_sendsig	0
95#define	LINUX_SYS_linux_sendsig		0
96
97extern char linux_sigcode[];
98extern int linux_szsigcode;
99
100extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
101
102SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
103
104static int	linux_fixup(register_t **stack_base,
105		    struct image_params *iparams);
106static int	elf_linux_fixup(register_t **stack_base,
107		    struct image_params *iparams);
108static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
109		    caddr_t *params);
110static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
111		    u_long code);
112static void	exec_linux_setregs(struct thread *td, u_long entry,
113				   u_long stack, u_long ps_strings);
114
115/*
116 * Linux syscalls return negative errno's, we do positive and map them
117 */
118static int bsd_to_linux_errno[ELAST + 1] = {
119	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
120	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
121	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
122	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
123	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
124	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
125	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
126	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
127	-6, -6, -43, -42, -75, -6, -84
128};
129
130int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
131	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
132	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
133	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
134	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
135	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
136	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
137	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
138	0, LINUX_SIGUSR1, LINUX_SIGUSR2
139};
140
141int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
142	SIGHUP, SIGINT, SIGQUIT, SIGILL,
143	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
144	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
145	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
146	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
147	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
148	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
149	SIGIO, SIGURG, SIGSYS
150};
151
152#define LINUX_T_UNKNOWN  255
153static int _bsd_to_linux_trapcode[] = {
154	LINUX_T_UNKNOWN,	/* 0 */
155	6,			/* 1  T_PRIVINFLT */
156	LINUX_T_UNKNOWN,	/* 2 */
157	3,			/* 3  T_BPTFLT */
158	LINUX_T_UNKNOWN,	/* 4 */
159	LINUX_T_UNKNOWN,	/* 5 */
160	16,			/* 6  T_ARITHTRAP */
161	254,			/* 7  T_ASTFLT */
162	LINUX_T_UNKNOWN,	/* 8 */
163	13,			/* 9  T_PROTFLT */
164	1,			/* 10 T_TRCTRAP */
165	LINUX_T_UNKNOWN,	/* 11 */
166	14,			/* 12 T_PAGEFLT */
167	LINUX_T_UNKNOWN,	/* 13 */
168	17,			/* 14 T_ALIGNFLT */
169	LINUX_T_UNKNOWN,	/* 15 */
170	LINUX_T_UNKNOWN,	/* 16 */
171	LINUX_T_UNKNOWN,	/* 17 */
172	0,			/* 18 T_DIVIDE */
173	2,			/* 19 T_NMI */
174	4,			/* 20 T_OFLOW */
175	5,			/* 21 T_BOUND */
176	7,			/* 22 T_DNA */
177	8,			/* 23 T_DOUBLEFLT */
178	9,			/* 24 T_FPOPFLT */
179	10,			/* 25 T_TSSFLT */
180	11,			/* 26 T_SEGNPFLT */
181	12,			/* 27 T_STKFLT */
182	18,			/* 28 T_MCHK */
183	19,			/* 29 T_XMMFLT */
184	15			/* 30 T_RESERVED */
185};
186#define bsd_to_linux_trapcode(code) \
187    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
188     _bsd_to_linux_trapcode[(code)]: \
189     LINUX_T_UNKNOWN)
190
191/*
192 * If FreeBSD & Linux have a difference of opinion about what a trap
193 * means, deal with it here.
194 *
195 * MPSAFE
196 */
197static int
198translate_traps(int signal, int trap_code)
199{
200	if (signal != SIGBUS)
201		return signal;
202	switch (trap_code) {
203	case T_PROTFLT:
204	case T_TSSFLT:
205	case T_DOUBLEFLT:
206	case T_PAGEFLT:
207		return SIGSEGV;
208	default:
209		return signal;
210	}
211}
212
213static int
214linux_fixup(register_t **stack_base, struct image_params *imgp)
215{
216	register_t *argv, *envp;
217
218	argv = *stack_base;
219	envp = *stack_base + (imgp->argc + 1);
220	(*stack_base)--;
221	**stack_base = (intptr_t)(void *)envp;
222	(*stack_base)--;
223	**stack_base = (intptr_t)(void *)argv;
224	(*stack_base)--;
225	**stack_base = imgp->argc;
226	return 0;
227}
228
229static int
230elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
231{
232	Elf32_Auxargs *args;
233	register_t *pos;
234
235	KASSERT(curthread->td_proc == imgp->proc &&
236	    (curthread->td_proc->p_flag & P_SA) == 0,
237	    ("unsafe elf_linux_fixup(), should be curproc"));
238	args = (Elf32_Auxargs *)imgp->auxargs;
239	pos = *stack_base + (imgp->argc + imgp->envc + 2);
240
241	if (args->trace)
242		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
243	if (args->execfd != -1)
244		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
245	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
246	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
247	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
248	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
249	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
250	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
251	AUXARGS_ENTRY(pos, AT_BASE, args->base);
252	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
253	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
254	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
255	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
256	AUXARGS_ENTRY(pos, AT_NULL, 0);
257
258	free(imgp->auxargs, M_TEMP);
259	imgp->auxargs = NULL;
260
261	(*stack_base)--;
262	**stack_base = (register_t)imgp->argc;
263	return 0;
264}
265
266extern int _ucodesel, _udatasel;
267extern unsigned long linux_sznonrtsigcode;
268
269static void
270linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
271{
272	struct thread *td = curthread;
273	struct proc *p = td->td_proc;
274	struct sigacts *psp;
275	struct trapframe *regs;
276	struct l_rt_sigframe *fp, frame;
277	int oonstack;
278
279	PROC_LOCK_ASSERT(p, MA_OWNED);
280	psp = p->p_sigacts;
281	mtx_assert(&psp->ps_mtx, MA_OWNED);
282	regs = td->td_frame;
283	oonstack = sigonstack(regs->tf_esp);
284
285#ifdef DEBUG
286	if (ldebug(rt_sendsig))
287		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
288		    catcher, sig, (void*)mask, code);
289#endif
290	/*
291	 * Allocate space for the signal handler context.
292	 */
293	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
294	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
295		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
296		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
297	} else
298		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
299	mtx_unlock(&psp->ps_mtx);
300
301	/*
302	 * Build the argument list for the signal handler.
303	 */
304	if (p->p_sysent->sv_sigtbl)
305		if (sig <= p->p_sysent->sv_sigsize)
306			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
307
308	bzero(&frame, sizeof(frame));
309
310	frame.sf_handler = catcher;
311	frame.sf_sig = sig;
312	frame.sf_siginfo = &fp->sf_si;
313	frame.sf_ucontext = &fp->sf_sc;
314
315	/* Fill in POSIX parts */
316	frame.sf_si.lsi_signo = sig;
317	frame.sf_si.lsi_code = code;
318	frame.sf_si.lsi_addr = (void *)regs->tf_err;
319
320	/*
321	 * Build the signal context to be used by sigreturn.
322	 */
323	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
324	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
325
326	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
327	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
328	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
329	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
330	PROC_UNLOCK(p);
331
332	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
333
334	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
335	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
336	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
337	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
338	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
339	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
340	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
341	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
342	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
343	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
344	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
345	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
346	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
347	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
348	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
349	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
350	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
351	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
352	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
353
354#ifdef DEBUG
355	if (ldebug(rt_sendsig))
356		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
357		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
358		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
359#endif
360
361	if (copyout(&frame, fp, sizeof(frame)) != 0) {
362		/*
363		 * Process has trashed its stack; give it an illegal
364		 * instruction to halt it in its tracks.
365		 */
366#ifdef DEBUG
367		if (ldebug(rt_sendsig))
368			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
369			    fp, oonstack);
370#endif
371		PROC_LOCK(p);
372		sigexit(td, SIGILL);
373	}
374
375	/*
376	 * Build context to run handler in.
377	 */
378	regs->tf_esp = (int)fp;
379	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
380	    linux_sznonrtsigcode;
381	regs->tf_eflags &= ~(PSL_T | PSL_VM);
382	regs->tf_cs = _ucodesel;
383	regs->tf_ds = _udatasel;
384	regs->tf_es = _udatasel;
385	regs->tf_fs = _udatasel;
386	regs->tf_ss = _udatasel;
387	PROC_LOCK(p);
388	mtx_lock(&psp->ps_mtx);
389}
390
391
392/*
393 * Send an interrupt to process.
394 *
395 * Stack is set up to allow sigcode stored
396 * in u. to call routine, followed by kcall
397 * to sigreturn routine below.  After sigreturn
398 * resets the signal mask, the stack, and the
399 * frame pointer, it returns to the user
400 * specified pc, psl.
401 */
402static void
403linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
404{
405	struct thread *td = curthread;
406	struct proc *p = td->td_proc;
407	struct sigacts *psp;
408	struct trapframe *regs;
409	struct l_sigframe *fp, frame;
410	l_sigset_t lmask;
411	int oonstack, i;
412
413	PROC_LOCK_ASSERT(p, MA_OWNED);
414	psp = p->p_sigacts;
415	mtx_assert(&psp->ps_mtx, MA_OWNED);
416	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417		/* Signal handler installed with SA_SIGINFO. */
418		linux_rt_sendsig(catcher, sig, mask, code);
419		return;
420	}
421
422	regs = td->td_frame;
423	oonstack = sigonstack(regs->tf_esp);
424
425#ifdef DEBUG
426	if (ldebug(sendsig))
427		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
428		    catcher, sig, (void*)mask, code);
429#endif
430
431	/*
432	 * Allocate space for the signal handler context.
433	 */
434	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
435	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
436		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
437		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
438	} else
439		fp = (struct l_sigframe *)regs->tf_esp - 1;
440	mtx_unlock(&psp->ps_mtx);
441	PROC_UNLOCK(p);
442
443	/*
444	 * Build the argument list for the signal handler.
445	 */
446	if (p->p_sysent->sv_sigtbl)
447		if (sig <= p->p_sysent->sv_sigsize)
448			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
449
450	bzero(&frame, sizeof(frame));
451
452	frame.sf_handler = catcher;
453	frame.sf_sig = sig;
454
455	bsd_to_linux_sigset(mask, &lmask);
456
457	/*
458	 * Build the signal context to be used by sigreturn.
459	 */
460	frame.sf_sc.sc_mask   = lmask.__bits[0];
461	frame.sf_sc.sc_gs     = rgs();
462	frame.sf_sc.sc_fs     = regs->tf_fs;
463	frame.sf_sc.sc_es     = regs->tf_es;
464	frame.sf_sc.sc_ds     = regs->tf_ds;
465	frame.sf_sc.sc_edi    = regs->tf_edi;
466	frame.sf_sc.sc_esi    = regs->tf_esi;
467	frame.sf_sc.sc_ebp    = regs->tf_ebp;
468	frame.sf_sc.sc_ebx    = regs->tf_ebx;
469	frame.sf_sc.sc_edx    = regs->tf_edx;
470	frame.sf_sc.sc_ecx    = regs->tf_ecx;
471	frame.sf_sc.sc_eax    = regs->tf_eax;
472	frame.sf_sc.sc_eip    = regs->tf_eip;
473	frame.sf_sc.sc_cs     = regs->tf_cs;
474	frame.sf_sc.sc_eflags = regs->tf_eflags;
475	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
476	frame.sf_sc.sc_ss     = regs->tf_ss;
477	frame.sf_sc.sc_err    = regs->tf_err;
478	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
479
480	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
481		frame.sf_extramask[i] = lmask.__bits[i+1];
482
483	if (copyout(&frame, fp, sizeof(frame)) != 0) {
484		/*
485		 * Process has trashed its stack; give it an illegal
486		 * instruction to halt it in its tracks.
487		 */
488		PROC_LOCK(p);
489		sigexit(td, SIGILL);
490	}
491
492	/*
493	 * Build context to run handler in.
494	 */
495	regs->tf_esp = (int)fp;
496	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
497	regs->tf_eflags &= ~(PSL_T | PSL_VM);
498	regs->tf_cs = _ucodesel;
499	regs->tf_ds = _udatasel;
500	regs->tf_es = _udatasel;
501	regs->tf_fs = _udatasel;
502	regs->tf_ss = _udatasel;
503	PROC_LOCK(p);
504	mtx_lock(&psp->ps_mtx);
505}
506
507/*
508 * System call to cleanup state after a signal
509 * has been taken.  Reset signal mask and
510 * stack state from context left by sendsig (above).
511 * Return to previous pc and psl as specified by
512 * context left by sendsig. Check carefully to
513 * make sure that the user has not modified the
514 * psl to gain improper privileges or to cause
515 * a machine fault.
516 */
517int
518linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
519{
520	struct proc *p = td->td_proc;
521	struct l_sigframe frame;
522	struct trapframe *regs;
523	l_sigset_t lmask;
524	int eflags, i;
525
526	regs = td->td_frame;
527
528#ifdef DEBUG
529	if (ldebug(sigreturn))
530		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
531#endif
532	/*
533	 * The trampoline code hands us the sigframe.
534	 * It is unsafe to keep track of it ourselves, in the event that a
535	 * program jumps out of a signal handler.
536	 */
537	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
538		return (EFAULT);
539
540	/*
541	 * Check for security violations.
542	 */
543#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
544	eflags = frame.sf_sc.sc_eflags;
545	/*
546	 * XXX do allow users to change the privileged flag PSL_RF.  The
547	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
548	 * sometimes set it there too.  tf_eflags is kept in the signal
549	 * context during signal handling and there is no other place
550	 * to remember it, so the PSL_RF bit may be corrupted by the
551	 * signal handler without us knowing.  Corruption of the PSL_RF
552	 * bit at worst causes one more or one less debugger trap, so
553	 * allowing it is fairly harmless.
554	 */
555	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
556		return(EINVAL);
557
558	/*
559	 * Don't allow users to load a valid privileged %cs.  Let the
560	 * hardware check for invalid selectors, excess privilege in
561	 * other selectors, invalid %eip's and invalid %esp's.
562	 */
563#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
564	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
565		trapsignal(td, SIGBUS, T_PROTFLT);
566		return(EINVAL);
567	}
568
569	lmask.__bits[0] = frame.sf_sc.sc_mask;
570	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
571		lmask.__bits[i+1] = frame.sf_extramask[i];
572	PROC_LOCK(p);
573	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
574	SIG_CANTMASK(td->td_sigmask);
575	signotify(td);
576	PROC_UNLOCK(p);
577
578	/*
579	 * Restore signal context.
580	 */
581	/* %gs was restored by the trampoline. */
582	regs->tf_fs     = frame.sf_sc.sc_fs;
583	regs->tf_es     = frame.sf_sc.sc_es;
584	regs->tf_ds     = frame.sf_sc.sc_ds;
585	regs->tf_edi    = frame.sf_sc.sc_edi;
586	regs->tf_esi    = frame.sf_sc.sc_esi;
587	regs->tf_ebp    = frame.sf_sc.sc_ebp;
588	regs->tf_ebx    = frame.sf_sc.sc_ebx;
589	regs->tf_edx    = frame.sf_sc.sc_edx;
590	regs->tf_ecx    = frame.sf_sc.sc_ecx;
591	regs->tf_eax    = frame.sf_sc.sc_eax;
592	regs->tf_eip    = frame.sf_sc.sc_eip;
593	regs->tf_cs     = frame.sf_sc.sc_cs;
594	regs->tf_eflags = eflags;
595	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
596	regs->tf_ss     = frame.sf_sc.sc_ss;
597
598	return (EJUSTRETURN);
599}
600
601/*
602 * System call to cleanup state after a signal
603 * has been taken.  Reset signal mask and
604 * stack state from context left by rt_sendsig (above).
605 * Return to previous pc and psl as specified by
606 * context left by sendsig. Check carefully to
607 * make sure that the user has not modified the
608 * psl to gain improper privileges or to cause
609 * a machine fault.
610 */
611int
612linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
613{
614	struct proc *p = td->td_proc;
615	struct l_ucontext uc;
616	struct l_sigcontext *context;
617	l_stack_t *lss;
618	stack_t ss;
619	struct trapframe *regs;
620	int eflags;
621
622	regs = td->td_frame;
623
624#ifdef DEBUG
625	if (ldebug(rt_sigreturn))
626		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
627#endif
628	/*
629	 * The trampoline code hands us the ucontext.
630	 * It is unsafe to keep track of it ourselves, in the event that a
631	 * program jumps out of a signal handler.
632	 */
633	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
634		return (EFAULT);
635
636	context = &uc.uc_mcontext;
637
638	/*
639	 * Check for security violations.
640	 */
641#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
642	eflags = context->sc_eflags;
643	/*
644	 * XXX do allow users to change the privileged flag PSL_RF.  The
645	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
646	 * sometimes set it there too.  tf_eflags is kept in the signal
647	 * context during signal handling and there is no other place
648	 * to remember it, so the PSL_RF bit may be corrupted by the
649	 * signal handler without us knowing.  Corruption of the PSL_RF
650	 * bit at worst causes one more or one less debugger trap, so
651	 * allowing it is fairly harmless.
652	 */
653	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
654		return(EINVAL);
655
656	/*
657	 * Don't allow users to load a valid privileged %cs.  Let the
658	 * hardware check for invalid selectors, excess privilege in
659	 * other selectors, invalid %eip's and invalid %esp's.
660	 */
661#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
662	if (!CS_SECURE(context->sc_cs)) {
663		trapsignal(td, SIGBUS, T_PROTFLT);
664		return(EINVAL);
665	}
666
667	PROC_LOCK(p);
668	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
669	SIG_CANTMASK(td->td_sigmask);
670	signotify(td);
671	PROC_UNLOCK(p);
672
673	/*
674	 * Restore signal context
675	 */
676	/* %gs was restored by the trampoline. */
677	regs->tf_fs     = context->sc_fs;
678	regs->tf_es     = context->sc_es;
679	regs->tf_ds     = context->sc_ds;
680	regs->tf_edi    = context->sc_edi;
681	regs->tf_esi    = context->sc_esi;
682	regs->tf_ebp    = context->sc_ebp;
683	regs->tf_ebx    = context->sc_ebx;
684	regs->tf_edx    = context->sc_edx;
685	regs->tf_ecx    = context->sc_ecx;
686	regs->tf_eax    = context->sc_eax;
687	regs->tf_eip    = context->sc_eip;
688	regs->tf_cs     = context->sc_cs;
689	regs->tf_eflags = eflags;
690	regs->tf_esp    = context->sc_esp_at_signal;
691	regs->tf_ss     = context->sc_ss;
692
693	/*
694	 * call sigaltstack & ignore results..
695	 */
696	lss = &uc.uc_stack;
697	ss.ss_sp = lss->ss_sp;
698	ss.ss_size = lss->ss_size;
699	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
700
701#ifdef DEBUG
702	if (ldebug(rt_sigreturn))
703		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
704		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
705#endif
706	(void)kern_sigaltstack(td, &ss, NULL);
707
708	return (EJUSTRETURN);
709}
710
711/*
712 * MPSAFE
713 */
714static void
715linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
716{
717	args[0] = tf->tf_ebx;
718	args[1] = tf->tf_ecx;
719	args[2] = tf->tf_edx;
720	args[3] = tf->tf_esi;
721	args[4] = tf->tf_edi;
722	args[5] = tf->tf_ebp;	/* Unconfirmed */
723	*params = NULL;		/* no copyin */
724}
725
726
727
728/*
729 * Dump core, into a file named as described in the comments for
730 * expand_name(), unless the process was setuid/setgid.
731 */
732static int
733linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
734{
735	struct proc *p = td->td_proc;
736	struct ucred *cred = td->td_ucred;
737	struct vmspace *vm = p->p_vmspace;
738	char *tempuser;
739	int error;
740
741	if (ctob((uarea_pages + kstack_pages) +
742	    vm->vm_dsize + vm->vm_ssize) >= limit)
743		return (EFAULT);
744	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
745	    M_WAITOK | M_ZERO);
746	if (tempuser == NULL)
747		return (ENOMEM);
748	PROC_LOCK(p);
749	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
750	PROC_UNLOCK(p);
751	bcopy(p->p_uarea, tempuser, sizeof(struct user));
752	bcopy(td->td_frame,
753	    tempuser + ctob(uarea_pages) +
754	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
755	    sizeof(struct trapframe));
756	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
757	    ctob(uarea_pages + kstack_pages),
758	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
759	    (int *)NULL, td);
760	free(tempuser, M_TEMP);
761	if (error == 0)
762		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
763		    (int)ctob(vm->vm_dsize),
764		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
765		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
766	if (error == 0)
767		error = vn_rdwr_inchunks(UIO_WRITE, vp,
768		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
769		    round_page(ctob(vm->vm_ssize)),
770		    (off_t)ctob(uarea_pages + kstack_pages) +
771			ctob(vm->vm_dsize), UIO_USERSPACE,
772		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
773	return (error);
774}
775/*
776 * If a linux binary is exec'ing something, try this image activator
777 * first.  We override standard shell script execution in order to
778 * be able to modify the interpreter path.  We only do this if a linux
779 * binary is doing the exec, so we do not create an EXEC module for it.
780 */
781static int	exec_linux_imgact_try(struct image_params *iparams);
782
783static int
784exec_linux_imgact_try(struct image_params *imgp)
785{
786    const char *head = (const char *)imgp->image_header;
787    int error = -1;
788
789    /*
790     * The interpreter for shell scripts run from a linux binary needs
791     * to be located in /compat/linux if possible in order to recursively
792     * maintain linux path emulation.
793     */
794    if (((const short *)head)[0] == SHELLMAGIC) {
795	    /*
796	     * Run our normal shell image activator.  If it succeeds attempt
797	     * to use the alternate path for the interpreter.  If an alternate
798	     * path is found, use our stringspace to store it.
799	     */
800	    if ((error = exec_shell_imgact(imgp)) == 0) {
801		    char *rpath = NULL;
802
803		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
804			imgp->interpreter_name, &rpath, 0);
805		    if (rpath != imgp->interpreter_name) {
806			    int len = strlen(rpath) + 1;
807
808			    if (len <= MAXSHELLCMDLEN) {
809				    memcpy(imgp->interpreter_name, rpath, len);
810			    }
811			    free(rpath, M_TEMP);
812		    }
813	    }
814    }
815    return(error);
816}
817
818/*
819 * exec_setregs may initialize some registers differently than Linux
820 * does, thus potentially confusing Linux binaries. If necessary, we
821 * override the exec_setregs default(s) here.
822 */
823static void
824exec_linux_setregs(struct thread *td, u_long entry,
825		   u_long stack, u_long ps_strings)
826{
827	struct pcb *pcb = td->td_pcb;
828
829	exec_setregs(td, entry, stack, ps_strings);
830
831	/* Linux sets %gs to 0, we default to _udatasel */
832	pcb->pcb_gs = 0; load_gs(0);
833}
834
835struct sysentvec linux_sysvec = {
836	LINUX_SYS_MAXSYSCALL,
837	linux_sysent,
838	0xff,
839	LINUX_SIGTBLSZ,
840	bsd_to_linux_signal,
841	ELAST + 1,
842	bsd_to_linux_errno,
843	translate_traps,
844	linux_fixup,
845	linux_sendsig,
846	linux_sigcode,
847	&linux_szsigcode,
848	linux_prepsyscall,
849	"Linux a.out",
850	linux_aout_coredump,
851	exec_linux_imgact_try,
852	LINUX_MINSIGSTKSZ,
853	PAGE_SIZE,
854	VM_MIN_ADDRESS,
855	VM_MAXUSER_ADDRESS,
856	USRSTACK,
857	PS_STRINGS,
858	VM_PROT_ALL,
859	exec_copyout_strings,
860	exec_linux_setregs,
861	NULL
862};
863
864struct sysentvec elf_linux_sysvec = {
865	LINUX_SYS_MAXSYSCALL,
866	linux_sysent,
867	0xff,
868	LINUX_SIGTBLSZ,
869	bsd_to_linux_signal,
870	ELAST + 1,
871	bsd_to_linux_errno,
872	translate_traps,
873	elf_linux_fixup,
874	linux_sendsig,
875	linux_sigcode,
876	&linux_szsigcode,
877	linux_prepsyscall,
878	"Linux ELF",
879	elf32_coredump,
880	exec_linux_imgact_try,
881	LINUX_MINSIGSTKSZ,
882	PAGE_SIZE,
883	VM_MIN_ADDRESS,
884	VM_MAXUSER_ADDRESS,
885	USRSTACK,
886	PS_STRINGS,
887	VM_PROT_ALL,
888	exec_copyout_strings,
889	exec_linux_setregs,
890	NULL
891};
892
893static Elf32_Brandinfo linux_brand = {
894					ELFOSABI_LINUX,
895					EM_386,
896					"Linux",
897					"/compat/linux",
898					"/lib/ld-linux.so.1",
899					&elf_linux_sysvec,
900					NULL,
901				 };
902
903static Elf32_Brandinfo linux_glibc2brand = {
904					ELFOSABI_LINUX,
905					EM_386,
906					"Linux",
907					"/compat/linux",
908					"/lib/ld-linux.so.2",
909					&elf_linux_sysvec,
910					NULL,
911				 };
912
913Elf32_Brandinfo *linux_brandlist[] = {
914					&linux_brand,
915					&linux_glibc2brand,
916					NULL
917				};
918
919static int
920linux_elf_modevent(module_t mod, int type, void *data)
921{
922	Elf32_Brandinfo **brandinfo;
923	int error;
924	struct linux_ioctl_handler **lihp;
925
926	error = 0;
927
928	switch(type) {
929	case MOD_LOAD:
930		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
931		     ++brandinfo)
932			if (elf32_insert_brand_entry(*brandinfo) < 0)
933				error = EINVAL;
934		if (error == 0) {
935			SET_FOREACH(lihp, linux_ioctl_handler_set)
936				linux_ioctl_register_handler(*lihp);
937			if (bootverbose)
938				printf("Linux ELF exec handler installed\n");
939		} else
940			printf("cannot insert Linux ELF brand handler\n");
941		break;
942	case MOD_UNLOAD:
943		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
944		     ++brandinfo)
945			if (elf32_brand_inuse(*brandinfo))
946				error = EBUSY;
947		if (error == 0) {
948			for (brandinfo = &linux_brandlist[0];
949			     *brandinfo != NULL; ++brandinfo)
950				if (elf32_remove_brand_entry(*brandinfo) < 0)
951					error = EINVAL;
952		}
953		if (error == 0) {
954			SET_FOREACH(lihp, linux_ioctl_handler_set)
955				linux_ioctl_unregister_handler(*lihp);
956			if (bootverbose)
957				printf("Linux ELF exec handler removed\n");
958			linux_mib_destroy();
959		} else
960			printf("Could not deinstall ELF interpreter entry\n");
961		break;
962	default:
963		break;
964	}
965	return error;
966}
967
968static moduledata_t linux_elf_mod = {
969	"linuxelf",
970	linux_elf_modevent,
971	0
972};
973
974DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
975