linux_sysvec.c revision 156843
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_sysvec.c 156843 2006-03-18 18:24:38Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/exec.h>
35#include <sys/imgact.h>
36#include <sys/imgact_aout.h>
37#include <sys/imgact_elf.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/module.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysent.h>
47#include <sys/sysproto.h>
48#include <sys/vnode.h>
49
50#include <vm/vm.h>
51#include <vm/pmap.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_map.h>
54#include <vm/vm_object.h>
55#include <vm/vm_page.h>
56#include <vm/vm_param.h>
57
58#include <machine/cpu.h>
59#include <machine/md_var.h>
60#include <machine/pcb.h>
61
62#include <i386/linux/linux.h>
63#include <i386/linux/linux_proto.h>
64#include <compat/linux/linux_mib.h>
65#include <compat/linux/linux_signal.h>
66#include <compat/linux/linux_util.h>
67
68MODULE_VERSION(linux, 1);
69
70MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
71
72#if BYTE_ORDER == LITTLE_ENDIAN
73#define SHELLMAGIC      0x2123 /* #! */
74#else
75#define SHELLMAGIC      0x2321
76#endif
77
78/*
79 * Allow the sendsig functions to use the ldebug() facility
80 * even though they are not syscalls themselves. Map them
81 * to syscall 0. This is slightly less bogus than using
82 * ldebug(sigreturn).
83 */
84#define	LINUX_SYS_linux_rt_sendsig	0
85#define	LINUX_SYS_linux_sendsig		0
86
87#define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
88#define	__LINUX_NPXCW__		0x37f
89
90extern char linux_sigcode[];
91extern int linux_szsigcode;
92
93extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
94
95SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
96
97static int	linux_fixup(register_t **stack_base,
98		    struct image_params *iparams);
99static int	elf_linux_fixup(register_t **stack_base,
100		    struct image_params *iparams);
101static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
102		    caddr_t *params);
103static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
104static void	exec_linux_setregs(struct thread *td, u_long entry,
105				   u_long stack, u_long ps_strings);
106
107/*
108 * Linux syscalls return negative errno's, we do positive and map them
109 */
110static int bsd_to_linux_errno[ELAST + 1] = {
111	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
112	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
113	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
114	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
115	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
116	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
117	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
118	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
119	-6, -6, -43, -42, -75, -6, -84
120};
121
122int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
123	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
124	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
125	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
126	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
127	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
128	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
129	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
130	0, LINUX_SIGUSR1, LINUX_SIGUSR2
131};
132
133int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
134	SIGHUP, SIGINT, SIGQUIT, SIGILL,
135	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
136	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
137	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
138	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
139	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
140	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
141	SIGIO, SIGURG, SIGSYS
142};
143
144#define LINUX_T_UNKNOWN  255
145static int _bsd_to_linux_trapcode[] = {
146	LINUX_T_UNKNOWN,	/* 0 */
147	6,			/* 1  T_PRIVINFLT */
148	LINUX_T_UNKNOWN,	/* 2 */
149	3,			/* 3  T_BPTFLT */
150	LINUX_T_UNKNOWN,	/* 4 */
151	LINUX_T_UNKNOWN,	/* 5 */
152	16,			/* 6  T_ARITHTRAP */
153	254,			/* 7  T_ASTFLT */
154	LINUX_T_UNKNOWN,	/* 8 */
155	13,			/* 9  T_PROTFLT */
156	1,			/* 10 T_TRCTRAP */
157	LINUX_T_UNKNOWN,	/* 11 */
158	14,			/* 12 T_PAGEFLT */
159	LINUX_T_UNKNOWN,	/* 13 */
160	17,			/* 14 T_ALIGNFLT */
161	LINUX_T_UNKNOWN,	/* 15 */
162	LINUX_T_UNKNOWN,	/* 16 */
163	LINUX_T_UNKNOWN,	/* 17 */
164	0,			/* 18 T_DIVIDE */
165	2,			/* 19 T_NMI */
166	4,			/* 20 T_OFLOW */
167	5,			/* 21 T_BOUND */
168	7,			/* 22 T_DNA */
169	8,			/* 23 T_DOUBLEFLT */
170	9,			/* 24 T_FPOPFLT */
171	10,			/* 25 T_TSSFLT */
172	11,			/* 26 T_SEGNPFLT */
173	12,			/* 27 T_STKFLT */
174	18,			/* 28 T_MCHK */
175	19,			/* 29 T_XMMFLT */
176	15			/* 30 T_RESERVED */
177};
178#define bsd_to_linux_trapcode(code) \
179    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
180     _bsd_to_linux_trapcode[(code)]: \
181     LINUX_T_UNKNOWN)
182
183/*
184 * If FreeBSD & Linux have a difference of opinion about what a trap
185 * means, deal with it here.
186 *
187 * MPSAFE
188 */
189static int
190translate_traps(int signal, int trap_code)
191{
192	if (signal != SIGBUS)
193		return signal;
194	switch (trap_code) {
195	case T_PROTFLT:
196	case T_TSSFLT:
197	case T_DOUBLEFLT:
198	case T_PAGEFLT:
199		return SIGSEGV;
200	default:
201		return signal;
202	}
203}
204
205static int
206linux_fixup(register_t **stack_base, struct image_params *imgp)
207{
208	register_t *argv, *envp;
209
210	argv = *stack_base;
211	envp = *stack_base + (imgp->args->argc + 1);
212	(*stack_base)--;
213	**stack_base = (intptr_t)(void *)envp;
214	(*stack_base)--;
215	**stack_base = (intptr_t)(void *)argv;
216	(*stack_base)--;
217	**stack_base = imgp->args->argc;
218	return 0;
219}
220
221static int
222elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
223{
224	Elf32_Auxargs *args;
225	register_t *pos;
226
227	KASSERT(curthread->td_proc == imgp->proc &&
228	    (curthread->td_proc->p_flag & P_SA) == 0,
229	    ("unsafe elf_linux_fixup(), should be curproc"));
230	args = (Elf32_Auxargs *)imgp->auxargs;
231	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
232
233	if (args->trace)
234		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
235	if (args->execfd != -1)
236		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
237	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
238	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
239	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
240	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
241	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
242	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
243	AUXARGS_ENTRY(pos, AT_BASE, args->base);
244	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
245	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
246	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
247	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
248	AUXARGS_ENTRY(pos, AT_NULL, 0);
249
250	free(imgp->auxargs, M_TEMP);
251	imgp->auxargs = NULL;
252
253	(*stack_base)--;
254	**stack_base = (register_t)imgp->args->argc;
255	return 0;
256}
257
258extern int _ucodesel, _udatasel;
259extern unsigned long linux_sznonrtsigcode;
260
261static void
262linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
263{
264	struct thread *td = curthread;
265	struct proc *p = td->td_proc;
266	struct sigacts *psp;
267	struct trapframe *regs;
268	struct l_rt_sigframe *fp, frame;
269	int sig, code;
270	int oonstack;
271
272	sig = ksi->ksi_signo;
273	code = ksi->ksi_code;
274	PROC_LOCK_ASSERT(p, MA_OWNED);
275	psp = p->p_sigacts;
276	mtx_assert(&psp->ps_mtx, MA_OWNED);
277	regs = td->td_frame;
278	oonstack = sigonstack(regs->tf_esp);
279
280#ifdef DEBUG
281	if (ldebug(rt_sendsig))
282		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
283		    catcher, sig, (void*)mask, code);
284#endif
285	/*
286	 * Allocate space for the signal handler context.
287	 */
288	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
289	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
290		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
291		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
292	} else
293		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
294	mtx_unlock(&psp->ps_mtx);
295
296	/*
297	 * Build the argument list for the signal handler.
298	 */
299	if (p->p_sysent->sv_sigtbl)
300		if (sig <= p->p_sysent->sv_sigsize)
301			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
302
303	bzero(&frame, sizeof(frame));
304
305	frame.sf_handler = catcher;
306	frame.sf_sig = sig;
307	frame.sf_siginfo = &fp->sf_si;
308	frame.sf_ucontext = &fp->sf_sc;
309
310	/* Fill in POSIX parts */
311	frame.sf_si.lsi_signo = sig;
312	frame.sf_si.lsi_code = code;
313	frame.sf_si.lsi_addr = ksi->ksi_addr;
314
315	/*
316	 * Build the signal context to be used by sigreturn.
317	 */
318	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
319	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
320
321	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
322	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
323	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
324	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
325	PROC_UNLOCK(p);
326
327	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
328
329	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
330	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
331	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
332	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
333	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
334	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
335	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
336	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
337	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
338	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
339	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
340	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
341	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
342	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
343	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
344	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
345	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
346	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
347	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
348
349#ifdef DEBUG
350	if (ldebug(rt_sendsig))
351		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
352		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
353		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
354#endif
355
356	if (copyout(&frame, fp, sizeof(frame)) != 0) {
357		/*
358		 * Process has trashed its stack; give it an illegal
359		 * instruction to halt it in its tracks.
360		 */
361#ifdef DEBUG
362		if (ldebug(rt_sendsig))
363			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
364			    fp, oonstack);
365#endif
366		PROC_LOCK(p);
367		sigexit(td, SIGILL);
368	}
369
370	/*
371	 * Build context to run handler in.
372	 */
373	regs->tf_esp = (int)fp;
374	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
375	    linux_sznonrtsigcode;
376	regs->tf_eflags &= ~(PSL_T | PSL_VM);
377	regs->tf_cs = _ucodesel;
378	regs->tf_ds = _udatasel;
379	regs->tf_es = _udatasel;
380	regs->tf_fs = _udatasel;
381	regs->tf_ss = _udatasel;
382	PROC_LOCK(p);
383	mtx_lock(&psp->ps_mtx);
384}
385
386
387/*
388 * Send an interrupt to process.
389 *
390 * Stack is set up to allow sigcode stored
391 * in u. to call routine, followed by kcall
392 * to sigreturn routine below.  After sigreturn
393 * resets the signal mask, the stack, and the
394 * frame pointer, it returns to the user
395 * specified pc, psl.
396 */
397static void
398linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
399{
400	struct thread *td = curthread;
401	struct proc *p = td->td_proc;
402	struct sigacts *psp;
403	struct trapframe *regs;
404	struct l_sigframe *fp, frame;
405	l_sigset_t lmask;
406	int sig, code;
407	int oonstack, i;
408
409	PROC_LOCK_ASSERT(p, MA_OWNED);
410	psp = p->p_sigacts;
411	sig = ksi->ksi_signo;
412	code = ksi->ksi_code;
413	mtx_assert(&psp->ps_mtx, MA_OWNED);
414	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
415		/* Signal handler installed with SA_SIGINFO. */
416		linux_rt_sendsig(catcher, ksi, mask);
417		return;
418	}
419	regs = td->td_frame;
420	oonstack = sigonstack(regs->tf_esp);
421
422#ifdef DEBUG
423	if (ldebug(sendsig))
424		printf(ARGS(sendsig, "%p, %d, %p, %u"),
425		    catcher, sig, (void*)mask, code);
426#endif
427
428	/*
429	 * Allocate space for the signal handler context.
430	 */
431	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
432	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
433		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
434		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
435	} else
436		fp = (struct l_sigframe *)regs->tf_esp - 1;
437	mtx_unlock(&psp->ps_mtx);
438	PROC_UNLOCK(p);
439
440	/*
441	 * Build the argument list for the signal handler.
442	 */
443	if (p->p_sysent->sv_sigtbl)
444		if (sig <= p->p_sysent->sv_sigsize)
445			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
446
447	bzero(&frame, sizeof(frame));
448
449	frame.sf_handler = catcher;
450	frame.sf_sig = sig;
451
452	bsd_to_linux_sigset(mask, &lmask);
453
454	/*
455	 * Build the signal context to be used by sigreturn.
456	 */
457	frame.sf_sc.sc_mask   = lmask.__bits[0];
458	frame.sf_sc.sc_gs     = rgs();
459	frame.sf_sc.sc_fs     = regs->tf_fs;
460	frame.sf_sc.sc_es     = regs->tf_es;
461	frame.sf_sc.sc_ds     = regs->tf_ds;
462	frame.sf_sc.sc_edi    = regs->tf_edi;
463	frame.sf_sc.sc_esi    = regs->tf_esi;
464	frame.sf_sc.sc_ebp    = regs->tf_ebp;
465	frame.sf_sc.sc_ebx    = regs->tf_ebx;
466	frame.sf_sc.sc_edx    = regs->tf_edx;
467	frame.sf_sc.sc_ecx    = regs->tf_ecx;
468	frame.sf_sc.sc_eax    = regs->tf_eax;
469	frame.sf_sc.sc_eip    = regs->tf_eip;
470	frame.sf_sc.sc_cs     = regs->tf_cs;
471	frame.sf_sc.sc_eflags = regs->tf_eflags;
472	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
473	frame.sf_sc.sc_ss     = regs->tf_ss;
474	frame.sf_sc.sc_err    = regs->tf_err;
475	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
476
477	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
478		frame.sf_extramask[i] = lmask.__bits[i+1];
479
480	if (copyout(&frame, fp, sizeof(frame)) != 0) {
481		/*
482		 * Process has trashed its stack; give it an illegal
483		 * instruction to halt it in its tracks.
484		 */
485		PROC_LOCK(p);
486		sigexit(td, SIGILL);
487	}
488
489	/*
490	 * Build context to run handler in.
491	 */
492	regs->tf_esp = (int)fp;
493	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
494	regs->tf_eflags &= ~(PSL_T | PSL_VM);
495	regs->tf_cs = _ucodesel;
496	regs->tf_ds = _udatasel;
497	regs->tf_es = _udatasel;
498	regs->tf_fs = _udatasel;
499	regs->tf_ss = _udatasel;
500	PROC_LOCK(p);
501	mtx_lock(&psp->ps_mtx);
502}
503
504/*
505 * System call to cleanup state after a signal
506 * has been taken.  Reset signal mask and
507 * stack state from context left by sendsig (above).
508 * Return to previous pc and psl as specified by
509 * context left by sendsig. Check carefully to
510 * make sure that the user has not modified the
511 * psl to gain improper privileges or to cause
512 * a machine fault.
513 */
514int
515linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
516{
517	struct proc *p = td->td_proc;
518	struct l_sigframe frame;
519	struct trapframe *regs;
520	l_sigset_t lmask;
521	int eflags, i;
522	ksiginfo_t ksi;
523
524	regs = td->td_frame;
525
526#ifdef DEBUG
527	if (ldebug(sigreturn))
528		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
529#endif
530	/*
531	 * The trampoline code hands us the sigframe.
532	 * It is unsafe to keep track of it ourselves, in the event that a
533	 * program jumps out of a signal handler.
534	 */
535	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
536		return (EFAULT);
537
538	/*
539	 * Check for security violations.
540	 */
541#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
542	eflags = frame.sf_sc.sc_eflags;
543	/*
544	 * XXX do allow users to change the privileged flag PSL_RF.  The
545	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
546	 * sometimes set it there too.  tf_eflags is kept in the signal
547	 * context during signal handling and there is no other place
548	 * to remember it, so the PSL_RF bit may be corrupted by the
549	 * signal handler without us knowing.  Corruption of the PSL_RF
550	 * bit at worst causes one more or one less debugger trap, so
551	 * allowing it is fairly harmless.
552	 */
553	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
554		return(EINVAL);
555
556	/*
557	 * Don't allow users to load a valid privileged %cs.  Let the
558	 * hardware check for invalid selectors, excess privilege in
559	 * other selectors, invalid %eip's and invalid %esp's.
560	 */
561#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
562	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
563		ksiginfo_init_trap(&ksi);
564		ksi.ksi_signo = SIGBUS;
565		ksi.ksi_code = BUS_OBJERR;
566		ksi.ksi_trapno = T_PROTFLT;
567		ksi.ksi_addr = (void *)regs->tf_eip;
568		trapsignal(td, &ksi);
569		return(EINVAL);
570	}
571
572	lmask.__bits[0] = frame.sf_sc.sc_mask;
573	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
574		lmask.__bits[i+1] = frame.sf_extramask[i];
575	PROC_LOCK(p);
576	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
577	SIG_CANTMASK(td->td_sigmask);
578	signotify(td);
579	PROC_UNLOCK(p);
580
581	/*
582	 * Restore signal context.
583	 */
584	/* %gs was restored by the trampoline. */
585	regs->tf_fs     = frame.sf_sc.sc_fs;
586	regs->tf_es     = frame.sf_sc.sc_es;
587	regs->tf_ds     = frame.sf_sc.sc_ds;
588	regs->tf_edi    = frame.sf_sc.sc_edi;
589	regs->tf_esi    = frame.sf_sc.sc_esi;
590	regs->tf_ebp    = frame.sf_sc.sc_ebp;
591	regs->tf_ebx    = frame.sf_sc.sc_ebx;
592	regs->tf_edx    = frame.sf_sc.sc_edx;
593	regs->tf_ecx    = frame.sf_sc.sc_ecx;
594	regs->tf_eax    = frame.sf_sc.sc_eax;
595	regs->tf_eip    = frame.sf_sc.sc_eip;
596	regs->tf_cs     = frame.sf_sc.sc_cs;
597	regs->tf_eflags = eflags;
598	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
599	regs->tf_ss     = frame.sf_sc.sc_ss;
600
601	return (EJUSTRETURN);
602}
603
604/*
605 * System call to cleanup state after a signal
606 * has been taken.  Reset signal mask and
607 * stack state from context left by rt_sendsig (above).
608 * Return to previous pc and psl as specified by
609 * context left by sendsig. Check carefully to
610 * make sure that the user has not modified the
611 * psl to gain improper privileges or to cause
612 * a machine fault.
613 */
614int
615linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
616{
617	struct proc *p = td->td_proc;
618	struct l_ucontext uc;
619	struct l_sigcontext *context;
620	l_stack_t *lss;
621	stack_t ss;
622	struct trapframe *regs;
623	int eflags;
624	ksiginfo_t ksi;
625
626	regs = td->td_frame;
627
628#ifdef DEBUG
629	if (ldebug(rt_sigreturn))
630		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
631#endif
632	/*
633	 * The trampoline code hands us the ucontext.
634	 * It is unsafe to keep track of it ourselves, in the event that a
635	 * program jumps out of a signal handler.
636	 */
637	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
638		return (EFAULT);
639
640	context = &uc.uc_mcontext;
641
642	/*
643	 * Check for security violations.
644	 */
645#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
646	eflags = context->sc_eflags;
647	/*
648	 * XXX do allow users to change the privileged flag PSL_RF.  The
649	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
650	 * sometimes set it there too.  tf_eflags is kept in the signal
651	 * context during signal handling and there is no other place
652	 * to remember it, so the PSL_RF bit may be corrupted by the
653	 * signal handler without us knowing.  Corruption of the PSL_RF
654	 * bit at worst causes one more or one less debugger trap, so
655	 * allowing it is fairly harmless.
656	 */
657	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
658		return(EINVAL);
659
660	/*
661	 * Don't allow users to load a valid privileged %cs.  Let the
662	 * hardware check for invalid selectors, excess privilege in
663	 * other selectors, invalid %eip's and invalid %esp's.
664	 */
665#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
666	if (!CS_SECURE(context->sc_cs)) {
667		ksiginfo_init_trap(&ksi);
668		ksi.ksi_signo = SIGBUS;
669		ksi.ksi_code = BUS_OBJERR;
670		ksi.ksi_trapno = T_PROTFLT;
671		ksi.ksi_addr = (void *)regs->tf_eip;
672		trapsignal(td, &ksi);
673		return(EINVAL);
674	}
675
676	PROC_LOCK(p);
677	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
678	SIG_CANTMASK(td->td_sigmask);
679	signotify(td);
680	PROC_UNLOCK(p);
681
682	/*
683	 * Restore signal context
684	 */
685	/* %gs was restored by the trampoline. */
686	regs->tf_fs     = context->sc_fs;
687	regs->tf_es     = context->sc_es;
688	regs->tf_ds     = context->sc_ds;
689	regs->tf_edi    = context->sc_edi;
690	regs->tf_esi    = context->sc_esi;
691	regs->tf_ebp    = context->sc_ebp;
692	regs->tf_ebx    = context->sc_ebx;
693	regs->tf_edx    = context->sc_edx;
694	regs->tf_ecx    = context->sc_ecx;
695	regs->tf_eax    = context->sc_eax;
696	regs->tf_eip    = context->sc_eip;
697	regs->tf_cs     = context->sc_cs;
698	regs->tf_eflags = eflags;
699	regs->tf_esp    = context->sc_esp_at_signal;
700	regs->tf_ss     = context->sc_ss;
701
702	/*
703	 * call sigaltstack & ignore results..
704	 */
705	lss = &uc.uc_stack;
706	ss.ss_sp = lss->ss_sp;
707	ss.ss_size = lss->ss_size;
708	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
709
710#ifdef DEBUG
711	if (ldebug(rt_sigreturn))
712		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
713		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
714#endif
715	(void)kern_sigaltstack(td, &ss, NULL);
716
717	return (EJUSTRETURN);
718}
719
720/*
721 * MPSAFE
722 */
723static void
724linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
725{
726	args[0] = tf->tf_ebx;
727	args[1] = tf->tf_ecx;
728	args[2] = tf->tf_edx;
729	args[3] = tf->tf_esi;
730	args[4] = tf->tf_edi;
731	args[5] = tf->tf_ebp;	/* Unconfirmed */
732	*params = NULL;		/* no copyin */
733}
734
735/*
736 * If a linux binary is exec'ing something, try this image activator
737 * first.  We override standard shell script execution in order to
738 * be able to modify the interpreter path.  We only do this if a linux
739 * binary is doing the exec, so we do not create an EXEC module for it.
740 */
741static int	exec_linux_imgact_try(struct image_params *iparams);
742
743static int
744exec_linux_imgact_try(struct image_params *imgp)
745{
746    const char *head = (const char *)imgp->image_header;
747    char *rpath;
748    int error = -1, len;
749
750    /*
751     * The interpreter for shell scripts run from a linux binary needs
752     * to be located in /compat/linux if possible in order to recursively
753     * maintain linux path emulation.
754     */
755    if (((const short *)head)[0] == SHELLMAGIC) {
756	    /*
757	     * Run our normal shell image activator.  If it succeeds attempt
758	     * to use the alternate path for the interpreter.  If an alternate
759	     * path is found, use our stringspace to store it.
760	     */
761	    if ((error = exec_shell_imgact(imgp)) == 0) {
762		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
763			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
764		    if (rpath != NULL) {
765			    len = strlen(rpath) + 1;
766
767			    if (len <= MAXSHELLCMDLEN) {
768				    memcpy(imgp->interpreter_name, rpath, len);
769			    }
770			    free(rpath, M_TEMP);
771		    }
772	    }
773    }
774    return(error);
775}
776
777/*
778 * exec_setregs may initialize some registers differently than Linux
779 * does, thus potentially confusing Linux binaries. If necessary, we
780 * override the exec_setregs default(s) here.
781 */
782static void
783exec_linux_setregs(struct thread *td, u_long entry,
784		   u_long stack, u_long ps_strings)
785{
786	static const u_short control = __LINUX_NPXCW__;
787	struct pcb *pcb = td->td_pcb;
788
789	exec_setregs(td, entry, stack, ps_strings);
790
791	/* Linux sets %gs to 0, we default to _udatasel */
792	pcb->pcb_gs = 0; load_gs(0);
793
794	/* Linux sets the i387 to extended precision. */
795	fldcw(&control);
796}
797
798struct sysentvec linux_sysvec = {
799	LINUX_SYS_MAXSYSCALL,
800	linux_sysent,
801	0xff,
802	LINUX_SIGTBLSZ,
803	bsd_to_linux_signal,
804	ELAST + 1,
805	bsd_to_linux_errno,
806	translate_traps,
807	linux_fixup,
808	linux_sendsig,
809	linux_sigcode,
810	&linux_szsigcode,
811	linux_prepsyscall,
812	"Linux a.out",
813	NULL,
814	exec_linux_imgact_try,
815	LINUX_MINSIGSTKSZ,
816	PAGE_SIZE,
817	VM_MIN_ADDRESS,
818	VM_MAXUSER_ADDRESS,
819	USRSTACK,
820	PS_STRINGS,
821	VM_PROT_ALL,
822	exec_copyout_strings,
823	exec_linux_setregs,
824	NULL
825};
826
827struct sysentvec elf_linux_sysvec = {
828	LINUX_SYS_MAXSYSCALL,
829	linux_sysent,
830	0xff,
831	LINUX_SIGTBLSZ,
832	bsd_to_linux_signal,
833	ELAST + 1,
834	bsd_to_linux_errno,
835	translate_traps,
836	elf_linux_fixup,
837	linux_sendsig,
838	linux_sigcode,
839	&linux_szsigcode,
840	linux_prepsyscall,
841	"Linux ELF",
842	elf32_coredump,
843	exec_linux_imgact_try,
844	LINUX_MINSIGSTKSZ,
845	PAGE_SIZE,
846	VM_MIN_ADDRESS,
847	VM_MAXUSER_ADDRESS,
848	USRSTACK,
849	PS_STRINGS,
850	VM_PROT_ALL,
851	exec_copyout_strings,
852	exec_linux_setregs,
853	NULL
854};
855
856static Elf32_Brandinfo linux_brand = {
857					ELFOSABI_LINUX,
858					EM_386,
859					"Linux",
860					"/compat/linux",
861					"/lib/ld-linux.so.1",
862					&elf_linux_sysvec,
863					NULL,
864					BI_CAN_EXEC_DYN,
865				 };
866
867static Elf32_Brandinfo linux_glibc2brand = {
868					ELFOSABI_LINUX,
869					EM_386,
870					"Linux",
871					"/compat/linux",
872					"/lib/ld-linux.so.2",
873					&elf_linux_sysvec,
874					NULL,
875					BI_CAN_EXEC_DYN,
876				 };
877
878Elf32_Brandinfo *linux_brandlist[] = {
879					&linux_brand,
880					&linux_glibc2brand,
881					NULL
882				};
883
884static int
885linux_elf_modevent(module_t mod, int type, void *data)
886{
887	Elf32_Brandinfo **brandinfo;
888	int error;
889	struct linux_ioctl_handler **lihp;
890
891	error = 0;
892
893	switch(type) {
894	case MOD_LOAD:
895		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
896		     ++brandinfo)
897			if (elf32_insert_brand_entry(*brandinfo) < 0)
898				error = EINVAL;
899		if (error == 0) {
900			SET_FOREACH(lihp, linux_ioctl_handler_set)
901				linux_ioctl_register_handler(*lihp);
902			if (bootverbose)
903				printf("Linux ELF exec handler installed\n");
904		} else
905			printf("cannot insert Linux ELF brand handler\n");
906		break;
907	case MOD_UNLOAD:
908		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
909		     ++brandinfo)
910			if (elf32_brand_inuse(*brandinfo))
911				error = EBUSY;
912		if (error == 0) {
913			for (brandinfo = &linux_brandlist[0];
914			     *brandinfo != NULL; ++brandinfo)
915				if (elf32_remove_brand_entry(*brandinfo) < 0)
916					error = EINVAL;
917		}
918		if (error == 0) {
919			SET_FOREACH(lihp, linux_ioctl_handler_set)
920				linux_ioctl_unregister_handler(*lihp);
921			if (bootverbose)
922				printf("Linux ELF exec handler removed\n");
923		} else
924			printf("Could not deinstall ELF interpreter entry\n");
925		break;
926	default:
927		return EOPNOTSUPP;
928	}
929	return error;
930}
931
932static moduledata_t linux_elf_mod = {
933	"linuxelf",
934	linux_elf_modevent,
935	0
936};
937
938DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
939