linux_sysvec.c revision 103086
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_sysvec.c 103086 2002-09-07 22:31:44Z peter $
29 */
30
31/* XXX we use functions that might not exist. */
32#include "opt_compat.h"
33
34#ifndef COMPAT_43
35#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
36#endif
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/imgact.h>
41#include <sys/imgact_aout.h>
42#include <sys/imgact_elf.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/signalvar.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysent.h>
50#include <sys/sysproto.h>
51#include <sys/user.h>
52#include <sys/vnode.h>
53
54#include <vm/vm.h>
55#include <vm/vm_param.h>
56#include <vm/vm_page.h>
57#include <vm/vm_extern.h>
58#include <sys/exec.h>
59#include <sys/kernel.h>
60#include <sys/module.h>
61#include <machine/cpu.h>
62#include <machine/md_var.h>
63#include <sys/mutex.h>
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_object.h>
70
71#include <i386/linux/linux.h>
72#include <i386/linux/linux_proto.h>
73#include <compat/linux/linux_signal.h>
74#include <compat/linux/linux_util.h>
75
76MODULE_VERSION(linux, 1);
77MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
78MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
79MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
80
81MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
82
83#if BYTE_ORDER == LITTLE_ENDIAN
84#define SHELLMAGIC      0x2123 /* #! */
85#else
86#define SHELLMAGIC      0x2321
87#endif
88
89/*
90 * Allow the sendsig functions to use the ldebug() facility
91 * even though they are not syscalls themselves. Map them
92 * to syscall 0. This is slightly less bogus than using
93 * ldebug(sigreturn).
94 */
95#define	LINUX_SYS_linux_rt_sendsig	0
96#define	LINUX_SYS_linux_sendsig		0
97
98extern char linux_sigcode[];
99extern int linux_szsigcode;
100
101extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
102
103SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
104
105static int	linux_fixup(register_t **stack_base,
106		    struct image_params *iparams);
107static int	elf_linux_fixup(register_t **stack_base,
108		    struct image_params *iparams);
109static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
110		    caddr_t *params);
111static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
112		    u_long code);
113
114/*
115 * Linux syscalls return negative errno's, we do positive and map them
116 */
117static int bsd_to_linux_errno[ELAST + 1] = {
118  	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
119 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
120 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
121 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
122 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
123	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
124	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
125	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
126  	-6, -6, -43, -42, -75, -6, -84
127};
128
129int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
130	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
131	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
132	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, 0,
133	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
134	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
135	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
136	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
137	0, LINUX_SIGUSR1, LINUX_SIGUSR2
138};
139
140int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
141	SIGHUP, SIGINT, SIGQUIT, SIGILL,
142	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
143	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
144	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
145	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
146	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
147	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
148	SIGIO, SIGURG, 0
149};
150
151#define LINUX_T_UNKNOWN  255
152static int _bsd_to_linux_trapcode[] = {
153	LINUX_T_UNKNOWN,	/* 0 */
154	6,			/* 1  T_PRIVINFLT */
155	LINUX_T_UNKNOWN,	/* 2 */
156	3,			/* 3  T_BPTFLT */
157	LINUX_T_UNKNOWN,	/* 4 */
158	LINUX_T_UNKNOWN,	/* 5 */
159	16,			/* 6  T_ARITHTRAP */
160	254,			/* 7  T_ASTFLT */
161	LINUX_T_UNKNOWN,	/* 8 */
162	13,			/* 9  T_PROTFLT */
163	1,			/* 10 T_TRCTRAP */
164	LINUX_T_UNKNOWN,	/* 11 */
165	14,			/* 12 T_PAGEFLT */
166	LINUX_T_UNKNOWN,	/* 13 */
167	17,			/* 14 T_ALIGNFLT */
168	LINUX_T_UNKNOWN,	/* 15 */
169	LINUX_T_UNKNOWN,	/* 16 */
170	LINUX_T_UNKNOWN,	/* 17 */
171	0,			/* 18 T_DIVIDE */
172	2,			/* 19 T_NMI */
173	4,			/* 20 T_OFLOW */
174	5,			/* 21 T_BOUND */
175	7,			/* 22 T_DNA */
176	8,			/* 23 T_DOUBLEFLT */
177	9,			/* 24 T_FPOPFLT */
178	10,			/* 25 T_TSSFLT */
179	11,			/* 26 T_SEGNPFLT */
180	12,			/* 27 T_STKFLT */
181	18,			/* 28 T_MCHK */
182	19,			/* 29 T_XMMFLT */
183	15			/* 30 T_RESERVED */
184};
185#define bsd_to_linux_trapcode(code) \
186    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
187     _bsd_to_linux_trapcode[(code)]: \
188     LINUX_T_UNKNOWN)
189
190/*
191 * If FreeBSD & Linux have a difference of opinion about what a trap
192 * means, deal with it here.
193 *
194 * MPSAFE
195 */
196static int
197translate_traps(int signal, int trap_code)
198{
199	if (signal != SIGBUS)
200		return signal;
201	switch (trap_code) {
202	case T_PROTFLT:
203	case T_TSSFLT:
204	case T_DOUBLEFLT:
205	case T_PAGEFLT:
206		return SIGSEGV;
207	default:
208		return signal;
209	}
210}
211
212static int
213linux_fixup(register_t **stack_base, struct image_params *imgp)
214{
215	register_t *argv, *envp;
216
217	argv = *stack_base;
218	envp = *stack_base + (imgp->argc + 1);
219	(*stack_base)--;
220	**stack_base = (intptr_t)(void *)envp;
221	(*stack_base)--;
222	**stack_base = (intptr_t)(void *)argv;
223	(*stack_base)--;
224	**stack_base = imgp->argc;
225	return 0;
226}
227
228static int
229elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
230{
231	Elf32_Auxargs *args = (Elf32_Auxargs *)imgp->auxargs;
232	register_t *pos;
233
234	pos = *stack_base + (imgp->argc + imgp->envc + 2);
235
236	if (args->trace) {
237		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
238	}
239	if (args->execfd != -1) {
240		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
241	}
242	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
243	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
244	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
245	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
246	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
247	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
248	AUXARGS_ENTRY(pos, AT_BASE, args->base);
249	PROC_LOCK(imgp->proc);
250	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
251	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
252	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
253	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
254	PROC_UNLOCK(imgp->proc);
255	AUXARGS_ENTRY(pos, AT_NULL, 0);
256
257	free(imgp->auxargs, M_TEMP);
258	imgp->auxargs = NULL;
259
260	(*stack_base)--;
261	**stack_base = (long)imgp->argc;
262	return 0;
263}
264
265extern int _ucodesel, _udatasel;
266extern unsigned long linux_sznonrtsigcode;
267
268static void
269linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
270{
271	register struct thread *td = curthread;
272	register struct proc *p = td->td_proc;
273	register struct trapframe *regs;
274	struct l_rt_sigframe *fp, frame;
275	int oonstack;
276
277	PROC_LOCK_ASSERT(p, MA_OWNED);
278	regs = td->td_frame;
279	oonstack = sigonstack(regs->tf_esp);
280
281#ifdef DEBUG
282	if (ldebug(rt_sendsig))
283		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
284		    catcher, sig, (void*)mask, code);
285#endif
286	/*
287	 * Allocate space for the signal handler context.
288	 */
289	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
290	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
291		fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp +
292		    p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe));
293	} else
294		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
295	PROC_UNLOCK(p);
296
297	/*
298	 * Build the argument list for the signal handler.
299	 */
300	if (p->p_sysent->sv_sigtbl)
301		if (sig <= p->p_sysent->sv_sigsize)
302			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
303
304	frame.sf_handler = catcher;
305	frame.sf_sig = sig;
306	frame.sf_siginfo = &fp->sf_si;
307	frame.sf_ucontext = &fp->sf_sc;
308
309	/* Fill in POSIX parts */
310	frame.sf_si.lsi_signo = sig;
311	frame.sf_si.lsi_code = code;
312	frame.sf_si.lsi_addr = (void *)regs->tf_err;
313
314	/*
315	 * Build the signal context to be used by sigreturn.
316	 */
317	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
318	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
319
320	PROC_LOCK(p);
321	frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp;
322	frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size;
323	frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
324	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
325	PROC_UNLOCK(p);
326
327	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
328
329	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
330	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
331	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
332	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
333	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
334	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
335	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
336	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
337	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
338	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
339	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
340	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
341	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
342	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
343	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
344	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
345	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
346	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
347	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
348
349#ifdef DEBUG
350	if (ldebug(rt_sendsig))
351		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
352		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
353		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
354#endif
355
356	if (copyout(&frame, fp, sizeof(frame)) != 0) {
357		/*
358		 * Process has trashed its stack; give it an illegal
359		 * instruction to halt it in its tracks.
360		 */
361#ifdef DEBUG
362		if (ldebug(rt_sendsig))
363			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
364			    fp, oonstack);
365#endif
366		PROC_LOCK(p);
367		sigexit(td, SIGILL);
368	}
369
370	/*
371	 * Build context to run handler in.
372	 */
373	regs->tf_esp = (int)fp;
374	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
375	    linux_sznonrtsigcode;
376	regs->tf_eflags &= ~(PSL_T | PSL_VM);
377	regs->tf_cs = _ucodesel;
378	regs->tf_ds = _udatasel;
379	regs->tf_es = _udatasel;
380	regs->tf_fs = _udatasel;
381	regs->tf_ss = _udatasel;
382	PROC_LOCK(p);
383}
384
385
386/*
387 * Send an interrupt to process.
388 *
389 * Stack is set up to allow sigcode stored
390 * in u. to call routine, followed by kcall
391 * to sigreturn routine below.  After sigreturn
392 * resets the signal mask, the stack, and the
393 * frame pointer, it returns to the user
394 * specified pc, psl.
395 */
396
397static void
398linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
399{
400	register struct thread *td = curthread;
401	register struct proc *p = td->td_proc;
402	register struct trapframe *regs;
403	struct l_sigframe *fp, frame;
404	l_sigset_t lmask;
405	int oonstack, i;
406
407	PROC_LOCK_ASSERT(p, MA_OWNED);
408	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
409		/* Signal handler installed with SA_SIGINFO. */
410		linux_rt_sendsig(catcher, sig, mask, code);
411		return;
412	}
413
414	regs = td->td_frame;
415	oonstack = sigonstack(regs->tf_esp);
416
417#ifdef DEBUG
418	if (ldebug(sendsig))
419		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
420		    catcher, sig, (void*)mask, code);
421#endif
422
423	/*
424	 * Allocate space for the signal handler context.
425	 */
426	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
427	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
428		fp = (struct l_sigframe *)(p->p_sigstk.ss_sp +
429		    p->p_sigstk.ss_size - sizeof(struct l_sigframe));
430	} else
431		fp = (struct l_sigframe *)regs->tf_esp - 1;
432	PROC_UNLOCK(p);
433
434	/*
435	 * Build the argument list for the signal handler.
436	 */
437	if (p->p_sysent->sv_sigtbl)
438		if (sig <= p->p_sysent->sv_sigsize)
439			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
440
441	frame.sf_handler = catcher;
442	frame.sf_sig = sig;
443
444	bsd_to_linux_sigset(mask, &lmask);
445
446	/*
447	 * Build the signal context to be used by sigreturn.
448	 */
449	frame.sf_sc.sc_mask   = lmask.__bits[0];
450	frame.sf_sc.sc_gs     = rgs();
451	frame.sf_sc.sc_fs     = regs->tf_fs;
452	frame.sf_sc.sc_es     = regs->tf_es;
453	frame.sf_sc.sc_ds     = regs->tf_ds;
454	frame.sf_sc.sc_edi    = regs->tf_edi;
455	frame.sf_sc.sc_esi    = regs->tf_esi;
456	frame.sf_sc.sc_ebp    = regs->tf_ebp;
457	frame.sf_sc.sc_ebx    = regs->tf_ebx;
458	frame.sf_sc.sc_edx    = regs->tf_edx;
459	frame.sf_sc.sc_ecx    = regs->tf_ecx;
460	frame.sf_sc.sc_eax    = regs->tf_eax;
461	frame.sf_sc.sc_eip    = regs->tf_eip;
462	frame.sf_sc.sc_cs     = regs->tf_cs;
463	frame.sf_sc.sc_eflags = regs->tf_eflags;
464	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
465	frame.sf_sc.sc_ss     = regs->tf_ss;
466	frame.sf_sc.sc_err    = regs->tf_err;
467	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
468
469	bzero(&frame.sf_fpstate, sizeof(struct l_fpstate));
470
471	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
472		frame.sf_extramask[i] = lmask.__bits[i+1];
473
474	if (copyout(&frame, fp, sizeof(frame)) != 0) {
475		/*
476		 * Process has trashed its stack; give it an illegal
477		 * instruction to halt it in its tracks.
478		 */
479		PROC_LOCK(p);
480		sigexit(td, SIGILL);
481	}
482
483	/*
484	 * Build context to run handler in.
485	 */
486	regs->tf_esp = (int)fp;
487	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
488	regs->tf_eflags &= ~(PSL_T | PSL_VM);
489	regs->tf_cs = _ucodesel;
490	regs->tf_ds = _udatasel;
491	regs->tf_es = _udatasel;
492	regs->tf_fs = _udatasel;
493	regs->tf_ss = _udatasel;
494	PROC_LOCK(p);
495}
496
497/*
498 * System call to cleanup state after a signal
499 * has been taken.  Reset signal mask and
500 * stack state from context left by sendsig (above).
501 * Return to previous pc and psl as specified by
502 * context left by sendsig. Check carefully to
503 * make sure that the user has not modified the
504 * psl to gain improper privileges or to cause
505 * a machine fault.
506 */
507int
508linux_sigreturn(td, args)
509	struct thread *td;
510	struct linux_sigreturn_args *args;
511{
512	struct proc *p = td->td_proc;
513	struct l_sigframe frame;
514	register struct trapframe *regs;
515	l_sigset_t lmask;
516	int eflags, i;
517
518	regs = td->td_frame;
519
520#ifdef DEBUG
521	if (ldebug(sigreturn))
522		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
523#endif
524	/*
525	 * The trampoline code hands us the sigframe.
526	 * It is unsafe to keep track of it ourselves, in the event that a
527	 * program jumps out of a signal handler.
528	 */
529	if (copyin((caddr_t)args->sfp, &frame, sizeof(frame)) != 0)
530		return (EFAULT);
531
532	/*
533	 * Check for security violations.
534	 */
535#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
536	eflags = frame.sf_sc.sc_eflags;
537	/*
538	 * XXX do allow users to change the privileged flag PSL_RF.  The
539	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
540	 * sometimes set it there too.  tf_eflags is kept in the signal
541	 * context during signal handling and there is no other place
542	 * to remember it, so the PSL_RF bit may be corrupted by the
543	 * signal handler without us knowing.  Corruption of the PSL_RF
544	 * bit at worst causes one more or one less debugger trap, so
545	 * allowing it is fairly harmless.
546	 */
547	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
548    		return(EINVAL);
549	}
550
551	/*
552	 * Don't allow users to load a valid privileged %cs.  Let the
553	 * hardware check for invalid selectors, excess privilege in
554	 * other selectors, invalid %eip's and invalid %esp's.
555	 */
556#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
557	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
558		trapsignal(p, SIGBUS, T_PROTFLT);
559		return(EINVAL);
560	}
561
562	lmask.__bits[0] = frame.sf_sc.sc_mask;
563	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
564		lmask.__bits[i+1] = frame.sf_extramask[i];
565	PROC_LOCK(p);
566	linux_to_bsd_sigset(&lmask, &p->p_sigmask);
567	SIG_CANTMASK(p->p_sigmask);
568	signotify(p);
569	PROC_UNLOCK(p);
570
571	/*
572	 * Restore signal context.
573	 */
574	/* %gs was restored by the trampoline. */
575	regs->tf_fs     = frame.sf_sc.sc_fs;
576	regs->tf_es     = frame.sf_sc.sc_es;
577	regs->tf_ds     = frame.sf_sc.sc_ds;
578	regs->tf_edi    = frame.sf_sc.sc_edi;
579	regs->tf_esi    = frame.sf_sc.sc_esi;
580	regs->tf_ebp    = frame.sf_sc.sc_ebp;
581	regs->tf_ebx    = frame.sf_sc.sc_ebx;
582	regs->tf_edx    = frame.sf_sc.sc_edx;
583	regs->tf_ecx    = frame.sf_sc.sc_ecx;
584	regs->tf_eax    = frame.sf_sc.sc_eax;
585	regs->tf_eip    = frame.sf_sc.sc_eip;
586	regs->tf_cs     = frame.sf_sc.sc_cs;
587	regs->tf_eflags = eflags;
588	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
589	regs->tf_ss     = frame.sf_sc.sc_ss;
590
591	return (EJUSTRETURN);
592}
593
594/*
595 * System call to cleanup state after a signal
596 * has been taken.  Reset signal mask and
597 * stack state from context left by rt_sendsig (above).
598 * Return to previous pc and psl as specified by
599 * context left by sendsig. Check carefully to
600 * make sure that the user has not modified the
601 * psl to gain improper privileges or to cause
602 * a machine fault.
603 */
604int
605linux_rt_sigreturn(td, args)
606	struct thread *td;
607	struct linux_rt_sigreturn_args *args;
608{
609	struct proc *p = td->td_proc;
610	struct l_ucontext uc;
611	struct l_sigcontext *context;
612	l_stack_t *lss;
613	stack_t ss;
614	register struct trapframe *regs;
615	int eflags;
616
617	regs = td->td_frame;
618
619#ifdef DEBUG
620	if (ldebug(rt_sigreturn))
621		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
622#endif
623	/*
624	 * The trampoline code hands us the ucontext.
625	 * It is unsafe to keep track of it ourselves, in the event that a
626	 * program jumps out of a signal handler.
627	 */
628	if (copyin((caddr_t)args->ucp, &uc, sizeof(uc)) != 0)
629		return (EFAULT);
630
631	context = &uc.uc_mcontext;
632
633	/*
634	 * Check for security violations.
635	 */
636#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
637	eflags = context->sc_eflags;
638	/*
639	 * XXX do allow users to change the privileged flag PSL_RF.  The
640	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
641	 * sometimes set it there too.  tf_eflags is kept in the signal
642	 * context during signal handling and there is no other place
643	 * to remember it, so the PSL_RF bit may be corrupted by the
644	 * signal handler without us knowing.  Corruption of the PSL_RF
645	 * bit at worst causes one more or one less debugger trap, so
646	 * allowing it is fairly harmless.
647	 */
648	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
649    		return(EINVAL);
650	}
651
652	/*
653	 * Don't allow users to load a valid privileged %cs.  Let the
654	 * hardware check for invalid selectors, excess privilege in
655	 * other selectors, invalid %eip's and invalid %esp's.
656	 */
657#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
658	if (!CS_SECURE(context->sc_cs)) {
659		trapsignal(p, SIGBUS, T_PROTFLT);
660		return(EINVAL);
661	}
662
663	PROC_LOCK(p);
664	linux_to_bsd_sigset(&uc.uc_sigmask, &p->p_sigmask);
665	SIG_CANTMASK(p->p_sigmask);
666	signotify(p);
667	PROC_UNLOCK(p);
668
669	/*
670	 * Restore signal context
671	 */
672	/* %gs was restored by the trampoline. */
673	regs->tf_fs     = context->sc_fs;
674	regs->tf_es     = context->sc_es;
675	regs->tf_ds     = context->sc_ds;
676	regs->tf_edi    = context->sc_edi;
677	regs->tf_esi    = context->sc_esi;
678	regs->tf_ebp    = context->sc_ebp;
679	regs->tf_ebx    = context->sc_ebx;
680	regs->tf_edx    = context->sc_edx;
681	regs->tf_ecx    = context->sc_ecx;
682	regs->tf_eax    = context->sc_eax;
683	regs->tf_eip    = context->sc_eip;
684	regs->tf_cs     = context->sc_cs;
685	regs->tf_eflags = eflags;
686	regs->tf_esp    = context->sc_esp_at_signal;
687	regs->tf_ss     = context->sc_ss;
688
689	/*
690	 * call sigaltstack & ignore results..
691	 */
692	lss = &uc.uc_stack;
693	ss.ss_sp = lss->ss_sp;
694	ss.ss_size = lss->ss_size;
695	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
696
697#ifdef DEBUG
698	if (ldebug(rt_sigreturn))
699		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
700		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
701#endif
702	(void)kern_sigaltstack(td, &ss, NULL);
703
704	return (EJUSTRETURN);
705}
706
707/*
708 * MPSAFE
709 */
710static void
711linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
712{
713	args[0] = tf->tf_ebx;
714	args[1] = tf->tf_ecx;
715	args[2] = tf->tf_edx;
716	args[3] = tf->tf_esi;
717	args[4] = tf->tf_edi;
718	args[5] = tf->tf_ebp;	/* Unconfirmed */
719	*params = NULL;		/* no copyin */
720}
721
722
723
724/*
725 * Dump core, into a file named as described in the comments for
726 * expand_name(), unless the process was setuid/setgid.
727 */
728static int
729linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
730{
731	struct proc *p = td->td_proc;
732	struct ucred *cred = td->td_ucred;
733	struct vmspace *vm = p->p_vmspace;
734	char *tempuser;
735	int error;
736
737	if (ctob((uarea_pages + kstack_pages) +
738	    vm->vm_dsize + vm->vm_ssize) >= limit)
739		return (EFAULT);
740	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
741	    M_WAITOK | M_ZERO);
742	if (tempuser == NULL)
743		return (ENOMEM);
744	PROC_LOCK(p);
745	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
746	PROC_UNLOCK(p);
747	bcopy(p->p_uarea, tempuser, sizeof(struct user));
748	bcopy(td->td_frame,
749	    tempuser + ctob(uarea_pages) +
750	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
751	    sizeof(struct trapframe));
752	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
753	    ctob(uarea_pages + kstack_pages),
754	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
755	    (int *)NULL, td);
756	free(tempuser, M_TEMP);
757	if (error == 0)
758		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
759		    (int)ctob(vm->vm_dsize),
760		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
761		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
762	if (error == 0)
763		error = vn_rdwr_inchunks(UIO_WRITE, vp,
764		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
765		    round_page(ctob(vm->vm_ssize)),
766		    (off_t)ctob(uarea_pages + kstack_pages) +
767		        ctob(vm->vm_dsize), UIO_USERSPACE,
768		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
769	return (error);
770}
771/*
772 * If a linux binary is exec'ing something, try this image activator
773 * first.  We override standard shell script execution in order to
774 * be able to modify the interpreter path.  We only do this if a linux
775 * binary is doing the exec, so we do not create an EXEC module for it.
776 */
777static int	exec_linux_imgact_try(struct image_params *iparams);
778
779static int
780exec_linux_imgact_try(imgp)
781    struct image_params *imgp;
782{
783    const char *head = (const char *)imgp->image_header;
784    int error = -1;
785
786    /*
787     * The interpreter for shell scripts run from a linux binary needs
788     * to be located in /compat/linux if possible in order to recursively
789     * maintain linux path emulation.
790     */
791    if (((const short *)head)[0] == SHELLMAGIC) {
792	    /*
793	     * Run our normal shell image activator.  If it succeeds attempt
794	     * to use the alternate path for the interpreter.  If an alternate
795	     * path is found, use our stringspace to store it.
796	     */
797	    if ((error = exec_shell_imgact(imgp)) == 0) {
798		    char *rpath = NULL;
799
800		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
801			imgp->interpreter_name, &rpath, 0);
802		    if (rpath != imgp->interpreter_name) {
803			    int len = strlen(rpath) + 1;
804
805			    if (len <= MAXSHELLCMDLEN) {
806				    memcpy(imgp->interpreter_name, rpath, len);
807			    }
808			    free(rpath, M_TEMP);
809		    }
810	    }
811    }
812    return(error);
813}
814
815struct sysentvec linux_sysvec = {
816	LINUX_SYS_MAXSYSCALL,
817	linux_sysent,
818	0xff,
819	LINUX_SIGTBLSZ,
820	bsd_to_linux_signal,
821	ELAST + 1,
822	bsd_to_linux_errno,
823	translate_traps,
824	linux_fixup,
825	linux_sendsig,
826	linux_sigcode,
827	&linux_szsigcode,
828	linux_prepsyscall,
829	"Linux a.out",
830	linux_aout_coredump,
831	exec_linux_imgact_try,
832	LINUX_MINSIGSTKSZ,
833	PAGE_SIZE,
834	VM_MIN_ADDRESS,
835	VM_MAXUSER_ADDRESS,
836	USRSTACK,
837	PS_STRINGS,
838	VM_PROT_ALL,
839	exec_copyout_strings,
840	exec_setregs
841};
842
843struct sysentvec elf_linux_sysvec = {
844	LINUX_SYS_MAXSYSCALL,
845	linux_sysent,
846	0xff,
847	LINUX_SIGTBLSZ,
848	bsd_to_linux_signal,
849	ELAST + 1,
850	bsd_to_linux_errno,
851	translate_traps,
852	elf_linux_fixup,
853	linux_sendsig,
854	linux_sigcode,
855	&linux_szsigcode,
856	linux_prepsyscall,
857	"Linux ELF",
858	elf32_coredump,
859	exec_linux_imgact_try,
860	LINUX_MINSIGSTKSZ,
861	PAGE_SIZE,
862	VM_MIN_ADDRESS,
863	VM_MAXUSER_ADDRESS,
864	USRSTACK,
865	PS_STRINGS,
866	VM_PROT_ALL,
867	exec_copyout_strings,
868	exec_setregs
869};
870
871static Elf32_Brandinfo linux_brand = {
872					ELFOSABI_LINUX,
873					EM_386,
874					"Linux",
875					"/compat/linux",
876					"/lib/ld-linux.so.1",
877					&elf_linux_sysvec
878				 };
879
880static Elf32_Brandinfo linux_glibc2brand = {
881					ELFOSABI_LINUX,
882					EM_386,
883					"Linux",
884					"/compat/linux",
885					"/lib/ld-linux.so.2",
886					&elf_linux_sysvec
887				 };
888
889Elf32_Brandinfo *linux_brandlist[] = {
890					&linux_brand,
891					&linux_glibc2brand,
892					NULL
893				};
894
895static int
896linux_elf_modevent(module_t mod, int type, void *data)
897{
898	Elf32_Brandinfo **brandinfo;
899	int error;
900	struct linux_ioctl_handler **lihp;
901
902	error = 0;
903
904	switch(type) {
905	case MOD_LOAD:
906		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
907		     ++brandinfo)
908			if (elf32_insert_brand_entry(*brandinfo) < 0)
909				error = EINVAL;
910		if (error == 0) {
911			SET_FOREACH(lihp, linux_ioctl_handler_set)
912				linux_ioctl_register_handler(*lihp);
913			if (bootverbose)
914				printf("Linux ELF exec handler installed\n");
915		} else
916			printf("cannot insert Linux ELF brand handler\n");
917		break;
918	case MOD_UNLOAD:
919		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
920		     ++brandinfo)
921			if (elf32_brand_inuse(*brandinfo))
922				error = EBUSY;
923		if (error == 0) {
924			for (brandinfo = &linux_brandlist[0];
925			     *brandinfo != NULL; ++brandinfo)
926				if (elf32_remove_brand_entry(*brandinfo) < 0)
927					error = EINVAL;
928		}
929		if (error == 0) {
930			SET_FOREACH(lihp, linux_ioctl_handler_set)
931				linux_ioctl_unregister_handler(*lihp);
932			if (bootverbose)
933				printf("Linux ELF exec handler removed\n");
934		} else
935			printf("Could not deinstall ELF interpreter entry\n");
936		break;
937	default:
938		break;
939	}
940	return error;
941}
942
943static moduledata_t linux_elf_mod = {
944	"linuxelf",
945	linux_elf_modevent,
946	0
947};
948
949DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
950