linux_sysvec.c revision 105441
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_sysvec.c 105441 2002-10-19 11:57:38Z markm $
29 */
30
31/* XXX we use functions that might not exist. */
32#include "opt_compat.h"
33
34#ifndef COMPAT_43
35#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
36#endif
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/imgact.h>
41#include <sys/imgact_aout.h>
42#include <sys/imgact_elf.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/signalvar.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysent.h>
50#include <sys/sysproto.h>
51#include <sys/user.h>
52#include <sys/vnode.h>
53
54#include <vm/vm.h>
55#include <vm/vm_param.h>
56#include <vm/vm_page.h>
57#include <vm/vm_extern.h>
58#include <sys/exec.h>
59#include <sys/kernel.h>
60#include <sys/module.h>
61#include <machine/cpu.h>
62#include <machine/md_var.h>
63#include <sys/mutex.h>
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_object.h>
70
71#include <i386/linux/linux.h>
72#include <i386/linux/linux_proto.h>
73#include <compat/linux/linux_signal.h>
74#include <compat/linux/linux_util.h>
75
76MODULE_VERSION(linux, 1);
77MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
78MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
79MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
80
81MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
82
83#if BYTE_ORDER == LITTLE_ENDIAN
84#define SHELLMAGIC      0x2123 /* #! */
85#else
86#define SHELLMAGIC      0x2321
87#endif
88
89/*
90 * Allow the sendsig functions to use the ldebug() facility
91 * even though they are not syscalls themselves. Map them
92 * to syscall 0. This is slightly less bogus than using
93 * ldebug(sigreturn).
94 */
95#define	LINUX_SYS_linux_rt_sendsig	0
96#define	LINUX_SYS_linux_sendsig		0
97
98extern char linux_sigcode[];
99extern int linux_szsigcode;
100
101extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
102
103SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
104
105static int	linux_fixup(register_t **stack_base,
106		    struct image_params *iparams);
107static int	elf_linux_fixup(register_t **stack_base,
108		    struct image_params *iparams);
109static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
110		    caddr_t *params);
111static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
112		    u_long code);
113
114/*
115 * Linux syscalls return negative errno's, we do positive and map them
116 */
117static int bsd_to_linux_errno[ELAST + 1] = {
118  	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
119 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
120 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
121 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
122 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
123	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
124	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
125	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
126  	-6, -6, -43, -42, -75, -6, -84
127};
128
129int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
130	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
131	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
132	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, 0,
133	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
134	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
135	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
136	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
137	0, LINUX_SIGUSR1, LINUX_SIGUSR2
138};
139
140int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
141	SIGHUP, SIGINT, SIGQUIT, SIGILL,
142	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
143	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
144	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
145	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
146	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
147	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
148	SIGIO, SIGURG, 0
149};
150
151#define LINUX_T_UNKNOWN  255
152static int _bsd_to_linux_trapcode[] = {
153	LINUX_T_UNKNOWN,	/* 0 */
154	6,			/* 1  T_PRIVINFLT */
155	LINUX_T_UNKNOWN,	/* 2 */
156	3,			/* 3  T_BPTFLT */
157	LINUX_T_UNKNOWN,	/* 4 */
158	LINUX_T_UNKNOWN,	/* 5 */
159	16,			/* 6  T_ARITHTRAP */
160	254,			/* 7  T_ASTFLT */
161	LINUX_T_UNKNOWN,	/* 8 */
162	13,			/* 9  T_PROTFLT */
163	1,			/* 10 T_TRCTRAP */
164	LINUX_T_UNKNOWN,	/* 11 */
165	14,			/* 12 T_PAGEFLT */
166	LINUX_T_UNKNOWN,	/* 13 */
167	17,			/* 14 T_ALIGNFLT */
168	LINUX_T_UNKNOWN,	/* 15 */
169	LINUX_T_UNKNOWN,	/* 16 */
170	LINUX_T_UNKNOWN,	/* 17 */
171	0,			/* 18 T_DIVIDE */
172	2,			/* 19 T_NMI */
173	4,			/* 20 T_OFLOW */
174	5,			/* 21 T_BOUND */
175	7,			/* 22 T_DNA */
176	8,			/* 23 T_DOUBLEFLT */
177	9,			/* 24 T_FPOPFLT */
178	10,			/* 25 T_TSSFLT */
179	11,			/* 26 T_SEGNPFLT */
180	12,			/* 27 T_STKFLT */
181	18,			/* 28 T_MCHK */
182	19,			/* 29 T_XMMFLT */
183	15			/* 30 T_RESERVED */
184};
185#define bsd_to_linux_trapcode(code) \
186    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
187     _bsd_to_linux_trapcode[(code)]: \
188     LINUX_T_UNKNOWN)
189
190/*
191 * If FreeBSD & Linux have a difference of opinion about what a trap
192 * means, deal with it here.
193 *
194 * MPSAFE
195 */
196static int
197translate_traps(int signal, int trap_code)
198{
199	if (signal != SIGBUS)
200		return signal;
201	switch (trap_code) {
202	case T_PROTFLT:
203	case T_TSSFLT:
204	case T_DOUBLEFLT:
205	case T_PAGEFLT:
206		return SIGSEGV;
207	default:
208		return signal;
209	}
210}
211
212static int
213linux_fixup(register_t **stack_base, struct image_params *imgp)
214{
215	register_t *argv, *envp;
216
217	argv = *stack_base;
218	envp = *stack_base + (imgp->argc + 1);
219	(*stack_base)--;
220	**stack_base = (intptr_t)(void *)envp;
221	(*stack_base)--;
222	**stack_base = (intptr_t)(void *)argv;
223	(*stack_base)--;
224	**stack_base = imgp->argc;
225	return 0;
226}
227
228static int
229elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
230{
231	Elf32_Auxargs *args = (Elf32_Auxargs *)imgp->auxargs;
232	register_t *pos;
233
234	pos = *stack_base + (imgp->argc + imgp->envc + 2);
235
236	if (args->trace)
237		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
238	if (args->execfd != -1)
239		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
240	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
241	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
242	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
243	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
244	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
245	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
246	AUXARGS_ENTRY(pos, AT_BASE, args->base);
247	PROC_LOCK(imgp->proc);
248	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
249	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
250	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
251	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
252	PROC_UNLOCK(imgp->proc);
253	AUXARGS_ENTRY(pos, AT_NULL, 0);
254
255	free(imgp->auxargs, M_TEMP);
256	imgp->auxargs = NULL;
257
258	(*stack_base)--;
259	**stack_base = (long)imgp->argc;
260	return 0;
261}
262
263extern int _ucodesel, _udatasel;
264extern unsigned long linux_sznonrtsigcode;
265
266static void
267linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
268{
269	register struct thread *td = curthread;
270	register struct proc *p = td->td_proc;
271	register struct trapframe *regs;
272	struct l_rt_sigframe *fp, frame;
273	int oonstack;
274
275	PROC_LOCK_ASSERT(p, MA_OWNED);
276	regs = td->td_frame;
277	oonstack = sigonstack(regs->tf_esp);
278
279#ifdef DEBUG
280	if (ldebug(rt_sendsig))
281		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
282		    catcher, sig, (void*)mask, code);
283#endif
284	/*
285	 * Allocate space for the signal handler context.
286	 */
287	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
288	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
289		fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp +
290		    p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe));
291	} else
292		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
293	PROC_UNLOCK(p);
294
295	/*
296	 * Build the argument list for the signal handler.
297	 */
298	if (p->p_sysent->sv_sigtbl)
299		if (sig <= p->p_sysent->sv_sigsize)
300			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
301
302	frame.sf_handler = catcher;
303	frame.sf_sig = sig;
304	frame.sf_siginfo = &fp->sf_si;
305	frame.sf_ucontext = &fp->sf_sc;
306
307	/* Fill in POSIX parts */
308	frame.sf_si.lsi_signo = sig;
309	frame.sf_si.lsi_code = code;
310	frame.sf_si.lsi_addr = (void *)regs->tf_err;
311
312	/*
313	 * Build the signal context to be used by sigreturn.
314	 */
315	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
316	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
317
318	PROC_LOCK(p);
319	frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp;
320	frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size;
321	frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
322	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
323	PROC_UNLOCK(p);
324
325	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
326
327	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
328	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
329	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
330	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
331	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
332	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
333	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
334	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
335	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
336	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
337	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
338	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
339	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
340	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
341	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
342	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
343	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
344	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
345	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
346
347#ifdef DEBUG
348	if (ldebug(rt_sendsig))
349		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
350		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
351		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
352#endif
353
354	if (copyout(&frame, fp, sizeof(frame)) != 0) {
355		/*
356		 * Process has trashed its stack; give it an illegal
357		 * instruction to halt it in its tracks.
358		 */
359#ifdef DEBUG
360		if (ldebug(rt_sendsig))
361			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
362			    fp, oonstack);
363#endif
364		PROC_LOCK(p);
365		sigexit(td, SIGILL);
366	}
367
368	/*
369	 * Build context to run handler in.
370	 */
371	regs->tf_esp = (int)fp;
372	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
373	    linux_sznonrtsigcode;
374	regs->tf_eflags &= ~(PSL_T | PSL_VM);
375	regs->tf_cs = _ucodesel;
376	regs->tf_ds = _udatasel;
377	regs->tf_es = _udatasel;
378	regs->tf_fs = _udatasel;
379	regs->tf_ss = _udatasel;
380	PROC_LOCK(p);
381}
382
383
384/*
385 * Send an interrupt to process.
386 *
387 * Stack is set up to allow sigcode stored
388 * in u. to call routine, followed by kcall
389 * to sigreturn routine below.  After sigreturn
390 * resets the signal mask, the stack, and the
391 * frame pointer, it returns to the user
392 * specified pc, psl.
393 */
394
395static void
396linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
397{
398	register struct thread *td = curthread;
399	register struct proc *p = td->td_proc;
400	register struct trapframe *regs;
401	struct l_sigframe *fp, frame;
402	l_sigset_t lmask;
403	int oonstack, i;
404
405	PROC_LOCK_ASSERT(p, MA_OWNED);
406	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
407		/* Signal handler installed with SA_SIGINFO. */
408		linux_rt_sendsig(catcher, sig, mask, code);
409		return;
410	}
411
412	regs = td->td_frame;
413	oonstack = sigonstack(regs->tf_esp);
414
415#ifdef DEBUG
416	if (ldebug(sendsig))
417		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
418		    catcher, sig, (void*)mask, code);
419#endif
420
421	/*
422	 * Allocate space for the signal handler context.
423	 */
424	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
425	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
426		fp = (struct l_sigframe *)(p->p_sigstk.ss_sp +
427		    p->p_sigstk.ss_size - sizeof(struct l_sigframe));
428	} else
429		fp = (struct l_sigframe *)regs->tf_esp - 1;
430	PROC_UNLOCK(p);
431
432	/*
433	 * Build the argument list for the signal handler.
434	 */
435	if (p->p_sysent->sv_sigtbl)
436		if (sig <= p->p_sysent->sv_sigsize)
437			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
438
439	frame.sf_handler = catcher;
440	frame.sf_sig = sig;
441
442	bsd_to_linux_sigset(mask, &lmask);
443
444	/*
445	 * Build the signal context to be used by sigreturn.
446	 */
447	frame.sf_sc.sc_mask   = lmask.__bits[0];
448	frame.sf_sc.sc_gs     = rgs();
449	frame.sf_sc.sc_fs     = regs->tf_fs;
450	frame.sf_sc.sc_es     = regs->tf_es;
451	frame.sf_sc.sc_ds     = regs->tf_ds;
452	frame.sf_sc.sc_edi    = regs->tf_edi;
453	frame.sf_sc.sc_esi    = regs->tf_esi;
454	frame.sf_sc.sc_ebp    = regs->tf_ebp;
455	frame.sf_sc.sc_ebx    = regs->tf_ebx;
456	frame.sf_sc.sc_edx    = regs->tf_edx;
457	frame.sf_sc.sc_ecx    = regs->tf_ecx;
458	frame.sf_sc.sc_eax    = regs->tf_eax;
459	frame.sf_sc.sc_eip    = regs->tf_eip;
460	frame.sf_sc.sc_cs     = regs->tf_cs;
461	frame.sf_sc.sc_eflags = regs->tf_eflags;
462	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
463	frame.sf_sc.sc_ss     = regs->tf_ss;
464	frame.sf_sc.sc_err    = regs->tf_err;
465	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
466
467	bzero(&frame.sf_fpstate, sizeof(struct l_fpstate));
468
469	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
470		frame.sf_extramask[i] = lmask.__bits[i+1];
471
472	if (copyout(&frame, fp, sizeof(frame)) != 0) {
473		/*
474		 * Process has trashed its stack; give it an illegal
475		 * instruction to halt it in its tracks.
476		 */
477		PROC_LOCK(p);
478		sigexit(td, SIGILL);
479	}
480
481	/*
482	 * Build context to run handler in.
483	 */
484	regs->tf_esp = (int)fp;
485	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
486	regs->tf_eflags &= ~(PSL_T | PSL_VM);
487	regs->tf_cs = _ucodesel;
488	regs->tf_ds = _udatasel;
489	regs->tf_es = _udatasel;
490	regs->tf_fs = _udatasel;
491	regs->tf_ss = _udatasel;
492	PROC_LOCK(p);
493}
494
495/*
496 * System call to cleanup state after a signal
497 * has been taken.  Reset signal mask and
498 * stack state from context left by sendsig (above).
499 * Return to previous pc and psl as specified by
500 * context left by sendsig. Check carefully to
501 * make sure that the user has not modified the
502 * psl to gain improper privileges or to cause
503 * a machine fault.
504 */
505int
506linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
507{
508	struct proc *p = td->td_proc;
509	struct l_sigframe frame;
510	register struct trapframe *regs;
511	l_sigset_t lmask;
512	int eflags, i;
513
514	regs = td->td_frame;
515
516#ifdef DEBUG
517	if (ldebug(sigreturn))
518		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
519#endif
520	/*
521	 * The trampoline code hands us the sigframe.
522	 * It is unsafe to keep track of it ourselves, in the event that a
523	 * program jumps out of a signal handler.
524	 */
525	if (copyin((caddr_t)args->sfp, &frame, sizeof(frame)) != 0)
526		return (EFAULT);
527
528	/*
529	 * Check for security violations.
530	 */
531#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
532	eflags = frame.sf_sc.sc_eflags;
533	/*
534	 * XXX do allow users to change the privileged flag PSL_RF.  The
535	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
536	 * sometimes set it there too.  tf_eflags is kept in the signal
537	 * context during signal handling and there is no other place
538	 * to remember it, so the PSL_RF bit may be corrupted by the
539	 * signal handler without us knowing.  Corruption of the PSL_RF
540	 * bit at worst causes one more or one less debugger trap, so
541	 * allowing it is fairly harmless.
542	 */
543	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
544    		return(EINVAL);
545
546	/*
547	 * Don't allow users to load a valid privileged %cs.  Let the
548	 * hardware check for invalid selectors, excess privilege in
549	 * other selectors, invalid %eip's and invalid %esp's.
550	 */
551#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
552	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
553		trapsignal(p, SIGBUS, T_PROTFLT);
554		return(EINVAL);
555	}
556
557	lmask.__bits[0] = frame.sf_sc.sc_mask;
558	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
559		lmask.__bits[i+1] = frame.sf_extramask[i];
560	PROC_LOCK(p);
561	linux_to_bsd_sigset(&lmask, &p->p_sigmask);
562	SIG_CANTMASK(p->p_sigmask);
563	signotify(p);
564	PROC_UNLOCK(p);
565
566	/*
567	 * Restore signal context.
568	 */
569	/* %gs was restored by the trampoline. */
570	regs->tf_fs     = frame.sf_sc.sc_fs;
571	regs->tf_es     = frame.sf_sc.sc_es;
572	regs->tf_ds     = frame.sf_sc.sc_ds;
573	regs->tf_edi    = frame.sf_sc.sc_edi;
574	regs->tf_esi    = frame.sf_sc.sc_esi;
575	regs->tf_ebp    = frame.sf_sc.sc_ebp;
576	regs->tf_ebx    = frame.sf_sc.sc_ebx;
577	regs->tf_edx    = frame.sf_sc.sc_edx;
578	regs->tf_ecx    = frame.sf_sc.sc_ecx;
579	regs->tf_eax    = frame.sf_sc.sc_eax;
580	regs->tf_eip    = frame.sf_sc.sc_eip;
581	regs->tf_cs     = frame.sf_sc.sc_cs;
582	regs->tf_eflags = eflags;
583	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
584	regs->tf_ss     = frame.sf_sc.sc_ss;
585
586	return (EJUSTRETURN);
587}
588
589/*
590 * System call to cleanup state after a signal
591 * has been taken.  Reset signal mask and
592 * stack state from context left by rt_sendsig (above).
593 * Return to previous pc and psl as specified by
594 * context left by sendsig. Check carefully to
595 * make sure that the user has not modified the
596 * psl to gain improper privileges or to cause
597 * a machine fault.
598 */
599int
600linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
601{
602	struct proc *p = td->td_proc;
603	struct l_ucontext uc;
604	struct l_sigcontext *context;
605	l_stack_t *lss;
606	stack_t ss;
607	register struct trapframe *regs;
608	int eflags;
609
610	regs = td->td_frame;
611
612#ifdef DEBUG
613	if (ldebug(rt_sigreturn))
614		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
615#endif
616	/*
617	 * The trampoline code hands us the ucontext.
618	 * It is unsafe to keep track of it ourselves, in the event that a
619	 * program jumps out of a signal handler.
620	 */
621	if (copyin((caddr_t)args->ucp, &uc, sizeof(uc)) != 0)
622		return (EFAULT);
623
624	context = &uc.uc_mcontext;
625
626	/*
627	 * Check for security violations.
628	 */
629#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
630	eflags = context->sc_eflags;
631	/*
632	 * XXX do allow users to change the privileged flag PSL_RF.  The
633	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
634	 * sometimes set it there too.  tf_eflags is kept in the signal
635	 * context during signal handling and there is no other place
636	 * to remember it, so the PSL_RF bit may be corrupted by the
637	 * signal handler without us knowing.  Corruption of the PSL_RF
638	 * bit at worst causes one more or one less debugger trap, so
639	 * allowing it is fairly harmless.
640	 */
641	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
642    		return(EINVAL);
643
644	/*
645	 * Don't allow users to load a valid privileged %cs.  Let the
646	 * hardware check for invalid selectors, excess privilege in
647	 * other selectors, invalid %eip's and invalid %esp's.
648	 */
649#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
650	if (!CS_SECURE(context->sc_cs)) {
651		trapsignal(p, SIGBUS, T_PROTFLT);
652		return(EINVAL);
653	}
654
655	PROC_LOCK(p);
656	linux_to_bsd_sigset(&uc.uc_sigmask, &p->p_sigmask);
657	SIG_CANTMASK(p->p_sigmask);
658	signotify(p);
659	PROC_UNLOCK(p);
660
661	/*
662	 * Restore signal context
663	 */
664	/* %gs was restored by the trampoline. */
665	regs->tf_fs     = context->sc_fs;
666	regs->tf_es     = context->sc_es;
667	regs->tf_ds     = context->sc_ds;
668	regs->tf_edi    = context->sc_edi;
669	regs->tf_esi    = context->sc_esi;
670	regs->tf_ebp    = context->sc_ebp;
671	regs->tf_ebx    = context->sc_ebx;
672	regs->tf_edx    = context->sc_edx;
673	regs->tf_ecx    = context->sc_ecx;
674	regs->tf_eax    = context->sc_eax;
675	regs->tf_eip    = context->sc_eip;
676	regs->tf_cs     = context->sc_cs;
677	regs->tf_eflags = eflags;
678	regs->tf_esp    = context->sc_esp_at_signal;
679	regs->tf_ss     = context->sc_ss;
680
681	/*
682	 * call sigaltstack & ignore results..
683	 */
684	lss = &uc.uc_stack;
685	ss.ss_sp = lss->ss_sp;
686	ss.ss_size = lss->ss_size;
687	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
688
689#ifdef DEBUG
690	if (ldebug(rt_sigreturn))
691		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
692		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
693#endif
694	(void)kern_sigaltstack(td, &ss, NULL);
695
696	return (EJUSTRETURN);
697}
698
699/*
700 * MPSAFE
701 */
702static void
703linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
704{
705	args[0] = tf->tf_ebx;
706	args[1] = tf->tf_ecx;
707	args[2] = tf->tf_edx;
708	args[3] = tf->tf_esi;
709	args[4] = tf->tf_edi;
710	args[5] = tf->tf_ebp;	/* Unconfirmed */
711	*params = NULL;		/* no copyin */
712}
713
714
715
716/*
717 * Dump core, into a file named as described in the comments for
718 * expand_name(), unless the process was setuid/setgid.
719 */
720static int
721linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
722{
723	struct proc *p = td->td_proc;
724	struct ucred *cred = td->td_ucred;
725	struct vmspace *vm = p->p_vmspace;
726	char *tempuser;
727	int error;
728
729	if (ctob((uarea_pages + kstack_pages) +
730	    vm->vm_dsize + vm->vm_ssize) >= limit)
731		return (EFAULT);
732	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
733	    M_WAITOK | M_ZERO);
734	if (tempuser == NULL)
735		return (ENOMEM);
736	PROC_LOCK(p);
737	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
738	PROC_UNLOCK(p);
739	bcopy(p->p_uarea, tempuser, sizeof(struct user));
740	bcopy(td->td_frame,
741	    tempuser + ctob(uarea_pages) +
742	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
743	    sizeof(struct trapframe));
744	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
745	    ctob(uarea_pages + kstack_pages),
746	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
747	    (int *)NULL, td);
748	free(tempuser, M_TEMP);
749	if (error == 0)
750		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
751		    (int)ctob(vm->vm_dsize),
752		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
753		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
754	if (error == 0)
755		error = vn_rdwr_inchunks(UIO_WRITE, vp,
756		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
757		    round_page(ctob(vm->vm_ssize)),
758		    (off_t)ctob(uarea_pages + kstack_pages) +
759		        ctob(vm->vm_dsize), UIO_USERSPACE,
760		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
761	return (error);
762}
763/*
764 * If a linux binary is exec'ing something, try this image activator
765 * first.  We override standard shell script execution in order to
766 * be able to modify the interpreter path.  We only do this if a linux
767 * binary is doing the exec, so we do not create an EXEC module for it.
768 */
769static int	exec_linux_imgact_try(struct image_params *iparams);
770
771static int
772exec_linux_imgact_try(struct image_params *imgp)
773{
774    const char *head = (const char *)imgp->image_header;
775    int error = -1;
776
777    /*
778     * The interpreter for shell scripts run from a linux binary needs
779     * to be located in /compat/linux if possible in order to recursively
780     * maintain linux path emulation.
781     */
782    if (((const short *)head)[0] == SHELLMAGIC) {
783	    /*
784	     * Run our normal shell image activator.  If it succeeds attempt
785	     * to use the alternate path for the interpreter.  If an alternate
786	     * path is found, use our stringspace to store it.
787	     */
788	    if ((error = exec_shell_imgact(imgp)) == 0) {
789		    char *rpath = NULL;
790
791		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
792			imgp->interpreter_name, &rpath, 0);
793		    if (rpath != imgp->interpreter_name) {
794			    int len = strlen(rpath) + 1;
795
796			    if (len <= MAXSHELLCMDLEN) {
797				    memcpy(imgp->interpreter_name, rpath, len);
798			    }
799			    free(rpath, M_TEMP);
800		    }
801	    }
802    }
803    return(error);
804}
805
806struct sysentvec linux_sysvec = {
807	LINUX_SYS_MAXSYSCALL,
808	linux_sysent,
809	0xff,
810	LINUX_SIGTBLSZ,
811	bsd_to_linux_signal,
812	ELAST + 1,
813	bsd_to_linux_errno,
814	translate_traps,
815	linux_fixup,
816	linux_sendsig,
817	linux_sigcode,
818	&linux_szsigcode,
819	linux_prepsyscall,
820	"Linux a.out",
821	linux_aout_coredump,
822	exec_linux_imgact_try,
823	LINUX_MINSIGSTKSZ,
824	PAGE_SIZE,
825	VM_MIN_ADDRESS,
826	VM_MAXUSER_ADDRESS,
827	USRSTACK,
828	PS_STRINGS,
829	VM_PROT_ALL,
830	exec_copyout_strings,
831	exec_setregs
832};
833
834struct sysentvec elf_linux_sysvec = {
835	LINUX_SYS_MAXSYSCALL,
836	linux_sysent,
837	0xff,
838	LINUX_SIGTBLSZ,
839	bsd_to_linux_signal,
840	ELAST + 1,
841	bsd_to_linux_errno,
842	translate_traps,
843	elf_linux_fixup,
844	linux_sendsig,
845	linux_sigcode,
846	&linux_szsigcode,
847	linux_prepsyscall,
848	"Linux ELF",
849	elf32_coredump,
850	exec_linux_imgact_try,
851	LINUX_MINSIGSTKSZ,
852	PAGE_SIZE,
853	VM_MIN_ADDRESS,
854	VM_MAXUSER_ADDRESS,
855	USRSTACK,
856	PS_STRINGS,
857	VM_PROT_ALL,
858	exec_copyout_strings,
859	exec_setregs
860};
861
862static Elf32_Brandinfo linux_brand = {
863					ELFOSABI_LINUX,
864					EM_386,
865					"Linux",
866					"/compat/linux",
867					"/lib/ld-linux.so.1",
868					&elf_linux_sysvec
869				 };
870
871static Elf32_Brandinfo linux_glibc2brand = {
872					ELFOSABI_LINUX,
873					EM_386,
874					"Linux",
875					"/compat/linux",
876					"/lib/ld-linux.so.2",
877					&elf_linux_sysvec
878				 };
879
880Elf32_Brandinfo *linux_brandlist[] = {
881					&linux_brand,
882					&linux_glibc2brand,
883					NULL
884				};
885
886static int
887linux_elf_modevent(module_t mod, int type, void *data)
888{
889	Elf32_Brandinfo **brandinfo;
890	int error;
891	struct linux_ioctl_handler **lihp;
892
893	error = 0;
894
895	switch(type) {
896	case MOD_LOAD:
897		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
898		     ++brandinfo)
899			if (elf32_insert_brand_entry(*brandinfo) < 0)
900				error = EINVAL;
901		if (error == 0) {
902			SET_FOREACH(lihp, linux_ioctl_handler_set)
903				linux_ioctl_register_handler(*lihp);
904			if (bootverbose)
905				printf("Linux ELF exec handler installed\n");
906		} else
907			printf("cannot insert Linux ELF brand handler\n");
908		break;
909	case MOD_UNLOAD:
910		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
911		     ++brandinfo)
912			if (elf32_brand_inuse(*brandinfo))
913				error = EBUSY;
914		if (error == 0) {
915			for (brandinfo = &linux_brandlist[0];
916			     *brandinfo != NULL; ++brandinfo)
917				if (elf32_remove_brand_entry(*brandinfo) < 0)
918					error = EINVAL;
919		}
920		if (error == 0) {
921			SET_FOREACH(lihp, linux_ioctl_handler_set)
922				linux_ioctl_unregister_handler(*lihp);
923			if (bootverbose)
924				printf("Linux ELF exec handler removed\n");
925		} else
926			printf("Could not deinstall ELF interpreter entry\n");
927		break;
928	default:
929		break;
930	}
931	return error;
932}
933
934static moduledata_t linux_elf_mod = {
935	"linuxelf",
936	linux_elf_modevent,
937	0
938};
939
940DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
941