linux_sysvec.c revision 112682
1/*-
2 * Copyright (c) 1994-1996 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_sysvec.c 112682 2003-03-26 18:29:44Z jhb $
29 */
30
31/* XXX we use functions that might not exist. */
32#include "opt_compat.h"
33
34#ifndef COMPAT_43
35#error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
36#endif
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/imgact.h>
41#include <sys/imgact_aout.h>
42#include <sys/imgact_elf.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/signalvar.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysent.h>
50#include <sys/sysproto.h>
51#include <sys/user.h>
52#include <sys/vnode.h>
53
54#include <vm/vm.h>
55#include <vm/vm_param.h>
56#include <vm/vm_page.h>
57#include <vm/vm_extern.h>
58#include <sys/exec.h>
59#include <sys/kernel.h>
60#include <sys/module.h>
61#include <machine/cpu.h>
62#include <machine/md_var.h>
63#include <sys/mutex.h>
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_object.h>
70
71#include <i386/linux/linux.h>
72#include <i386/linux/linux_proto.h>
73#include <compat/linux/linux_signal.h>
74#include <compat/linux/linux_util.h>
75
76MODULE_VERSION(linux, 1);
77MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
78MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
79MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
80
81MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
82
83#if BYTE_ORDER == LITTLE_ENDIAN
84#define SHELLMAGIC      0x2123 /* #! */
85#else
86#define SHELLMAGIC      0x2321
87#endif
88
89/*
90 * Allow the sendsig functions to use the ldebug() facility
91 * even though they are not syscalls themselves. Map them
92 * to syscall 0. This is slightly less bogus than using
93 * ldebug(sigreturn).
94 */
95#define	LINUX_SYS_linux_rt_sendsig	0
96#define	LINUX_SYS_linux_sendsig		0
97
98extern char linux_sigcode[];
99extern int linux_szsigcode;
100
101extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
102
103SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
104
105static int	linux_fixup(register_t **stack_base,
106		    struct image_params *iparams);
107static int	elf_linux_fixup(register_t **stack_base,
108		    struct image_params *iparams);
109static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
110		    caddr_t *params);
111static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
112		    u_long code);
113
114/*
115 * Linux syscalls return negative errno's, we do positive and map them
116 */
117static int bsd_to_linux_errno[ELAST + 1] = {
118	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
119	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
120	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
121	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
122	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
123	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
124	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
125	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
126	-6, -6, -43, -42, -75, -6, -84
127};
128
129int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
130	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
131	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
132	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
133	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
134	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
135	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
136	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
137	0, LINUX_SIGUSR1, LINUX_SIGUSR2
138};
139
140int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
141	SIGHUP, SIGINT, SIGQUIT, SIGILL,
142	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
143	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
144	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
145	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
146	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
147	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
148	SIGIO, SIGURG, SIGSYS
149};
150
151#define LINUX_T_UNKNOWN  255
152static int _bsd_to_linux_trapcode[] = {
153	LINUX_T_UNKNOWN,	/* 0 */
154	6,			/* 1  T_PRIVINFLT */
155	LINUX_T_UNKNOWN,	/* 2 */
156	3,			/* 3  T_BPTFLT */
157	LINUX_T_UNKNOWN,	/* 4 */
158	LINUX_T_UNKNOWN,	/* 5 */
159	16,			/* 6  T_ARITHTRAP */
160	254,			/* 7  T_ASTFLT */
161	LINUX_T_UNKNOWN,	/* 8 */
162	13,			/* 9  T_PROTFLT */
163	1,			/* 10 T_TRCTRAP */
164	LINUX_T_UNKNOWN,	/* 11 */
165	14,			/* 12 T_PAGEFLT */
166	LINUX_T_UNKNOWN,	/* 13 */
167	17,			/* 14 T_ALIGNFLT */
168	LINUX_T_UNKNOWN,	/* 15 */
169	LINUX_T_UNKNOWN,	/* 16 */
170	LINUX_T_UNKNOWN,	/* 17 */
171	0,			/* 18 T_DIVIDE */
172	2,			/* 19 T_NMI */
173	4,			/* 20 T_OFLOW */
174	5,			/* 21 T_BOUND */
175	7,			/* 22 T_DNA */
176	8,			/* 23 T_DOUBLEFLT */
177	9,			/* 24 T_FPOPFLT */
178	10,			/* 25 T_TSSFLT */
179	11,			/* 26 T_SEGNPFLT */
180	12,			/* 27 T_STKFLT */
181	18,			/* 28 T_MCHK */
182	19,			/* 29 T_XMMFLT */
183	15			/* 30 T_RESERVED */
184};
185#define bsd_to_linux_trapcode(code) \
186    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
187     _bsd_to_linux_trapcode[(code)]: \
188     LINUX_T_UNKNOWN)
189
190/*
191 * If FreeBSD & Linux have a difference of opinion about what a trap
192 * means, deal with it here.
193 *
194 * MPSAFE
195 */
196static int
197translate_traps(int signal, int trap_code)
198{
199	if (signal != SIGBUS)
200		return signal;
201	switch (trap_code) {
202	case T_PROTFLT:
203	case T_TSSFLT:
204	case T_DOUBLEFLT:
205	case T_PAGEFLT:
206		return SIGSEGV;
207	default:
208		return signal;
209	}
210}
211
212static int
213linux_fixup(register_t **stack_base, struct image_params *imgp)
214{
215	register_t *argv, *envp;
216
217	argv = *stack_base;
218	envp = *stack_base + (imgp->argc + 1);
219	(*stack_base)--;
220	**stack_base = (intptr_t)(void *)envp;
221	(*stack_base)--;
222	**stack_base = (intptr_t)(void *)argv;
223	(*stack_base)--;
224	**stack_base = imgp->argc;
225	return 0;
226}
227
228static int
229elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
230{
231	Elf32_Auxargs *args;
232	register_t *pos;
233
234	KASSERT(curthread->td_proc == imgp->proc &&
235	    (curthread->td_proc->p_flag & P_THREADED) == 0,
236	    ("unsafe elf_linux_fixup(), should be curproc"));
237	args = (Elf32_Auxargs *)imgp->auxargs;
238	pos = *stack_base + (imgp->argc + imgp->envc + 2);
239
240	if (args->trace)
241		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
242	if (args->execfd != -1)
243		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
244	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
245	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
246	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
247	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
248	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
249	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
250	AUXARGS_ENTRY(pos, AT_BASE, args->base);
251	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
252	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
253	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
254	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
255	AUXARGS_ENTRY(pos, AT_NULL, 0);
256
257	free(imgp->auxargs, M_TEMP);
258	imgp->auxargs = NULL;
259
260	(*stack_base)--;
261	**stack_base = (register_t)imgp->argc;
262	return 0;
263}
264
265extern int _ucodesel, _udatasel;
266extern unsigned long linux_sznonrtsigcode;
267
268static void
269linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
270{
271	struct thread *td = curthread;
272	struct proc *p = td->td_proc;
273	struct trapframe *regs;
274	struct l_rt_sigframe *fp, frame;
275	int oonstack;
276
277	PROC_LOCK_ASSERT(p, MA_OWNED);
278	regs = td->td_frame;
279	oonstack = sigonstack(regs->tf_esp);
280
281#ifdef DEBUG
282	if (ldebug(rt_sendsig))
283		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
284		    catcher, sig, (void*)mask, code);
285#endif
286	/*
287	 * Allocate space for the signal handler context.
288	 */
289	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
290	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
291		fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp +
292		    p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe));
293	} else
294		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
295	PROC_UNLOCK(p);
296
297	/*
298	 * Build the argument list for the signal handler.
299	 */
300	if (p->p_sysent->sv_sigtbl)
301		if (sig <= p->p_sysent->sv_sigsize)
302			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
303
304	bzero(&frame, sizeof(frame));
305
306	frame.sf_handler = catcher;
307	frame.sf_sig = sig;
308	frame.sf_siginfo = &fp->sf_si;
309	frame.sf_ucontext = &fp->sf_sc;
310
311	/* Fill in POSIX parts */
312	frame.sf_si.lsi_signo = sig;
313	frame.sf_si.lsi_code = code;
314	frame.sf_si.lsi_addr = (void *)regs->tf_err;
315
316	/*
317	 * Build the signal context to be used by sigreturn.
318	 */
319	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
320	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
321
322	PROC_LOCK(p);
323	frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp;
324	frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size;
325	frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
326	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
327	PROC_UNLOCK(p);
328
329	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
330
331	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
332	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
333	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
334	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
335	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
336	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
337	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
338	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
339	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
340	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
341	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
342	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
343	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
344	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
345	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
346	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
347	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
348	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
349	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
350
351#ifdef DEBUG
352	if (ldebug(rt_sendsig))
353		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
354		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
355		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
356#endif
357
358	if (copyout(&frame, fp, sizeof(frame)) != 0) {
359		/*
360		 * Process has trashed its stack; give it an illegal
361		 * instruction to halt it in its tracks.
362		 */
363#ifdef DEBUG
364		if (ldebug(rt_sendsig))
365			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
366			    fp, oonstack);
367#endif
368		PROC_LOCK(p);
369		sigexit(td, SIGILL);
370	}
371
372	/*
373	 * Build context to run handler in.
374	 */
375	regs->tf_esp = (int)fp;
376	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
377	    linux_sznonrtsigcode;
378	regs->tf_eflags &= ~(PSL_T | PSL_VM);
379	regs->tf_cs = _ucodesel;
380	regs->tf_ds = _udatasel;
381	regs->tf_es = _udatasel;
382	regs->tf_fs = _udatasel;
383	regs->tf_ss = _udatasel;
384	PROC_LOCK(p);
385}
386
387
388/*
389 * Send an interrupt to process.
390 *
391 * Stack is set up to allow sigcode stored
392 * in u. to call routine, followed by kcall
393 * to sigreturn routine below.  After sigreturn
394 * resets the signal mask, the stack, and the
395 * frame pointer, it returns to the user
396 * specified pc, psl.
397 */
398static void
399linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
400{
401	struct thread *td = curthread;
402	struct proc *p = td->td_proc;
403	struct trapframe *regs;
404	struct l_sigframe *fp, frame;
405	l_sigset_t lmask;
406	int oonstack, i;
407
408	PROC_LOCK_ASSERT(p, MA_OWNED);
409	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
410		/* Signal handler installed with SA_SIGINFO. */
411		linux_rt_sendsig(catcher, sig, mask, code);
412		return;
413	}
414
415	regs = td->td_frame;
416	oonstack = sigonstack(regs->tf_esp);
417
418#ifdef DEBUG
419	if (ldebug(sendsig))
420		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
421		    catcher, sig, (void*)mask, code);
422#endif
423
424	/*
425	 * Allocate space for the signal handler context.
426	 */
427	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
428	    SIGISMEMBER(p->p_sigacts->ps_sigonstack, sig)) {
429		fp = (struct l_sigframe *)(p->p_sigstk.ss_sp +
430		    p->p_sigstk.ss_size - sizeof(struct l_sigframe));
431	} else
432		fp = (struct l_sigframe *)regs->tf_esp - 1;
433	PROC_UNLOCK(p);
434
435	/*
436	 * Build the argument list for the signal handler.
437	 */
438	if (p->p_sysent->sv_sigtbl)
439		if (sig <= p->p_sysent->sv_sigsize)
440			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
441
442	bzero(&frame, sizeof(frame));
443
444	frame.sf_handler = catcher;
445	frame.sf_sig = sig;
446
447	bsd_to_linux_sigset(mask, &lmask);
448
449	/*
450	 * Build the signal context to be used by sigreturn.
451	 */
452	frame.sf_sc.sc_mask   = lmask.__bits[0];
453	frame.sf_sc.sc_gs     = rgs();
454	frame.sf_sc.sc_fs     = regs->tf_fs;
455	frame.sf_sc.sc_es     = regs->tf_es;
456	frame.sf_sc.sc_ds     = regs->tf_ds;
457	frame.sf_sc.sc_edi    = regs->tf_edi;
458	frame.sf_sc.sc_esi    = regs->tf_esi;
459	frame.sf_sc.sc_ebp    = regs->tf_ebp;
460	frame.sf_sc.sc_ebx    = regs->tf_ebx;
461	frame.sf_sc.sc_edx    = regs->tf_edx;
462	frame.sf_sc.sc_ecx    = regs->tf_ecx;
463	frame.sf_sc.sc_eax    = regs->tf_eax;
464	frame.sf_sc.sc_eip    = regs->tf_eip;
465	frame.sf_sc.sc_cs     = regs->tf_cs;
466	frame.sf_sc.sc_eflags = regs->tf_eflags;
467	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
468	frame.sf_sc.sc_ss     = regs->tf_ss;
469	frame.sf_sc.sc_err    = regs->tf_err;
470	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
471
472	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
473		frame.sf_extramask[i] = lmask.__bits[i+1];
474
475	if (copyout(&frame, fp, sizeof(frame)) != 0) {
476		/*
477		 * Process has trashed its stack; give it an illegal
478		 * instruction to halt it in its tracks.
479		 */
480		PROC_LOCK(p);
481		sigexit(td, SIGILL);
482	}
483
484	/*
485	 * Build context to run handler in.
486	 */
487	regs->tf_esp = (int)fp;
488	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
489	regs->tf_eflags &= ~(PSL_T | PSL_VM);
490	regs->tf_cs = _ucodesel;
491	regs->tf_ds = _udatasel;
492	regs->tf_es = _udatasel;
493	regs->tf_fs = _udatasel;
494	regs->tf_ss = _udatasel;
495	PROC_LOCK(p);
496}
497
498/*
499 * System call to cleanup state after a signal
500 * has been taken.  Reset signal mask and
501 * stack state from context left by sendsig (above).
502 * Return to previous pc and psl as specified by
503 * context left by sendsig. Check carefully to
504 * make sure that the user has not modified the
505 * psl to gain improper privileges or to cause
506 * a machine fault.
507 */
508int
509linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
510{
511	struct proc *p = td->td_proc;
512	struct l_sigframe frame;
513	struct trapframe *regs;
514	l_sigset_t lmask;
515	int eflags, i;
516
517	regs = td->td_frame;
518
519#ifdef DEBUG
520	if (ldebug(sigreturn))
521		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
522#endif
523	/*
524	 * The trampoline code hands us the sigframe.
525	 * It is unsafe to keep track of it ourselves, in the event that a
526	 * program jumps out of a signal handler.
527	 */
528	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
529		return (EFAULT);
530
531	/*
532	 * Check for security violations.
533	 */
534#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
535	eflags = frame.sf_sc.sc_eflags;
536	/*
537	 * XXX do allow users to change the privileged flag PSL_RF.  The
538	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
539	 * sometimes set it there too.  tf_eflags is kept in the signal
540	 * context during signal handling and there is no other place
541	 * to remember it, so the PSL_RF bit may be corrupted by the
542	 * signal handler without us knowing.  Corruption of the PSL_RF
543	 * bit at worst causes one more or one less debugger trap, so
544	 * allowing it is fairly harmless.
545	 */
546	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
547		return(EINVAL);
548
549	/*
550	 * Don't allow users to load a valid privileged %cs.  Let the
551	 * hardware check for invalid selectors, excess privilege in
552	 * other selectors, invalid %eip's and invalid %esp's.
553	 */
554#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
555	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
556		trapsignal(p, SIGBUS, T_PROTFLT);
557		return(EINVAL);
558	}
559
560	lmask.__bits[0] = frame.sf_sc.sc_mask;
561	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
562		lmask.__bits[i+1] = frame.sf_extramask[i];
563	PROC_LOCK(p);
564	linux_to_bsd_sigset(&lmask, &p->p_sigmask);
565	SIG_CANTMASK(p->p_sigmask);
566	signotify(p);
567	PROC_UNLOCK(p);
568
569	/*
570	 * Restore signal context.
571	 */
572	/* %gs was restored by the trampoline. */
573	regs->tf_fs     = frame.sf_sc.sc_fs;
574	regs->tf_es     = frame.sf_sc.sc_es;
575	regs->tf_ds     = frame.sf_sc.sc_ds;
576	regs->tf_edi    = frame.sf_sc.sc_edi;
577	regs->tf_esi    = frame.sf_sc.sc_esi;
578	regs->tf_ebp    = frame.sf_sc.sc_ebp;
579	regs->tf_ebx    = frame.sf_sc.sc_ebx;
580	regs->tf_edx    = frame.sf_sc.sc_edx;
581	regs->tf_ecx    = frame.sf_sc.sc_ecx;
582	regs->tf_eax    = frame.sf_sc.sc_eax;
583	regs->tf_eip    = frame.sf_sc.sc_eip;
584	regs->tf_cs     = frame.sf_sc.sc_cs;
585	regs->tf_eflags = eflags;
586	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
587	regs->tf_ss     = frame.sf_sc.sc_ss;
588
589	return (EJUSTRETURN);
590}
591
592/*
593 * System call to cleanup state after a signal
594 * has been taken.  Reset signal mask and
595 * stack state from context left by rt_sendsig (above).
596 * Return to previous pc and psl as specified by
597 * context left by sendsig. Check carefully to
598 * make sure that the user has not modified the
599 * psl to gain improper privileges or to cause
600 * a machine fault.
601 */
602int
603linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
604{
605	struct proc *p = td->td_proc;
606	struct l_ucontext uc;
607	struct l_sigcontext *context;
608	l_stack_t *lss;
609	stack_t ss;
610	struct trapframe *regs;
611	int eflags;
612
613	regs = td->td_frame;
614
615#ifdef DEBUG
616	if (ldebug(rt_sigreturn))
617		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
618#endif
619	/*
620	 * The trampoline code hands us the ucontext.
621	 * It is unsafe to keep track of it ourselves, in the event that a
622	 * program jumps out of a signal handler.
623	 */
624	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
625		return (EFAULT);
626
627	context = &uc.uc_mcontext;
628
629	/*
630	 * Check for security violations.
631	 */
632#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
633	eflags = context->sc_eflags;
634	/*
635	 * XXX do allow users to change the privileged flag PSL_RF.  The
636	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
637	 * sometimes set it there too.  tf_eflags is kept in the signal
638	 * context during signal handling and there is no other place
639	 * to remember it, so the PSL_RF bit may be corrupted by the
640	 * signal handler without us knowing.  Corruption of the PSL_RF
641	 * bit at worst causes one more or one less debugger trap, so
642	 * allowing it is fairly harmless.
643	 */
644	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
645		return(EINVAL);
646
647	/*
648	 * Don't allow users to load a valid privileged %cs.  Let the
649	 * hardware check for invalid selectors, excess privilege in
650	 * other selectors, invalid %eip's and invalid %esp's.
651	 */
652#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
653	if (!CS_SECURE(context->sc_cs)) {
654		trapsignal(p, SIGBUS, T_PROTFLT);
655		return(EINVAL);
656	}
657
658	PROC_LOCK(p);
659	linux_to_bsd_sigset(&uc.uc_sigmask, &p->p_sigmask);
660	SIG_CANTMASK(p->p_sigmask);
661	signotify(p);
662	PROC_UNLOCK(p);
663
664	/*
665	 * Restore signal context
666	 */
667	/* %gs was restored by the trampoline. */
668	regs->tf_fs     = context->sc_fs;
669	regs->tf_es     = context->sc_es;
670	regs->tf_ds     = context->sc_ds;
671	regs->tf_edi    = context->sc_edi;
672	regs->tf_esi    = context->sc_esi;
673	regs->tf_ebp    = context->sc_ebp;
674	regs->tf_ebx    = context->sc_ebx;
675	regs->tf_edx    = context->sc_edx;
676	regs->tf_ecx    = context->sc_ecx;
677	regs->tf_eax    = context->sc_eax;
678	regs->tf_eip    = context->sc_eip;
679	regs->tf_cs     = context->sc_cs;
680	regs->tf_eflags = eflags;
681	regs->tf_esp    = context->sc_esp_at_signal;
682	regs->tf_ss     = context->sc_ss;
683
684	/*
685	 * call sigaltstack & ignore results..
686	 */
687	lss = &uc.uc_stack;
688	ss.ss_sp = lss->ss_sp;
689	ss.ss_size = lss->ss_size;
690	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
691
692#ifdef DEBUG
693	if (ldebug(rt_sigreturn))
694		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
695		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
696#endif
697	(void)kern_sigaltstack(td, &ss, NULL);
698
699	return (EJUSTRETURN);
700}
701
702/*
703 * MPSAFE
704 */
705static void
706linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
707{
708	args[0] = tf->tf_ebx;
709	args[1] = tf->tf_ecx;
710	args[2] = tf->tf_edx;
711	args[3] = tf->tf_esi;
712	args[4] = tf->tf_edi;
713	args[5] = tf->tf_ebp;	/* Unconfirmed */
714	*params = NULL;		/* no copyin */
715}
716
717
718
719/*
720 * Dump core, into a file named as described in the comments for
721 * expand_name(), unless the process was setuid/setgid.
722 */
723static int
724linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
725{
726	struct proc *p = td->td_proc;
727	struct ucred *cred = td->td_ucred;
728	struct vmspace *vm = p->p_vmspace;
729	char *tempuser;
730	int error;
731
732	if (ctob((uarea_pages + kstack_pages) +
733	    vm->vm_dsize + vm->vm_ssize) >= limit)
734		return (EFAULT);
735	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
736	    M_WAITOK | M_ZERO);
737	if (tempuser == NULL)
738		return (ENOMEM);
739	PROC_LOCK(p);
740	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
741	PROC_UNLOCK(p);
742	bcopy(p->p_uarea, tempuser, sizeof(struct user));
743	bcopy(td->td_frame,
744	    tempuser + ctob(uarea_pages) +
745	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
746	    sizeof(struct trapframe));
747	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
748	    ctob(uarea_pages + kstack_pages),
749	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
750	    (int *)NULL, td);
751	free(tempuser, M_TEMP);
752	if (error == 0)
753		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
754		    (int)ctob(vm->vm_dsize),
755		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
756		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
757	if (error == 0)
758		error = vn_rdwr_inchunks(UIO_WRITE, vp,
759		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
760		    round_page(ctob(vm->vm_ssize)),
761		    (off_t)ctob(uarea_pages + kstack_pages) +
762			ctob(vm->vm_dsize), UIO_USERSPACE,
763		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
764	return (error);
765}
766/*
767 * If a linux binary is exec'ing something, try this image activator
768 * first.  We override standard shell script execution in order to
769 * be able to modify the interpreter path.  We only do this if a linux
770 * binary is doing the exec, so we do not create an EXEC module for it.
771 */
772static int	exec_linux_imgact_try(struct image_params *iparams);
773
774static int
775exec_linux_imgact_try(struct image_params *imgp)
776{
777    const char *head = (const char *)imgp->image_header;
778    int error = -1;
779
780    /*
781     * The interpreter for shell scripts run from a linux binary needs
782     * to be located in /compat/linux if possible in order to recursively
783     * maintain linux path emulation.
784     */
785    if (((const short *)head)[0] == SHELLMAGIC) {
786	    /*
787	     * Run our normal shell image activator.  If it succeeds attempt
788	     * to use the alternate path for the interpreter.  If an alternate
789	     * path is found, use our stringspace to store it.
790	     */
791	    if ((error = exec_shell_imgact(imgp)) == 0) {
792		    char *rpath = NULL;
793
794		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
795			imgp->interpreter_name, &rpath, 0);
796		    if (rpath != imgp->interpreter_name) {
797			    int len = strlen(rpath) + 1;
798
799			    if (len <= MAXSHELLCMDLEN) {
800				    memcpy(imgp->interpreter_name, rpath, len);
801			    }
802			    free(rpath, M_TEMP);
803		    }
804	    }
805    }
806    return(error);
807}
808
809struct sysentvec linux_sysvec = {
810	LINUX_SYS_MAXSYSCALL,
811	linux_sysent,
812	0xff,
813	LINUX_SIGTBLSZ,
814	bsd_to_linux_signal,
815	ELAST + 1,
816	bsd_to_linux_errno,
817	translate_traps,
818	linux_fixup,
819	linux_sendsig,
820	linux_sigcode,
821	&linux_szsigcode,
822	linux_prepsyscall,
823	"Linux a.out",
824	linux_aout_coredump,
825	exec_linux_imgact_try,
826	LINUX_MINSIGSTKSZ,
827	PAGE_SIZE,
828	VM_MIN_ADDRESS,
829	VM_MAXUSER_ADDRESS,
830	USRSTACK,
831	PS_STRINGS,
832	VM_PROT_ALL,
833	exec_copyout_strings,
834	exec_setregs
835};
836
837struct sysentvec elf_linux_sysvec = {
838	LINUX_SYS_MAXSYSCALL,
839	linux_sysent,
840	0xff,
841	LINUX_SIGTBLSZ,
842	bsd_to_linux_signal,
843	ELAST + 1,
844	bsd_to_linux_errno,
845	translate_traps,
846	elf_linux_fixup,
847	linux_sendsig,
848	linux_sigcode,
849	&linux_szsigcode,
850	linux_prepsyscall,
851	"Linux ELF",
852	elf32_coredump,
853	exec_linux_imgact_try,
854	LINUX_MINSIGSTKSZ,
855	PAGE_SIZE,
856	VM_MIN_ADDRESS,
857	VM_MAXUSER_ADDRESS,
858	USRSTACK,
859	PS_STRINGS,
860	VM_PROT_ALL,
861	exec_copyout_strings,
862	exec_setregs
863};
864
865static Elf32_Brandinfo linux_brand = {
866					ELFOSABI_LINUX,
867					EM_386,
868					"Linux",
869					"/compat/linux",
870					"/lib/ld-linux.so.1",
871					&elf_linux_sysvec
872				 };
873
874static Elf32_Brandinfo linux_glibc2brand = {
875					ELFOSABI_LINUX,
876					EM_386,
877					"Linux",
878					"/compat/linux",
879					"/lib/ld-linux.so.2",
880					&elf_linux_sysvec
881				 };
882
883Elf32_Brandinfo *linux_brandlist[] = {
884					&linux_brand,
885					&linux_glibc2brand,
886					NULL
887				};
888
889static int
890linux_elf_modevent(module_t mod, int type, void *data)
891{
892	Elf32_Brandinfo **brandinfo;
893	int error;
894	struct linux_ioctl_handler **lihp;
895
896	error = 0;
897
898	switch(type) {
899	case MOD_LOAD:
900		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
901		     ++brandinfo)
902			if (elf32_insert_brand_entry(*brandinfo) < 0)
903				error = EINVAL;
904		if (error == 0) {
905			SET_FOREACH(lihp, linux_ioctl_handler_set)
906				linux_ioctl_register_handler(*lihp);
907			if (bootverbose)
908				printf("Linux ELF exec handler installed\n");
909		} else
910			printf("cannot insert Linux ELF brand handler\n");
911		break;
912	case MOD_UNLOAD:
913		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
914		     ++brandinfo)
915			if (elf32_brand_inuse(*brandinfo))
916				error = EBUSY;
917		if (error == 0) {
918			for (brandinfo = &linux_brandlist[0];
919			     *brandinfo != NULL; ++brandinfo)
920				if (elf32_remove_brand_entry(*brandinfo) < 0)
921					error = EINVAL;
922		}
923		if (error == 0) {
924			SET_FOREACH(lihp, linux_ioctl_handler_set)
925				linux_ioctl_unregister_handler(*lihp);
926			if (bootverbose)
927				printf("Linux ELF exec handler removed\n");
928			linux_mib_destroy();
929		} else
930			printf("Could not deinstall ELF interpreter entry\n");
931		break;
932	default:
933		break;
934	}
935	return error;
936}
937
938static moduledata_t linux_elf_mod = {
939	"linuxelf",
940	linux_elf_modevent,
941	0
942};
943
944DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
945