1133819Stjr/*-
2133819Stjr * Copyright (c) 2004 Tim J. Robbins
3133819Stjr * Copyright (c) 2003 Peter Wemm
4133819Stjr * Copyright (c) 2002 Doug Rabson
5133819Stjr * Copyright (c) 1998-1999 Andrew Gallatin
6230132Suqs * Copyright (c) 1994-1996 S��ren Schmidt
7133819Stjr * All rights reserved.
8133819Stjr *
9133819Stjr * Redistribution and use in source and binary forms, with or without
10133819Stjr * modification, are permitted provided that the following conditions
11133819Stjr * are met:
12133819Stjr * 1. Redistributions of source code must retain the above copyright
13133819Stjr *    notice, this list of conditions and the following disclaimer
14133819Stjr *    in this position and unchanged.
15133819Stjr * 2. Redistributions in binary form must reproduce the above copyright
16133819Stjr *    notice, this list of conditions and the following disclaimer in the
17133819Stjr *    documentation and/or other materials provided with the distribution.
18133819Stjr * 3. The name of the author may not be used to endorse or promote products
19133819Stjr *    derived from this software without specific prior written permission
20133819Stjr *
21133819Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22133819Stjr * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23133819Stjr * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24133819Stjr * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25133819Stjr * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26133819Stjr * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27133819Stjr * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28133819Stjr * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29133819Stjr * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30133819Stjr * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31133819Stjr */
32133819Stjr
33133819Stjr#include <sys/cdefs.h>
34133819Stjr__FBSDID("$FreeBSD: releng/10.3/sys/amd64/linux32/linux32_sysvec.c 294901 2016-01-27 07:28:55Z delphij $");
35156874Sru#include "opt_compat.h"
36133819Stjr
37205014Snwhitehorn#ifndef COMPAT_FREEBSD32
38205014Snwhitehorn#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39133819Stjr#endif
40133819Stjr
41133819Stjr#define	__ELF_WORD_SIZE	32
42133819Stjr
43133819Stjr#include <sys/param.h>
44133819Stjr#include <sys/systm.h>
45133819Stjr#include <sys/exec.h>
46177997Skib#include <sys/fcntl.h>
47133819Stjr#include <sys/imgact.h>
48133819Stjr#include <sys/imgact_elf.h>
49133819Stjr#include <sys/kernel.h>
50133819Stjr#include <sys/lock.h>
51133819Stjr#include <sys/malloc.h>
52133819Stjr#include <sys/module.h>
53133819Stjr#include <sys/mutex.h>
54133819Stjr#include <sys/proc.h>
55138129Sdas#include <sys/resourcevar.h>
56133819Stjr#include <sys/signalvar.h>
57133819Stjr#include <sys/sysctl.h>
58133819Stjr#include <sys/syscallsubr.h>
59133819Stjr#include <sys/sysent.h>
60133819Stjr#include <sys/sysproto.h>
61133819Stjr#include <sys/vnode.h>
62161315Snetchild#include <sys/eventhandler.h>
63133819Stjr
64133819Stjr#include <vm/vm.h>
65133819Stjr#include <vm/pmap.h>
66133819Stjr#include <vm/vm_extern.h>
67133819Stjr#include <vm/vm_map.h>
68133819Stjr#include <vm/vm_object.h>
69133819Stjr#include <vm/vm_page.h>
70133819Stjr#include <vm/vm_param.h>
71133819Stjr
72133819Stjr#include <machine/cpu.h>
73133819Stjr#include <machine/md_var.h>
74138129Sdas#include <machine/pcb.h>
75133819Stjr#include <machine/specialreg.h>
76133819Stjr
77133819Stjr#include <amd64/linux32/linux.h>
78133819Stjr#include <amd64/linux32/linux32_proto.h>
79218658Sdchagin#include <compat/linux/linux_emul.h>
80191741Sdchagin#include <compat/linux/linux_futex.h>
81246085Sjhb#include <compat/linux/linux_ioctl.h>
82133819Stjr#include <compat/linux/linux_mib.h>
83189362Sdchagin#include <compat/linux/linux_misc.h>
84133819Stjr#include <compat/linux/linux_signal.h>
85133819Stjr#include <compat/linux/linux_util.h>
86293514Sdchagin#include <compat/linux/linux_vdso.h>
87133819Stjr
88133819StjrMODULE_VERSION(linux, 1);
89133819Stjr
90133819Stjr#define	AUXARGS_ENTRY_32(pos, id, val)	\
91133819Stjr	do {				\
92133819Stjr		suword32(pos++, id);	\
93133819Stjr		suword32(pos++, val);	\
94133819Stjr	} while (0)
95133819Stjr
96133819Stjr#if BYTE_ORDER == LITTLE_ENDIAN
97133819Stjr#define SHELLMAGIC      0x2123 /* #! */
98133819Stjr#else
99133819Stjr#define SHELLMAGIC      0x2321
100133819Stjr#endif
101133819Stjr
102133819Stjr/*
103133819Stjr * Allow the sendsig functions to use the ldebug() facility
104133819Stjr * even though they are not syscalls themselves. Map them
105133819Stjr * to syscall 0. This is slightly less bogus than using
106133819Stjr * ldebug(sigreturn).
107133819Stjr */
108294368Sjhb#define	LINUX32_SYS_linux_rt_sendsig	0
109294368Sjhb#define	LINUX32_SYS_linux_sendsig	0
110133819Stjr
111293516Sdchaginconst char *linux_kplatform;
112293514Sdchaginstatic int linux_szsigcode;
113293514Sdchaginstatic vm_object_t linux_shared_page_obj;
114293514Sdchaginstatic char *linux_shared_page_mapping;
115293514Sdchaginextern char _binary_linux32_locore_o_start;
116293514Sdchaginextern char _binary_linux32_locore_o_end;
117133819Stjr
118294368Sjhbextern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL];
119133819Stjr
120133819StjrSET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
121133819Stjr
122133819Stjrstatic int	elf_linux_fixup(register_t **stack_base,
123133819Stjr		    struct image_params *iparams);
124133819Stjrstatic register_t *linux_copyout_strings(struct image_params *imgp);
125151316Sdavidxustatic void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
126205642Snwhitehornstatic void	exec_linux_setregs(struct thread *td,
127205642Snwhitehorn				   struct image_params *imgp, u_long stack);
128169565Sjhbstatic void	linux32_fixlimit(struct rlimit *rl, int which);
129196512Sbzstatic boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130293514Sdchaginstatic void	linux_vdso_install(void *param);
131293514Sdchaginstatic void	linux_vdso_deinstall(void *param);
132133819Stjr
133133819Stjr/*
134133819Stjr * Linux syscalls return negative errno's, we do positive and map them
135161204Snetchild * Reference:
136161204Snetchild *   FreeBSD: src/sys/sys/errno.h
137161204Snetchild *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138161204Snetchild *            linux-2.6.17.8/include/asm-generic/errno.h
139133819Stjr */
140133819Stjrstatic int bsd_to_linux_errno[ELAST + 1] = {
141133819Stjr	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142133819Stjr	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143133819Stjr	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144133819Stjr	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145133819Stjr	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146133819Stjr	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147133819Stjr	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148133819Stjr	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149161204Snetchild	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150161204Snetchild	 -72, -67, -71
151133819Stjr};
152133819Stjr
153133819Stjr#define LINUX_T_UNKNOWN  255
154133819Stjrstatic int _bsd_to_linux_trapcode[] = {
155133819Stjr	LINUX_T_UNKNOWN,	/* 0 */
156133819Stjr	6,			/* 1  T_PRIVINFLT */
157133819Stjr	LINUX_T_UNKNOWN,	/* 2 */
158133819Stjr	3,			/* 3  T_BPTFLT */
159133819Stjr	LINUX_T_UNKNOWN,	/* 4 */
160133819Stjr	LINUX_T_UNKNOWN,	/* 5 */
161133819Stjr	16,			/* 6  T_ARITHTRAP */
162133819Stjr	254,			/* 7  T_ASTFLT */
163133819Stjr	LINUX_T_UNKNOWN,	/* 8 */
164133819Stjr	13,			/* 9  T_PROTFLT */
165133819Stjr	1,			/* 10 T_TRCTRAP */
166133819Stjr	LINUX_T_UNKNOWN,	/* 11 */
167133819Stjr	14,			/* 12 T_PAGEFLT */
168133819Stjr	LINUX_T_UNKNOWN,	/* 13 */
169133819Stjr	17,			/* 14 T_ALIGNFLT */
170133819Stjr	LINUX_T_UNKNOWN,	/* 15 */
171133819Stjr	LINUX_T_UNKNOWN,	/* 16 */
172133819Stjr	LINUX_T_UNKNOWN,	/* 17 */
173133819Stjr	0,			/* 18 T_DIVIDE */
174133819Stjr	2,			/* 19 T_NMI */
175133819Stjr	4,			/* 20 T_OFLOW */
176133819Stjr	5,			/* 21 T_BOUND */
177133819Stjr	7,			/* 22 T_DNA */
178133819Stjr	8,			/* 23 T_DOUBLEFLT */
179133819Stjr	9,			/* 24 T_FPOPFLT */
180133819Stjr	10,			/* 25 T_TSSFLT */
181133819Stjr	11,			/* 26 T_SEGNPFLT */
182133819Stjr	12,			/* 27 T_STKFLT */
183133819Stjr	18,			/* 28 T_MCHK */
184133819Stjr	19,			/* 29 T_XMMFLT */
185133819Stjr	15			/* 30 T_RESERVED */
186133819Stjr};
187133819Stjr#define bsd_to_linux_trapcode(code) \
188133819Stjr    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
189133819Stjr     _bsd_to_linux_trapcode[(code)]: \
190133819Stjr     LINUX_T_UNKNOWN)
191133819Stjr
192133819Stjrstruct linux32_ps_strings {
193133819Stjr	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
194144011Sdas	u_int ps_nargvstr;	/* the number of argument strings */
195133819Stjr	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
196144011Sdas	u_int ps_nenvstr;	/* the number of environment strings */
197133819Stjr};
198133819Stjr
199293514SdchaginLINUX_VDSO_SYM_INTPTR(linux32_sigcode);
200293514SdchaginLINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
201293514SdchaginLINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
202293516SdchaginLINUX_VDSO_SYM_CHAR(linux_platform);
203293514Sdchagin
204133819Stjr/*
205133819Stjr * If FreeBSD & Linux have a difference of opinion about what a trap
206133819Stjr * means, deal with it here.
207133819Stjr *
208133819Stjr * MPSAFE
209133819Stjr */
210133819Stjrstatic int
211133819Stjrtranslate_traps(int signal, int trap_code)
212133819Stjr{
213133819Stjr	if (signal != SIGBUS)
214133819Stjr		return signal;
215133819Stjr	switch (trap_code) {
216133819Stjr	case T_PROTFLT:
217133819Stjr	case T_TSSFLT:
218133819Stjr	case T_DOUBLEFLT:
219133819Stjr	case T_PAGEFLT:
220133819Stjr		return SIGSEGV;
221133819Stjr	default:
222133819Stjr		return signal;
223133819Stjr	}
224133819Stjr}
225133819Stjr
226133819Stjrstatic int
227133819Stjrelf_linux_fixup(register_t **stack_base, struct image_params *imgp)
228133819Stjr{
229133819Stjr	Elf32_Auxargs *args;
230133819Stjr	Elf32_Addr *base;
231293516Sdchagin	Elf32_Addr *pos;
232189362Sdchagin	struct linux32_ps_strings *arginfo;
233294901Sdelphij	int issetugid;
234133819Stjr
235189362Sdchagin	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
236189362Sdchagin
237177091Sjeff	KASSERT(curthread->td_proc == imgp->proc,
238133819Stjr	    ("unsafe elf_linux_fixup(), should be curproc"));
239133819Stjr	base = (Elf32_Addr *)*stack_base;
240133819Stjr	args = (Elf32_Auxargs *)imgp->auxargs;
241140992Ssobomax	pos = base + (imgp->args->argc + imgp->args->envc + 2);
242133819Stjr
243294901Sdelphij	issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0;
244293514Sdchagin	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
245293514Sdchagin	    imgp->proc->p_sysent->sv_shared_page_base);
246293514Sdchagin	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
247189362Sdchagin	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
248191973Sdchagin
249191973Sdchagin	/*
250191973Sdchagin	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
251191973Sdchagin	 * as it has appeared in the 2.4.0-rc7 first time.
252191973Sdchagin	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
253191973Sdchagin	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
254191973Sdchagin	 * is not present.
255191973Sdchagin	 * Also see linux_times() implementation.
256191973Sdchagin	 */
257191973Sdchagin	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
258191973Sdchagin		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
259133819Stjr	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
260133819Stjr	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
261133819Stjr	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
262133819Stjr	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
263133819Stjr	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
264133819Stjr	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
265133819Stjr	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
266294901Sdelphij	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, issetugid);
267133819Stjr	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
268133819Stjr	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
269133819Stjr	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
270133819Stjr	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
271293516Sdchagin	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
272293535Sdchagin	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary));
273293535Sdchagin	if (imgp->execpathp != 0)
274293535Sdchagin		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp));
275189362Sdchagin	if (args->execfd != -1)
276189362Sdchagin		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
277133819Stjr	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
278133819Stjr
279133819Stjr	free(imgp->auxargs, M_TEMP);
280133819Stjr	imgp->auxargs = NULL;
281133819Stjr
282133819Stjr	base--;
283140992Ssobomax	suword32(base, (uint32_t)imgp->args->argc);
284133819Stjr	*stack_base = (register_t *)base;
285293495Sdchagin	return (0);
286133819Stjr}
287133819Stjr
288133819Stjrstatic void
289151316Sdavidxulinux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
290133819Stjr{
291133819Stjr	struct thread *td = curthread;
292133819Stjr	struct proc *p = td->td_proc;
293133819Stjr	struct sigacts *psp;
294133819Stjr	struct trapframe *regs;
295133819Stjr	struct l_rt_sigframe *fp, frame;
296133819Stjr	int oonstack;
297151316Sdavidxu	int sig;
298151316Sdavidxu	int code;
299151316Sdavidxu
300151316Sdavidxu	sig = ksi->ksi_signo;
301151316Sdavidxu	code = ksi->ksi_code;
302133819Stjr	PROC_LOCK_ASSERT(p, MA_OWNED);
303133819Stjr	psp = p->p_sigacts;
304133819Stjr	mtx_assert(&psp->ps_mtx, MA_OWNED);
305133819Stjr	regs = td->td_frame;
306133819Stjr	oonstack = sigonstack(regs->tf_rsp);
307133819Stjr
308133819Stjr#ifdef DEBUG
309133819Stjr	if (ldebug(rt_sendsig))
310151343Sjhb		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
311133819Stjr		    catcher, sig, (void*)mask, code);
312133819Stjr#endif
313133819Stjr	/*
314133819Stjr	 * Allocate space for the signal handler context.
315133819Stjr	 */
316133819Stjr	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
317133819Stjr	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
318133819Stjr		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
319133819Stjr		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
320133819Stjr	} else
321133819Stjr		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
322133819Stjr	mtx_unlock(&psp->ps_mtx);
323133819Stjr
324133819Stjr	/*
325133819Stjr	 * Build the argument list for the signal handler.
326133819Stjr	 */
327293575Sdchagin	sig = bsd_to_linux_signal(sig);
328133819Stjr
329133819Stjr	bzero(&frame, sizeof(frame));
330133819Stjr
331133819Stjr	frame.sf_handler = PTROUT(catcher);
332133819Stjr	frame.sf_sig = sig;
333133819Stjr	frame.sf_siginfo = PTROUT(&fp->sf_si);
334133819Stjr	frame.sf_ucontext = PTROUT(&fp->sf_sc);
335133819Stjr
336133819Stjr	/* Fill in POSIX parts */
337184058Skib	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
338133819Stjr
339133819Stjr	/*
340293514Sdchagin	 * Build the signal context to be used by sigreturn
341293514Sdchagin	 * and libgcc unwind.
342133819Stjr	 */
343133819Stjr	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
344133819Stjr	frame.sf_sc.uc_link = 0;		/* XXX ??? */
345133819Stjr
346133819Stjr	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
347133819Stjr	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
348133819Stjr	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
349133819Stjr	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
350133819Stjr	PROC_UNLOCK(p);
351133819Stjr
352133819Stjr	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
353133819Stjr
354293575Sdchagin	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__mask;
355133819Stjr	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
356133819Stjr	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
357133819Stjr	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
358133819Stjr	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
359293514Sdchagin	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
360133819Stjr	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
361133819Stjr	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
362133819Stjr	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
363133819Stjr	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
364133819Stjr	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
365190620Skib	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
366190620Skib	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
367190620Skib	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
368190620Skib	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
369133819Stjr	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
370133819Stjr	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
371133819Stjr	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
372133819Stjr	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
373172255Skib	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
374133819Stjr	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
375133819Stjr
376133819Stjr#ifdef DEBUG
377133819Stjr	if (ldebug(rt_sendsig))
378133844Sobrien		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
379133819Stjr		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
380133819Stjr		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
381133819Stjr#endif
382133819Stjr
383133819Stjr	if (copyout(&frame, fp, sizeof(frame)) != 0) {
384133819Stjr		/*
385133819Stjr		 * Process has trashed its stack; give it an illegal
386133819Stjr		 * instruction to halt it in its tracks.
387133819Stjr		 */
388133819Stjr#ifdef DEBUG
389133819Stjr		if (ldebug(rt_sendsig))
390133819Stjr			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
391133819Stjr			    fp, oonstack);
392133819Stjr#endif
393133819Stjr		PROC_LOCK(p);
394133819Stjr		sigexit(td, SIGILL);
395133819Stjr	}
396133819Stjr
397133819Stjr	/*
398133819Stjr	 * Build context to run handler in.
399133819Stjr	 */
400133819Stjr	regs->tf_rsp = PTROUT(fp);
401293514Sdchagin	regs->tf_rip = linux32_rt_sigcode;
402177145Skib	regs->tf_rflags &= ~(PSL_T | PSL_D);
403133819Stjr	regs->tf_cs = _ucode32sel;
404133819Stjr	regs->tf_ss = _udatasel;
405190620Skib	regs->tf_ds = _udatasel;
406190620Skib	regs->tf_es = _udatasel;
407190620Skib	regs->tf_fs = _ufssel;
408190620Skib	regs->tf_gs = _ugssel;
409190620Skib	regs->tf_flags = TF_HASSEGS;
410216634Sjkim	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
411133819Stjr	PROC_LOCK(p);
412133819Stjr	mtx_lock(&psp->ps_mtx);
413133819Stjr}
414133819Stjr
415133819Stjr
416133819Stjr/*
417133819Stjr * Send an interrupt to process.
418133819Stjr *
419133819Stjr * Stack is set up to allow sigcode stored
420133819Stjr * in u. to call routine, followed by kcall
421133819Stjr * to sigreturn routine below.  After sigreturn
422133819Stjr * resets the signal mask, the stack, and the
423133819Stjr * frame pointer, it returns to the user
424133819Stjr * specified pc, psl.
425133819Stjr */
426133819Stjrstatic void
427151316Sdavidxulinux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
428133819Stjr{
429133819Stjr	struct thread *td = curthread;
430133819Stjr	struct proc *p = td->td_proc;
431133819Stjr	struct sigacts *psp;
432133819Stjr	struct trapframe *regs;
433133819Stjr	struct l_sigframe *fp, frame;
434133819Stjr	l_sigset_t lmask;
435293575Sdchagin	int oonstack;
436151316Sdavidxu	int sig, code;
437133819Stjr
438151316Sdavidxu	sig = ksi->ksi_signo;
439151316Sdavidxu	code = ksi->ksi_code;
440133819Stjr	PROC_LOCK_ASSERT(p, MA_OWNED);
441133819Stjr	psp = p->p_sigacts;
442133819Stjr	mtx_assert(&psp->ps_mtx, MA_OWNED);
443133819Stjr	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
444133819Stjr		/* Signal handler installed with SA_SIGINFO. */
445151316Sdavidxu		linux_rt_sendsig(catcher, ksi, mask);
446133819Stjr		return;
447133819Stjr	}
448133819Stjr
449133819Stjr	regs = td->td_frame;
450133819Stjr	oonstack = sigonstack(regs->tf_rsp);
451133819Stjr
452133819Stjr#ifdef DEBUG
453133819Stjr	if (ldebug(sendsig))
454151343Sjhb		printf(ARGS(sendsig, "%p, %d, %p, %u"),
455133819Stjr		    catcher, sig, (void*)mask, code);
456133819Stjr#endif
457133819Stjr
458133819Stjr	/*
459133819Stjr	 * Allocate space for the signal handler context.
460133819Stjr	 */
461133819Stjr	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
462133819Stjr	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
463133819Stjr		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
464133819Stjr		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
465133819Stjr	} else
466133819Stjr		fp = (struct l_sigframe *)regs->tf_rsp - 1;
467133819Stjr	mtx_unlock(&psp->ps_mtx);
468133819Stjr	PROC_UNLOCK(p);
469133819Stjr
470133819Stjr	/*
471133819Stjr	 * Build the argument list for the signal handler.
472133819Stjr	 */
473293575Sdchagin	sig = bsd_to_linux_signal(sig);
474133819Stjr
475133819Stjr	bzero(&frame, sizeof(frame));
476133819Stjr
477133819Stjr	frame.sf_handler = PTROUT(catcher);
478133819Stjr	frame.sf_sig = sig;
479133819Stjr
480133819Stjr	bsd_to_linux_sigset(mask, &lmask);
481133819Stjr
482133819Stjr	/*
483133819Stjr	 * Build the signal context to be used by sigreturn.
484133819Stjr	 */
485293575Sdchagin	frame.sf_sc.sc_mask   = lmask.__mask;
486190620Skib	frame.sf_sc.sc_gs     = regs->tf_gs;
487190620Skib	frame.sf_sc.sc_fs     = regs->tf_fs;
488190620Skib	frame.sf_sc.sc_es     = regs->tf_es;
489190620Skib	frame.sf_sc.sc_ds     = regs->tf_ds;
490133819Stjr	frame.sf_sc.sc_edi    = regs->tf_rdi;
491133819Stjr	frame.sf_sc.sc_esi    = regs->tf_rsi;
492133819Stjr	frame.sf_sc.sc_ebp    = regs->tf_rbp;
493133819Stjr	frame.sf_sc.sc_ebx    = regs->tf_rbx;
494293514Sdchagin	frame.sf_sc.sc_esp    = regs->tf_rsp;
495133819Stjr	frame.sf_sc.sc_edx    = regs->tf_rdx;
496133819Stjr	frame.sf_sc.sc_ecx    = regs->tf_rcx;
497133819Stjr	frame.sf_sc.sc_eax    = regs->tf_rax;
498133819Stjr	frame.sf_sc.sc_eip    = regs->tf_rip;
499133819Stjr	frame.sf_sc.sc_cs     = regs->tf_cs;
500133819Stjr	frame.sf_sc.sc_eflags = regs->tf_rflags;
501133819Stjr	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
502133819Stjr	frame.sf_sc.sc_ss     = regs->tf_ss;
503133819Stjr	frame.sf_sc.sc_err    = regs->tf_err;
504172255Skib	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
505133819Stjr	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
506133819Stjr
507293575Sdchagin	frame.sf_extramask[0] = lmask.__mask;
508133819Stjr
509133819Stjr	if (copyout(&frame, fp, sizeof(frame)) != 0) {
510133819Stjr		/*
511133819Stjr		 * Process has trashed its stack; give it an illegal
512133819Stjr		 * instruction to halt it in its tracks.
513133819Stjr		 */
514133819Stjr		PROC_LOCK(p);
515133819Stjr		sigexit(td, SIGILL);
516133819Stjr	}
517133819Stjr
518133819Stjr	/*
519133819Stjr	 * Build context to run handler in.
520133819Stjr	 */
521133819Stjr	regs->tf_rsp = PTROUT(fp);
522293514Sdchagin	regs->tf_rip = linux32_sigcode;
523177145Skib	regs->tf_rflags &= ~(PSL_T | PSL_D);
524133819Stjr	regs->tf_cs = _ucode32sel;
525133819Stjr	regs->tf_ss = _udatasel;
526190620Skib	regs->tf_ds = _udatasel;
527190620Skib	regs->tf_es = _udatasel;
528190620Skib	regs->tf_fs = _ufssel;
529190620Skib	regs->tf_gs = _ugssel;
530190620Skib	regs->tf_flags = TF_HASSEGS;
531216634Sjkim	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
532133819Stjr	PROC_LOCK(p);
533133819Stjr	mtx_lock(&psp->ps_mtx);
534133819Stjr}
535133819Stjr
536133819Stjr/*
537133819Stjr * System call to cleanup state after a signal
538133819Stjr * has been taken.  Reset signal mask and
539133819Stjr * stack state from context left by sendsig (above).
540133819Stjr * Return to previous pc and psl as specified by
541133819Stjr * context left by sendsig. Check carefully to
542133819Stjr * make sure that the user has not modified the
543133819Stjr * psl to gain improper privileges or to cause
544133819Stjr * a machine fault.
545133819Stjr */
546133819Stjrint
547133819Stjrlinux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
548133819Stjr{
549133819Stjr	struct l_sigframe frame;
550133819Stjr	struct trapframe *regs;
551198507Skib	sigset_t bmask;
552133819Stjr	l_sigset_t lmask;
553293575Sdchagin	int eflags;
554151316Sdavidxu	ksiginfo_t ksi;
555133819Stjr
556133819Stjr	regs = td->td_frame;
557133819Stjr
558133819Stjr#ifdef DEBUG
559133819Stjr	if (ldebug(sigreturn))
560133819Stjr		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
561133819Stjr#endif
562133819Stjr	/*
563133819Stjr	 * The trampoline code hands us the sigframe.
564133819Stjr	 * It is unsafe to keep track of it ourselves, in the event that a
565133819Stjr	 * program jumps out of a signal handler.
566133819Stjr	 */
567133819Stjr	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
568133819Stjr		return (EFAULT);
569133819Stjr
570133819Stjr	/*
571133819Stjr	 * Check for security violations.
572133819Stjr	 */
573133819Stjr#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
574133819Stjr	eflags = frame.sf_sc.sc_eflags;
575258559Semaste	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
576133819Stjr		return(EINVAL);
577133819Stjr
578133819Stjr	/*
579133819Stjr	 * Don't allow users to load a valid privileged %cs.  Let the
580133819Stjr	 * hardware check for invalid selectors, excess privilege in
581133819Stjr	 * other selectors, invalid %eip's and invalid %esp's.
582133819Stjr	 */
583133819Stjr#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
584133819Stjr	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
585151316Sdavidxu		ksiginfo_init_trap(&ksi);
586151316Sdavidxu		ksi.ksi_signo = SIGBUS;
587151316Sdavidxu		ksi.ksi_code = BUS_OBJERR;
588151316Sdavidxu		ksi.ksi_trapno = T_PROTFLT;
589151316Sdavidxu		ksi.ksi_addr = (void *)regs->tf_rip;
590151316Sdavidxu		trapsignal(td, &ksi);
591133819Stjr		return(EINVAL);
592133819Stjr	}
593133819Stjr
594293575Sdchagin	lmask.__mask = frame.sf_sc.sc_mask;
595293575Sdchagin	lmask.__mask = frame.sf_extramask[0];
596198507Skib	linux_to_bsd_sigset(&lmask, &bmask);
597198507Skib	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
598133819Stjr
599133819Stjr	/*
600133819Stjr	 * Restore signal context.
601133819Stjr	 */
602133819Stjr	regs->tf_rdi    = frame.sf_sc.sc_edi;
603133819Stjr	regs->tf_rsi    = frame.sf_sc.sc_esi;
604133819Stjr	regs->tf_rbp    = frame.sf_sc.sc_ebp;
605133819Stjr	regs->tf_rbx    = frame.sf_sc.sc_ebx;
606133819Stjr	regs->tf_rdx    = frame.sf_sc.sc_edx;
607133819Stjr	regs->tf_rcx    = frame.sf_sc.sc_ecx;
608133819Stjr	regs->tf_rax    = frame.sf_sc.sc_eax;
609133819Stjr	regs->tf_rip    = frame.sf_sc.sc_eip;
610133819Stjr	regs->tf_cs     = frame.sf_sc.sc_cs;
611190620Skib	regs->tf_ds     = frame.sf_sc.sc_ds;
612190620Skib	regs->tf_es     = frame.sf_sc.sc_es;
613190620Skib	regs->tf_fs     = frame.sf_sc.sc_fs;
614190620Skib	regs->tf_gs     = frame.sf_sc.sc_gs;
615133819Stjr	regs->tf_rflags = eflags;
616133819Stjr	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
617133819Stjr	regs->tf_ss     = frame.sf_sc.sc_ss;
618216634Sjkim	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
619133819Stjr
620133819Stjr	return (EJUSTRETURN);
621133819Stjr}
622133819Stjr
623133819Stjr/*
624133819Stjr * System call to cleanup state after a signal
625133819Stjr * has been taken.  Reset signal mask and
626133819Stjr * stack state from context left by rt_sendsig (above).
627133819Stjr * Return to previous pc and psl as specified by
628133819Stjr * context left by sendsig. Check carefully to
629133819Stjr * make sure that the user has not modified the
630133819Stjr * psl to gain improper privileges or to cause
631133819Stjr * a machine fault.
632133819Stjr */
633133819Stjrint
634133819Stjrlinux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
635133819Stjr{
636133819Stjr	struct l_ucontext uc;
637133819Stjr	struct l_sigcontext *context;
638198507Skib	sigset_t bmask;
639133819Stjr	l_stack_t *lss;
640133819Stjr	stack_t ss;
641133819Stjr	struct trapframe *regs;
642133819Stjr	int eflags;
643151316Sdavidxu	ksiginfo_t ksi;
644133819Stjr
645133819Stjr	regs = td->td_frame;
646133819Stjr
647133819Stjr#ifdef DEBUG
648133819Stjr	if (ldebug(rt_sigreturn))
649133819Stjr		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
650133819Stjr#endif
651133819Stjr	/*
652133819Stjr	 * The trampoline code hands us the ucontext.
653133819Stjr	 * It is unsafe to keep track of it ourselves, in the event that a
654133819Stjr	 * program jumps out of a signal handler.
655133819Stjr	 */
656133819Stjr	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
657133819Stjr		return (EFAULT);
658133819Stjr
659133819Stjr	context = &uc.uc_mcontext;
660133819Stjr
661133819Stjr	/*
662133819Stjr	 * Check for security violations.
663133819Stjr	 */
664133819Stjr#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
665133819Stjr	eflags = context->sc_eflags;
666258559Semaste	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
667133819Stjr		return(EINVAL);
668133819Stjr
669133819Stjr	/*
670133819Stjr	 * Don't allow users to load a valid privileged %cs.  Let the
671133819Stjr	 * hardware check for invalid selectors, excess privilege in
672133819Stjr	 * other selectors, invalid %eip's and invalid %esp's.
673133819Stjr	 */
674133819Stjr#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
675133819Stjr	if (!CS_SECURE(context->sc_cs)) {
676151316Sdavidxu		ksiginfo_init_trap(&ksi);
677151316Sdavidxu		ksi.ksi_signo = SIGBUS;
678151316Sdavidxu		ksi.ksi_code = BUS_OBJERR;
679151316Sdavidxu		ksi.ksi_trapno = T_PROTFLT;
680151316Sdavidxu		ksi.ksi_addr = (void *)regs->tf_rip;
681151316Sdavidxu		trapsignal(td, &ksi);
682133819Stjr		return(EINVAL);
683133819Stjr	}
684133819Stjr
685198507Skib	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
686198507Skib	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
687133819Stjr
688133819Stjr	/*
689133819Stjr	 * Restore signal context
690133819Stjr	 */
691190620Skib	regs->tf_gs	= context->sc_gs;
692190620Skib	regs->tf_fs	= context->sc_fs;
693190620Skib	regs->tf_es	= context->sc_es;
694190620Skib	regs->tf_ds	= context->sc_ds;
695133819Stjr	regs->tf_rdi    = context->sc_edi;
696133819Stjr	regs->tf_rsi    = context->sc_esi;
697133819Stjr	regs->tf_rbp    = context->sc_ebp;
698133819Stjr	regs->tf_rbx    = context->sc_ebx;
699133819Stjr	regs->tf_rdx    = context->sc_edx;
700133819Stjr	regs->tf_rcx    = context->sc_ecx;
701133819Stjr	regs->tf_rax    = context->sc_eax;
702133819Stjr	regs->tf_rip    = context->sc_eip;
703133819Stjr	regs->tf_cs     = context->sc_cs;
704133819Stjr	regs->tf_rflags = eflags;
705133819Stjr	regs->tf_rsp    = context->sc_esp_at_signal;
706133819Stjr	regs->tf_ss     = context->sc_ss;
707216634Sjkim	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
708133819Stjr
709133819Stjr	/*
710133819Stjr	 * call sigaltstack & ignore results..
711133819Stjr	 */
712133819Stjr	lss = &uc.uc_stack;
713133819Stjr	ss.ss_sp = PTRIN(lss->ss_sp);
714133819Stjr	ss.ss_size = lss->ss_size;
715133819Stjr	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
716133819Stjr
717133819Stjr#ifdef DEBUG
718133819Stjr	if (ldebug(rt_sigreturn))
719133844Sobrien		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
720133819Stjr		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
721133819Stjr#endif
722133819Stjr	(void)kern_sigaltstack(td, &ss, NULL);
723133819Stjr
724133819Stjr	return (EJUSTRETURN);
725133819Stjr}
726133819Stjr
727208453Skibstatic int
728208453Skiblinux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
729133819Stjr{
730208453Skib	struct proc *p;
731208453Skib	struct trapframe *frame;
732208453Skib
733208453Skib	p = td->td_proc;
734208453Skib	frame = td->td_frame;
735208453Skib
736208453Skib	sa->args[0] = frame->tf_rbx;
737208453Skib	sa->args[1] = frame->tf_rcx;
738208453Skib	sa->args[2] = frame->tf_rdx;
739208453Skib	sa->args[3] = frame->tf_rsi;
740208453Skib	sa->args[4] = frame->tf_rdi;
741208453Skib	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
742208453Skib	sa->code = frame->tf_rax;
743208453Skib
744208453Skib	if (sa->code >= p->p_sysent->sv_size)
745293569Sdchagin		/* nosys */
746293609Sdchagin		sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1];
747208453Skib	else
748208453Skib		sa->callp = &p->p_sysent->sv_table[sa->code];
749208453Skib	sa->narg = sa->callp->sy_narg;
750208453Skib
751208453Skib	td->td_retval[0] = 0;
752208453Skib	td->td_retval[1] = frame->tf_rdx;
753208453Skib
754208453Skib	return (0);
755133819Stjr}
756133819Stjr
757133819Stjr/*
758133819Stjr * If a linux binary is exec'ing something, try this image activator
759133819Stjr * first.  We override standard shell script execution in order to
760133819Stjr * be able to modify the interpreter path.  We only do this if a linux
761133819Stjr * binary is doing the exec, so we do not create an EXEC module for it.
762133819Stjr */
763133819Stjrstatic int	exec_linux_imgact_try(struct image_params *iparams);
764133819Stjr
765133819Stjrstatic int
766133819Stjrexec_linux_imgact_try(struct image_params *imgp)
767133819Stjr{
768187964Sobrien	const char *head = (const char *)imgp->image_header;
769187964Sobrien	char *rpath;
770210555Salc	int error = -1;
771133819Stjr
772187964Sobrien	/*
773187964Sobrien	* The interpreter for shell scripts run from a linux binary needs
774187964Sobrien	* to be located in /compat/linux if possible in order to recursively
775187964Sobrien	* maintain linux path emulation.
776187964Sobrien	*/
777187964Sobrien	if (((const short *)head)[0] == SHELLMAGIC) {
778187964Sobrien		/*
779187964Sobrien		* Run our normal shell image activator.  If it succeeds attempt
780187964Sobrien		* to use the alternate path for the interpreter.  If an
781187964Sobrien		* alternate * path is found, use our stringspace to store it.
782187964Sobrien		*/
783187964Sobrien		if ((error = exec_shell_imgact(imgp)) == 0) {
784187964Sobrien			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
785187964Sobrien			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
786187964Sobrien			    AT_FDCWD);
787210555Salc			if (rpath != NULL)
788210555Salc				imgp->args->fname_buf =
789210555Salc				    imgp->interpreter_name = rpath;
790187964Sobrien		}
791187964Sobrien	}
792210555Salc	return (error);
793133819Stjr}
794133819Stjr
795133819Stjr/*
796133819Stjr * Clear registers on exec
797133819Stjr * XXX copied from ia32_signal.c.
798133819Stjr */
799133819Stjrstatic void
800205642Snwhitehornexec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
801133819Stjr{
802133819Stjr	struct trapframe *regs = td->td_frame;
803133819Stjr	struct pcb *pcb = td->td_pcb;
804133819Stjr
805190620Skib	mtx_lock(&dt_lock);
806190620Skib	if (td->td_proc->p_md.md_ldt != NULL)
807190620Skib		user_ldt_free(td);
808190620Skib	else
809190620Skib		mtx_unlock(&dt_lock);
810190620Skib
811168035Sjkim	critical_enter();
812133819Stjr	wrmsr(MSR_FSBASE, 0);
813133819Stjr	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
814133819Stjr	pcb->pcb_fsbase = 0;
815133819Stjr	pcb->pcb_gsbase = 0;
816168035Sjkim	critical_exit();
817189423Sjhb	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
818133819Stjr
819133819Stjr	bzero((char *)regs, sizeof(struct trapframe));
820205642Snwhitehorn	regs->tf_rip = imgp->entry_addr;
821133819Stjr	regs->tf_rsp = stack;
822133819Stjr	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
823190620Skib	regs->tf_gs = _ugssel;
824190620Skib	regs->tf_fs = _ufssel;
825190620Skib	regs->tf_es = _udatasel;
826190620Skib	regs->tf_ds = _udatasel;
827133819Stjr	regs->tf_ss = _udatasel;
828190620Skib	regs->tf_flags = TF_HASSEGS;
829133819Stjr	regs->tf_cs = _ucode32sel;
830205642Snwhitehorn	regs->tf_rbx = imgp->ps_strings;
831217424Sjkim
832150473Sups	fpstate_drop(td);
833133819Stjr
834216255Skib	/* Do full restore on return so that we can change to a different %cs */
835216634Sjkim	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
836133819Stjr	td->td_retval[1] = 0;
837133819Stjr}
838133819Stjr
839133819Stjr/*
840133819Stjr * XXX copied from ia32_sysvec.c.
841133819Stjr */
842133819Stjrstatic register_t *
843133819Stjrlinux_copyout_strings(struct image_params *imgp)
844133819Stjr{
845133819Stjr	int argc, envc;
846133819Stjr	u_int32_t *vectp;
847133819Stjr	char *stringp, *destp;
848133819Stjr	u_int32_t *stack_base;
849133819Stjr	struct linux32_ps_strings *arginfo;
850293535Sdchagin	char canary[LINUX_AT_RANDOM_LEN];
851293535Sdchagin	size_t execpath_len;
852133819Stjr
853133819Stjr	/*
854133819Stjr	 * Calculate string base and vector table pointers.
855133819Stjr	 */
856293535Sdchagin	if (imgp->execpath != NULL && imgp->auxargs != NULL)
857293535Sdchagin		execpath_len = strlen(imgp->execpath) + 1;
858293535Sdchagin	else
859293535Sdchagin		execpath_len = 0;
860293535Sdchagin
861133819Stjr	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
862293516Sdchagin	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
863293535Sdchagin	    roundup(sizeof(canary), sizeof(char *)) -
864293535Sdchagin	    roundup(execpath_len, sizeof(char *)) -
865293516Sdchagin	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
866133819Stjr
867293535Sdchagin	if (execpath_len != 0) {
868293535Sdchagin		imgp->execpathp = (uintptr_t)arginfo - execpath_len;
869293535Sdchagin		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
870293535Sdchagin	}
871293535Sdchagin
872133819Stjr	/*
873293535Sdchagin	 * Prepare the canary for SSP.
874293535Sdchagin	 */
875293535Sdchagin	arc4rand(canary, sizeof(canary), 0);
876293535Sdchagin	imgp->canary = (uintptr_t)arginfo -
877293535Sdchagin	    roundup(execpath_len, sizeof(char *)) -
878293535Sdchagin	    roundup(sizeof(canary), sizeof(char *));
879293535Sdchagin	copyout(canary, (void *)imgp->canary, sizeof(canary));
880293535Sdchagin
881293535Sdchagin	/*
882133819Stjr	 * If we have a valid auxargs ptr, prepare some room
883133819Stjr	 * on the stack.
884133819Stjr	 */
885133819Stjr	if (imgp->auxargs) {
886133819Stjr		/*
887133819Stjr		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
888133819Stjr		 * lower compatibility.
889133819Stjr		 */
890187964Sobrien		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
891189362Sdchagin		    (LINUX_AT_COUNT * 2);
892133819Stjr		/*
893133819Stjr		 * The '+ 2' is for the null pointers at the end of each of
894133819Stjr		 * the arg and env vector sets,and imgp->auxarg_size is room
895133819Stjr		 * for argument of Runtime loader.
896133819Stjr		 */
897187964Sobrien		vectp = (u_int32_t *) (destp - (imgp->args->argc +
898187964Sobrien		    imgp->args->envc + 2 + imgp->auxarg_size) *
899187964Sobrien		    sizeof(u_int32_t));
900133819Stjr
901133819Stjr	} else
902133819Stjr		/*
903133819Stjr		 * The '+ 2' is for the null pointers at the end of each of
904133819Stjr		 * the arg and env vector sets
905133819Stjr		 */
906187964Sobrien		vectp = (u_int32_t *)(destp - (imgp->args->argc +
907187964Sobrien		    imgp->args->envc + 2) * sizeof(u_int32_t));
908133819Stjr
909133819Stjr	/*
910133819Stjr	 * vectp also becomes our initial stack base
911133819Stjr	 */
912133819Stjr	stack_base = vectp;
913133819Stjr
914140992Ssobomax	stringp = imgp->args->begin_argv;
915140992Ssobomax	argc = imgp->args->argc;
916140992Ssobomax	envc = imgp->args->envc;
917133819Stjr	/*
918133819Stjr	 * Copy out strings - arguments and environment.
919133819Stjr	 */
920140992Ssobomax	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
921133819Stjr
922133819Stjr	/*
923133819Stjr	 * Fill in "ps_strings" struct for ps, w, etc.
924133819Stjr	 */
925189362Sdchagin	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
926133819Stjr	suword32(&arginfo->ps_nargvstr, argc);
927133819Stjr
928133819Stjr	/*
929133819Stjr	 * Fill in argument portion of vector table.
930133819Stjr	 */
931133819Stjr	for (; argc > 0; --argc) {
932189362Sdchagin		suword32(vectp++, (uint32_t)(intptr_t)destp);
933133819Stjr		while (*stringp++ != 0)
934133819Stjr			destp++;
935133819Stjr		destp++;
936133819Stjr	}
937133819Stjr
938133819Stjr	/* a null vector table pointer separates the argp's from the envp's */
939133819Stjr	suword32(vectp++, 0);
940133819Stjr
941189362Sdchagin	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
942133819Stjr	suword32(&arginfo->ps_nenvstr, envc);
943133819Stjr
944133819Stjr	/*
945133819Stjr	 * Fill in environment portion of vector table.
946133819Stjr	 */
947133819Stjr	for (; envc > 0; --envc) {
948189362Sdchagin		suword32(vectp++, (uint32_t)(intptr_t)destp);
949133819Stjr		while (*stringp++ != 0)
950133819Stjr			destp++;
951133819Stjr		destp++;
952133819Stjr	}
953133819Stjr
954133819Stjr	/* end of vector table is a null pointer */
955133819Stjr	suword32(vectp, 0);
956133819Stjr
957133819Stjr	return ((register_t *)stack_base);
958133819Stjr}
959133819Stjr
960227309Sedstatic SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
961133819Stjr    "32-bit Linux emulation");
962133819Stjr
963133819Stjrstatic u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
964133819StjrSYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
965133819Stjr    &linux32_maxdsiz, 0, "");
966133819Stjrstatic u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
967133819StjrSYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
968133819Stjr    &linux32_maxssiz, 0, "");
969133819Stjrstatic u_long	linux32_maxvmem = LINUX32_MAXVMEM;
970133819StjrSYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
971133819Stjr    &linux32_maxvmem, 0, "");
972133819Stjr
973293527Sdchagin#if defined(DEBUG)
974293527SdchaginSYSCTL_PROC(_compat_linux32, OID_AUTO, debug,
975293527Sdchagin            CTLTYPE_STRING | CTLFLAG_RW,
976293527Sdchagin            0, 0, linux_sysctl_debug, "A",
977293527Sdchagin            "Linux debugging control");
978293527Sdchagin#endif
979293527Sdchagin
980133819Stjrstatic void
981169565Sjhblinux32_fixlimit(struct rlimit *rl, int which)
982133819Stjr{
983133819Stjr
984169565Sjhb	switch (which) {
985169565Sjhb	case RLIMIT_DATA:
986187964Sobrien		if (linux32_maxdsiz != 0) {
987169565Sjhb			if (rl->rlim_cur > linux32_maxdsiz)
988169565Sjhb				rl->rlim_cur = linux32_maxdsiz;
989169565Sjhb			if (rl->rlim_max > linux32_maxdsiz)
990169565Sjhb				rl->rlim_max = linux32_maxdsiz;
991169565Sjhb		}
992169565Sjhb		break;
993169565Sjhb	case RLIMIT_STACK:
994169565Sjhb		if (linux32_maxssiz != 0) {
995169565Sjhb			if (rl->rlim_cur > linux32_maxssiz)
996169565Sjhb				rl->rlim_cur = linux32_maxssiz;
997169565Sjhb			if (rl->rlim_max > linux32_maxssiz)
998169565Sjhb				rl->rlim_max = linux32_maxssiz;
999169565Sjhb		}
1000169565Sjhb		break;
1001169565Sjhb	case RLIMIT_VMEM:
1002169565Sjhb		if (linux32_maxvmem != 0) {
1003169565Sjhb			if (rl->rlim_cur > linux32_maxvmem)
1004169565Sjhb				rl->rlim_cur = linux32_maxvmem;
1005169565Sjhb			if (rl->rlim_max > linux32_maxvmem)
1006169565Sjhb				rl->rlim_max = linux32_maxvmem;
1007169565Sjhb		}
1008169565Sjhb		break;
1009133819Stjr	}
1010133819Stjr}
1011133819Stjr
1012133819Stjrstruct sysentvec elf_linux_sysvec = {
1013294368Sjhb	.sv_size	= LINUX32_SYS_MAXSYSCALL,
1014294368Sjhb	.sv_table	= linux32_sysent,
1015183322Skib	.sv_mask	= 0,
1016293575Sdchagin	.sv_sigsize	= 0,
1017293575Sdchagin	.sv_sigtbl	= NULL,
1018183322Skib	.sv_errsize	= ELAST + 1,
1019183322Skib	.sv_errtbl	= bsd_to_linux_errno,
1020183322Skib	.sv_transtrap	= translate_traps,
1021183322Skib	.sv_fixup	= elf_linux_fixup,
1022183322Skib	.sv_sendsig	= linux_sendsig,
1023293514Sdchagin	.sv_sigcode	= &_binary_linux32_locore_o_start,
1024183322Skib	.sv_szsigcode	= &linux_szsigcode,
1025208453Skib	.sv_prepsyscall	= NULL,
1026183322Skib	.sv_name	= "Linux ELF32",
1027183322Skib	.sv_coredump	= elf32_coredump,
1028183322Skib	.sv_imgact_try	= exec_linux_imgact_try,
1029183322Skib	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1030183322Skib	.sv_pagesize	= PAGE_SIZE,
1031183322Skib	.sv_minuser	= VM_MIN_ADDRESS,
1032219609Sdchagin	.sv_maxuser	= LINUX32_MAXUSER,
1033183322Skib	.sv_usrstack	= LINUX32_USRSTACK,
1034183322Skib	.sv_psstrings	= LINUX32_PS_STRINGS,
1035183322Skib	.sv_stackprot	= VM_PROT_ALL,
1036183322Skib	.sv_copyout_strings = linux_copyout_strings,
1037183322Skib	.sv_setregs	= exec_linux_setregs,
1038183322Skib	.sv_fixlimit	= linux32_fixlimit,
1039183322Skib	.sv_maxssiz	= &linux32_maxssiz,
1040219609Sdchagin	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1041208453Skib	.sv_set_syscall_retval = cpu_set_syscall_retval,
1042208453Skib	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1043208453Skib	.sv_syscallnames = NULL,
1044219609Sdchagin	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1045219609Sdchagin	.sv_shared_page_len = PAGE_SIZE,
1046219405Sdchagin	.sv_schedtail	= linux_schedtail,
1047293493Sdchagin	.sv_thread_detach = linux_thread_detach,
1048294136Sdchagin	.sv_trap	= NULL,
1049133819Stjr};
1050133819Stjr
1051293514Sdchaginstatic void
1052293514Sdchaginlinux_vdso_install(void *param)
1053293514Sdchagin{
1054293514Sdchagin
1055293514Sdchagin	linux_szsigcode = (&_binary_linux32_locore_o_end -
1056293514Sdchagin	    &_binary_linux32_locore_o_start);
1057293514Sdchagin
1058293514Sdchagin	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1059293514Sdchagin		panic("Linux invalid vdso size\n");
1060293514Sdchagin
1061293514Sdchagin	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1062293514Sdchagin
1063293514Sdchagin	linux_shared_page_obj = __elfN(linux_shared_page_init)
1064293514Sdchagin	    (&linux_shared_page_mapping);
1065293514Sdchagin
1066293514Sdchagin	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1067293514Sdchagin
1068293514Sdchagin	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1069293514Sdchagin	    linux_szsigcode);
1070293514Sdchagin	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1071293516Sdchagin
1072293516Sdchagin	linux_kplatform = linux_shared_page_mapping +
1073293516Sdchagin	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1074293514Sdchagin}
1075293514SdchaginSYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1076293514Sdchagin    (sysinit_cfunc_t)linux_vdso_install, NULL);
1077293514Sdchagin
1078293514Sdchaginstatic void
1079293514Sdchaginlinux_vdso_deinstall(void *param)
1080293514Sdchagin{
1081293514Sdchagin
1082293514Sdchagin	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1083293514Sdchagin};
1084293514SdchaginSYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1085293514Sdchagin    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1086293514Sdchagin
1087196512Sbzstatic char GNU_ABI_VENDOR[] = "GNU";
1088196512Sbzstatic int GNULINUX_ABI_DESC = 0;
1089189771Sdchagin
1090196512Sbzstatic boolean_t
1091196512Sbzlinux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1092196512Sbz{
1093196512Sbz	const Elf32_Word *desc;
1094196512Sbz	uintptr_t p;
1095196512Sbz
1096196512Sbz	p = (uintptr_t)(note + 1);
1097196512Sbz	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1098196512Sbz
1099196512Sbz	desc = (const Elf32_Word *)p;
1100196512Sbz	if (desc[0] != GNULINUX_ABI_DESC)
1101196512Sbz		return (FALSE);
1102196512Sbz
1103196512Sbz	/*
1104196512Sbz	 * For linux we encode osrel as follows (see linux_mib.c):
1105196512Sbz	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1106196512Sbz	 */
1107196512Sbz	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1108196512Sbz
1109196512Sbz	return (TRUE);
1110196512Sbz}
1111196512Sbz
1112189771Sdchaginstatic Elf_Brandnote linux32_brandnote = {
1113196512Sbz	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1114196512Sbz	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1115189771Sdchagin	.hdr.n_type	= 1,
1116196512Sbz	.vendor		= GNU_ABI_VENDOR,
1117196512Sbz	.flags		= BN_TRANSLATE_OSREL,
1118196512Sbz	.trans_osrel	= linux32_trans_osrel
1119189771Sdchagin};
1120189771Sdchagin
1121133819Stjrstatic Elf32_Brandinfo linux_brand = {
1122183322Skib	.brand		= ELFOSABI_LINUX,
1123183322Skib	.machine	= EM_386,
1124183322Skib	.compat_3_brand	= "Linux",
1125183322Skib	.emul_path	= "/compat/linux",
1126183322Skib	.interp_path	= "/lib/ld-linux.so.1",
1127183322Skib	.sysvec		= &elf_linux_sysvec,
1128183322Skib	.interp_newpath	= NULL,
1129189771Sdchagin	.brand_note	= &linux32_brandnote,
1130267561Sdchagin	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1131183322Skib};
1132133819Stjr
1133133819Stjrstatic Elf32_Brandinfo linux_glibc2brand = {
1134183322Skib	.brand		= ELFOSABI_LINUX,
1135183322Skib	.machine	= EM_386,
1136183322Skib	.compat_3_brand	= "Linux",
1137183322Skib	.emul_path	= "/compat/linux",
1138183322Skib	.interp_path	= "/lib/ld-linux.so.2",
1139183322Skib	.sysvec		= &elf_linux_sysvec,
1140183322Skib	.interp_newpath	= NULL,
1141189771Sdchagin	.brand_note	= &linux32_brandnote,
1142267561Sdchagin	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1143183322Skib};
1144133819Stjr
1145133819StjrElf32_Brandinfo *linux_brandlist[] = {
1146183322Skib	&linux_brand,
1147183322Skib	&linux_glibc2brand,
1148183322Skib	NULL
1149183322Skib};
1150133819Stjr
1151133819Stjrstatic int
1152133819Stjrlinux_elf_modevent(module_t mod, int type, void *data)
1153133819Stjr{
1154133819Stjr	Elf32_Brandinfo **brandinfo;
1155133819Stjr	int error;
1156133819Stjr	struct linux_ioctl_handler **lihp;
1157133819Stjr
1158133819Stjr	error = 0;
1159133819Stjr
1160133819Stjr	switch(type) {
1161133819Stjr	case MOD_LOAD:
1162133819Stjr		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1163133819Stjr		     ++brandinfo)
1164133819Stjr			if (elf32_insert_brand_entry(*brandinfo) < 0)
1165133819Stjr				error = EINVAL;
1166133819Stjr		if (error == 0) {
1167133819Stjr			SET_FOREACH(lihp, linux_ioctl_handler_set)
1168133819Stjr				linux_ioctl_register_handler(*lihp);
1169161315Snetchild			LIST_INIT(&futex_list);
1170191719Sdchagin			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1171191966Sdchagin			stclohz = (stathz ? stathz : hz);
1172133819Stjr			if (bootverbose)
1173133819Stjr				printf("Linux ELF exec handler installed\n");
1174133819Stjr		} else
1175133819Stjr			printf("cannot insert Linux ELF brand handler\n");
1176133819Stjr		break;
1177133819Stjr	case MOD_UNLOAD:
1178133819Stjr		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1179133819Stjr		     ++brandinfo)
1180133819Stjr			if (elf32_brand_inuse(*brandinfo))
1181133819Stjr				error = EBUSY;
1182133819Stjr		if (error == 0) {
1183133819Stjr			for (brandinfo = &linux_brandlist[0];
1184133819Stjr			     *brandinfo != NULL; ++brandinfo)
1185133819Stjr				if (elf32_remove_brand_entry(*brandinfo) < 0)
1186133819Stjr					error = EINVAL;
1187133819Stjr		}
1188133819Stjr		if (error == 0) {
1189133819Stjr			SET_FOREACH(lihp, linux_ioctl_handler_set)
1190133819Stjr				linux_ioctl_unregister_handler(*lihp);
1191191719Sdchagin			mtx_destroy(&futex_mtx);
1192133819Stjr			if (bootverbose)
1193133819Stjr				printf("Linux ELF exec handler removed\n");
1194133819Stjr		} else
1195133819Stjr			printf("Could not deinstall ELF interpreter entry\n");
1196133819Stjr		break;
1197133819Stjr	default:
1198293495Sdchagin		return (EOPNOTSUPP);
1199133819Stjr	}
1200293495Sdchagin	return (error);
1201133819Stjr}
1202133819Stjr
1203133819Stjrstatic moduledata_t linux_elf_mod = {
1204133819Stjr	"linuxelf",
1205133819Stjr	linux_elf_modevent,
1206241394Skevlo	0
1207133819Stjr};
1208133819Stjr
1209213716SkibDECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1210293527SdchaginMODULE_DEPEND(linuxelf, linux_common, 1, 1, 1);
1211