1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 S��ren Schmidt
7 * All rights reserved.
8 * Copyright (c) 2013, 2021 Dmitry Chagin <dchagin@FreeBSD.org>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer
15 *    in this position and unchanged.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The name of the author may not be used to endorse or promote products
20 *    derived from this software without specific prior written permission
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#define	__ELF_WORD_SIZE	64
35
36#include <sys/param.h>
37#include <sys/exec.h>
38#include <sys/imgact.h>
39#include <sys/imgact_elf.h>
40#include <sys/kernel.h>
41#include <sys/ktr.h>
42#include <sys/lock.h>
43#include <sys/module.h>
44#include <sys/mutex.h>
45#include <sys/proc.h>
46#include <sys/stddef.h>
47#include <sys/syscallsubr.h>
48#include <sys/sysctl.h>
49#include <sys/sysent.h>
50
51#include <vm/pmap.h>
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54
55#include <machine/md_var.h>
56#include <machine/trap.h>
57
58#include <x86/linux/linux_x86.h>
59#include <amd64/linux/linux.h>
60#include <amd64/linux/linux_proto.h>
61#include <compat/linux/linux_elf.h>
62#include <compat/linux/linux_emul.h>
63#include <compat/linux/linux_fork.h>
64#include <compat/linux/linux_ioctl.h>
65#include <compat/linux/linux_mib.h>
66#include <compat/linux/linux_misc.h>
67#include <compat/linux/linux_signal.h>
68#include <compat/linux/linux_util.h>
69#include <compat/linux/linux_vdso.h>
70
71#include <x86/linux/linux_x86_sigframe.h>
72
73_Static_assert(sizeof(struct l_fpstate) ==
74    sizeof(__typeof(((mcontext_t *)0)->mc_fpstate)),
75    "fxsave area size incorrect");
76
77MODULE_VERSION(linux64, 1);
78
79#define	LINUX_VDSOPAGE_SIZE	PAGE_SIZE * 2
80#define	LINUX_VDSOPAGE_LA48	(VM_MAXUSER_ADDRESS_LA48 - \
81				    LINUX_VDSOPAGE_SIZE)
82#define	LINUX_SHAREDPAGE_LA48	(LINUX_VDSOPAGE_LA48 - PAGE_SIZE)
83				/*
84				 * PAGE_SIZE - the size
85				 * of the native SHAREDPAGE
86				 */
87#define	LINUX_USRSTACK_LA48	LINUX_SHAREDPAGE_LA48
88#define	LINUX_PS_STRINGS_LA48	(LINUX_USRSTACK_LA48 - \
89				    sizeof(struct ps_strings))
90
91static int linux_szsigcode;
92static vm_object_t linux_vdso_obj;
93static char *linux_vdso_mapping;
94extern char _binary_linux_vdso_so_o_start;
95extern char _binary_linux_vdso_so_o_end;
96static vm_offset_t linux_vdso_base;
97
98extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
99extern const char *linux_syscallnames[];
100
101SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
102
103static void	linux_vdso_install(const void *param);
104static void	linux_vdso_deinstall(const void *param);
105static void	linux_vdso_reloc(char *mapping, Elf_Addr offset);
106static void	linux_set_syscall_retval(struct thread *td, int error);
107static int	linux_fetch_syscall_args(struct thread *td);
108static void	linux_exec_setregs(struct thread *td, struct image_params *imgp,
109		    uintptr_t stack);
110static void	linux_exec_sysvec_init(void *param);
111static int	linux_on_exec_vmspace(struct proc *p,
112		    struct image_params *imgp);
113static void	linux_set_fork_retval(struct thread *td);
114static int	linux_vsyscall(struct thread *td);
115
116LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
117LINUX_VDSO_SYM_CHAR(linux_platform);
118LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
119LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
120LINUX_VDSO_SYM_INTPTR(kern_cpu_selector);
121
122/*
123 * According to the Intel x86 ISA 64-bit syscall
124 * saves %rip to %rcx and rflags to %r11. Registers on syscall entry:
125 * %rax  system call number
126 * %rcx  return address
127 * %r11  saved rflags
128 * %rdi  arg1
129 * %rsi  arg2
130 * %rdx  arg3
131 * %r10  arg4
132 * %r8   arg5
133 * %r9   arg6
134 *
135 * Then FreeBSD fast_syscall() move registers:
136 * %rcx -> trapframe.tf_rip
137 * %r10 -> trapframe.tf_rcx
138 */
139static int
140linux_fetch_syscall_args(struct thread *td)
141{
142	struct proc *p;
143	struct trapframe *frame;
144	struct syscall_args *sa;
145
146	p = td->td_proc;
147	frame = td->td_frame;
148	sa = &td->td_sa;
149
150	sa->args[0] = frame->tf_rdi;
151	sa->args[1] = frame->tf_rsi;
152	sa->args[2] = frame->tf_rdx;
153	sa->args[3] = frame->tf_rcx;
154	sa->args[4] = frame->tf_r8;
155	sa->args[5] = frame->tf_r9;
156	sa->code = frame->tf_rax;
157	sa->original_code = sa->code;
158
159	if (sa->code >= p->p_sysent->sv_size)
160		/* nosys */
161		sa->callp = &nosys_sysent;
162	else
163		sa->callp = &p->p_sysent->sv_table[sa->code];
164
165	/* Restore r10 earlier to avoid doing this multiply times. */
166	frame->tf_r10 = frame->tf_rcx;
167	/* Restore %rcx for machine context. */
168	frame->tf_rcx = frame->tf_rip;
169
170	td->td_retval[0] = 0;
171	return (0);
172}
173
174static void
175linux_set_syscall_retval(struct thread *td, int error)
176{
177	struct trapframe *frame;
178
179	frame = td->td_frame;
180
181	switch (error) {
182	case 0:
183		frame->tf_rax = td->td_retval[0];
184		break;
185
186	case ERESTART:
187		/*
188		 * Reconstruct pc, we know that 'syscall' is 2 bytes,
189		 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
190		 * We saved this in tf_err.
191		 *
192		 */
193		frame->tf_rip -= frame->tf_err;
194		break;
195
196	case EJUSTRETURN:
197		break;
198
199	default:
200		frame->tf_rax = bsd_to_linux_errno(error);
201		break;
202	}
203
204	/*
205	 * Differently from FreeBSD native ABI, on Linux only %rcx
206	 * and %r11 values are not preserved across the syscall.
207	 * Require full context restore to get all registers except
208	 * those two restored at return to usermode.
209	 */
210	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
211}
212
213static void
214linux_set_fork_retval(struct thread *td)
215{
216	struct trapframe *frame = td->td_frame;
217
218	frame->tf_rax = 0;
219}
220
221void
222linux64_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos)
223{
224
225	AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
226	AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, cpu_feature);
227	AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP2, linux_x86_elf_hwcap2());
228	AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform));
229}
230
231/*
232 * Reset registers to default values on exec.
233 */
234static void
235linux_exec_setregs(struct thread *td, struct image_params *imgp,
236    uintptr_t stack)
237{
238	struct trapframe *regs;
239	struct pcb *pcb;
240	register_t saved_rflags;
241
242	regs = td->td_frame;
243	pcb = td->td_pcb;
244
245	if (td->td_proc->p_md.md_ldt != NULL)
246		user_ldt_free(td);
247
248	pcb->pcb_fsbase = 0;
249	pcb->pcb_gsbase = 0;
250	clear_pcb_flags(pcb, PCB_32BIT);
251	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
252	set_pcb_flags(pcb, PCB_FULL_IRET);
253
254	saved_rflags = regs->tf_rflags & PSL_T;
255	bzero((char *)regs, sizeof(struct trapframe));
256	regs->tf_rip = imgp->entry_addr;
257	regs->tf_rsp = stack;
258	regs->tf_rflags = PSL_USER | saved_rflags;
259	regs->tf_ss = _udatasel;
260	regs->tf_cs = _ucodesel;
261	regs->tf_ds = _udatasel;
262	regs->tf_es = _udatasel;
263	regs->tf_fs = _ufssel;
264	regs->tf_gs = _ugssel;
265	regs->tf_flags = TF_HASSEGS;
266
267	x86_clear_dbregs(pcb);
268
269	/*
270	 * Drop the FP state if we hold it, so that the process gets a
271	 * clean FP state if it uses the FPU again.
272	 */
273	fpstate_drop(td);
274}
275
276static int
277linux_fxrstor(struct thread *td, mcontext_t *mcp, struct l_sigcontext *sc)
278{
279	struct savefpu *fp = (struct savefpu *)&mcp->mc_fpstate[0];
280	int error;
281
282	error = copyin(PTRIN(sc->sc_fpstate), fp, sizeof(mcp->mc_fpstate));
283	if (error != 0)
284		return (error);
285	bzero(&fp->sv_pad[0], sizeof(fp->sv_pad));
286	return (set_fpcontext(td, mcp, NULL, 0));
287}
288
289static int
290linux_xrstor(struct thread *td, mcontext_t *mcp, struct l_sigcontext *sc)
291{
292	struct savefpu *fp = (struct savefpu *)&mcp->mc_fpstate[0];
293	char *xfpustate;
294	struct proc *p;
295	uint32_t magic2;
296	int error;
297
298	p = td->td_proc;
299	mcp->mc_xfpustate_len = cpu_max_ext_state_size - sizeof(struct savefpu);
300
301	/* Legacy region of an xsave area. */
302	error = copyin(PTRIN(sc->sc_fpstate), fp, sizeof(mcp->mc_fpstate));
303	if (error != 0)
304		return (error);
305	bzero(&fp->sv_pad[0], sizeof(fp->sv_pad));
306
307	/* Extended region of an xsave area. */
308	sc->sc_fpstate += sizeof(mcp->mc_fpstate);
309	xfpustate = (char *)fpu_save_area_alloc();
310	error = copyin(PTRIN(sc->sc_fpstate), xfpustate, mcp->mc_xfpustate_len);
311	if (error != 0) {
312		fpu_save_area_free((struct savefpu *)xfpustate);
313		uprintf("pid %d (%s): linux xrstor failed\n", p->p_pid,
314		    td->td_name);
315		return (error);
316	}
317
318	/* Linux specific end of xsave area marker. */
319	sc->sc_fpstate += mcp->mc_xfpustate_len;
320	error = copyin(PTRIN(sc->sc_fpstate), &magic2, LINUX_FP_XSTATE_MAGIC2_SIZE);
321	if (error != 0 || magic2 != LINUX_FP_XSTATE_MAGIC2) {
322		fpu_save_area_free((struct savefpu *)xfpustate);
323		uprintf("pid %d (%s): sigreturn magic2 0x%x error %d\n",
324		    p->p_pid, td->td_name, magic2, error);
325		return (error);
326	}
327
328	error = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
329	fpu_save_area_free((struct savefpu *)xfpustate);
330	if (error != 0) {
331		uprintf("pid %d (%s): sigreturn set_fpcontext error %d\n",
332		    p->p_pid, td->td_name, error);
333	}
334	return (error);
335}
336
337static int
338linux_copyin_fpstate(struct thread *td, struct l_ucontext *uc)
339{
340	mcontext_t mc;
341
342	bzero(&mc, sizeof(mc));
343	mc.mc_ownedfp = _MC_FPOWNED_FPU;
344	mc.mc_fpformat = _MC_FPFMT_XMM;
345
346	if ((uc->uc_flags & LINUX_UC_FP_XSTATE) != 0)
347		return (linux_xrstor(td, &mc, &uc->uc_mcontext));
348	else
349		return (linux_fxrstor(td, &mc, &uc->uc_mcontext));
350}
351
352/*
353 * Copied from amd64/amd64/machdep.c
354 */
355int
356linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
357{
358	struct proc *p;
359	struct l_rt_sigframe sf;
360	struct l_sigcontext *context;
361	struct trapframe *regs;
362	unsigned long rflags;
363	sigset_t bmask;
364	int error;
365	ksiginfo_t ksi;
366
367	regs = td->td_frame;
368	error = copyin((void *)regs->tf_rbx, &sf, sizeof(sf));
369	if (error != 0)
370		return (error);
371
372	p = td->td_proc;
373	context = &sf.sf_uc.uc_mcontext;
374	rflags = context->sc_rflags;
375
376	/*
377	 * Don't allow users to change privileged or reserved flags.
378	 */
379	/*
380	 * XXX do allow users to change the privileged flag PSL_RF.
381	 * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
382	 * should sometimes set it there too.  tf_rflags is kept in
383	 * the signal context during signal handling and there is no
384	 * other place to remember it, so the PSL_RF bit may be
385	 * corrupted by the signal handler without us knowing.
386	 * Corruption of the PSL_RF bit at worst causes one more or
387	 * one less debugger trap, so allowing it is fairly harmless.
388	 */
389	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
390		uprintf("pid %d comm %s linux mangled rflags %#lx\n",
391		    p->p_pid, p->p_comm, rflags);
392		return (EINVAL);
393	}
394
395	/*
396	 * Don't allow users to load a valid privileged %cs.  Let the
397	 * hardware check for invalid selectors, excess privilege in
398	 * other selectors, invalid %eip's and invalid %esp's.
399	 */
400	if (!CS_SECURE(context->sc_cs)) {
401		uprintf("pid %d comm %s linux mangled cs %#x\n",
402		    p->p_pid, p->p_comm, context->sc_cs);
403		ksiginfo_init_trap(&ksi);
404		ksi.ksi_signo = SIGBUS;
405		ksi.ksi_code = BUS_OBJERR;
406		ksi.ksi_trapno = T_PROTFLT;
407		ksi.ksi_addr = (void *)regs->tf_rip;
408		trapsignal(td, &ksi);
409		return (EINVAL);
410	}
411
412	linux_to_bsd_sigset(&sf.sf_uc.uc_sigmask, &bmask);
413	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
414
415	regs->tf_rdi    = context->sc_rdi;
416	regs->tf_rsi    = context->sc_rsi;
417	regs->tf_rdx    = context->sc_rdx;
418	regs->tf_rbp    = context->sc_rbp;
419	regs->tf_rbx    = context->sc_rbx;
420	regs->tf_rcx    = context->sc_rcx;
421	regs->tf_rax    = context->sc_rax;
422	regs->tf_rip    = context->sc_rip;
423	regs->tf_rsp    = context->sc_rsp;
424	regs->tf_r8     = context->sc_r8;
425	regs->tf_r9     = context->sc_r9;
426	regs->tf_r10    = context->sc_r10;
427	regs->tf_r11    = context->sc_r11;
428	regs->tf_r12    = context->sc_r12;
429	regs->tf_r13    = context->sc_r13;
430	regs->tf_r14    = context->sc_r14;
431	regs->tf_r15    = context->sc_r15;
432	regs->tf_cs     = context->sc_cs;
433	regs->tf_err    = context->sc_err;
434	regs->tf_rflags = rflags;
435
436	error = linux_copyin_fpstate(td, &sf.sf_uc);
437	if (error != 0) {
438		uprintf("pid %d comm %s linux can't restore fpu state %d\n",
439		    p->p_pid, p->p_comm, error);
440		return (error);
441	}
442
443	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
444	return (EJUSTRETURN);
445}
446
447static int
448linux_fxsave(mcontext_t *mcp, void *ufp)
449{
450	struct l_fpstate *fx = (struct l_fpstate *)&mcp->mc_fpstate[0];
451
452	bzero(&fx->reserved2[0], sizeof(fx->reserved2));
453	return (copyout(fx, ufp, sizeof(*fx)));
454}
455
456static int
457linux_xsave(mcontext_t *mcp, char *xfpusave, char *ufp)
458{
459	struct l_fpstate *fx = (struct l_fpstate *)&mcp->mc_fpstate[0];
460	uint32_t magic2;
461	int error;
462
463	/* Legacy region of an xsave area. */
464	fx->sw_reserved.magic1 = LINUX_FP_XSTATE_MAGIC1;
465	fx->sw_reserved.xstate_size = mcp->mc_xfpustate_len + sizeof(*fx);
466	fx->sw_reserved.extended_size = fx->sw_reserved.xstate_size +
467	    LINUX_FP_XSTATE_MAGIC2_SIZE;
468	fx->sw_reserved.xfeatures = xsave_mask;
469
470	error = copyout(fx, ufp, sizeof(*fx));
471	if (error != 0)
472		return (error);
473	ufp += sizeof(*fx);
474
475	/* Extended region of an xsave area. */
476	error = copyout(xfpusave, ufp, mcp->mc_xfpustate_len);
477	if (error != 0)
478		return (error);
479
480	/* Linux specific end of xsave area marker. */
481	ufp += mcp->mc_xfpustate_len;
482	magic2 = LINUX_FP_XSTATE_MAGIC2;
483	return (copyout(&magic2, ufp, LINUX_FP_XSTATE_MAGIC2_SIZE));
484}
485
486static int
487linux_copyout_fpstate(struct thread *td, struct l_ucontext *uc, char **sp)
488{
489	size_t xfpusave_len;
490	char *xfpusave;
491	mcontext_t mc;
492	char *ufp = *sp;
493
494	get_fpcontext(td, &mc, &xfpusave, &xfpusave_len);
495	KASSERT(mc.mc_fpformat != _MC_FPFMT_NODEV, ("fpu not present"));
496
497	/* Room for fxsave area. */
498	ufp -= sizeof(struct l_fpstate);
499	if (xfpusave != NULL) {
500		/* Room for xsave area. */
501		ufp -= (xfpusave_len + LINUX_FP_XSTATE_MAGIC2_SIZE);
502		uc->uc_flags |= LINUX_UC_FP_XSTATE;
503	}
504	*sp = ufp = (char *)((unsigned long)ufp & ~0x3Ful);
505
506	if (xfpusave != NULL)
507		return (linux_xsave(&mc, xfpusave, ufp));
508	else
509		return (linux_fxsave(&mc, ufp));
510}
511
512/*
513 * copied from amd64/amd64/machdep.c
514 *
515 * Send an interrupt to process.
516 */
517static void
518linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
519{
520	struct l_rt_sigframe sf, *sfp;
521	struct proc *p;
522	struct thread *td;
523	struct sigacts *psp;
524	char *sp;
525	struct trapframe *regs;
526	int sig, code;
527	int oonstack, issiginfo;
528
529	td = curthread;
530	p = td->td_proc;
531	PROC_LOCK_ASSERT(p, MA_OWNED);
532	sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno);
533	psp = p->p_sigacts;
534	issiginfo = SIGISMEMBER(psp->ps_siginfo, sig);
535	code = ksi->ksi_code;
536	mtx_assert(&psp->ps_mtx, MA_OWNED);
537	regs = td->td_frame;
538	oonstack = sigonstack(regs->tf_rsp);
539
540	LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u",
541	    catcher, sig, mask, code);
542
543	bzero(&sf, sizeof(sf));
544	sf.sf_uc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
545	sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
546	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
547	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
548
549	/* Allocate space for the signal handler context. */
550	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
551	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
552		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
553	} else
554		sp = (char *)regs->tf_rsp - 128;
555
556	mtx_unlock(&psp->ps_mtx);
557	PROC_UNLOCK(p);
558
559	if (linux_copyout_fpstate(td, &sf.sf_uc, &sp) != 0) {
560		uprintf("pid %d comm %s linux can't save fpu state, killing\n",
561		    p->p_pid, p->p_comm);
562		PROC_LOCK(p);
563		sigexit(td, SIGILL);
564	}
565	sf.sf_uc.uc_mcontext.sc_fpstate = (register_t)sp;
566
567	/* Make room, keeping the stack aligned. */
568	sp -= sizeof(struct l_rt_sigframe);
569	sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul);
570
571	/* Save user context. */
572	bsd_to_linux_sigset(mask, &sf.sf_uc.uc_sigmask);
573	sf.sf_uc.uc_mcontext.sc_mask   = sf.sf_uc.uc_sigmask;
574	sf.sf_uc.uc_mcontext.sc_rdi    = regs->tf_rdi;
575	sf.sf_uc.uc_mcontext.sc_rsi    = regs->tf_rsi;
576	sf.sf_uc.uc_mcontext.sc_rdx    = regs->tf_rdx;
577	sf.sf_uc.uc_mcontext.sc_rbp    = regs->tf_rbp;
578	sf.sf_uc.uc_mcontext.sc_rbx    = regs->tf_rbx;
579	sf.sf_uc.uc_mcontext.sc_rcx    = regs->tf_rcx;
580	sf.sf_uc.uc_mcontext.sc_rax    = regs->tf_rax;
581	sf.sf_uc.uc_mcontext.sc_rip    = regs->tf_rip;
582	sf.sf_uc.uc_mcontext.sc_rsp    = regs->tf_rsp;
583	sf.sf_uc.uc_mcontext.sc_r8     = regs->tf_r8;
584	sf.sf_uc.uc_mcontext.sc_r9     = regs->tf_r9;
585	sf.sf_uc.uc_mcontext.sc_r10    = regs->tf_r10;
586	sf.sf_uc.uc_mcontext.sc_r11    = regs->tf_r11;
587	sf.sf_uc.uc_mcontext.sc_r12    = regs->tf_r12;
588	sf.sf_uc.uc_mcontext.sc_r13    = regs->tf_r13;
589	sf.sf_uc.uc_mcontext.sc_r14    = regs->tf_r14;
590	sf.sf_uc.uc_mcontext.sc_r15    = regs->tf_r15;
591	sf.sf_uc.uc_mcontext.sc_cs     = regs->tf_cs;
592	sf.sf_uc.uc_mcontext.sc_rflags = regs->tf_rflags;
593	sf.sf_uc.uc_mcontext.sc_err    = regs->tf_err;
594	sf.sf_uc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
595	sf.sf_uc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
596
597	/* Translate the signal. */
598	sig = bsd_to_linux_signal(sig);
599	/* Fill in POSIX parts. */
600	siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig);
601
602	/* Copy the sigframe out to the user's stack. */
603	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
604		uprintf("pid %d comm %s has trashed its stack, killing\n",
605		    p->p_pid, p->p_comm);
606		PROC_LOCK(p);
607		sigexit(td, SIGILL);
608	}
609
610	fpstate_drop(td);
611	/* Build the argument list for the signal handler. */
612	regs->tf_rdi = sig;			/* arg 1 in %rdi */
613	regs->tf_rax = 0;
614	if (issiginfo) {
615		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
616		regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
617	} else {
618		regs->tf_rsi = 0;
619		regs->tf_rdx = 0;
620	}
621	regs->tf_rcx = (register_t)catcher;
622	regs->tf_rsp = (long)sfp;
623	regs->tf_rip = linux_rt_sigcode;
624	regs->tf_rflags &= ~(PSL_T | PSL_D);
625	regs->tf_cs = _ucodesel;
626	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
627	PROC_LOCK(p);
628	mtx_lock(&psp->ps_mtx);
629}
630
631#define	LINUX_VSYSCALL_START		(-10UL << 20)
632#define	LINUX_VSYSCALL_SZ		1024
633
634const unsigned long linux_vsyscall_vector[] = {
635	LINUX_SYS_gettimeofday,
636	LINUX_SYS_linux_time,
637	LINUX_SYS_linux_getcpu,
638};
639
640static int
641linux_vsyscall(struct thread *td)
642{
643	struct trapframe *frame;
644	uint64_t retqaddr;
645	int code, traced;
646	int error;
647
648	frame = td->td_frame;
649
650	/* Check %rip for vsyscall area. */
651	if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START))
652		return (EINVAL);
653	if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0)
654		return (EINVAL);
655	code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ;
656	if (code >= nitems(linux_vsyscall_vector))
657		return (EINVAL);
658
659	/*
660	 * vsyscall called as callq *(%rax), so we must
661	 * use return address from %rsp and also fixup %rsp.
662	 */
663	error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr));
664	if (error)
665		return (error);
666
667	frame->tf_rip = retqaddr;
668	frame->tf_rax = linux_vsyscall_vector[code];
669	frame->tf_rsp += 8;
670
671	traced = (frame->tf_flags & PSL_T);
672
673	amd64_syscall(td, traced);
674
675	return (0);
676}
677
678struct sysentvec elf_linux_sysvec = {
679	.sv_size	= LINUX_SYS_MAXSYSCALL,
680	.sv_table	= linux_sysent,
681	.sv_fixup	= __elfN(freebsd_fixup),
682	.sv_sendsig	= linux_rt_sendsig,
683	.sv_sigcode	= &_binary_linux_vdso_so_o_start,
684	.sv_szsigcode	= &linux_szsigcode,
685	.sv_name	= "Linux ELF64",
686	.sv_coredump	= elf64_coredump,
687	.sv_elf_core_osabi = ELFOSABI_NONE,
688	.sv_elf_core_abi_vendor = LINUX_ABI_VENDOR,
689	.sv_elf_core_prepare_notes = linux64_prepare_notes,
690	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
691	.sv_minuser	= VM_MIN_ADDRESS,
692	.sv_maxuser	= VM_MAXUSER_ADDRESS_LA48,
693	.sv_usrstack	= LINUX_USRSTACK_LA48,
694	.sv_psstrings	= LINUX_PS_STRINGS_LA48,
695	.sv_psstringssz	= sizeof(struct ps_strings),
696	.sv_stackprot	= VM_PROT_ALL,
697	.sv_copyout_auxargs = __linuxN(copyout_auxargs),
698	.sv_copyout_strings = __linuxN(copyout_strings),
699	.sv_setregs	= linux_exec_setregs,
700	.sv_fixlimit	= NULL,
701	.sv_maxssiz	= NULL,
702	.sv_flags	= SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN |
703	    SV_SIG_WAITNDQ | SV_TIMEKEEP,
704	.sv_set_syscall_retval = linux_set_syscall_retval,
705	.sv_fetch_syscall_args = linux_fetch_syscall_args,
706	.sv_syscallnames = linux_syscallnames,
707	.sv_shared_page_base = LINUX_SHAREDPAGE_LA48,
708	.sv_shared_page_len = PAGE_SIZE,
709	.sv_schedtail	= linux_schedtail,
710	.sv_thread_detach = linux_thread_detach,
711	.sv_trap	= linux_vsyscall,
712	.sv_hwcap	= NULL,
713	.sv_hwcap2	= NULL,
714	.sv_onexec	= linux_on_exec_vmspace,
715	.sv_onexit	= linux_on_exit,
716	.sv_ontdexit	= linux_thread_dtor,
717	.sv_setid_allowed = &linux_setid_allowed_query,
718	.sv_set_fork_retval = linux_set_fork_retval,
719};
720
721static int
722linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
723{
724	int error;
725
726	error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
727	    LINUX_VDSOPAGE_SIZE, imgp);
728	if (error == 0)
729		error = linux_on_exec(p, imgp);
730	return (error);
731}
732
733/*
734 * linux_vdso_install() and linux_exec_sysvec_init() must be called
735 * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY).
736 */
737static void
738linux_exec_sysvec_init(void *param)
739{
740	l_uintptr_t *ktimekeep_base, *ktsc_selector;
741	struct sysentvec *sv;
742	ptrdiff_t tkoff;
743
744	sv = param;
745	amd64_lower_shared_page(sv);
746	/* Fill timekeep_base */
747	exec_sysvec_init(sv);
748
749	tkoff = kern_timekeep_base - linux_vdso_base;
750	ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
751	*ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset;
752
753	tkoff = kern_tsc_selector - linux_vdso_base;
754	ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
755	*ktsc_selector = linux_vdso_tsc_selector_idx();
756	if (bootverbose)
757		printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector);
758
759	tkoff = kern_cpu_selector - linux_vdso_base;
760	ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
761	*ktsc_selector = linux_vdso_cpu_selector_idx();
762	if (bootverbose)
763		printf("Linux x86-64 vDSO cpu_selector: %lu\n", *ktsc_selector);
764}
765SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY,
766    linux_exec_sysvec_init, &elf_linux_sysvec);
767
768static void
769linux_vdso_install(const void *param)
770{
771	char *vdso_start = &_binary_linux_vdso_so_o_start;
772	char *vdso_end = &_binary_linux_vdso_so_o_end;
773
774	linux_szsigcode = vdso_end - vdso_start;
775	MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
776
777	linux_vdso_base = LINUX_VDSOPAGE_LA48;
778	if (hw_lower_amd64_sharedpage != 0)
779		linux_vdso_base -= PAGE_SIZE;
780
781	__elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
782
783	linux_vdso_obj = __elfN(linux_shared_page_init)
784	    (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
785	bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
786
787	linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
788}
789SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST,
790    linux_vdso_install, NULL);
791
792static void
793linux_vdso_deinstall(const void *param)
794{
795
796	__elfN(linux_shared_page_fini)(linux_vdso_obj,
797	    linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
798}
799SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
800    linux_vdso_deinstall, NULL);
801
802static void
803linux_vdso_reloc(char *mapping, Elf_Addr offset)
804{
805	const Elf_Ehdr *ehdr;
806	const Elf_Shdr *shdr;
807	Elf64_Addr *where, val;
808	Elf_Size rtype, symidx;
809	const Elf_Rela *rela;
810	Elf_Addr addr, addend;
811	int relacnt;
812	int i, j;
813
814	MPASS(offset != 0);
815
816	relacnt = 0;
817	ehdr = (const Elf_Ehdr *)mapping;
818	shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
819	for (i = 0; i < ehdr->e_shnum; i++)
820	{
821		switch (shdr[i].sh_type) {
822		case SHT_REL:
823			printf("Linux x86_64 vDSO: unexpected Rel section\n");
824			break;
825		case SHT_RELA:
826			rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset);
827			relacnt = shdr[i].sh_size / sizeof(*rela);
828		}
829	}
830
831	for (j = 0; j < relacnt; j++, rela++) {
832		where = (Elf_Addr *)(mapping + rela->r_offset);
833		addend = rela->r_addend;
834		rtype = ELF_R_TYPE(rela->r_info);
835		symidx = ELF_R_SYM(rela->r_info);
836
837		switch (rtype) {
838		case R_X86_64_NONE:	/* none */
839			break;
840
841		case R_X86_64_RELATIVE:	/* B + A */
842			addr = (Elf_Addr)(offset + addend);
843			val = addr;
844			if (*where != val)
845				*where = val;
846			break;
847		case R_X86_64_IRELATIVE:
848			printf("Linux x86_64 vDSO: unexpected ifunc relocation, "
849			    "symbol index %ld\n", symidx);
850			break;
851		default:
852			printf("Linux x86_64 vDSO: unexpected relocation type %ld, "
853			    "symbol index %ld\n", rtype, symidx);
854		}
855	}
856}
857
858static Elf_Brandnote linux64_brandnote = {
859	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
860	.hdr.n_descsz	= 16,
861	.hdr.n_type	= 1,
862	.vendor		= GNU_ABI_VENDOR,
863	.flags		= BN_TRANSLATE_OSREL,
864	.trans_osrel	= linux_trans_osrel
865};
866
867static Elf64_Brandinfo linux_glibc2brand = {
868	.brand		= ELFOSABI_LINUX,
869	.machine	= EM_X86_64,
870	.compat_3_brand	= "Linux",
871	.interp_path	= "/lib64/ld-linux-x86-64.so.2",
872	.sysvec		= &elf_linux_sysvec,
873	.interp_newpath	= NULL,
874	.brand_note	= &linux64_brandnote,
875	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
876};
877
878static Elf64_Brandinfo linux_glibc2brandshort = {
879	.brand		= ELFOSABI_LINUX,
880	.machine	= EM_X86_64,
881	.compat_3_brand	= "Linux",
882	.interp_path	= "/lib64/ld-linux.so.2",
883	.sysvec		= &elf_linux_sysvec,
884	.interp_newpath	= NULL,
885	.brand_note	= &linux64_brandnote,
886	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
887};
888
889static Elf64_Brandinfo linux_muslbrand = {
890	.brand		= ELFOSABI_LINUX,
891	.machine	= EM_X86_64,
892	.compat_3_brand	= "Linux",
893	.interp_path	= "/lib/ld-musl-x86_64.so.1",
894	.sysvec		= &elf_linux_sysvec,
895	.interp_newpath	= NULL,
896	.brand_note	= &linux64_brandnote,
897	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE |
898			    LINUX_BI_FUTEX_REQUEUE
899};
900
901static Elf64_Brandinfo *linux_brandlist[] = {
902	&linux_glibc2brand,
903	&linux_glibc2brandshort,
904	&linux_muslbrand,
905	NULL
906};
907
908static int
909linux64_elf_modevent(module_t mod, int type, void *data)
910{
911	Elf64_Brandinfo **brandinfo;
912	int error;
913	struct linux_ioctl_handler **lihp;
914
915	error = 0;
916
917	switch(type) {
918	case MOD_LOAD:
919		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
920		     ++brandinfo)
921			if (elf64_insert_brand_entry(*brandinfo) < 0)
922				error = EINVAL;
923		if (error == 0) {
924			SET_FOREACH(lihp, linux_ioctl_handler_set)
925				linux_ioctl_register_handler(*lihp);
926			stclohz = (stathz ? stathz : hz);
927			if (bootverbose)
928				printf("Linux x86-64 ELF exec handler installed\n");
929		} else
930			printf("cannot insert Linux x86-64 ELF brand handler\n");
931		break;
932	case MOD_UNLOAD:
933		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
934		     ++brandinfo)
935			if (elf64_brand_inuse(*brandinfo))
936				error = EBUSY;
937		if (error == 0) {
938			for (brandinfo = &linux_brandlist[0];
939			     *brandinfo != NULL; ++brandinfo)
940				if (elf64_remove_brand_entry(*brandinfo) < 0)
941					error = EINVAL;
942		}
943		if (error == 0) {
944			SET_FOREACH(lihp, linux_ioctl_handler_set)
945				linux_ioctl_unregister_handler(*lihp);
946			if (bootverbose)
947				printf("Linux x86_64 ELF exec handler removed\n");
948		} else
949			printf("Could not deinstall Linux x86_64 ELF interpreter entry\n");
950		break;
951	default:
952		return (EOPNOTSUPP);
953	}
954	return (error);
955}
956
957static moduledata_t linux64_elf_mod = {
958	"linux64elf",
959	linux64_elf_modevent,
960	0
961};
962
963DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
964MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1);
965FEATURE(linux64, "Linux 64bit support");
966