1/*	$NetBSD: linux_machdep.c,v 1.150 2011/03/04 22:25:31 joerg Exp $	*/
2
3/*-
4 * Copyright (c) 1995, 2000, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Frank van der Linden, and by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: linux_machdep.c,v 1.150 2011/03/04 22:25:31 joerg Exp $");
34
35#if defined(_KERNEL_OPT)
36#include "opt_vm86.h"
37#include "opt_user_ldt.h"
38#endif
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/signalvar.h>
43#include <sys/kernel.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/reboot.h>
47#include <sys/conf.h>
48#include <sys/exec.h>
49#include <sys/file.h>
50#include <sys/callout.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/msgbuf.h>
54#include <sys/mount.h>
55#include <sys/vnode.h>
56#include <sys/device.h>
57#include <sys/syscallargs.h>
58#include <sys/filedesc.h>
59#include <sys/exec_elf.h>
60#include <sys/disklabel.h>
61#include <sys/ioctl.h>
62#include <sys/wait.h>
63#include <sys/kauth.h>
64#include <sys/kmem.h>
65
66#include <miscfs/specfs/specdev.h>
67
68#include <compat/linux/common/linux_types.h>
69#include <compat/linux/common/linux_signal.h>
70#include <compat/linux/common/linux_util.h>
71#include <compat/linux/common/linux_ioctl.h>
72#include <compat/linux/common/linux_hdio.h>
73#include <compat/linux/common/linux_exec.h>
74#include <compat/linux/common/linux_machdep.h>
75#include <compat/linux/common/linux_errno.h>
76
77#include <compat/linux/linux_syscallargs.h>
78
79#include <sys/cpu.h>
80#include <machine/cpufunc.h>
81#include <machine/psl.h>
82#include <machine/reg.h>
83#include <machine/segments.h>
84#include <machine/specialreg.h>
85#include <machine/sysarch.h>
86#include <machine/vm86.h>
87#include <machine/vmparam.h>
88
89/*
90 * To see whether wscons is configured (for virtual console ioctl calls).
91 */
92#if defined(_KERNEL_OPT)
93#include "wsdisplay.h"
94#endif
95#if (NWSDISPLAY > 0)
96#include <dev/wscons/wsconsio.h>
97#include <dev/wscons/wsdisplay_usl_io.h>
98#if defined(_KERNEL_OPT)
99#include "opt_xserver.h"
100#endif
101#endif
102
103#ifdef DEBUG_LINUX
104#define DPRINTF(a) uprintf a
105#else
106#define DPRINTF(a)
107#endif
108
109static struct biosdisk_info *fd2biosinfo(struct proc *, struct file *);
110extern struct disklist *x86_alldisks;
111static void linux_save_ucontext(struct lwp *, struct trapframe *,
112    const sigset_t *, struct sigaltstack *, struct linux_ucontext *);
113static void linux_save_sigcontext(struct lwp *, struct trapframe *,
114    const sigset_t *, struct linux_sigcontext *);
115static int linux_restore_sigcontext(struct lwp *,
116    struct linux_sigcontext *, register_t *);
117static void linux_rt_sendsig(const ksiginfo_t *, const sigset_t *);
118static void linux_old_sendsig(const ksiginfo_t *, const sigset_t *);
119
120extern char linux_sigcode[], linux_rt_sigcode[];
121
122/*
123 * Deal with some i386-specific things in the Linux emulation code.
124 */
125
126void
127linux_setregs(struct lwp *l, struct exec_package *epp, vaddr_t stack)
128{
129	struct pcb *pcb = lwp_getpcb(l);
130	struct trapframe *tf;
131
132#if NNPX > 0
133	/* If we were using the FPU, forget about it. */
134	if (npxproc == l)
135		npxdrop();
136#endif
137
138#ifdef USER_LDT
139	pmap_ldt_cleanup(l);
140#endif
141
142	l->l_md.md_flags &= ~MDL_USEDFPU;
143
144	if (i386_use_fxsave) {
145		pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __Linux_NPXCW__;
146		pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
147	} else
148		pcb->pcb_savefpu.sv_87.sv_env.en_cw = __Linux_NPXCW__;
149
150	tf = l->l_md.md_regs;
151	tf->tf_gs = 0;
152	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
153	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
154	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
155	tf->tf_edi = 0;
156	tf->tf_esi = 0;
157	tf->tf_ebp = 0;
158	tf->tf_ebx = l->l_proc->p_psstrp;
159	tf->tf_edx = 0;
160	tf->tf_ecx = 0;
161	tf->tf_eax = 0;
162	tf->tf_eip = epp->ep_entry;
163	tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
164	tf->tf_eflags = PSL_USERSET;
165	tf->tf_esp = stack;
166	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
167}
168
169/*
170 * Send an interrupt to process.
171 *
172 * Stack is set up to allow sigcode stored
173 * in u. to call routine, followed by kcall
174 * to sigreturn routine below.  After sigreturn
175 * resets the signal mask, the stack, and the
176 * frame pointer, it returns to the user
177 * specified pc, psl.
178 */
179
180void
181linux_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
182{
183	if (SIGACTION(curproc, ksi->ksi_signo).sa_flags & SA_SIGINFO)
184		linux_rt_sendsig(ksi, mask);
185	else
186		linux_old_sendsig(ksi, mask);
187}
188
189
190static void
191linux_save_ucontext(struct lwp *l, struct trapframe *tf, const sigset_t *mask, struct sigaltstack *sas, struct linux_ucontext *uc)
192{
193	uc->uc_flags = 0;
194	uc->uc_link = NULL;
195	native_to_linux_sigaltstack(&uc->uc_stack, sas);
196	linux_save_sigcontext(l, tf, mask, &uc->uc_mcontext);
197	native_to_linux_sigset(&uc->uc_sigmask, mask);
198	(void)memset(&uc->uc_fpregs_mem, 0, sizeof(uc->uc_fpregs_mem));
199}
200
201static void
202linux_save_sigcontext(struct lwp *l, struct trapframe *tf,
203    const sigset_t *mask, struct linux_sigcontext *sc)
204{
205	struct pcb *pcb = lwp_getpcb(l);
206
207	/* Save register context. */
208#ifdef VM86
209	if (tf->tf_eflags & PSL_VM) {
210		sc->sc_gs = tf->tf_vm86_gs;
211		sc->sc_fs = tf->tf_vm86_fs;
212		sc->sc_es = tf->tf_vm86_es;
213		sc->sc_ds = tf->tf_vm86_ds;
214		sc->sc_eflags = get_vflags(l);
215	} else
216#endif
217	{
218		sc->sc_gs = tf->tf_gs;
219		sc->sc_fs = tf->tf_fs;
220		sc->sc_es = tf->tf_es;
221		sc->sc_ds = tf->tf_ds;
222		sc->sc_eflags = tf->tf_eflags;
223	}
224	sc->sc_edi = tf->tf_edi;
225	sc->sc_esi = tf->tf_esi;
226	sc->sc_esp = tf->tf_esp;
227	sc->sc_ebp = tf->tf_ebp;
228	sc->sc_ebx = tf->tf_ebx;
229	sc->sc_edx = tf->tf_edx;
230	sc->sc_ecx = tf->tf_ecx;
231	sc->sc_eax = tf->tf_eax;
232	sc->sc_eip = tf->tf_eip;
233	sc->sc_cs = tf->tf_cs;
234	sc->sc_esp_at_signal = tf->tf_esp;
235	sc->sc_ss = tf->tf_ss;
236	sc->sc_err = tf->tf_err;
237	sc->sc_trapno = tf->tf_trapno;
238	sc->sc_cr2 = pcb->pcb_cr2;
239	sc->sc_387 = NULL;
240
241	/* Save signal stack. */
242	/* Linux doesn't save the onstack flag in sigframe */
243
244	/* Save signal mask. */
245	native_to_linux_old_sigset(&sc->sc_mask, mask);
246}
247
248static void
249linux_rt_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
250{
251	struct lwp *l = curlwp;
252	struct proc *p = l->l_proc;
253	struct trapframe *tf;
254	struct linux_rt_sigframe *fp, frame;
255	int onstack, error;
256	int sig = ksi->ksi_signo;
257	sig_t catcher = SIGACTION(p, sig).sa_handler;
258	struct sigaltstack *sas = &l->l_sigstk;
259
260	tf = l->l_md.md_regs;
261	/* Do we need to jump onto the signal stack? */
262	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
263	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
264
265
266	/* Allocate space for the signal handler context. */
267	if (onstack)
268		fp = (struct linux_rt_sigframe *)((char *)sas->ss_sp +
269		    sas->ss_size);
270	else
271		fp = (struct linux_rt_sigframe *)tf->tf_esp;
272	fp--;
273
274	DPRINTF(("rt: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
275	    onstack, fp, sig, tf->tf_eip,
276	    ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
277
278	/* Build stack frame for signal trampoline. */
279	frame.sf_handler = catcher;
280	frame.sf_sig = native_to_linux_signo[sig];
281	frame.sf_sip = &fp->sf_si;
282	frame.sf_ucp = &fp->sf_uc;
283
284	/*
285	 * XXX: the following code assumes that the constants for
286	 * siginfo are the same between linux and NetBSD.
287	 */
288	native_to_linux_siginfo(&frame.sf_si, &ksi->ksi_info);
289
290	/* Save register context. */
291	linux_save_ucontext(l, tf, mask, sas, &frame.sf_uc);
292	sendsig_reset(l, sig);
293
294	mutex_exit(p->p_lock);
295	error = copyout(&frame, fp, sizeof(frame));
296	mutex_enter(p->p_lock);
297
298	if (error != 0) {
299		/*
300		 * Process has trashed its stack; give it an illegal
301		 * instruction to halt it in its tracks.
302		 */
303		sigexit(l, SIGILL);
304		/* NOTREACHED */
305	}
306
307	/*
308	 * Build context to run handler in.
309	 */
310	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
311	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
312	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
313	tf->tf_eip = ((int)p->p_sigctx.ps_sigcode) +
314	    (linux_rt_sigcode - linux_sigcode);
315	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
316	tf->tf_eflags &= ~PSL_CLEARSIG;
317	tf->tf_esp = (int)fp;
318	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
319
320	/* Remember that we're now on the signal stack. */
321	if (onstack)
322		sas->ss_flags |= SS_ONSTACK;
323}
324
325static void
326linux_old_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
327{
328	struct lwp *l = curlwp;
329	struct proc *p = l->l_proc;
330	struct trapframe *tf;
331	struct linux_sigframe *fp, frame;
332	int onstack, error;
333	int sig = ksi->ksi_signo;
334	sig_t catcher = SIGACTION(p, sig).sa_handler;
335	struct sigaltstack *sas = &l->l_sigstk;
336
337	tf = l->l_md.md_regs;
338
339	/* Do we need to jump onto the signal stack? */
340	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
341	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
342
343	/* Allocate space for the signal handler context. */
344	if (onstack)
345		fp = (struct linux_sigframe *) ((char *)sas->ss_sp +
346		    sas->ss_size);
347	else
348		fp = (struct linux_sigframe *)tf->tf_esp;
349	fp--;
350
351	DPRINTF(("old: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
352	    onstack, fp, sig, tf->tf_eip,
353	    ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
354
355	/* Build stack frame for signal trampoline. */
356	frame.sf_handler = catcher;
357	frame.sf_sig = native_to_linux_signo[sig];
358
359	linux_save_sigcontext(l, tf, mask, &frame.sf_sc);
360	sendsig_reset(l, sig);
361
362	mutex_exit(p->p_lock);
363	error = copyout(&frame, fp, sizeof(frame));
364	mutex_enter(p->p_lock);
365
366	if (error != 0) {
367		/*
368		 * Process has trashed its stack; give it an illegal
369		 * instruction to halt it in its tracks.
370		 */
371		sigexit(l, SIGILL);
372		/* NOTREACHED */
373	}
374
375	/*
376	 * Build context to run handler in.
377	 */
378	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
379	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
380	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
381	tf->tf_eip = (int)p->p_sigctx.ps_sigcode;
382	tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
383	tf->tf_eflags &= ~PSL_CLEARSIG;
384	tf->tf_esp = (int)fp;
385	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
386
387	/* Remember that we're now on the signal stack. */
388	if (onstack)
389		sas->ss_flags |= SS_ONSTACK;
390}
391
392/*
393 * System call to cleanup state after a signal
394 * has been taken.  Reset signal mask and
395 * stack state from context left by sendsig (above).
396 * Return to previous pc and psl as specified by
397 * context left by sendsig. Check carefully to
398 * make sure that the user has not modified the
399 * psl to gain improper privileges or to cause
400 * a machine fault.
401 */
402int
403linux_sys_rt_sigreturn(struct lwp *l, const struct linux_sys_rt_sigreturn_args *uap, register_t *retval)
404{
405	/* {
406		syscallarg(struct linux_ucontext *) ucp;
407	} */
408	struct linux_ucontext context, *ucp = SCARG(uap, ucp);
409	int error;
410
411	/*
412	 * The trampoline code hands us the context.
413	 * It is unsafe to keep track of it ourselves, in the event that a
414	 * program jumps out of a signal handler.
415	 */
416	if ((error = copyin(ucp, &context, sizeof(*ucp))) != 0)
417		return error;
418
419	/* XXX XAX we can do better here by using more of the ucontext */
420	return linux_restore_sigcontext(l, &context.uc_mcontext, retval);
421}
422
423int
424linux_sys_sigreturn(struct lwp *l, const struct linux_sys_sigreturn_args *uap, register_t *retval)
425{
426	/* {
427		syscallarg(struct linux_sigcontext *) scp;
428	} */
429	struct linux_sigcontext context, *scp = SCARG(uap, scp);
430	int error;
431
432	/*
433	 * The trampoline code hands us the context.
434	 * It is unsafe to keep track of it ourselves, in the event that a
435	 * program jumps out of a signal handler.
436	 */
437	if ((error = copyin((void *)scp, &context, sizeof(*scp))) != 0)
438		return error;
439	return linux_restore_sigcontext(l, &context, retval);
440}
441
442static int
443linux_restore_sigcontext(struct lwp *l, struct linux_sigcontext *scp,
444    register_t *retval)
445{
446	struct proc *p = l->l_proc;
447	struct sigaltstack *sas = &l->l_sigstk;
448	struct trapframe *tf;
449	sigset_t mask;
450	ssize_t ss_gap;
451
452	/* Restore register context. */
453	tf = l->l_md.md_regs;
454	DPRINTF(("sigreturn enter esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
455
456#ifdef VM86
457	if (scp->sc_eflags & PSL_VM) {
458		void syscall_vm86(struct trapframe *);
459
460		tf->tf_vm86_gs = scp->sc_gs;
461		tf->tf_vm86_fs = scp->sc_fs;
462		tf->tf_vm86_es = scp->sc_es;
463		tf->tf_vm86_ds = scp->sc_ds;
464		set_vflags(l, scp->sc_eflags);
465		p->p_md.md_syscall = syscall_vm86;
466	} else
467#endif
468	{
469		/*
470		 * Check for security violations.  If we're returning to
471		 * protected mode, the CPU will validate the segment registers
472		 * automatically and generate a trap on violations.  We handle
473		 * the trap, rather than doing all of the checking here.
474		 */
475		if (((scp->sc_eflags ^ tf->tf_eflags) & PSL_USERSTATIC) != 0 ||
476		    !USERMODE(scp->sc_cs, scp->sc_eflags))
477			return EINVAL;
478
479		tf->tf_gs = scp->sc_gs;
480		tf->tf_fs = scp->sc_fs;
481		tf->tf_es = scp->sc_es;
482		tf->tf_ds = scp->sc_ds;
483#ifdef VM86
484		if (tf->tf_eflags & PSL_VM)
485			(*p->p_emul->e_syscall_intern)(p);
486#endif
487		tf->tf_eflags = scp->sc_eflags;
488	}
489	tf->tf_edi = scp->sc_edi;
490	tf->tf_esi = scp->sc_esi;
491	tf->tf_ebp = scp->sc_ebp;
492	tf->tf_ebx = scp->sc_ebx;
493	tf->tf_edx = scp->sc_edx;
494	tf->tf_ecx = scp->sc_ecx;
495	tf->tf_eax = scp->sc_eax;
496	tf->tf_eip = scp->sc_eip;
497	tf->tf_cs = scp->sc_cs;
498	tf->tf_esp = scp->sc_esp_at_signal;
499	tf->tf_ss = scp->sc_ss;
500
501	/* Restore signal stack. */
502	/*
503	 * Linux really does it this way; it doesn't have space in sigframe
504	 * to save the onstack flag.
505	 */
506	mutex_enter(p->p_lock);
507	ss_gap = (ssize_t)((char *)scp->sc_esp_at_signal - (char *)sas->ss_sp);
508	if (ss_gap >= 0 && ss_gap < sas->ss_size)
509		sas->ss_flags |= SS_ONSTACK;
510	else
511		sas->ss_flags &= ~SS_ONSTACK;
512
513	/* Restore signal mask. */
514	linux_old_to_native_sigset(&mask, &scp->sc_mask);
515	(void) sigprocmask1(l, SIG_SETMASK, &mask, 0);
516	mutex_exit(p->p_lock);
517
518	DPRINTF(("sigreturn exit esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
519	return EJUSTRETURN;
520}
521
522#ifdef USER_LDT
523
524static int
525linux_read_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
526    register_t *retval)
527{
528	struct x86_get_ldt_args gl;
529	int error;
530	union descriptor *ldt_buf;
531	size_t sz;
532
533	/*
534	 * I've checked the linux code - this function is asymetric with
535	 * linux_write_ldt, and returns raw ldt entries.
536	 * NB, the code I saw zerod the spare parts of the user buffer.
537	 */
538
539	DPRINTF(("linux_read_ldt!"));
540
541	sz = 8192 * sizeof(*ldt_buf);
542	ldt_buf = kmem_zalloc(sz, KM_SLEEP);
543	gl.start = 0;
544	gl.desc = NULL;
545	gl.num = SCARG(uap, bytecount) / sizeof(union descriptor);
546	error = x86_get_ldt1(l, &gl, ldt_buf);
547	/* NB gl.num might have changed */
548	if (error == 0) {
549		*retval = gl.num * sizeof *ldt;
550		error = copyout(ldt_buf, SCARG(uap, ptr),
551		    gl.num * sizeof *ldt_buf);
552	}
553	kmem_free(ldt_buf, sz);
554
555	return error;
556}
557
558struct linux_ldt_info {
559	u_int entry_number;
560	u_long base_addr;
561	u_int limit;
562	u_int seg_32bit:1;
563	u_int contents:2;
564	u_int read_exec_only:1;
565	u_int limit_in_pages:1;
566	u_int seg_not_present:1;
567	u_int useable:1;
568};
569
570static int
571linux_write_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
572    int oldmode)
573{
574	struct linux_ldt_info ldt_info;
575	union descriptor d;
576	struct x86_set_ldt_args sl;
577	int error;
578
579	DPRINTF(("linux_write_ldt %d\n", oldmode));
580	if (SCARG(uap, bytecount) != sizeof(ldt_info))
581		return (EINVAL);
582	if ((error = copyin(SCARG(uap, ptr), &ldt_info, sizeof(ldt_info))) != 0)
583		return error;
584	if (ldt_info.entry_number >= 8192)
585		return (EINVAL);
586	if (ldt_info.contents == 3) {
587		if (oldmode)
588			return (EINVAL);
589		if (ldt_info.seg_not_present)
590			return (EINVAL);
591	}
592
593	if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
594	    (oldmode || (ldt_info.contents == 0 &&
595	    ldt_info.read_exec_only == 1 && ldt_info.seg_32bit == 0 &&
596	    ldt_info.limit_in_pages == 0 && ldt_info.seg_not_present == 1 &&
597	    ldt_info.useable == 0))) {
598		/* this means you should zero the ldt */
599		(void)memset(&d, 0, sizeof(d));
600	} else {
601		d.sd.sd_lobase = ldt_info.base_addr & 0xffffff;
602		d.sd.sd_hibase = (ldt_info.base_addr >> 24) & 0xff;
603		d.sd.sd_lolimit = ldt_info.limit & 0xffff;
604		d.sd.sd_hilimit = (ldt_info.limit >> 16) & 0xf;
605		d.sd.sd_type = 16 | (ldt_info.contents << 2) |
606		    (!ldt_info.read_exec_only << 1);
607		d.sd.sd_dpl = SEL_UPL;
608		d.sd.sd_p = !ldt_info.seg_not_present;
609		d.sd.sd_def32 = ldt_info.seg_32bit;
610		d.sd.sd_gran = ldt_info.limit_in_pages;
611		if (!oldmode)
612			d.sd.sd_xx = ldt_info.useable;
613		else
614			d.sd.sd_xx = 0;
615	}
616	sl.start = ldt_info.entry_number;
617	sl.desc = NULL;
618	sl.num = 1;
619
620	DPRINTF(("linux_write_ldt: idx=%d, base=0x%lx, limit=0x%x\n",
621	    ldt_info.entry_number, ldt_info.base_addr, ldt_info.limit));
622
623	return x86_set_ldt1(l, &sl, &d);
624}
625
626#endif /* USER_LDT */
627
628int
629linux_sys_modify_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap, register_t *retval)
630{
631	/* {
632		syscallarg(int) func;
633		syscallarg(void *) ptr;
634		syscallarg(size_t) bytecount;
635	} */
636
637	switch (SCARG(uap, func)) {
638#ifdef USER_LDT
639	case 0:
640		return linux_read_ldt(l, (const void *)uap, retval);
641	case 1:
642		return linux_write_ldt(l, (const void *)uap, 1);
643	case 2:
644#ifdef notyet
645		return linux_read_default_ldt(l, (const void *)uap, retval);
646#else
647		return (ENOSYS);
648#endif
649	case 0x11:
650		return linux_write_ldt(l, (const void *)uap, 0);
651#endif /* USER_LDT */
652
653	default:
654		return (ENOSYS);
655	}
656}
657
658/*
659 * XXX Pathetic hack to make svgalib work. This will fake the major
660 * device number of an opened VT so that svgalib likes it. grmbl.
661 * Should probably do it 'wrong the right way' and use a mapping
662 * array for all major device numbers, and map linux_mknod too.
663 */
664dev_t
665linux_fakedev(dev_t dev, int raw)
666{
667	extern const struct cdevsw ptc_cdevsw, pts_cdevsw;
668	const struct cdevsw *cd = cdevsw_lookup(dev);
669
670	if (raw) {
671#if (NWSDISPLAY > 0)
672		extern const struct cdevsw wsdisplay_cdevsw;
673		if (cd == &wsdisplay_cdevsw)
674			return makedev(LINUX_CONS_MAJOR, (minor(dev) + 1));
675#endif
676	}
677
678	if (cd == &ptc_cdevsw)
679		return makedev(LINUX_PTC_MAJOR, minor(dev));
680	if (cd == &pts_cdevsw)
681		return makedev(LINUX_PTS_MAJOR, minor(dev));
682
683	return dev;
684}
685
686#if (NWSDISPLAY > 0)
687/*
688 * That's not complete, but enough to get an X server running.
689 */
690#define NR_KEYS 128
691static const u_short plain_map[NR_KEYS] = {
692	0x0200,	0x001b,	0x0031,	0x0032,	0x0033,	0x0034,	0x0035,	0x0036,
693	0x0037,	0x0038,	0x0039,	0x0030,	0x002d,	0x003d,	0x007f,	0x0009,
694	0x0b71,	0x0b77,	0x0b65,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
695	0x0b6f,	0x0b70,	0x005b,	0x005d,	0x0201,	0x0702,	0x0b61,	0x0b73,
696	0x0b64,	0x0b66,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x003b,
697	0x0027,	0x0060,	0x0700,	0x005c,	0x0b7a,	0x0b78,	0x0b63,	0x0b76,
698	0x0b62,	0x0b6e,	0x0b6d,	0x002c,	0x002e,	0x002f,	0x0700,	0x030c,
699	0x0703,	0x0020,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
700	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0209,	0x0307,
701	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
702	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003c,	0x010a,
703	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
704	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
705	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
706	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
707	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
708}, shift_map[NR_KEYS] = {
709	0x0200,	0x001b,	0x0021,	0x0040,	0x0023,	0x0024,	0x0025,	0x005e,
710	0x0026,	0x002a,	0x0028,	0x0029,	0x005f,	0x002b,	0x007f,	0x0009,
711	0x0b51,	0x0b57,	0x0b45,	0x0b52,	0x0b54,	0x0b59,	0x0b55,	0x0b49,
712	0x0b4f,	0x0b50,	0x007b,	0x007d,	0x0201,	0x0702,	0x0b41,	0x0b53,
713	0x0b44,	0x0b46,	0x0b47,	0x0b48,	0x0b4a,	0x0b4b,	0x0b4c,	0x003a,
714	0x0022,	0x007e,	0x0700,	0x007c,	0x0b5a,	0x0b58,	0x0b43,	0x0b56,
715	0x0b42,	0x0b4e,	0x0b4d,	0x003c,	0x003e,	0x003f,	0x0700,	0x030c,
716	0x0703,	0x0020,	0x0207,	0x010a,	0x010b,	0x010c,	0x010d,	0x010e,
717	0x010f,	0x0110,	0x0111,	0x0112,	0x0113,	0x0213,	0x0203,	0x0307,
718	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
719	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003e,	0x010a,
720	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
721	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
722	0x020b,	0x0601,	0x0602,	0x0117,	0x0600,	0x020a,	0x0115,	0x0116,
723	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
724	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
725}, altgr_map[NR_KEYS] = {
726	0x0200,	0x0200,	0x0200,	0x0040,	0x0200,	0x0024,	0x0200,	0x0200,
727	0x007b,	0x005b,	0x005d,	0x007d,	0x005c,	0x0200,	0x0200,	0x0200,
728	0x0b71,	0x0b77,	0x0918,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
729	0x0b6f,	0x0b70,	0x0200,	0x007e,	0x0201,	0x0702,	0x0914,	0x0b73,
730	0x0917,	0x0919,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x0200,
731	0x0200,	0x0200,	0x0700,	0x0200,	0x0b7a,	0x0b78,	0x0916,	0x0b76,
732	0x0915,	0x0b6e,	0x0b6d,	0x0200,	0x0200,	0x0200,	0x0700,	0x030c,
733	0x0703,	0x0200,	0x0207,	0x050c,	0x050d,	0x050e,	0x050f,	0x0510,
734	0x0511,	0x0512,	0x0513,	0x0514,	0x0515,	0x0208,	0x0202,	0x0911,
735	0x0912,	0x0913,	0x030b,	0x090e,	0x090f,	0x0910,	0x030a,	0x090b,
736	0x090c,	0x090d,	0x090a,	0x0310,	0x0206,	0x0200,	0x007c,	0x0516,
737	0x0517,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
738	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
739	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
740	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
741	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
742}, ctrl_map[NR_KEYS] = {
743	0x0200,	0x0200,	0x0200,	0x0000,	0x001b,	0x001c,	0x001d,	0x001e,
744	0x001f,	0x007f,	0x0200,	0x0200,	0x001f,	0x0200,	0x0008,	0x0200,
745	0x0011,	0x0017,	0x0005,	0x0012,	0x0014,	0x0019,	0x0015,	0x0009,
746	0x000f,	0x0010,	0x001b,	0x001d,	0x0201,	0x0702,	0x0001,	0x0013,
747	0x0004,	0x0006,	0x0007,	0x0008,	0x000a,	0x000b,	0x000c,	0x0200,
748	0x0007,	0x0000,	0x0700,	0x001c,	0x001a,	0x0018,	0x0003,	0x0016,
749	0x0002,	0x000e,	0x000d,	0x0200,	0x020e,	0x007f,	0x0700,	0x030c,
750	0x0703,	0x0000,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
751	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0204,	0x0307,
752	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
753	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x0200,	0x010a,
754	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
755	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
756	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
757	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
758	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
759};
760
761const u_short * const linux_keytabs[] = {
762	plain_map, shift_map, altgr_map, altgr_map, ctrl_map
763};
764#endif
765
766static struct biosdisk_info *
767fd2biosinfo(struct proc *p, struct file *fp)
768{
769	struct vnode *vp;
770	const char *blkname;
771	char diskname[16];
772	int i;
773	struct nativedisk_info *nip;
774	struct disklist *dl = x86_alldisks;
775
776	if (fp->f_type != DTYPE_VNODE)
777		return NULL;
778	vp = (struct vnode *)fp->f_data;
779
780	if (vp->v_type != VBLK)
781		return NULL;
782
783	blkname = devsw_blk2name(major(vp->v_rdev));
784	snprintf(diskname, sizeof diskname, "%s%llu", blkname,
785	    (unsigned long long)DISKUNIT(vp->v_rdev));
786
787	for (i = 0; i < dl->dl_nnativedisks; i++) {
788		nip = &dl->dl_nativedisks[i];
789		if (strcmp(diskname, nip->ni_devname))
790			continue;
791		if (nip->ni_nmatches != 0)
792			return &dl->dl_biosdisks[nip->ni_biosmatches[0]];
793	}
794
795	return NULL;
796}
797
798
799/*
800 * We come here in a last attempt to satisfy a Linux ioctl() call
801 */
802int
803linux_machdepioctl(struct lwp *l, const struct linux_sys_ioctl_args *uap, register_t *retval)
804{
805	/* {
806		syscallarg(int) fd;
807		syscallarg(u_long) com;
808		syscallarg(void *) data;
809	} */
810	struct sys_ioctl_args bia;
811	u_long com;
812	int error, error1;
813#if (NWSDISPLAY > 0)
814	struct vt_mode lvt;
815	struct kbentry kbe;
816#endif
817	struct linux_hd_geometry hdg;
818	struct linux_hd_big_geometry hdg_big;
819	struct biosdisk_info *bip;
820	file_t *fp;
821	int fd;
822	struct disklabel label, *labp;
823	struct partinfo partp;
824	int (*ioctlf)(struct file *, u_long, void *);
825	u_long start, biostotal, realtotal;
826	u_char heads, sectors;
827	u_int cylinders;
828	struct ioctl_pt pt;
829
830	fd = SCARG(uap, fd);
831	SCARG(&bia, fd) = fd;
832	SCARG(&bia, data) = SCARG(uap, data);
833	com = SCARG(uap, com);
834
835	if ((fp = fd_getfile(fd)) == NULL)
836		return (EBADF);
837
838	switch (com) {
839#if (NWSDISPLAY > 0)
840	case LINUX_KDGKBMODE:
841		com = KDGKBMODE;
842		break;
843	case LINUX_KDSKBMODE:
844		com = KDSKBMODE;
845		if ((unsigned)SCARG(uap, data) == LINUX_K_MEDIUMRAW)
846			SCARG(&bia, data) = (void *)K_RAW;
847		break;
848	case LINUX_KIOCSOUND:
849		SCARG(&bia, data) =
850		    (void *)(((unsigned long)SCARG(&bia, data)) & 0xffff);
851		/* fall through */
852	case LINUX_KDMKTONE:
853		com = KDMKTONE;
854		break;
855	case LINUX_KDSETMODE:
856		com = KDSETMODE;
857		break;
858	case LINUX_KDGETMODE:
859		/* KD_* values are equal to the wscons numbers */
860		com = WSDISPLAYIO_GMODE;
861		break;
862	case LINUX_KDENABIO:
863		com = KDENABIO;
864		break;
865	case LINUX_KDDISABIO:
866		com = KDDISABIO;
867		break;
868	case LINUX_KDGETLED:
869		com = KDGETLED;
870		break;
871	case LINUX_KDSETLED:
872		com = KDSETLED;
873		break;
874	case LINUX_VT_OPENQRY:
875		com = VT_OPENQRY;
876		break;
877	case LINUX_VT_GETMODE:
878		error = fp->f_ops->fo_ioctl(fp, VT_GETMODE, &lvt);
879		if (error != 0)
880			goto out;
881		lvt.relsig = native_to_linux_signo[lvt.relsig];
882		lvt.acqsig = native_to_linux_signo[lvt.acqsig];
883		lvt.frsig = native_to_linux_signo[lvt.frsig];
884		error = copyout(&lvt, SCARG(uap, data), sizeof (lvt));
885		goto out;
886	case LINUX_VT_SETMODE:
887		error = copyin(SCARG(uap, data), &lvt, sizeof (lvt));
888		if (error != 0)
889			goto out;
890		lvt.relsig = linux_to_native_signo[lvt.relsig];
891		lvt.acqsig = linux_to_native_signo[lvt.acqsig];
892		lvt.frsig = linux_to_native_signo[lvt.frsig];
893		error = fp->f_ops->fo_ioctl(fp, VT_SETMODE, &lvt);
894		goto out;
895	case LINUX_VT_DISALLOCATE:
896		/* XXX should use WSDISPLAYIO_DELSCREEN */
897		error = 0;
898		goto out;
899	case LINUX_VT_RELDISP:
900		com = VT_RELDISP;
901		break;
902	case LINUX_VT_ACTIVATE:
903		com = VT_ACTIVATE;
904		break;
905	case LINUX_VT_WAITACTIVE:
906		com = VT_WAITACTIVE;
907		break;
908	case LINUX_VT_GETSTATE:
909		com = VT_GETSTATE;
910		break;
911	case LINUX_KDGKBTYPE:
912	    {
913		static const u_int8_t kb101 = KB_101;
914
915		/* This is what Linux does. */
916		error = copyout(&kb101, SCARG(uap, data), 1);
917		goto out;
918	    }
919	case LINUX_KDGKBENT:
920		/*
921		 * The Linux KDGKBENT ioctl is different from the
922		 * SYSV original. So we handle it in machdep code.
923		 * XXX We should use keyboard mapping information
924		 * from wsdisplay, but this would be expensive.
925		 */
926		if ((error = copyin(SCARG(uap, data), &kbe,
927				    sizeof(struct kbentry))))
928			goto out;
929		if (kbe.kb_table >= sizeof(linux_keytabs) / sizeof(u_short *)
930		    || kbe.kb_index >= NR_KEYS) {
931			error = EINVAL;
932			goto out;
933		}
934		kbe.kb_value = linux_keytabs[kbe.kb_table][kbe.kb_index];
935		error = copyout(&kbe, SCARG(uap, data),
936				sizeof(struct kbentry));
937		goto out;
938#endif
939	case LINUX_HDIO_GETGEO:
940	case LINUX_HDIO_GETGEO_BIG:
941		/*
942		 * Try to mimic Linux behaviour: return the BIOS geometry
943		 * if possible (extending its # of cylinders if it's beyond
944		 * the 1023 limit), fall back to the MI geometry (i.e.
945		 * the real geometry) if not found, by returning an
946		 * error. See common/linux_hdio.c
947		 */
948		bip = fd2biosinfo(curproc, fp);
949		ioctlf = fp->f_ops->fo_ioctl;
950		error = ioctlf(fp, DIOCGDEFLABEL, (void *)&label);
951		error1 = ioctlf(fp, DIOCGPART, (void *)&partp);
952		if (error != 0 && error1 != 0) {
953			error = error1;
954			goto out;
955		}
956		labp = error != 0 ? &label : partp.disklab;
957		start = error1 != 0 ? partp.part->p_offset : 0;
958		if (bip != NULL && bip->bi_head != 0 && bip->bi_sec != 0
959		    && bip->bi_cyl != 0) {
960			heads = bip->bi_head;
961			sectors = bip->bi_sec;
962			cylinders = bip->bi_cyl;
963			biostotal = heads * sectors * cylinders;
964			realtotal = labp->d_ntracks * labp->d_nsectors *
965			    labp->d_ncylinders;
966			if (realtotal > biostotal)
967				cylinders = realtotal / (heads * sectors);
968		} else {
969			heads = labp->d_ntracks;
970			cylinders = labp->d_ncylinders;
971			sectors = labp->d_nsectors;
972		}
973		if (com == LINUX_HDIO_GETGEO) {
974			hdg.start = start;
975			hdg.heads = heads;
976			hdg.cylinders = cylinders;
977			hdg.sectors = sectors;
978			error = copyout(&hdg, SCARG(uap, data), sizeof hdg);
979			goto out;
980		} else {
981			hdg_big.start = start;
982			hdg_big.heads = heads;
983			hdg_big.cylinders = cylinders;
984			hdg_big.sectors = sectors;
985			error = copyout(&hdg_big, SCARG(uap, data),
986			    sizeof hdg_big);
987			goto out;
988		}
989
990	default:
991		/*
992		 * Unknown to us. If it's on a device, just pass it through
993		 * using PTIOCLINUX, the device itself might be able to
994		 * make some sense of it.
995		 * XXX hack: if the function returns EJUSTRETURN,
996		 * it has stuffed a sysctl return value in pt.data.
997		 */
998		ioctlf = fp->f_ops->fo_ioctl;
999		pt.com = SCARG(uap, com);
1000		pt.data = SCARG(uap, data);
1001		error = ioctlf(fp, PTIOCLINUX, &pt);
1002		if (error == EJUSTRETURN) {
1003			retval[0] = (register_t)pt.data;
1004			error = 0;
1005		}
1006
1007		if (error == ENOTTY) {
1008			DPRINTF(("linux_machdepioctl: invalid ioctl %08lx\n",
1009			    com));
1010		}
1011		goto out;
1012	}
1013	SCARG(&bia, com) = com;
1014	error = sys_ioctl(curlwp, &bia, retval);
1015out:
1016	fd_putfile(fd);
1017	return error;
1018}
1019
1020/*
1021 * Set I/O permissions for a process. Just set the maximum level
1022 * right away (ignoring the argument), otherwise we would have
1023 * to rely on I/O permission maps, which are not implemented.
1024 */
1025int
1026linux_sys_iopl(struct lwp *l, const struct linux_sys_iopl_args *uap, register_t *retval)
1027{
1028	/* {
1029		syscallarg(int) level;
1030	} */
1031	struct trapframe *fp = l->l_md.md_regs;
1032
1033	if (kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_IOPL,
1034	    NULL, NULL, NULL, NULL) != 0)
1035		return EPERM;
1036	fp->tf_eflags |= PSL_IOPL;
1037	*retval = 0;
1038	return 0;
1039}
1040
1041/*
1042 * See above. If a root process tries to set access to an I/O port,
1043 * just let it have the whole range.
1044 */
1045int
1046linux_sys_ioperm(struct lwp *l, const struct linux_sys_ioperm_args *uap, register_t *retval)
1047{
1048	/* {
1049		syscallarg(unsigned int) lo;
1050		syscallarg(unsigned int) hi;
1051		syscallarg(int) val;
1052	} */
1053	struct trapframe *fp = l->l_md.md_regs;
1054
1055	if (kauth_authorize_machdep(l->l_cred, SCARG(uap, val) ?
1056	    KAUTH_MACHDEP_IOPERM_SET : KAUTH_MACHDEP_IOPERM_GET, NULL, NULL,
1057	    NULL, NULL) != 0)
1058		return EPERM;
1059	if (SCARG(uap, val))
1060		fp->tf_eflags |= PSL_IOPL;
1061	*retval = 0;
1062	return 0;
1063}
1064
1065int
1066linux_usertrap(struct lwp *l, vaddr_t trapaddr,
1067    void *arg)
1068{
1069	return 0;
1070}
1071
1072const char *
1073linux_get_uname_arch(void)
1074{
1075	static char uname_arch[5] = "i386";
1076
1077	if (uname_arch[1] == '3')
1078		uname_arch[1] += cpu_class;
1079	return uname_arch;
1080}
1081