sys_machdep.c revision 276084
174462Salfred/*-
274462Salfred * Copyright (c) 1990 The Regents of the University of California.
31901Swollman * All rights reserved.
41901Swollman *
51901Swollman * Redistribution and use in source and binary forms, with or without
61901Swollman * modification, are permitted provided that the following conditions
71901Swollman * are met:
81901Swollman * 1. Redistributions of source code must retain the above copyright
91901Swollman *    notice, this list of conditions and the following disclaimer.
108870Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111901Swollman *    notice, this list of conditions and the following disclaimer in the
121901Swollman *    documentation and/or other materials provided with the distribution.
131901Swollman * 4. Neither the name of the University nor the names of its contributors
148870Srgrimes *    may be used to endorse or promote products derived from this software
151901Swollman *    without specific prior written permission.
161901Swollman *
171901Swollman * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
188870Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191901Swollman * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201901Swollman * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
211901Swollman * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
228870Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231901Swollman * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241901Swollman * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251901Swollman * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
268870Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271901Swollman * SUCH DAMAGE.
281901Swollman *
291901Swollman *	from: @(#)sys_machdep.c	5.5 (Berkeley) 1/19/91
301901Swollman */
3174462Salfred
3274462Salfred#include <sys/cdefs.h>
3374462Salfred__FBSDID("$FreeBSD: stable/10/sys/i386/i386/sys_machdep.c 276084 2014-12-22 21:32:39Z jhb $");
341901Swollman
3574462Salfred#include "opt_capsicum.h"
3674462Salfred#include "opt_kstack_pages.h"
3774462Salfred
3874462Salfred#include <sys/param.h>
391901Swollman#include <sys/capability.h>
4092990Sobrien#include <sys/systm.h>
4192990Sobrien#include <sys/lock.h>
421901Swollman#include <sys/malloc.h>
431901Swollman#include <sys/mutex.h>
441901Swollman#include <sys/priv.h>
451901Swollman#include <sys/proc.h>
461901Swollman#include <sys/smp.h>
4785138Salfred#include <sys/sysproto.h>
481901Swollman
491901Swollman#include <vm/vm.h>
501901Swollman#include <vm/pmap.h>
5175094Siedowse#include <vm/vm_map.h>
5274462Salfred#include <vm/vm_extern.h>
531901Swollman
5474462Salfred#include <machine/cpu.h>
5574462Salfred#include <machine/pcb.h>
5611666Sphk#include <machine/pcb_ext.h>
5774462Salfred#include <machine/proc.h>
581901Swollman#include <machine/sysarch.h>
5974462Salfred
6074462Salfred#include <security/audit/audit.h>
6174462Salfred
6274462Salfred#ifdef XEN
631901Swollman#include <machine/xen/xenfunc.h>
641901Swollman
651901Swollmanvoid i386_reset_ldt(struct proc_ldt *pldt);
6674462Salfred
6774462Salfredvoid
681901Swollmani386_reset_ldt(struct proc_ldt *pldt)
691901Swollman{
701901Swollman        xen_set_ldt((vm_offset_t)pldt->ldt_base, pldt->ldt_len);
7174462Salfred}
721901Swollman#else
7374462Salfred#define i386_reset_ldt(x)
741901Swollman#endif
7592905Sobrien
7692905Sobrien#include <vm/vm_kern.h>		/* for kernel_map */
7792905Sobrien
7892905Sobrien#define MAX_LD 8192
7992905Sobrien#define LD_PER_PAGE 512
8092905Sobrien#define NEW_MAX_LD(num)  ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1))
8192905Sobrien#define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3)
8292905Sobrien#define	NULL_LDT_BASE	((caddr_t)NULL)
831901Swollman
8474462Salfred#ifdef SMP
8574462Salfredstatic void set_user_ldt_rv(struct vmspace *vmsp);
861901Swollman#endif
8774462Salfredstatic int i386_set_ldt_data(struct thread *, int start, int num,
881901Swollman	union descriptor *descs);
8974462Salfredstatic int i386_ldt_grow(struct thread *td, int len);
9074462Salfred
911901Swollman#ifndef _SYS_SYSPROTO_H_
9274462Salfredstruct sysarch_args {
9374462Salfred	int op;
9474462Salfred	char *parms;
9574462Salfred};
9674462Salfred#endif
9774462Salfred
9874462Salfredint
9974462Salfredsysarch(td, uap)
10074462Salfred	struct thread *td;
10174462Salfred	register struct sysarch_args *uap;
10274462Salfred{
10374462Salfred	int error;
1041901Swollman	union descriptor *lp;
10574462Salfred	union {
1061901Swollman		struct i386_ldt_args largs;
10774462Salfred		struct i386_ioperm_args iargs;
10874462Salfred		struct i386_get_xfpustate xfpu;
1091901Swollman	} kargs;
11074462Salfred	uint32_t base;
11174462Salfred	struct segment_descriptor sd, *sdp;
11274462Salfred
1131901Swollman	AUDIT_ARG_CMD(uap->op);
1141901Swollman
1151901Swollman#ifdef CAPABILITY_MODE
11674462Salfred	/*
1171901Swollman	 * When adding new operations, add a new case statement here to
11874462Salfred	 * explicitly indicate whether or not the operation is safe to
11974462Salfred	 * perform in capability mode.
1201901Swollman	 */
1211901Swollman	if (IN_CAPABILITY_MODE(td)) {
1221901Swollman		switch (uap->op) {
1231901Swollman		case I386_GET_LDT:
12474462Salfred		case I386_SET_LDT:
1251901Swollman		case I386_GET_IOPERM:
12674462Salfred		case I386_GET_FSBASE:
1271901Swollman		case I386_SET_FSBASE:
1281901Swollman		case I386_GET_GSBASE:
1291901Swollman		case I386_SET_GSBASE:
13074462Salfred		case I386_GET_XFPUSTATE:
13174462Salfred			break;
1321901Swollman
13374462Salfred		case I386_SET_IOPERM:
13474462Salfred		default:
13574462Salfred#ifdef KTRACE
13674462Salfred			if (KTRPOINT(td, KTR_CAPFAIL))
13774462Salfred				ktrcapfail(CAPFAIL_SYSCALL, NULL, NULL);
13874462Salfred#endif
13974462Salfred			return (ECAPMODE);
14074462Salfred		}
1411901Swollman	}
1421901Swollman#endif
14374462Salfred
14474462Salfred	switch (uap->op) {
14574462Salfred	case I386_GET_IOPERM:
14674462Salfred	case I386_SET_IOPERM:
1471901Swollman		if ((error = copyin(uap->parms, &kargs.iargs,
1481901Swollman		    sizeof(struct i386_ioperm_args))) != 0)
1491901Swollman			return (error);
15074462Salfred		break;
1511901Swollman	case I386_GET_LDT:
15274462Salfred	case I386_SET_LDT:
1531901Swollman		if ((error = copyin(uap->parms, &kargs.largs,
1541901Swollman		    sizeof(struct i386_ldt_args))) != 0)
1551901Swollman			return (error);
15674462Salfred		if (kargs.largs.num > MAX_LD || kargs.largs.num <= 0)
15774462Salfred			return (EINVAL);
1581901Swollman		break;
15974462Salfred	case I386_GET_XFPUSTATE:
16074462Salfred		if ((error = copyin(uap->parms, &kargs.xfpu,
16174462Salfred		    sizeof(struct i386_get_xfpustate))) != 0)
16274462Salfred			return (error);
1631901Swollman		break;
16474462Salfred	default:
16574462Salfred		break;
16674462Salfred	}
1671901Swollman
1681901Swollman	switch(uap->op) {
16974462Salfred	case I386_GET_LDT:
17074462Salfred		error = i386_get_ldt(td, &kargs.largs);
17174462Salfred		break;
17274462Salfred	case I386_SET_LDT:
17374462Salfred		if (kargs.largs.descs != NULL) {
1741901Swollman			lp = (union descriptor *)malloc(
1751901Swollman			    kargs.largs.num * sizeof(union descriptor),
1761901Swollman			    M_TEMP, M_WAITOK);
17774462Salfred			error = copyin(kargs.largs.descs, lp,
1781901Swollman			    kargs.largs.num * sizeof(union descriptor));
17974462Salfred			if (error == 0)
1801901Swollman				error = i386_set_ldt(td, &kargs.largs, lp);
1811901Swollman			free(lp, M_TEMP);
1821901Swollman		} else {
1831901Swollman			error = i386_set_ldt(td, &kargs.largs, NULL);
18474462Salfred		}
1851901Swollman		break;
18674462Salfred	case I386_GET_IOPERM:
18774462Salfred		error = i386_get_ioperm(td, &kargs.iargs);
18874462Salfred		if (error == 0)
18974462Salfred			error = copyout(&kargs.iargs, uap->parms,
1901901Swollman			    sizeof(struct i386_ioperm_args));
19174462Salfred		break;
19274462Salfred	case I386_SET_IOPERM:
19374462Salfred		error = i386_set_ioperm(td, &kargs.iargs);
1941901Swollman		break;
1951901Swollman	case I386_VM86:
19674462Salfred		error = vm86_sysarch(td, uap->parms);
1971901Swollman		break;
19874462Salfred	case I386_GET_FSBASE:
1991901Swollman		sdp = &td->td_pcb->pcb_fsd;
2001901Swollman		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
2011901Swollman		error = copyout(&base, uap->parms, sizeof(base));
2028870Srgrimes		break;
20374462Salfred	case I386_SET_FSBASE:
20474462Salfred		error = copyin(uap->parms, &base, sizeof(base));
2051901Swollman		if (!error) {
20674462Salfred			/*
20774462Salfred			 * Construct a descriptor and store it in the pcb for
20874462Salfred			 * the next context switch.  Also store it in the gdt
20974462Salfred			 * so that the load of tf_fs into %fs will activate it
2101901Swollman			 * at return to userland.
21174462Salfred			 */
21274462Salfred			sd.sd_lobase = base & 0xffffff;
21374462Salfred			sd.sd_hibase = (base >> 24) & 0xff;
2141901Swollman#ifdef XEN
2151901Swollman			/* need to do nosegneg like Linux */
21674462Salfred			sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff;
2178870Srgrimes#else
2181901Swollman			sd.sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
21974462Salfred#endif
2201901Swollman			sd.sd_hilimit = 0xf;
22174462Salfred			sd.sd_type  = SDT_MEMRWA;
22274462Salfred			sd.sd_dpl   = SEL_UPL;
2231901Swollman			sd.sd_p     = 1;
2241901Swollman			sd.sd_xx    = 0;
22574462Salfred			sd.sd_def32 = 1;
22674462Salfred			sd.sd_gran  = 1;
22774462Salfred			critical_enter();
22874462Salfred			td->td_pcb->pcb_fsd = sd;
22974462Salfred#ifdef XEN
23074462Salfred			HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[0]),
23174462Salfred			    *(uint64_t *)&sd);
23274462Salfred#else
23374462Salfred			PCPU_GET(fsgs_gdt)[0] = sd;
23474462Salfred#endif
23574462Salfred			critical_exit();
23674462Salfred			td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
23774462Salfred		}
23874462Salfred		break;
23974462Salfred	case I386_GET_GSBASE:
24074462Salfred		sdp = &td->td_pcb->pcb_gsd;
24174462Salfred		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
24274462Salfred		error = copyout(&base, uap->parms, sizeof(base));
24374462Salfred		break;
24474462Salfred	case I386_SET_GSBASE:
24574462Salfred		error = copyin(uap->parms, &base, sizeof(base));
24674462Salfred		if (!error) {
24774462Salfred			/*
24874462Salfred			 * Construct a descriptor and store it in the pcb for
24974462Salfred			 * the next context switch.  Also store it in the gdt
25074462Salfred			 * because we have to do a load_gs() right now.
25174462Salfred			 */
25274462Salfred			sd.sd_lobase = base & 0xffffff;
25374462Salfred			sd.sd_hibase = (base >> 24) & 0xff;
25474462Salfred
25574462Salfred#ifdef XEN
25674462Salfred			/* need to do nosegneg like Linux */
25774462Salfred			sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff;
25874462Salfred#else
25974462Salfred			sd.sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
260#endif
261			sd.sd_hilimit = 0xf;
262			sd.sd_type  = SDT_MEMRWA;
263			sd.sd_dpl   = SEL_UPL;
264			sd.sd_p     = 1;
265			sd.sd_xx    = 0;
266			sd.sd_def32 = 1;
267			sd.sd_gran  = 1;
268			critical_enter();
269			td->td_pcb->pcb_gsd = sd;
270#ifdef XEN
271			HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[1]),
272			    *(uint64_t *)&sd);
273#else
274			PCPU_GET(fsgs_gdt)[1] = sd;
275#endif
276			critical_exit();
277			load_gs(GSEL(GUGS_SEL, SEL_UPL));
278		}
279		break;
280	case I386_GET_XFPUSTATE:
281		if (kargs.xfpu.len > cpu_max_ext_state_size -
282		    sizeof(union savefpu))
283			return (EINVAL);
284		npxgetregs(td);
285		error = copyout((char *)(get_pcb_user_save_td(td) + 1),
286		    kargs.xfpu.addr, kargs.xfpu.len);
287		break;
288	default:
289		error = EINVAL;
290		break;
291	}
292	return (error);
293}
294
295int
296i386_extend_pcb(struct thread *td)
297{
298	int i, offset;
299	u_long *addr;
300	struct pcb_ext *ext;
301	struct soft_segment_descriptor ssd = {
302		0,			/* segment base address (overwritten) */
303		ctob(IOPAGES + 1) - 1,	/* length */
304		SDT_SYS386TSS,		/* segment type */
305		0,			/* priority level */
306		1,			/* descriptor present */
307		0, 0,
308		0,			/* default 32 size */
309		0			/* granularity */
310	};
311
312	ext = (struct pcb_ext *)kmem_malloc(kernel_arena, ctob(IOPAGES+1),
313	    M_WAITOK | M_ZERO);
314	/* -16 is so we can convert a trapframe into vm86trapframe inplace */
315	ext->ext_tss.tss_esp0 = td->td_kstack + ctob(KSTACK_PAGES) -
316	    sizeof(struct pcb) - 16;
317	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
318	/*
319	 * The last byte of the i/o map must be followed by an 0xff byte.
320	 * We arbitrarily allocate 16 bytes here, to keep the starting
321	 * address on a doubleword boundary.
322	 */
323	offset = PAGE_SIZE - 16;
324	ext->ext_tss.tss_ioopt =
325	    (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16;
326	ext->ext_iomap = (caddr_t)ext + offset;
327	ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32;
328
329	addr = (u_long *)ext->ext_vm86.vm86_intmap;
330	for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++)
331		*addr++ = ~0;
332
333	ssd.ssd_base = (unsigned)&ext->ext_tss;
334	ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext);
335	ssdtosd(&ssd, &ext->ext_tssd);
336
337	KASSERT(td == curthread, ("giving TSS to !curthread"));
338	KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!"));
339
340	/* Switch to the new TSS. */
341	critical_enter();
342	td->td_pcb->pcb_ext = ext;
343	PCPU_SET(private_tss, 1);
344	*PCPU_GET(tss_gdt) = ext->ext_tssd;
345	ltr(GSEL(GPROC0_SEL, SEL_KPL));
346	critical_exit();
347
348	return 0;
349}
350
351int
352i386_set_ioperm(td, uap)
353	struct thread *td;
354	struct i386_ioperm_args *uap;
355{
356	int i, error;
357	char *iomap;
358
359	if ((error = priv_check(td, PRIV_IO)) != 0)
360		return (error);
361	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
362		return (error);
363	/*
364	 * XXX
365	 * While this is restricted to root, we should probably figure out
366	 * whether any other driver is using this i/o address, as so not to
367	 * cause confusion.  This probably requires a global 'usage registry'.
368	 */
369
370	if (td->td_pcb->pcb_ext == 0)
371		if ((error = i386_extend_pcb(td)) != 0)
372			return (error);
373	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
374
375	if (uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
376		return (EINVAL);
377
378	for (i = uap->start; i < uap->start + uap->length; i++) {
379		if (uap->enable)
380			iomap[i >> 3] &= ~(1 << (i & 7));
381		else
382			iomap[i >> 3] |= (1 << (i & 7));
383	}
384	return (error);
385}
386
387int
388i386_get_ioperm(td, uap)
389	struct thread *td;
390	struct i386_ioperm_args *uap;
391{
392	int i, state;
393	char *iomap;
394
395	if (uap->start >= IOPAGES * PAGE_SIZE * NBBY)
396		return (EINVAL);
397
398	if (td->td_pcb->pcb_ext == 0) {
399		uap->length = 0;
400		goto done;
401	}
402
403	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
404
405	i = uap->start;
406	state = (iomap[i >> 3] >> (i & 7)) & 1;
407	uap->enable = !state;
408	uap->length = 1;
409
410	for (i = uap->start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) {
411		if (state != ((iomap[i >> 3] >> (i & 7)) & 1))
412			break;
413		uap->length++;
414	}
415
416done:
417	return (0);
418}
419
420/*
421 * Update the GDT entry pointing to the LDT to point to the LDT of the
422 * current process. Manage dt_lock holding/unholding autonomously.
423 */
424void
425set_user_ldt(struct mdproc *mdp)
426{
427	struct proc_ldt *pldt;
428	int dtlocked;
429
430	dtlocked = 0;
431	if (!mtx_owned(&dt_lock)) {
432		mtx_lock_spin(&dt_lock);
433		dtlocked = 1;
434	}
435
436	pldt = mdp->md_ldt;
437#ifdef XEN
438	i386_reset_ldt(pldt);
439	PCPU_SET(currentldt, (int)pldt);
440#else
441#ifdef SMP
442	gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd;
443#else
444	gdt[GUSERLDT_SEL].sd = pldt->ldt_sd;
445#endif
446	lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
447	PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL));
448#endif /* XEN */
449	if (dtlocked)
450		mtx_unlock_spin(&dt_lock);
451}
452
453#ifdef SMP
454static void
455set_user_ldt_rv(struct vmspace *vmsp)
456{
457	struct thread *td;
458
459	td = curthread;
460	if (vmsp != td->td_proc->p_vmspace)
461		return;
462
463	set_user_ldt(&td->td_proc->p_md);
464}
465#endif
466
467#ifdef XEN
468
469/*
470 * dt_lock must be held. Returns with dt_lock held.
471 */
472struct proc_ldt *
473user_ldt_alloc(struct mdproc *mdp, int len)
474{
475        struct proc_ldt *pldt, *new_ldt;
476
477        mtx_assert(&dt_lock, MA_OWNED);
478        mtx_unlock_spin(&dt_lock);
479        new_ldt = malloc(sizeof(struct proc_ldt),
480                M_SUBPROC, M_WAITOK);
481
482        new_ldt->ldt_len = len = NEW_MAX_LD(len);
483        new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
484	    round_page(len * sizeof(union descriptor)), M_WAITOK);
485        new_ldt->ldt_refcnt = 1;
486        new_ldt->ldt_active = 0;
487
488	mtx_lock_spin(&dt_lock);
489        if ((pldt = mdp->md_ldt)) {
490                if (len > pldt->ldt_len)
491                        len = pldt->ldt_len;
492                bcopy(pldt->ldt_base, new_ldt->ldt_base,
493                    len * sizeof(union descriptor));
494        } else {
495                bcopy(ldt, new_ldt->ldt_base, PAGE_SIZE);
496        }
497        mtx_unlock_spin(&dt_lock);  /* XXX kill once pmap locking fixed. */
498        pmap_map_readonly(kernel_pmap, (vm_offset_t)new_ldt->ldt_base,
499                          new_ldt->ldt_len*sizeof(union descriptor));
500        mtx_lock_spin(&dt_lock);  /* XXX kill once pmap locking fixed. */
501        return (new_ldt);
502}
503#else
504/*
505 * dt_lock must be held. Returns with dt_lock held.
506 */
507struct proc_ldt *
508user_ldt_alloc(struct mdproc *mdp, int len)
509{
510	struct proc_ldt *pldt, *new_ldt;
511
512	mtx_assert(&dt_lock, MA_OWNED);
513	mtx_unlock_spin(&dt_lock);
514	new_ldt = malloc(sizeof(struct proc_ldt),
515		M_SUBPROC, M_WAITOK);
516
517	new_ldt->ldt_len = len = NEW_MAX_LD(len);
518	new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
519	    len * sizeof(union descriptor), M_WAITOK);
520	new_ldt->ldt_refcnt = 1;
521	new_ldt->ldt_active = 0;
522
523	mtx_lock_spin(&dt_lock);
524	gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base;
525	gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1;
526	ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd);
527
528	if ((pldt = mdp->md_ldt) != NULL) {
529		if (len > pldt->ldt_len)
530			len = pldt->ldt_len;
531		bcopy(pldt->ldt_base, new_ldt->ldt_base,
532		    len * sizeof(union descriptor));
533	} else
534		bcopy(ldt, new_ldt->ldt_base, sizeof(ldt));
535
536	return (new_ldt);
537}
538#endif /* !XEN */
539
540/*
541 * Must be called with dt_lock held.  Returns with dt_lock unheld.
542 */
543void
544user_ldt_free(struct thread *td)
545{
546	struct mdproc *mdp = &td->td_proc->p_md;
547	struct proc_ldt *pldt;
548
549	mtx_assert(&dt_lock, MA_OWNED);
550	if ((pldt = mdp->md_ldt) == NULL) {
551		mtx_unlock_spin(&dt_lock);
552		return;
553	}
554
555	if (td == curthread) {
556#ifdef XEN
557		i386_reset_ldt(&default_proc_ldt);
558		PCPU_SET(currentldt, (int)&default_proc_ldt);
559#else
560		lldt(_default_ldt);
561		PCPU_SET(currentldt, _default_ldt);
562#endif
563	}
564
565	mdp->md_ldt = NULL;
566	user_ldt_deref(pldt);
567}
568
569void
570user_ldt_deref(struct proc_ldt *pldt)
571{
572
573	mtx_assert(&dt_lock, MA_OWNED);
574	if (--pldt->ldt_refcnt == 0) {
575		mtx_unlock_spin(&dt_lock);
576		kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
577			pldt->ldt_len * sizeof(union descriptor));
578		free(pldt, M_SUBPROC);
579	} else
580		mtx_unlock_spin(&dt_lock);
581}
582
583/*
584 * Note for the authors of compat layers (linux, etc): copyout() in
585 * the function below is not a problem since it presents data in
586 * arch-specific format (i.e. i386-specific in this case), not in
587 * the OS-specific one.
588 */
589int
590i386_get_ldt(td, uap)
591	struct thread *td;
592	struct i386_ldt_args *uap;
593{
594	int error = 0;
595	struct proc_ldt *pldt;
596	int nldt, num;
597	union descriptor *lp;
598
599#ifdef	DEBUG
600	printf("i386_get_ldt: start=%d num=%d descs=%p\n",
601	    uap->start, uap->num, (void *)uap->descs);
602#endif
603
604	mtx_lock_spin(&dt_lock);
605	if ((pldt = td->td_proc->p_md.md_ldt) != NULL) {
606		nldt = pldt->ldt_len;
607		lp = &((union descriptor *)(pldt->ldt_base))[uap->start];
608		mtx_unlock_spin(&dt_lock);
609		num = min(uap->num, nldt);
610	} else {
611		mtx_unlock_spin(&dt_lock);
612		nldt = sizeof(ldt)/sizeof(ldt[0]);
613		num = min(uap->num, nldt);
614		lp = &ldt[uap->start];
615	}
616
617	if ((uap->start > (unsigned int)nldt) ||
618	    ((unsigned int)num > (unsigned int)nldt) ||
619	    ((unsigned int)(uap->start + num) > (unsigned int)nldt))
620		return(EINVAL);
621
622	error = copyout(lp, uap->descs, num * sizeof(union descriptor));
623	if (!error)
624		td->td_retval[0] = num;
625
626	return(error);
627}
628
629int
630i386_set_ldt(td, uap, descs)
631	struct thread *td;
632	struct i386_ldt_args *uap;
633	union descriptor *descs;
634{
635	int error = 0, i;
636	int largest_ld;
637	struct mdproc *mdp = &td->td_proc->p_md;
638	struct proc_ldt *pldt;
639	union descriptor *dp;
640
641#ifdef	DEBUG
642	printf("i386_set_ldt: start=%d num=%d descs=%p\n",
643	    uap->start, uap->num, (void *)uap->descs);
644#endif
645
646	if (descs == NULL) {
647		/* Free descriptors */
648		if (uap->start == 0 && uap->num == 0) {
649			/*
650			 * Treat this as a special case, so userland needn't
651			 * know magic number NLDT.
652			 */
653			uap->start = NLDT;
654			uap->num = MAX_LD - NLDT;
655		}
656		if (uap->num == 0)
657			return (EINVAL);
658		mtx_lock_spin(&dt_lock);
659		if ((pldt = mdp->md_ldt) == NULL ||
660		    uap->start >= pldt->ldt_len) {
661			mtx_unlock_spin(&dt_lock);
662			return (0);
663		}
664		largest_ld = uap->start + uap->num;
665		if (largest_ld > pldt->ldt_len)
666			largest_ld = pldt->ldt_len;
667		i = largest_ld - uap->start;
668		bzero(&((union descriptor *)(pldt->ldt_base))[uap->start],
669		    sizeof(union descriptor) * i);
670		mtx_unlock_spin(&dt_lock);
671		return (0);
672	}
673
674	if (!(uap->start == LDT_AUTO_ALLOC && uap->num == 1)) {
675		/* verify range of descriptors to modify */
676		largest_ld = uap->start + uap->num;
677		if (uap->start >= MAX_LD || largest_ld > MAX_LD) {
678			return (EINVAL);
679		}
680	}
681
682	/* Check descriptors for access violations */
683	for (i = 0; i < uap->num; i++) {
684		dp = &descs[i];
685
686		switch (dp->sd.sd_type) {
687		case SDT_SYSNULL:	/* system null */
688			dp->sd.sd_p = 0;
689			break;
690		case SDT_SYS286TSS: /* system 286 TSS available */
691		case SDT_SYSLDT:    /* system local descriptor table */
692		case SDT_SYS286BSY: /* system 286 TSS busy */
693		case SDT_SYSTASKGT: /* system task gate */
694		case SDT_SYS286IGT: /* system 286 interrupt gate */
695		case SDT_SYS286TGT: /* system 286 trap gate */
696		case SDT_SYSNULL2:  /* undefined by Intel */
697		case SDT_SYS386TSS: /* system 386 TSS available */
698		case SDT_SYSNULL3:  /* undefined by Intel */
699		case SDT_SYS386BSY: /* system 386 TSS busy */
700		case SDT_SYSNULL4:  /* undefined by Intel */
701		case SDT_SYS386IGT: /* system 386 interrupt gate */
702		case SDT_SYS386TGT: /* system 386 trap gate */
703		case SDT_SYS286CGT: /* system 286 call gate */
704		case SDT_SYS386CGT: /* system 386 call gate */
705			/* I can't think of any reason to allow a user proc
706			 * to create a segment of these types.  They are
707			 * for OS use only.
708			 */
709			return (EACCES);
710			/*NOTREACHED*/
711
712		/* memory segment types */
713		case SDT_MEMEC:   /* memory execute only conforming */
714		case SDT_MEMEAC:  /* memory execute only accessed conforming */
715		case SDT_MEMERC:  /* memory execute read conforming */
716		case SDT_MEMERAC: /* memory execute read accessed conforming */
717			 /* Must be "present" if executable and conforming. */
718			if (dp->sd.sd_p == 0)
719				return (EACCES);
720			break;
721		case SDT_MEMRO:   /* memory read only */
722		case SDT_MEMROA:  /* memory read only accessed */
723		case SDT_MEMRW:   /* memory read write */
724		case SDT_MEMRWA:  /* memory read write accessed */
725		case SDT_MEMROD:  /* memory read only expand dwn limit */
726		case SDT_MEMRODA: /* memory read only expand dwn lim accessed */
727		case SDT_MEMRWD:  /* memory read write expand dwn limit */
728		case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */
729		case SDT_MEME:    /* memory execute only */
730		case SDT_MEMEA:   /* memory execute only accessed */
731		case SDT_MEMER:   /* memory execute read */
732		case SDT_MEMERA:  /* memory execute read accessed */
733			break;
734		default:
735			return(EINVAL);
736			/*NOTREACHED*/
737		}
738
739		/* Only user (ring-3) descriptors may be present. */
740		if ((dp->sd.sd_p != 0) && (dp->sd.sd_dpl != SEL_UPL))
741			return (EACCES);
742	}
743
744	if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) {
745		/* Allocate a free slot */
746		mtx_lock_spin(&dt_lock);
747		if ((pldt = mdp->md_ldt) == NULL) {
748			if ((error = i386_ldt_grow(td, NLDT + 1))) {
749				mtx_unlock_spin(&dt_lock);
750				return (error);
751			}
752			pldt = mdp->md_ldt;
753		}
754again:
755		/*
756		 * start scanning a bit up to leave room for NVidia and
757		 * Wine, which still user the "Blat" method of allocation.
758		 */
759		dp = &((union descriptor *)(pldt->ldt_base))[NLDT];
760		for (i = NLDT; i < pldt->ldt_len; ++i) {
761			if (dp->sd.sd_type == SDT_SYSNULL)
762				break;
763			dp++;
764		}
765		if (i >= pldt->ldt_len) {
766			if ((error = i386_ldt_grow(td, pldt->ldt_len+1))) {
767				mtx_unlock_spin(&dt_lock);
768				return (error);
769			}
770			goto again;
771		}
772		uap->start = i;
773		error = i386_set_ldt_data(td, i, 1, descs);
774		mtx_unlock_spin(&dt_lock);
775	} else {
776		largest_ld = uap->start + uap->num;
777		mtx_lock_spin(&dt_lock);
778		if (!(error = i386_ldt_grow(td, largest_ld))) {
779			error = i386_set_ldt_data(td, uap->start, uap->num,
780			    descs);
781		}
782		mtx_unlock_spin(&dt_lock);
783	}
784	if (error == 0)
785		td->td_retval[0] = uap->start;
786	return (error);
787}
788#ifdef XEN
789static int
790i386_set_ldt_data(struct thread *td, int start, int num,
791	union descriptor *descs)
792{
793	struct mdproc *mdp = &td->td_proc->p_md;
794	struct proc_ldt *pldt = mdp->md_ldt;
795
796	mtx_assert(&dt_lock, MA_OWNED);
797
798	while (num) {
799		xen_update_descriptor(
800		    &((union descriptor *)(pldt->ldt_base))[start],
801		    descs);
802		num--;
803		start++;
804		descs++;
805	}
806	return (0);
807}
808#else
809static int
810i386_set_ldt_data(struct thread *td, int start, int num,
811	union descriptor *descs)
812{
813	struct mdproc *mdp = &td->td_proc->p_md;
814	struct proc_ldt *pldt = mdp->md_ldt;
815
816	mtx_assert(&dt_lock, MA_OWNED);
817
818	/* Fill in range */
819	bcopy(descs,
820	    &((union descriptor *)(pldt->ldt_base))[start],
821	    num * sizeof(union descriptor));
822	return (0);
823}
824#endif /* !XEN */
825
826static int
827i386_ldt_grow(struct thread *td, int len)
828{
829	struct mdproc *mdp = &td->td_proc->p_md;
830	struct proc_ldt *new_ldt, *pldt;
831	caddr_t old_ldt_base = NULL_LDT_BASE;
832	int old_ldt_len = 0;
833
834	mtx_assert(&dt_lock, MA_OWNED);
835
836	if (len > MAX_LD)
837		return (ENOMEM);
838	if (len < NLDT + 1)
839		len = NLDT + 1;
840
841	/* Allocate a user ldt. */
842	if ((pldt = mdp->md_ldt) == NULL || len > pldt->ldt_len) {
843		new_ldt = user_ldt_alloc(mdp, len);
844		if (new_ldt == NULL)
845			return (ENOMEM);
846		pldt = mdp->md_ldt;
847
848		if (pldt != NULL) {
849			if (new_ldt->ldt_len <= pldt->ldt_len) {
850				/*
851				 * We just lost the race for allocation, so
852				 * free the new object and return.
853				 */
854				mtx_unlock_spin(&dt_lock);
855				kmem_free(kernel_arena,
856				   (vm_offset_t)new_ldt->ldt_base,
857				   new_ldt->ldt_len * sizeof(union descriptor));
858				free(new_ldt, M_SUBPROC);
859				mtx_lock_spin(&dt_lock);
860				return (0);
861			}
862
863			/*
864			 * We have to substitute the current LDT entry for
865			 * curproc with the new one since its size grew.
866			 */
867			old_ldt_base = pldt->ldt_base;
868			old_ldt_len = pldt->ldt_len;
869			pldt->ldt_sd = new_ldt->ldt_sd;
870			pldt->ldt_base = new_ldt->ldt_base;
871			pldt->ldt_len = new_ldt->ldt_len;
872		} else
873			mdp->md_ldt = pldt = new_ldt;
874#ifdef SMP
875		/*
876		 * Signal other cpus to reload ldt.  We need to unlock dt_lock
877		 * here because other CPU will contest on it since their
878		 * curthreads won't hold the lock and will block when trying
879		 * to acquire it.
880		 */
881		mtx_unlock_spin(&dt_lock);
882		smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv,
883		    NULL, td->td_proc->p_vmspace);
884#else
885		set_user_ldt(&td->td_proc->p_md);
886		mtx_unlock_spin(&dt_lock);
887#endif
888		if (old_ldt_base != NULL_LDT_BASE) {
889			kmem_free(kernel_arena, (vm_offset_t)old_ldt_base,
890			    old_ldt_len * sizeof(union descriptor));
891			free(new_ldt, M_SUBPROC);
892		}
893		mtx_lock_spin(&dt_lock);
894	}
895	return (0);
896}
897