linux_machdep.c revision 161673
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 161673 2006-08-27 18:51:32Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/imgact.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mman.h>
38#include <sys/mutex.h>
39#include <sys/sx.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/resource.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysproto.h>
47#include <sys/unistd.h>
48#include <sys/wait.h>
49
50#include <machine/frame.h>
51#include <machine/psl.h>
52#include <machine/segments.h>
53#include <machine/sysarch.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58
59#include <i386/linux/linux.h>
60#include <i386/linux/linux_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
67
68#include "opt_posix.h"
69
70extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
71
72struct l_descriptor {
73	l_uint		entry_number;
74	l_ulong		base_addr;
75	l_uint		limit;
76	l_uint		seg_32bit:1;
77	l_uint		contents:2;
78	l_uint		read_exec_only:1;
79	l_uint		limit_in_pages:1;
80	l_uint		seg_not_present:1;
81	l_uint		useable:1;
82};
83
84struct l_old_select_argv {
85	l_int		nfds;
86	l_fd_set	*readfds;
87	l_fd_set	*writefds;
88	l_fd_set	*exceptfds;
89	struct l_timeval	*timeout;
90};
91
92int
93linux_to_bsd_sigaltstack(int lsa)
94{
95	int bsa = 0;
96
97	if (lsa & LINUX_SS_DISABLE)
98		bsa |= SS_DISABLE;
99	if (lsa & LINUX_SS_ONSTACK)
100		bsa |= SS_ONSTACK;
101	return (bsa);
102}
103
104int
105bsd_to_linux_sigaltstack(int bsa)
106{
107	int lsa = 0;
108
109	if (bsa & SS_DISABLE)
110		lsa |= LINUX_SS_DISABLE;
111	if (bsa & SS_ONSTACK)
112		lsa |= LINUX_SS_ONSTACK;
113	return (lsa);
114}
115
116int
117linux_execve(struct thread *td, struct linux_execve_args *args)
118{
119	int error;
120	char *newpath;
121	struct image_args eargs;
122
123	LCONVPATHEXIST(td, args->path, &newpath);
124
125#ifdef DEBUG
126	if (ldebug(execve))
127		printf(ARGS(execve, "%s"), newpath);
128#endif
129
130	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
131	    args->argp, args->envp);
132	free(newpath, M_TEMP);
133	if (error == 0)
134		error = kern_execve(td, &eargs, NULL);
135	if (error == 0)
136	   	/* linux process can exec fbsd one, dont attempt
137		 * to create emuldata for such process using
138		 * linux_proc_init, this leads to a panic on KASSERT
139		 * because such process has p->p_emuldata == NULL
140		 */
141	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
142   		   	error = linux_proc_init(td, 0, 0);
143	return (error);
144}
145
146struct l_ipc_kludge {
147	struct l_msgbuf *msgp;
148	l_long msgtyp;
149};
150
151int
152linux_ipc(struct thread *td, struct linux_ipc_args *args)
153{
154
155	switch (args->what & 0xFFFF) {
156	case LINUX_SEMOP: {
157		struct linux_semop_args a;
158
159		a.semid = args->arg1;
160		a.tsops = args->ptr;
161		a.nsops = args->arg2;
162		return (linux_semop(td, &a));
163	}
164	case LINUX_SEMGET: {
165		struct linux_semget_args a;
166
167		a.key = args->arg1;
168		a.nsems = args->arg2;
169		a.semflg = args->arg3;
170		return (linux_semget(td, &a));
171	}
172	case LINUX_SEMCTL: {
173		struct linux_semctl_args a;
174		int error;
175
176		a.semid = args->arg1;
177		a.semnum = args->arg2;
178		a.cmd = args->arg3;
179		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
180		if (error)
181			return (error);
182		return (linux_semctl(td, &a));
183	}
184	case LINUX_MSGSND: {
185		struct linux_msgsnd_args a;
186
187		a.msqid = args->arg1;
188		a.msgp = args->ptr;
189		a.msgsz = args->arg2;
190		a.msgflg = args->arg3;
191		return (linux_msgsnd(td, &a));
192	}
193	case LINUX_MSGRCV: {
194		struct linux_msgrcv_args a;
195
196		a.msqid = args->arg1;
197		a.msgsz = args->arg2;
198		a.msgflg = args->arg3;
199		if ((args->what >> 16) == 0) {
200			struct l_ipc_kludge tmp;
201			int error;
202
203			if (args->ptr == NULL)
204				return (EINVAL);
205			error = copyin(args->ptr, &tmp, sizeof(tmp));
206			if (error)
207				return (error);
208			a.msgp = tmp.msgp;
209			a.msgtyp = tmp.msgtyp;
210		} else {
211			a.msgp = args->ptr;
212			a.msgtyp = args->arg5;
213		}
214		return (linux_msgrcv(td, &a));
215	}
216	case LINUX_MSGGET: {
217		struct linux_msgget_args a;
218
219		a.key = args->arg1;
220		a.msgflg = args->arg2;
221		return (linux_msgget(td, &a));
222	}
223	case LINUX_MSGCTL: {
224		struct linux_msgctl_args a;
225
226		a.msqid = args->arg1;
227		a.cmd = args->arg2;
228		a.buf = args->ptr;
229		return (linux_msgctl(td, &a));
230	}
231	case LINUX_SHMAT: {
232		struct linux_shmat_args a;
233
234		a.shmid = args->arg1;
235		a.shmaddr = args->ptr;
236		a.shmflg = args->arg2;
237		a.raddr = (l_ulong *)args->arg3;
238		return (linux_shmat(td, &a));
239	}
240	case LINUX_SHMDT: {
241		struct linux_shmdt_args a;
242
243		a.shmaddr = args->ptr;
244		return (linux_shmdt(td, &a));
245	}
246	case LINUX_SHMGET: {
247		struct linux_shmget_args a;
248
249		a.key = args->arg1;
250		a.size = args->arg2;
251		a.shmflg = args->arg3;
252		return (linux_shmget(td, &a));
253	}
254	case LINUX_SHMCTL: {
255		struct linux_shmctl_args a;
256
257		a.shmid = args->arg1;
258		a.cmd = args->arg2;
259		a.buf = args->ptr;
260		return (linux_shmctl(td, &a));
261	}
262	default:
263		break;
264	}
265
266	return (EINVAL);
267}
268
269int
270linux_old_select(struct thread *td, struct linux_old_select_args *args)
271{
272	struct l_old_select_argv linux_args;
273	struct linux_select_args newsel;
274	int error;
275
276#ifdef DEBUG
277	if (ldebug(old_select))
278		printf(ARGS(old_select, "%p"), args->ptr);
279#endif
280
281	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
282	if (error)
283		return (error);
284
285	newsel.nfds = linux_args.nfds;
286	newsel.readfds = linux_args.readfds;
287	newsel.writefds = linux_args.writefds;
288	newsel.exceptfds = linux_args.exceptfds;
289	newsel.timeout = linux_args.timeout;
290	return (linux_select(td, &newsel));
291}
292
293int
294linux_fork(struct thread *td, struct linux_fork_args *args)
295{
296	int error;
297
298#ifdef DEBUG
299	if (ldebug(fork))
300		printf(ARGS(fork, ""));
301#endif
302
303	if ((error = fork(td, (struct fork_args *)args)) != 0)
304		return (error);
305
306	if (td->td_retval[1] == 1)
307		td->td_retval[0] = 0;
308	error = linux_proc_init(td, td->td_retval[0], 0);
309	if (error)
310		return (error);
311
312	return (0);
313}
314
315int
316linux_vfork(struct thread *td, struct linux_vfork_args *args)
317{
318	int error;
319	struct proc *p2;
320
321#ifdef DEBUG
322	if (ldebug(vfork))
323		printf(ARGS(vfork, ""));
324#endif
325
326	/* exclude RFPPWAIT */
327	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
328		return (error);
329	if (error == 0) {
330	   	td->td_retval[0] = p2->p_pid;
331		td->td_retval[1] = 0;
332	}
333	/* Are we the child? */
334	if (td->td_retval[1] == 1)
335		td->td_retval[0] = 0;
336	error = linux_proc_init(td, td->td_retval[0], 0);
337	if (error)
338		return (error);
339	/* wait for the children to exit, ie. emulate vfork */
340	PROC_LOCK(p2);
341	while (p2->p_flag & P_PPWAIT)
342	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
343	PROC_UNLOCK(p2);
344
345	return (0);
346}
347
348int
349linux_clone(struct thread *td, struct linux_clone_args *args)
350{
351	int error, ff = RFPROC | RFSTOPPED;
352	struct proc *p2;
353	struct thread *td2;
354	int exit_signal;
355	struct linux_emuldata *em;
356
357#ifdef DEBUG
358	if (ldebug(clone)) {
359   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
360		    (unsigned int)args->flags, (unsigned int)args->stack,
361		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
362	}
363#endif
364
365	exit_signal = args->flags & 0x000000ff;
366	if (exit_signal >= LINUX_NSIG)
367		return (EINVAL);
368
369	if (exit_signal <= LINUX_SIGTBLSZ)
370		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
371
372	if (args->flags & CLONE_VM)
373		ff |= RFMEM;
374	if (args->flags & CLONE_SIGHAND)
375		ff |= RFSIGSHARE;
376	if (!(args->flags & CLONE_FILES))
377		ff |= RFFDG;
378
379	/*
380	 * Attempt to detect when linux_clone(2) is used for creating
381	 * kernel threads. Unfortunately despite the existence of the
382	 * CLONE_THREAD flag, version of linuxthreads package used in
383	 * most popular distros as of beginning of 2005 doesn't make
384	 * any use of it. Therefore, this detection relay fully on
385	 * empirical observation that linuxthreads sets certain
386	 * combination of flags, so that we can make more or less
387	 * precise detection and notify the FreeBSD kernel that several
388	 * processes are in fact part of the same threading group, so
389	 * that special treatment is necessary for signal delivery
390	 * between those processes and fd locking.
391	 */
392	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
393		ff |= RFTHREAD;
394
395	error = fork1(td, ff, 0, &p2);
396	if (error)
397		return (error);
398
399	/* create the emuldata */
400	error = linux_proc_init(td, p2->p_pid, args->flags);
401	/* reference it - no need to check this */
402	em = em_find(p2, EMUL_UNLOCKED);
403	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
404	/* and adjust it */
405	if (args->flags & CLONE_PARENT_SETTID) {
406	   	if (args->parent_tidptr == NULL) {
407		   	EMUL_UNLOCK(&emul_lock);
408			return (EINVAL);
409		}
410		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
411		if (error) {
412		   	EMUL_UNLOCK(&emul_lock);
413			return (error);
414		}
415	}
416
417	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
418	   	sx_xlock(&proctree_lock);
419		PROC_LOCK(p2);
420		proc_reparent(p2, td->td_proc->p_pptr);
421		PROC_UNLOCK(p2);
422		sx_xunlock(&proctree_lock);
423	}
424
425	if (args->flags & CLONE_THREAD) {
426	   	/* XXX: linux mangles pgrp and pptr somehow
427		 * I think it might be this but I am not sure.
428		 */
429#ifdef notyet
430	   	PROC_LOCK(p2);
431	   	p2->p_pgrp = td->td_proc->p_pgrp;
432	   	PROC_UNLOCK(p2);
433#endif
434	 	exit_signal = 0;
435	}
436
437	if (args->flags & CLONE_CHILD_SETTID)
438		em->child_set_tid = args->child_tidptr;
439	else
440	   	em->child_set_tid = NULL;
441
442	if (args->flags & CLONE_CHILD_CLEARTID)
443		em->child_clear_tid = args->child_tidptr;
444	else
445	   	em->child_clear_tid = NULL;
446
447	EMUL_UNLOCK(&emul_lock);
448
449	PROC_LOCK(p2);
450	p2->p_sigparent = exit_signal;
451	PROC_UNLOCK(p2);
452	td2 = FIRST_THREAD_IN_PROC(p2);
453	/*
454	 * in a case of stack = NULL we are supposed to COW calling process stack
455	 * this is what normal fork() does so we just keep the tf_esp arg intact
456	 */
457	if (args->stack)
458   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
459
460	if (args->flags & CLONE_SETTLS) {
461   	   	struct l_user_desc info;
462   	   	int idx;
463	   	int a[2];
464		struct segment_descriptor sd;
465
466	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
467		if (error)
468   		   	return (error);
469
470		idx = info.entry_number;
471
472		/*
473		 * looks like we're getting the idx we returned
474		 * in the set_thread_area() syscall
475		 */
476		if (idx != 6 && idx != 3)
477			return (EINVAL);
478
479		/* this doesnt happen in practice */
480		if (idx == 6) {
481		   	/* we might copy out the entry_number as 3 */
482		   	info.entry_number = 3;
483			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
484			if (error)
485	   		   	return (error);
486		}
487
488		a[0] = LDT_entry_a(&info);
489		a[1] = LDT_entry_b(&info);
490
491		memcpy(&sd, &a, sizeof(a));
492#ifdef DEBUG
493	if (ldebug(clone))
494	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
495			sd.sd_hibase,
496			sd.sd_lolimit,
497			sd.sd_hilimit,
498			sd.sd_type,
499			sd.sd_dpl,
500			sd.sd_p,
501			sd.sd_xx,
502			sd.sd_def32,
503			sd.sd_gran);
504#endif
505
506		/* set %gs */
507		td2->td_pcb->pcb_gsd = sd;
508		td2->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL);
509	}
510
511#ifdef DEBUG
512	if (ldebug(clone))
513		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
514		    (long)p2->p_pid, args->stack, exit_signal);
515#endif
516
517	/*
518	 * Make this runnable after we are finished with it.
519	 */
520	mtx_lock_spin(&sched_lock);
521	TD_SET_CAN_RUN(td2);
522	setrunqueue(td2, SRQ_BORING);
523	mtx_unlock_spin(&sched_lock);
524
525	td->td_retval[0] = p2->p_pid;
526	td->td_retval[1] = 0;
527	return (0);
528}
529
530/* XXX move */
531struct l_mmap_argv {
532	l_caddr_t	addr;
533	l_int		len;
534	l_int		prot;
535	l_int		flags;
536	l_int		fd;
537	l_int		pos;
538};
539
540#define STACK_SIZE  (2 * 1024 * 1024)
541#define GUARD_SIZE  (4 * PAGE_SIZE)
542
543static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
544
545int
546linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
547{
548	struct l_mmap_argv linux_args;
549
550#ifdef DEBUG
551	if (ldebug(mmap2))
552		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
553		    (void *)args->addr, args->len, args->prot,
554		    args->flags, args->fd, args->pgoff);
555#endif
556
557	linux_args.addr = (l_caddr_t)args->addr;
558	linux_args.len = args->len;
559	linux_args.prot = args->prot;
560	linux_args.flags = args->flags;
561	linux_args.fd = args->fd;
562	linux_args.pos = args->pgoff * PAGE_SIZE;
563
564	return (linux_mmap_common(td, &linux_args));
565}
566
567int
568linux_mmap(struct thread *td, struct linux_mmap_args *args)
569{
570	int error;
571	struct l_mmap_argv linux_args;
572
573	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
574	if (error)
575		return (error);
576
577#ifdef DEBUG
578	if (ldebug(mmap))
579		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
580		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
581		    linux_args.flags, linux_args.fd, linux_args.pos);
582#endif
583
584	return (linux_mmap_common(td, &linux_args));
585}
586
587static int
588linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
589{
590	struct proc *p = td->td_proc;
591	struct mmap_args /* {
592		caddr_t addr;
593		size_t len;
594		int prot;
595		int flags;
596		int fd;
597		long pad;
598		off_t pos;
599	} */ bsd_args;
600	int error;
601
602	error = 0;
603	bsd_args.flags = 0;
604	if (linux_args->flags & LINUX_MAP_SHARED)
605		bsd_args.flags |= MAP_SHARED;
606	if (linux_args->flags & LINUX_MAP_PRIVATE)
607		bsd_args.flags |= MAP_PRIVATE;
608	if (linux_args->flags & LINUX_MAP_FIXED)
609		bsd_args.flags |= MAP_FIXED;
610	if (linux_args->flags & LINUX_MAP_ANON)
611		bsd_args.flags |= MAP_ANON;
612	else
613		bsd_args.flags |= MAP_NOSYNC;
614	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
615		bsd_args.flags |= MAP_STACK;
616
617		/*
618		 * The linux MAP_GROWSDOWN option does not limit auto
619		 * growth of the region.  Linux mmap with this option
620		 * takes as addr the inital BOS, and as len, the initial
621		 * region size.  It can then grow down from addr without
622		 * limit.  However, linux threads has an implicit internal
623		 * limit to stack size of STACK_SIZE.  Its just not
624		 * enforced explicitly in linux.  But, here we impose
625		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
626		 * region, since we can do this with our mmap.
627		 *
628		 * Our mmap with MAP_STACK takes addr as the maximum
629		 * downsize limit on BOS, and as len the max size of
630		 * the region.  It them maps the top SGROWSIZ bytes,
631		 * and autgrows the region down, up to the limit
632		 * in addr.
633		 *
634		 * If we don't use the MAP_STACK option, the effect
635		 * of this code is to allocate a stack region of a
636		 * fixed size of (STACK_SIZE - GUARD_SIZE).
637		 */
638
639		/* This gives us TOS */
640		bsd_args.addr = linux_args->addr + linux_args->len;
641
642		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
643			/*
644			 * Some linux apps will attempt to mmap
645			 * thread stacks near the top of their
646			 * address space.  If their TOS is greater
647			 * than vm_maxsaddr, vm_map_growstack()
648			 * will confuse the thread stack with the
649			 * process stack and deliver a SEGV if they
650			 * attempt to grow the thread stack past their
651			 * current stacksize rlimit.  To avoid this,
652			 * adjust vm_maxsaddr upwards to reflect
653			 * the current stacksize rlimit rather
654			 * than the maximum possible stacksize.
655			 * It would be better to adjust the
656			 * mmap'ed region, but some apps do not check
657			 * mmap's return value.
658			 */
659			PROC_LOCK(p);
660			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
661			    lim_cur(p, RLIMIT_STACK);
662			PROC_UNLOCK(p);
663		}
664
665		/* This gives us our maximum stack size */
666		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
667			bsd_args.len = linux_args->len;
668		else
669			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
670
671		/*
672		 * This gives us a new BOS.  If we're using VM_STACK, then
673		 * mmap will just map the top SGROWSIZ bytes, and let
674		 * the stack grow down to the limit at BOS.  If we're
675		 * not using VM_STACK we map the full stack, since we
676		 * don't have a way to autogrow it.
677		 */
678		bsd_args.addr -= bsd_args.len;
679	} else {
680		bsd_args.addr = linux_args->addr;
681		bsd_args.len  = linux_args->len;
682	}
683
684	bsd_args.prot = linux_args->prot | PROT_READ;	/* always required */
685	if (linux_args->flags & LINUX_MAP_ANON)
686		bsd_args.fd = -1;
687	else
688		bsd_args.fd = linux_args->fd;
689	bsd_args.pos = linux_args->pos;
690	bsd_args.pad = 0;
691
692#ifdef DEBUG
693	if (ldebug(mmap))
694		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
695		    __func__,
696		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
697		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
698#endif
699	error = mmap(td, &bsd_args);
700#ifdef DEBUG
701	if (ldebug(mmap))
702		printf("-> %s() return: 0x%x (0x%08x)\n",
703			__func__, error, (u_int)td->td_retval[0]);
704#endif
705	return (error);
706}
707
708int
709linux_pipe(struct thread *td, struct linux_pipe_args *args)
710{
711	int error;
712	int reg_edx;
713
714#ifdef DEBUG
715	if (ldebug(pipe))
716		printf(ARGS(pipe, "*"));
717#endif
718
719	reg_edx = td->td_retval[1];
720	error = pipe(td, 0);
721	if (error) {
722		td->td_retval[1] = reg_edx;
723		return (error);
724	}
725
726	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
727	if (error) {
728		td->td_retval[1] = reg_edx;
729		return (error);
730	}
731
732	td->td_retval[1] = reg_edx;
733	td->td_retval[0] = 0;
734	return (0);
735}
736
737int
738linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
739{
740	int error;
741	struct i386_ioperm_args iia;
742
743	iia.start = args->start;
744	iia.length = args->length;
745	iia.enable = args->enable;
746	mtx_lock(&Giant);
747	error = i386_set_ioperm(td, &iia);
748	mtx_unlock(&Giant);
749	return (error);
750}
751
752int
753linux_iopl(struct thread *td, struct linux_iopl_args *args)
754{
755	int error;
756
757	if (args->level < 0 || args->level > 3)
758		return (EINVAL);
759	if ((error = suser(td)) != 0)
760		return (error);
761	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
762		return (error);
763	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
764	    (args->level * (PSL_IOPL / 3));
765	return (0);
766}
767
768int
769linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
770{
771	int error;
772	struct i386_ldt_args ldt;
773	struct l_descriptor ld;
774	union descriptor desc;
775
776	if (uap->ptr == NULL)
777		return (EINVAL);
778
779	switch (uap->func) {
780	case 0x00: /* read_ldt */
781		ldt.start = 0;
782		ldt.descs = uap->ptr;
783		ldt.num = uap->bytecount / sizeof(union descriptor);
784		mtx_lock(&Giant);
785		error = i386_get_ldt(td, &ldt);
786		td->td_retval[0] *= sizeof(union descriptor);
787		mtx_unlock(&Giant);
788		break;
789	case 0x01: /* write_ldt */
790	case 0x11: /* write_ldt */
791		if (uap->bytecount != sizeof(ld))
792			return (EINVAL);
793
794		error = copyin(uap->ptr, &ld, sizeof(ld));
795		if (error)
796			return (error);
797
798		ldt.start = ld.entry_number;
799		ldt.descs = &desc;
800		ldt.num = 1;
801		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
802		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
803		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
804		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
805		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
806			(ld.contents << 2);
807		desc.sd.sd_dpl = 3;
808		desc.sd.sd_p = (ld.seg_not_present ^ 1);
809		desc.sd.sd_xx = 0;
810		desc.sd.sd_def32 = ld.seg_32bit;
811		desc.sd.sd_gran = ld.limit_in_pages;
812		mtx_lock(&Giant);
813		error = i386_set_ldt(td, &ldt, &desc);
814		mtx_unlock(&Giant);
815		break;
816	default:
817		error = EINVAL;
818		break;
819	}
820
821	if (error == EOPNOTSUPP) {
822		printf("linux: modify_ldt needs kernel option USER_LDT\n");
823		error = ENOSYS;
824	}
825
826	return (error);
827}
828
829int
830linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
831{
832	l_osigaction_t osa;
833	l_sigaction_t act, oact;
834	int error;
835
836#ifdef DEBUG
837	if (ldebug(sigaction))
838		printf(ARGS(sigaction, "%d, %p, %p"),
839		    args->sig, (void *)args->nsa, (void *)args->osa);
840#endif
841
842	if (args->nsa != NULL) {
843		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
844		if (error)
845			return (error);
846		act.lsa_handler = osa.lsa_handler;
847		act.lsa_flags = osa.lsa_flags;
848		act.lsa_restorer = osa.lsa_restorer;
849		LINUX_SIGEMPTYSET(act.lsa_mask);
850		act.lsa_mask.__bits[0] = osa.lsa_mask;
851	}
852
853	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
854	    args->osa ? &oact : NULL);
855
856	if (args->osa != NULL && !error) {
857		osa.lsa_handler = oact.lsa_handler;
858		osa.lsa_flags = oact.lsa_flags;
859		osa.lsa_restorer = oact.lsa_restorer;
860		osa.lsa_mask = oact.lsa_mask.__bits[0];
861		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
862	}
863
864	return (error);
865}
866
867/*
868 * Linux has two extra args, restart and oldmask.  We dont use these,
869 * but it seems that "restart" is actually a context pointer that
870 * enables the signal to happen with a different register set.
871 */
872int
873linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
874{
875	sigset_t sigmask;
876	l_sigset_t mask;
877
878#ifdef DEBUG
879	if (ldebug(sigsuspend))
880		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
881#endif
882
883	LINUX_SIGEMPTYSET(mask);
884	mask.__bits[0] = args->mask;
885	linux_to_bsd_sigset(&mask, &sigmask);
886	return (kern_sigsuspend(td, sigmask));
887}
888
889int
890linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
891{
892	l_sigset_t lmask;
893	sigset_t sigmask;
894	int error;
895
896#ifdef DEBUG
897	if (ldebug(rt_sigsuspend))
898		printf(ARGS(rt_sigsuspend, "%p, %d"),
899		    (void *)uap->newset, uap->sigsetsize);
900#endif
901
902	if (uap->sigsetsize != sizeof(l_sigset_t))
903		return (EINVAL);
904
905	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
906	if (error)
907		return (error);
908
909	linux_to_bsd_sigset(&lmask, &sigmask);
910	return (kern_sigsuspend(td, sigmask));
911}
912
913int
914linux_pause(struct thread *td, struct linux_pause_args *args)
915{
916	struct proc *p = td->td_proc;
917	sigset_t sigmask;
918
919#ifdef DEBUG
920	if (ldebug(pause))
921		printf(ARGS(pause, ""));
922#endif
923
924	PROC_LOCK(p);
925	sigmask = td->td_sigmask;
926	PROC_UNLOCK(p);
927	return (kern_sigsuspend(td, sigmask));
928}
929
930int
931linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
932{
933	stack_t ss, oss;
934	l_stack_t lss;
935	int error;
936
937#ifdef DEBUG
938	if (ldebug(sigaltstack))
939		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
940#endif
941
942	if (uap->uss != NULL) {
943		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
944		if (error)
945			return (error);
946
947		ss.ss_sp = lss.ss_sp;
948		ss.ss_size = lss.ss_size;
949		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
950	}
951	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
952	    (uap->uoss != NULL) ? &oss : NULL);
953	if (!error && uap->uoss != NULL) {
954		lss.ss_sp = oss.ss_sp;
955		lss.ss_size = oss.ss_size;
956		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
957		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
958	}
959
960	return (error);
961}
962
963int
964linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
965{
966	struct ftruncate_args sa;
967
968#ifdef DEBUG
969	if (ldebug(ftruncate64))
970		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
971		    (intmax_t)args->length);
972#endif
973
974	sa.fd = args->fd;
975	sa.pad = 0;
976	sa.length = args->length;
977	return ftruncate(td, &sa);
978}
979
980int
981linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
982{
983	struct l_user_desc info;
984	int error;
985	int idx;
986	int a[2];
987	struct segment_descriptor sd;
988
989	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
990	if (error)
991		return (error);
992
993#ifdef DEBUG
994	if (ldebug(set_thread_area))
995	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
996		      info.entry_number,
997      		      info.base_addr,
998      		      info.limit,
999      		      info.seg_32bit,
1000		      info.contents,
1001      		      info.read_exec_only,
1002      		      info.limit_in_pages,
1003      		      info.seg_not_present,
1004      		      info.useable);
1005#endif
1006
1007	idx = info.entry_number;
1008	/*
1009	 * Semantics of linux version: every thread in the system has array
1010	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1011	 * syscall loads one of the selected tls decriptors with a value
1012	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1013	 * descriptors.
1014	 *
1015	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1016	 * descriptors and use just the 1st one. The tls_array[] is used only in
1017	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1018	 * we use just one GDT descriptor for TLS so we will load just one.
1019	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1020	 * comment in the linux sources says wine might do that.
1021	 */
1022
1023	/*
1024	 * we support just GLIBC TLS now
1025	 * we should let 3 proceed as well because we use this segment so
1026	 * if code does two subsequent calls it should succeed
1027	 */
1028	if (idx != 6 && idx != -1 && idx != 3)
1029		return (EINVAL);
1030
1031	/*
1032	 * we have to copy out the GDT entry we use
1033	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1034	 * XXX: what if userspace program doesnt check this value and tries
1035	 * to use 6, 7 or 8?
1036	 */
1037	idx = info.entry_number = 3;
1038	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1039	if (error)
1040		return (error);
1041
1042	if (LDT_empty(&info)) {
1043		a[0] = 0;
1044		a[1] = 0;
1045	} else {
1046		a[0] = LDT_entry_a(&info);
1047		a[1] = LDT_entry_b(&info);
1048	}
1049
1050	memcpy(&sd, &a, sizeof(a));
1051#ifdef DEBUG
1052	if (ldebug(set_thread_area))
1053	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1054			sd.sd_hibase,
1055			sd.sd_lolimit,
1056			sd.sd_hilimit,
1057			sd.sd_type,
1058			sd.sd_dpl,
1059			sd.sd_p,
1060			sd.sd_xx,
1061			sd.sd_def32,
1062			sd.sd_gran);
1063#endif
1064
1065	/* this is taken from i386 version of cpu_set_user_tls() */
1066	critical_enter();
1067	/* set %gs */
1068	td->td_pcb->pcb_gsd = sd;
1069	PCPU_GET(fsgs_gdt)[1] = sd;
1070	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1071	critical_exit();
1072
1073	return (0);
1074}
1075
1076int
1077linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1078{
1079
1080	struct l_user_desc info;
1081	int error;
1082	int idx;
1083	struct l_desc_struct desc;
1084	struct segment_descriptor sd;
1085
1086#ifdef DEBUG
1087	if (ldebug(get_thread_area))
1088		printf(ARGS(get_thread_area, "%p"), args->desc);
1089#endif
1090
1091	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1092	if (error)
1093		return (error);
1094
1095	idx = info.entry_number;
1096	/* XXX: I am not sure if we want 3 to be allowed too. */
1097	if (idx != 6 && idx != 3)
1098		return (EINVAL);
1099
1100	idx = 3;
1101
1102	memset(&info, 0, sizeof(info));
1103
1104	sd = PCPU_GET(fsgs_gdt)[1];
1105
1106	memcpy(&desc, &sd, sizeof(desc));
1107
1108	info.entry_number = idx;
1109	info.base_addr = GET_BASE(&desc);
1110	info.limit = GET_LIMIT(&desc);
1111	info.seg_32bit = GET_32BIT(&desc);
1112	info.contents = GET_CONTENTS(&desc);
1113	info.read_exec_only = !GET_WRITABLE(&desc);
1114	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1115	info.seg_not_present = !GET_PRESENT(&desc);
1116	info.useable = GET_USEABLE(&desc);
1117
1118	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1119	if (error)
1120	   	return (EFAULT);
1121
1122	return (0);
1123}
1124
1125/* copied from kern/kern_time.c */
1126int
1127linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1128{
1129   	return ktimer_create(td, (struct ktimer_create_args *) args);
1130}
1131
1132int
1133linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1134{
1135   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1136}
1137
1138int
1139linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1140{
1141   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1142}
1143
1144int
1145linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1146{
1147   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1148}
1149
1150int
1151linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1152{
1153   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1154}
1155
1156/* XXX: this wont work with module - convert it */
1157int
1158linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1159{
1160#ifdef P1003_1B_MQUEUE
1161   	return kmq_open(td, (struct kmq_open_args *) args);
1162#else
1163	return (ENOSYS);
1164#endif
1165}
1166
1167int
1168linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1169{
1170#ifdef P1003_1B_MQUEUE
1171   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1172#else
1173	return (ENOSYS);
1174#endif
1175}
1176
1177int
1178linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1179{
1180#ifdef P1003_1B_MQUEUE
1181   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1182#else
1183	return (ENOSYS);
1184#endif
1185}
1186
1187int
1188linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1189{
1190#ifdef P1003_1B_MQUEUE
1191   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1192#else
1193	return (ENOSYS);
1194#endif
1195}
1196
1197int
1198linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1199{
1200#ifdef P1003_1B_MQUEUE
1201	return kmq_notify(td, (struct kmq_notify_args *) args);
1202#else
1203	return (ENOSYS);
1204#endif
1205}
1206
1207int
1208linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1209{
1210#ifdef P1003_1B_MQUEUE
1211   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1212#else
1213	return (ENOSYS);
1214#endif
1215}
1216
1217