linux_machdep.c revision 161611
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 161611 2006-08-25 11:59:56Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/imgact.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mman.h>
38#include <sys/mutex.h>
39#include <sys/sx.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/resource.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysproto.h>
47#include <sys/unistd.h>
48#include <sys/wait.h>
49
50#include <machine/frame.h>
51#include <machine/psl.h>
52#include <machine/segments.h>
53#include <machine/sysarch.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58
59#include <i386/linux/linux.h>
60#include <i386/linux/linux_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
67
68#include "opt_posix.h"
69
70extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
71
72struct l_descriptor {
73	l_uint		entry_number;
74	l_ulong		base_addr;
75	l_uint		limit;
76	l_uint		seg_32bit:1;
77	l_uint		contents:2;
78	l_uint		read_exec_only:1;
79	l_uint		limit_in_pages:1;
80	l_uint		seg_not_present:1;
81	l_uint		useable:1;
82};
83
84struct l_old_select_argv {
85	l_int		nfds;
86	l_fd_set	*readfds;
87	l_fd_set	*writefds;
88	l_fd_set	*exceptfds;
89	struct l_timeval	*timeout;
90};
91
92int
93linux_to_bsd_sigaltstack(int lsa)
94{
95	int bsa = 0;
96
97	if (lsa & LINUX_SS_DISABLE)
98		bsa |= SS_DISABLE;
99	if (lsa & LINUX_SS_ONSTACK)
100		bsa |= SS_ONSTACK;
101	return (bsa);
102}
103
104int
105bsd_to_linux_sigaltstack(int bsa)
106{
107	int lsa = 0;
108
109	if (bsa & SS_DISABLE)
110		lsa |= LINUX_SS_DISABLE;
111	if (bsa & SS_ONSTACK)
112		lsa |= LINUX_SS_ONSTACK;
113	return (lsa);
114}
115
116int
117linux_execve(struct thread *td, struct linux_execve_args *args)
118{
119	int error;
120	char *newpath;
121	struct image_args eargs;
122
123	LCONVPATHEXIST(td, args->path, &newpath);
124
125#ifdef DEBUG
126	if (ldebug(execve))
127		printf(ARGS(execve, "%s"), newpath);
128#endif
129
130	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
131	    args->argp, args->envp);
132	free(newpath, M_TEMP);
133	if (error == 0)
134		error = kern_execve(td, &eargs, NULL);
135	if (error == 0)
136	   	/* linux process can exec fbsd one, dont attempt
137		 * to create emuldata for such process using
138		 * linux_proc_init, this leads to a panic on KASSERT
139		 * because such process has p->p_emuldata == NULL
140		 */
141	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
142   		   	error = linux_proc_init(td, 0, 0);
143	return (error);
144}
145
146struct l_ipc_kludge {
147	struct l_msgbuf *msgp;
148	l_long msgtyp;
149};
150
151int
152linux_ipc(struct thread *td, struct linux_ipc_args *args)
153{
154
155	switch (args->what & 0xFFFF) {
156	case LINUX_SEMOP: {
157		struct linux_semop_args a;
158
159		a.semid = args->arg1;
160		a.tsops = args->ptr;
161		a.nsops = args->arg2;
162		return (linux_semop(td, &a));
163	}
164	case LINUX_SEMGET: {
165		struct linux_semget_args a;
166
167		a.key = args->arg1;
168		a.nsems = args->arg2;
169		a.semflg = args->arg3;
170		return (linux_semget(td, &a));
171	}
172	case LINUX_SEMCTL: {
173		struct linux_semctl_args a;
174		int error;
175
176		a.semid = args->arg1;
177		a.semnum = args->arg2;
178		a.cmd = args->arg3;
179		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
180		if (error)
181			return (error);
182		return (linux_semctl(td, &a));
183	}
184	case LINUX_MSGSND: {
185		struct linux_msgsnd_args a;
186
187		a.msqid = args->arg1;
188		a.msgp = args->ptr;
189		a.msgsz = args->arg2;
190		a.msgflg = args->arg3;
191		return (linux_msgsnd(td, &a));
192	}
193	case LINUX_MSGRCV: {
194		struct linux_msgrcv_args a;
195
196		a.msqid = args->arg1;
197		a.msgsz = args->arg2;
198		a.msgflg = args->arg3;
199		if ((args->what >> 16) == 0) {
200			struct l_ipc_kludge tmp;
201			int error;
202
203			if (args->ptr == NULL)
204				return (EINVAL);
205			error = copyin(args->ptr, &tmp, sizeof(tmp));
206			if (error)
207				return (error);
208			a.msgp = tmp.msgp;
209			a.msgtyp = tmp.msgtyp;
210		} else {
211			a.msgp = args->ptr;
212			a.msgtyp = args->arg5;
213		}
214		return (linux_msgrcv(td, &a));
215	}
216	case LINUX_MSGGET: {
217		struct linux_msgget_args a;
218
219		a.key = args->arg1;
220		a.msgflg = args->arg2;
221		return (linux_msgget(td, &a));
222	}
223	case LINUX_MSGCTL: {
224		struct linux_msgctl_args a;
225
226		a.msqid = args->arg1;
227		a.cmd = args->arg2;
228		a.buf = args->ptr;
229		return (linux_msgctl(td, &a));
230	}
231	case LINUX_SHMAT: {
232		struct linux_shmat_args a;
233
234		a.shmid = args->arg1;
235		a.shmaddr = args->ptr;
236		a.shmflg = args->arg2;
237		a.raddr = (l_ulong *)args->arg3;
238		return (linux_shmat(td, &a));
239	}
240	case LINUX_SHMDT: {
241		struct linux_shmdt_args a;
242
243		a.shmaddr = args->ptr;
244		return (linux_shmdt(td, &a));
245	}
246	case LINUX_SHMGET: {
247		struct linux_shmget_args a;
248
249		a.key = args->arg1;
250		a.size = args->arg2;
251		a.shmflg = args->arg3;
252		return (linux_shmget(td, &a));
253	}
254	case LINUX_SHMCTL: {
255		struct linux_shmctl_args a;
256
257		a.shmid = args->arg1;
258		a.cmd = args->arg2;
259		a.buf = args->ptr;
260		return (linux_shmctl(td, &a));
261	}
262	default:
263		break;
264	}
265
266	return (EINVAL);
267}
268
269int
270linux_old_select(struct thread *td, struct linux_old_select_args *args)
271{
272	struct l_old_select_argv linux_args;
273	struct linux_select_args newsel;
274	int error;
275
276#ifdef DEBUG
277	if (ldebug(old_select))
278		printf(ARGS(old_select, "%p"), args->ptr);
279#endif
280
281	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
282	if (error)
283		return (error);
284
285	newsel.nfds = linux_args.nfds;
286	newsel.readfds = linux_args.readfds;
287	newsel.writefds = linux_args.writefds;
288	newsel.exceptfds = linux_args.exceptfds;
289	newsel.timeout = linux_args.timeout;
290	return (linux_select(td, &newsel));
291}
292
293int
294linux_fork(struct thread *td, struct linux_fork_args *args)
295{
296	int error;
297
298#ifdef DEBUG
299	if (ldebug(fork))
300		printf(ARGS(fork, ""));
301#endif
302
303	if ((error = fork(td, (struct fork_args *)args)) != 0)
304		return (error);
305
306	if (td->td_retval[1] == 1)
307		td->td_retval[0] = 0;
308	error = linux_proc_init(td, td->td_retval[0], 0);
309	if (error)
310		return (error);
311
312	return (0);
313}
314
315int
316linux_vfork(struct thread *td, struct linux_vfork_args *args)
317{
318	int error;
319	struct proc *p2;
320
321#ifdef DEBUG
322	if (ldebug(vfork))
323		printf(ARGS(vfork, ""));
324#endif
325
326	/* exclude RFPPWAIT */
327	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
328		return (error);
329	if (error == 0) {
330	   	td->td_retval[0] = p2->p_pid;
331		td->td_retval[1] = 0;
332	}
333	/* Are we the child? */
334	if (td->td_retval[1] == 1)
335		td->td_retval[0] = 0;
336	error = linux_proc_init(td, td->td_retval[0], 0);
337	if (error)
338		return (error);
339	/* wait for the children to exit, ie. emulate vfork */
340	PROC_LOCK(p2);
341	while (p2->p_flag & P_PPWAIT)
342	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
343	PROC_UNLOCK(p2);
344
345	return (0);
346}
347
348int
349linux_clone(struct thread *td, struct linux_clone_args *args)
350{
351	int error, ff = RFPROC | RFSTOPPED;
352	struct proc *p2;
353	struct thread *td2;
354	int exit_signal;
355	struct linux_emuldata *em;
356
357#ifdef DEBUG
358	if (ldebug(clone)) {
359   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
360		    (unsigned int)args->flags, (unsigned int)args->stack,
361		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
362	}
363#endif
364
365	exit_signal = args->flags & 0x000000ff;
366	if (exit_signal >= LINUX_NSIG)
367		return (EINVAL);
368
369	if (exit_signal <= LINUX_SIGTBLSZ)
370		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
371
372	if (args->flags & CLONE_VM)
373		ff |= RFMEM;
374	if (args->flags & CLONE_SIGHAND)
375		ff |= RFSIGSHARE;
376	if (!(args->flags & CLONE_FILES))
377		ff |= RFFDG;
378
379	/*
380	 * Attempt to detect when linux_clone(2) is used for creating
381	 * kernel threads. Unfortunately despite the existence of the
382	 * CLONE_THREAD flag, version of linuxthreads package used in
383	 * most popular distros as of beginning of 2005 doesn't make
384	 * any use of it. Therefore, this detection relay fully on
385	 * empirical observation that linuxthreads sets certain
386	 * combination of flags, so that we can make more or less
387	 * precise detection and notify the FreeBSD kernel that several
388	 * processes are in fact part of the same threading group, so
389	 * that special treatment is necessary for signal delivery
390	 * between those processes and fd locking.
391	 */
392	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
393		ff |= RFTHREAD;
394
395	error = fork1(td, ff, 0, &p2);
396	if (error)
397		return (error);
398
399	/* create the emuldata */
400	error = linux_proc_init(td, p2->p_pid, args->flags);
401	/* reference it - no need to check this */
402	em = em_find(p2, EMUL_UNLOCKED);
403	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
404	/* and adjust it */
405	if (args->flags & CLONE_PARENT_SETTID) {
406	   	if (args->parent_tidptr == NULL) {
407		   	EMUL_UNLOCK(&emul_lock);
408			return (EINVAL);
409		}
410		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
411		if (error) {
412		   	EMUL_UNLOCK(&emul_lock);
413			return (error);
414		}
415	}
416
417	if (args->flags & CLONE_PARENT) {
418#ifdef DEBUG
419	   	printf("linux_clone: CLONE_PARENT\n");
420#endif
421	}
422
423	if (args->flags & CLONE_THREAD) {
424	   	/* XXX: linux mangles pgrp and pptr somehow
425		 * I think it might be this but I am not sure.
426		 */
427#ifdef notyet
428	   	p2->p_pgrp = td->td_proc->p_pgrp;
429	 	p2->p_pptr = td->td_proc->p_pptr;
430#endif
431	 	exit_signal = 0;
432#ifdef DEBUG
433	   	printf("linux_clone: CLONE_THREADS\n");
434#endif
435	}
436
437	if (args->flags & CLONE_CHILD_SETTID)
438		em->child_set_tid = args->child_tidptr;
439	else
440	   	em->child_set_tid = NULL;
441
442	if (args->flags & CLONE_CHILD_CLEARTID)
443		em->child_clear_tid = args->child_tidptr;
444	else
445	   	em->child_clear_tid = NULL;
446	EMUL_UNLOCK(&emul_lock);
447
448	PROC_LOCK(p2);
449	p2->p_sigparent = exit_signal;
450	PROC_UNLOCK(p2);
451	td2 = FIRST_THREAD_IN_PROC(p2);
452	/*
453	 * in a case of stack = NULL we are supposed to COW calling process stack
454	 * this is what normal fork() does so we just keep the tf_esp arg intact
455	 */
456	if (args->stack)
457   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
458
459	if (args->flags & CLONE_SETTLS) {
460   	   	struct l_user_desc info;
461   	   	int idx;
462	   	int a[2];
463		struct segment_descriptor sd;
464
465	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
466		if (error)
467   		   	return (error);
468
469		idx = info.entry_number;
470
471		/*
472		 * looks like we're getting the idx we returned
473		 * in the set_thread_area() syscall
474		 */
475		if (idx != 6 && idx != 3)
476			return (EINVAL);
477
478		/* this doesnt happen in practice */
479		if (idx == 6) {
480		   	/* we might copy out the entry_number as 3 */
481		   	info.entry_number = 3;
482			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
483			if (error)
484	   		   	return (error);
485		}
486
487		a[0] = LDT_entry_a(&info);
488		a[1] = LDT_entry_b(&info);
489
490		memcpy(&sd, &a, sizeof(a));
491#ifdef DEBUG
492	if (ldebug(clone))
493	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
494			sd.sd_hibase,
495			sd.sd_lolimit,
496			sd.sd_hilimit,
497			sd.sd_type,
498			sd.sd_dpl,
499			sd.sd_p,
500			sd.sd_xx,
501			sd.sd_def32,
502			sd.sd_gran);
503#endif
504
505		/* this is taken from i386 version of cpu_set_user_tls() */
506		critical_enter();
507		/* set %gs */
508		td2->td_pcb->pcb_gsd = sd;
509		PCPU_GET(fsgs_gdt)[1] = sd;
510		load_gs(GSEL(GUGS_SEL, SEL_UPL));
511		critical_exit();
512	}
513
514#ifdef DEBUG
515	if (ldebug(clone))
516		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
517		    (long)p2->p_pid, args->stack, exit_signal);
518#endif
519
520	/*
521	 * Make this runnable after we are finished with it.
522	 */
523	mtx_lock_spin(&sched_lock);
524	TD_SET_CAN_RUN(td2);
525	setrunqueue(td2, SRQ_BORING);
526	mtx_unlock_spin(&sched_lock);
527
528	td->td_retval[0] = p2->p_pid;
529	td->td_retval[1] = 0;
530	return (0);
531}
532
533/* XXX move */
534struct l_mmap_argv {
535	l_caddr_t	addr;
536	l_int		len;
537	l_int		prot;
538	l_int		flags;
539	l_int		fd;
540	l_int		pos;
541};
542
543#define STACK_SIZE  (2 * 1024 * 1024)
544#define GUARD_SIZE  (4 * PAGE_SIZE)
545
546static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
547
548int
549linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
550{
551	struct l_mmap_argv linux_args;
552
553#ifdef DEBUG
554	if (ldebug(mmap2))
555		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
556		    (void *)args->addr, args->len, args->prot,
557		    args->flags, args->fd, args->pgoff);
558#endif
559
560	linux_args.addr = (l_caddr_t)args->addr;
561	linux_args.len = args->len;
562	linux_args.prot = args->prot;
563	linux_args.flags = args->flags;
564	linux_args.fd = args->fd;
565	linux_args.pos = args->pgoff * PAGE_SIZE;
566
567	return (linux_mmap_common(td, &linux_args));
568}
569
570int
571linux_mmap(struct thread *td, struct linux_mmap_args *args)
572{
573	int error;
574	struct l_mmap_argv linux_args;
575
576	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
577	if (error)
578		return (error);
579
580#ifdef DEBUG
581	if (ldebug(mmap))
582		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
583		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
584		    linux_args.flags, linux_args.fd, linux_args.pos);
585#endif
586
587	return (linux_mmap_common(td, &linux_args));
588}
589
590static int
591linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
592{
593	struct proc *p = td->td_proc;
594	struct mmap_args /* {
595		caddr_t addr;
596		size_t len;
597		int prot;
598		int flags;
599		int fd;
600		long pad;
601		off_t pos;
602	} */ bsd_args;
603	int error;
604
605	error = 0;
606	bsd_args.flags = 0;
607	if (linux_args->flags & LINUX_MAP_SHARED)
608		bsd_args.flags |= MAP_SHARED;
609	if (linux_args->flags & LINUX_MAP_PRIVATE)
610		bsd_args.flags |= MAP_PRIVATE;
611	if (linux_args->flags & LINUX_MAP_FIXED)
612		bsd_args.flags |= MAP_FIXED;
613	if (linux_args->flags & LINUX_MAP_ANON)
614		bsd_args.flags |= MAP_ANON;
615	else
616		bsd_args.flags |= MAP_NOSYNC;
617	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
618		bsd_args.flags |= MAP_STACK;
619
620		/*
621		 * The linux MAP_GROWSDOWN option does not limit auto
622		 * growth of the region.  Linux mmap with this option
623		 * takes as addr the inital BOS, and as len, the initial
624		 * region size.  It can then grow down from addr without
625		 * limit.  However, linux threads has an implicit internal
626		 * limit to stack size of STACK_SIZE.  Its just not
627		 * enforced explicitly in linux.  But, here we impose
628		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
629		 * region, since we can do this with our mmap.
630		 *
631		 * Our mmap with MAP_STACK takes addr as the maximum
632		 * downsize limit on BOS, and as len the max size of
633		 * the region.  It them maps the top SGROWSIZ bytes,
634		 * and autgrows the region down, up to the limit
635		 * in addr.
636		 *
637		 * If we don't use the MAP_STACK option, the effect
638		 * of this code is to allocate a stack region of a
639		 * fixed size of (STACK_SIZE - GUARD_SIZE).
640		 */
641
642		/* This gives us TOS */
643		bsd_args.addr = linux_args->addr + linux_args->len;
644
645		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
646			/*
647			 * Some linux apps will attempt to mmap
648			 * thread stacks near the top of their
649			 * address space.  If their TOS is greater
650			 * than vm_maxsaddr, vm_map_growstack()
651			 * will confuse the thread stack with the
652			 * process stack and deliver a SEGV if they
653			 * attempt to grow the thread stack past their
654			 * current stacksize rlimit.  To avoid this,
655			 * adjust vm_maxsaddr upwards to reflect
656			 * the current stacksize rlimit rather
657			 * than the maximum possible stacksize.
658			 * It would be better to adjust the
659			 * mmap'ed region, but some apps do not check
660			 * mmap's return value.
661			 */
662			PROC_LOCK(p);
663			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
664			    lim_cur(p, RLIMIT_STACK);
665			PROC_UNLOCK(p);
666		}
667
668		/* This gives us our maximum stack size */
669		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
670			bsd_args.len = linux_args->len;
671		else
672			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
673
674		/*
675		 * This gives us a new BOS.  If we're using VM_STACK, then
676		 * mmap will just map the top SGROWSIZ bytes, and let
677		 * the stack grow down to the limit at BOS.  If we're
678		 * not using VM_STACK we map the full stack, since we
679		 * don't have a way to autogrow it.
680		 */
681		bsd_args.addr -= bsd_args.len;
682	} else {
683		bsd_args.addr = linux_args->addr;
684		bsd_args.len  = linux_args->len;
685	}
686
687	bsd_args.prot = linux_args->prot | PROT_READ;	/* always required */
688	if (linux_args->flags & LINUX_MAP_ANON)
689		bsd_args.fd = -1;
690	else
691		bsd_args.fd = linux_args->fd;
692	bsd_args.pos = linux_args->pos;
693	bsd_args.pad = 0;
694
695#ifdef DEBUG
696	if (ldebug(mmap))
697		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
698		    __func__,
699		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
700		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
701#endif
702	error = mmap(td, &bsd_args);
703#ifdef DEBUG
704	if (ldebug(mmap))
705		printf("-> %s() return: 0x%x (0x%08x)\n",
706			__func__, error, (u_int)td->td_retval[0]);
707#endif
708	return (error);
709}
710
711int
712linux_pipe(struct thread *td, struct linux_pipe_args *args)
713{
714	int error;
715	int reg_edx;
716
717#ifdef DEBUG
718	if (ldebug(pipe))
719		printf(ARGS(pipe, "*"));
720#endif
721
722	reg_edx = td->td_retval[1];
723	error = pipe(td, 0);
724	if (error) {
725		td->td_retval[1] = reg_edx;
726		return (error);
727	}
728
729	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
730	if (error) {
731		td->td_retval[1] = reg_edx;
732		return (error);
733	}
734
735	td->td_retval[1] = reg_edx;
736	td->td_retval[0] = 0;
737	return (0);
738}
739
740int
741linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
742{
743	int error;
744	struct i386_ioperm_args iia;
745
746	iia.start = args->start;
747	iia.length = args->length;
748	iia.enable = args->enable;
749	mtx_lock(&Giant);
750	error = i386_set_ioperm(td, &iia);
751	mtx_unlock(&Giant);
752	return (error);
753}
754
755int
756linux_iopl(struct thread *td, struct linux_iopl_args *args)
757{
758	int error;
759
760	if (args->level < 0 || args->level > 3)
761		return (EINVAL);
762	if ((error = suser(td)) != 0)
763		return (error);
764	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
765		return (error);
766	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
767	    (args->level * (PSL_IOPL / 3));
768	return (0);
769}
770
771int
772linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
773{
774	int error;
775	struct i386_ldt_args ldt;
776	struct l_descriptor ld;
777	union descriptor desc;
778
779	if (uap->ptr == NULL)
780		return (EINVAL);
781
782	switch (uap->func) {
783	case 0x00: /* read_ldt */
784		ldt.start = 0;
785		ldt.descs = uap->ptr;
786		ldt.num = uap->bytecount / sizeof(union descriptor);
787		mtx_lock(&Giant);
788		error = i386_get_ldt(td, &ldt);
789		td->td_retval[0] *= sizeof(union descriptor);
790		mtx_unlock(&Giant);
791		break;
792	case 0x01: /* write_ldt */
793	case 0x11: /* write_ldt */
794		if (uap->bytecount != sizeof(ld))
795			return (EINVAL);
796
797		error = copyin(uap->ptr, &ld, sizeof(ld));
798		if (error)
799			return (error);
800
801		ldt.start = ld.entry_number;
802		ldt.descs = &desc;
803		ldt.num = 1;
804		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
805		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
806		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
807		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
808		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
809			(ld.contents << 2);
810		desc.sd.sd_dpl = 3;
811		desc.sd.sd_p = (ld.seg_not_present ^ 1);
812		desc.sd.sd_xx = 0;
813		desc.sd.sd_def32 = ld.seg_32bit;
814		desc.sd.sd_gran = ld.limit_in_pages;
815		mtx_lock(&Giant);
816		error = i386_set_ldt(td, &ldt, &desc);
817		mtx_unlock(&Giant);
818		break;
819	default:
820		error = EINVAL;
821		break;
822	}
823
824	if (error == EOPNOTSUPP) {
825		printf("linux: modify_ldt needs kernel option USER_LDT\n");
826		error = ENOSYS;
827	}
828
829	return (error);
830}
831
832int
833linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
834{
835	l_osigaction_t osa;
836	l_sigaction_t act, oact;
837	int error;
838
839#ifdef DEBUG
840	if (ldebug(sigaction))
841		printf(ARGS(sigaction, "%d, %p, %p"),
842		    args->sig, (void *)args->nsa, (void *)args->osa);
843#endif
844
845	if (args->nsa != NULL) {
846		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
847		if (error)
848			return (error);
849		act.lsa_handler = osa.lsa_handler;
850		act.lsa_flags = osa.lsa_flags;
851		act.lsa_restorer = osa.lsa_restorer;
852		LINUX_SIGEMPTYSET(act.lsa_mask);
853		act.lsa_mask.__bits[0] = osa.lsa_mask;
854	}
855
856	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
857	    args->osa ? &oact : NULL);
858
859	if (args->osa != NULL && !error) {
860		osa.lsa_handler = oact.lsa_handler;
861		osa.lsa_flags = oact.lsa_flags;
862		osa.lsa_restorer = oact.lsa_restorer;
863		osa.lsa_mask = oact.lsa_mask.__bits[0];
864		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
865	}
866
867	return (error);
868}
869
870/*
871 * Linux has two extra args, restart and oldmask.  We dont use these,
872 * but it seems that "restart" is actually a context pointer that
873 * enables the signal to happen with a different register set.
874 */
875int
876linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
877{
878	sigset_t sigmask;
879	l_sigset_t mask;
880
881#ifdef DEBUG
882	if (ldebug(sigsuspend))
883		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
884#endif
885
886	LINUX_SIGEMPTYSET(mask);
887	mask.__bits[0] = args->mask;
888	linux_to_bsd_sigset(&mask, &sigmask);
889	return (kern_sigsuspend(td, sigmask));
890}
891
892int
893linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
894{
895	l_sigset_t lmask;
896	sigset_t sigmask;
897	int error;
898
899#ifdef DEBUG
900	if (ldebug(rt_sigsuspend))
901		printf(ARGS(rt_sigsuspend, "%p, %d"),
902		    (void *)uap->newset, uap->sigsetsize);
903#endif
904
905	if (uap->sigsetsize != sizeof(l_sigset_t))
906		return (EINVAL);
907
908	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
909	if (error)
910		return (error);
911
912	linux_to_bsd_sigset(&lmask, &sigmask);
913	return (kern_sigsuspend(td, sigmask));
914}
915
916int
917linux_pause(struct thread *td, struct linux_pause_args *args)
918{
919	struct proc *p = td->td_proc;
920	sigset_t sigmask;
921
922#ifdef DEBUG
923	if (ldebug(pause))
924		printf(ARGS(pause, ""));
925#endif
926
927	PROC_LOCK(p);
928	sigmask = td->td_sigmask;
929	PROC_UNLOCK(p);
930	return (kern_sigsuspend(td, sigmask));
931}
932
933int
934linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
935{
936	stack_t ss, oss;
937	l_stack_t lss;
938	int error;
939
940#ifdef DEBUG
941	if (ldebug(sigaltstack))
942		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
943#endif
944
945	if (uap->uss != NULL) {
946		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
947		if (error)
948			return (error);
949
950		ss.ss_sp = lss.ss_sp;
951		ss.ss_size = lss.ss_size;
952		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
953	}
954	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
955	    (uap->uoss != NULL) ? &oss : NULL);
956	if (!error && uap->uoss != NULL) {
957		lss.ss_sp = oss.ss_sp;
958		lss.ss_size = oss.ss_size;
959		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
960		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
961	}
962
963	return (error);
964}
965
966int
967linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
968{
969	struct ftruncate_args sa;
970
971#ifdef DEBUG
972	if (ldebug(ftruncate64))
973		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
974		    (intmax_t)args->length);
975#endif
976
977	sa.fd = args->fd;
978	sa.pad = 0;
979	sa.length = args->length;
980	return ftruncate(td, &sa);
981}
982
983int
984linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
985{
986	struct l_user_desc info;
987	int error;
988	int idx;
989	int a[2];
990	struct segment_descriptor sd;
991
992	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
993	if (error)
994		return (error);
995
996#ifdef DEBUG
997	if (ldebug(set_thread_area))
998	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
999		      info.entry_number,
1000      		      info.base_addr,
1001      		      info.limit,
1002      		      info.seg_32bit,
1003		      info.contents,
1004      		      info.read_exec_only,
1005      		      info.limit_in_pages,
1006      		      info.seg_not_present,
1007      		      info.useable);
1008#endif
1009
1010	idx = info.entry_number;
1011	/*
1012	 * Semantics of linux version: every thread in the system has array
1013	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1014	 * syscall loads one of the selected tls decriptors with a value
1015	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1016	 * descriptors.
1017	 *
1018	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1019	 * descriptors and use just the 1st one. The tls_array[] is used only in
1020	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1021	 * we use just one GDT descriptor for TLS so we will load just one.
1022	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1023	 * comment in the linux sources says wine might do that.
1024	 */
1025
1026	/*
1027	 * we support just GLIBC TLS now
1028	 * we should let 3 proceed as well because we use this segment so
1029	 * if code does two subsequent calls it should succeed
1030	 */
1031	if (idx != 6 && idx != -1 && idx != 3)
1032		return (EINVAL);
1033
1034	/*
1035	 * we have to copy out the GDT entry we use
1036	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1037	 * XXX: what if userspace program doesnt check this value and tries
1038	 * to use 6, 7 or 8?
1039	 */
1040	idx = info.entry_number = 3;
1041	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1042	if (error)
1043		return (error);
1044
1045	if (LDT_empty(&info)) {
1046		a[0] = 0;
1047		a[1] = 0;
1048	} else {
1049		a[0] = LDT_entry_a(&info);
1050		a[1] = LDT_entry_b(&info);
1051	}
1052
1053	memcpy(&sd, &a, sizeof(a));
1054#ifdef DEBUG
1055	if (ldebug(set_thread_area))
1056	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1057			sd.sd_hibase,
1058			sd.sd_lolimit,
1059			sd.sd_hilimit,
1060			sd.sd_type,
1061			sd.sd_dpl,
1062			sd.sd_p,
1063			sd.sd_xx,
1064			sd.sd_def32,
1065			sd.sd_gran);
1066#endif
1067
1068	/* this is taken from i386 version of cpu_set_user_tls() */
1069	critical_enter();
1070	/* set %gs */
1071	td->td_pcb->pcb_gsd = sd;
1072	PCPU_GET(fsgs_gdt)[1] = sd;
1073	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1074	critical_exit();
1075
1076	return (0);
1077}
1078
1079int
1080linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1081{
1082
1083	struct l_user_desc info;
1084	int error;
1085	int idx;
1086	struct l_desc_struct desc;
1087	struct segment_descriptor sd;
1088
1089#ifdef DEBUG
1090	if (ldebug(get_thread_area))
1091		printf(ARGS(get_thread_area, "%p"), args->desc);
1092#endif
1093
1094	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1095	if (error)
1096		return (error);
1097
1098	idx = info.entry_number;
1099	/* XXX: I am not sure if we want 3 to be allowed too. */
1100	if (idx != 6 && idx != 3)
1101		return (EINVAL);
1102
1103	idx = 3;
1104
1105	memset(&info, 0, sizeof(info));
1106
1107	sd = PCPU_GET(fsgs_gdt)[1];
1108
1109	memcpy(&desc, &sd, sizeof(desc));
1110
1111	info.entry_number = idx;
1112	info.base_addr = GET_BASE(&desc);
1113	info.limit = GET_LIMIT(&desc);
1114	info.seg_32bit = GET_32BIT(&desc);
1115	info.contents = GET_CONTENTS(&desc);
1116	info.read_exec_only = !GET_WRITABLE(&desc);
1117	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1118	info.seg_not_present = !GET_PRESENT(&desc);
1119	info.useable = GET_USEABLE(&desc);
1120
1121	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1122	if (error)
1123	   	return (EFAULT);
1124
1125	return (0);
1126}
1127
1128/* copied from kern/kern_time.c */
1129int
1130linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1131{
1132   	return ktimer_create(td, (struct ktimer_create_args *) args);
1133}
1134
1135int
1136linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1137{
1138   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1139}
1140
1141int
1142linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1143{
1144   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1145}
1146
1147int
1148linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1149{
1150   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1151}
1152
1153int
1154linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1155{
1156   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1157}
1158
1159/* XXX: this wont work with module - convert it */
1160int
1161linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1162{
1163#ifdef P1003_1B_MQUEUE
1164   	return kmq_open(td, (struct kmq_open_args *) args);
1165#else
1166	return (ENOSYS);
1167#endif
1168}
1169
1170int
1171linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1172{
1173#ifdef P1003_1B_MQUEUE
1174   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1175#else
1176	return (ENOSYS);
1177#endif
1178}
1179
1180int
1181linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1182{
1183#ifdef P1003_1B_MQUEUE
1184   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1185#else
1186	return (ENOSYS);
1187#endif
1188}
1189
1190int
1191linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1192{
1193#ifdef P1003_1B_MQUEUE
1194   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1195#else
1196	return (ENOSYS);
1197#endif
1198}
1199
1200int
1201linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1202{
1203#ifdef P1003_1B_MQUEUE
1204	return kmq_notify(td, (struct kmq_notify_args *) args);
1205#else
1206	return (ENOSYS);
1207#endif
1208}
1209
1210int
1211linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1212{
1213#ifdef P1003_1B_MQUEUE
1214   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1215#else
1216	return (ENOSYS);
1217#endif
1218}
1219
1220