linux_machdep.c revision 161365
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 161365 2006-08-16 18:54:51Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/imgact.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mman.h>
38#include <sys/mutex.h>
39#include <sys/sx.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/resource.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysproto.h>
47#include <sys/unistd.h>
48#include <sys/wait.h>
49
50#include <machine/frame.h>
51#include <machine/psl.h>
52#include <machine/segments.h>
53#include <machine/sysarch.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58
59#include <i386/linux/linux.h>
60#include <i386/linux/linux_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
67
68#include "opt_posix.h"
69
70extern struct sx emul_shared_lock;
71extern struct sx emul_lock;
72
73extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
74
75struct l_descriptor {
76	l_uint		entry_number;
77	l_ulong		base_addr;
78	l_uint		limit;
79	l_uint		seg_32bit:1;
80	l_uint		contents:2;
81	l_uint		read_exec_only:1;
82	l_uint		limit_in_pages:1;
83	l_uint		seg_not_present:1;
84	l_uint		useable:1;
85};
86
87struct l_old_select_argv {
88	l_int		nfds;
89	l_fd_set	*readfds;
90	l_fd_set	*writefds;
91	l_fd_set	*exceptfds;
92	struct l_timeval	*timeout;
93};
94
95int
96linux_to_bsd_sigaltstack(int lsa)
97{
98	int bsa = 0;
99
100	if (lsa & LINUX_SS_DISABLE)
101		bsa |= SS_DISABLE;
102	if (lsa & LINUX_SS_ONSTACK)
103		bsa |= SS_ONSTACK;
104	return (bsa);
105}
106
107int
108bsd_to_linux_sigaltstack(int bsa)
109{
110	int lsa = 0;
111
112	if (bsa & SS_DISABLE)
113		lsa |= LINUX_SS_DISABLE;
114	if (bsa & SS_ONSTACK)
115		lsa |= LINUX_SS_ONSTACK;
116	return (lsa);
117}
118
119int
120linux_execve(struct thread *td, struct linux_execve_args *args)
121{
122	int error;
123	char *newpath;
124	struct image_args eargs;
125
126	LCONVPATHEXIST(td, args->path, &newpath);
127
128#ifdef DEBUG
129	if (ldebug(execve))
130		printf(ARGS(execve, "%s"), newpath);
131#endif
132
133	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
134	    args->argp, args->envp);
135	free(newpath, M_TEMP);
136	if (error == 0)
137		error = kern_execve(td, &eargs, NULL);
138	if (error == 0)
139	   	/* linux process can exec fbsd one, dont attempt
140		 * to create emuldata for such process using
141		 * linux_proc_init, this leads to a panic on KASSERT
142		 * because such process has p->p_emuldata == NULL
143		 */
144	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
145   		   	error = linux_proc_init(td, 0, 0);
146	return (error);
147}
148
149struct l_ipc_kludge {
150	struct l_msgbuf *msgp;
151	l_long msgtyp;
152};
153
154int
155linux_ipc(struct thread *td, struct linux_ipc_args *args)
156{
157
158	switch (args->what & 0xFFFF) {
159	case LINUX_SEMOP: {
160		struct linux_semop_args a;
161
162		a.semid = args->arg1;
163		a.tsops = args->ptr;
164		a.nsops = args->arg2;
165		return (linux_semop(td, &a));
166	}
167	case LINUX_SEMGET: {
168		struct linux_semget_args a;
169
170		a.key = args->arg1;
171		a.nsems = args->arg2;
172		a.semflg = args->arg3;
173		return (linux_semget(td, &a));
174	}
175	case LINUX_SEMCTL: {
176		struct linux_semctl_args a;
177		int error;
178
179		a.semid = args->arg1;
180		a.semnum = args->arg2;
181		a.cmd = args->arg3;
182		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
183		if (error)
184			return (error);
185		return (linux_semctl(td, &a));
186	}
187	case LINUX_MSGSND: {
188		struct linux_msgsnd_args a;
189
190		a.msqid = args->arg1;
191		a.msgp = args->ptr;
192		a.msgsz = args->arg2;
193		a.msgflg = args->arg3;
194		return (linux_msgsnd(td, &a));
195	}
196	case LINUX_MSGRCV: {
197		struct linux_msgrcv_args a;
198
199		a.msqid = args->arg1;
200		a.msgsz = args->arg2;
201		a.msgflg = args->arg3;
202		if ((args->what >> 16) == 0) {
203			struct l_ipc_kludge tmp;
204			int error;
205
206			if (args->ptr == NULL)
207				return (EINVAL);
208			error = copyin(args->ptr, &tmp, sizeof(tmp));
209			if (error)
210				return (error);
211			a.msgp = tmp.msgp;
212			a.msgtyp = tmp.msgtyp;
213		} else {
214			a.msgp = args->ptr;
215			a.msgtyp = args->arg5;
216		}
217		return (linux_msgrcv(td, &a));
218	}
219	case LINUX_MSGGET: {
220		struct linux_msgget_args a;
221
222		a.key = args->arg1;
223		a.msgflg = args->arg2;
224		return (linux_msgget(td, &a));
225	}
226	case LINUX_MSGCTL: {
227		struct linux_msgctl_args a;
228
229		a.msqid = args->arg1;
230		a.cmd = args->arg2;
231		a.buf = args->ptr;
232		return (linux_msgctl(td, &a));
233	}
234	case LINUX_SHMAT: {
235		struct linux_shmat_args a;
236
237		a.shmid = args->arg1;
238		a.shmaddr = args->ptr;
239		a.shmflg = args->arg2;
240		a.raddr = (l_ulong *)args->arg3;
241		return (linux_shmat(td, &a));
242	}
243	case LINUX_SHMDT: {
244		struct linux_shmdt_args a;
245
246		a.shmaddr = args->ptr;
247		return (linux_shmdt(td, &a));
248	}
249	case LINUX_SHMGET: {
250		struct linux_shmget_args a;
251
252		a.key = args->arg1;
253		a.size = args->arg2;
254		a.shmflg = args->arg3;
255		return (linux_shmget(td, &a));
256	}
257	case LINUX_SHMCTL: {
258		struct linux_shmctl_args a;
259
260		a.shmid = args->arg1;
261		a.cmd = args->arg2;
262		a.buf = args->ptr;
263		return (linux_shmctl(td, &a));
264	}
265	default:
266		break;
267	}
268
269	return (EINVAL);
270}
271
272int
273linux_old_select(struct thread *td, struct linux_old_select_args *args)
274{
275	struct l_old_select_argv linux_args;
276	struct linux_select_args newsel;
277	int error;
278
279#ifdef DEBUG
280	if (ldebug(old_select))
281		printf(ARGS(old_select, "%p"), args->ptr);
282#endif
283
284	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
285	if (error)
286		return (error);
287
288	newsel.nfds = linux_args.nfds;
289	newsel.readfds = linux_args.readfds;
290	newsel.writefds = linux_args.writefds;
291	newsel.exceptfds = linux_args.exceptfds;
292	newsel.timeout = linux_args.timeout;
293	return (linux_select(td, &newsel));
294}
295
296int
297linux_fork(struct thread *td, struct linux_fork_args *args)
298{
299	int error;
300
301#ifdef DEBUG
302	if (ldebug(fork))
303		printf(ARGS(fork, ""));
304#endif
305
306	if ((error = fork(td, (struct fork_args *)args)) != 0)
307		return (error);
308
309	if (td->td_retval[1] == 1)
310		td->td_retval[0] = 0;
311	error = linux_proc_init(td, td->td_retval[0], 0);
312	if (error)
313		return (error);
314
315	return (0);
316}
317
318int
319linux_vfork(struct thread *td, struct linux_vfork_args *args)
320{
321	int error;
322
323#ifdef DEBUG
324	if (ldebug(vfork))
325		printf(ARGS(vfork, ""));
326#endif
327
328	if ((error = vfork(td, (struct vfork_args *)args)) != 0)
329		return (error);
330	/* Are we the child? */
331	if (td->td_retval[1] == 1)
332		td->td_retval[0] = 0;
333	error = linux_proc_init(td, td->td_retval[0], 0);
334	if (error)
335		return (error);
336	return (0);
337}
338
339int
340linux_clone(struct thread *td, struct linux_clone_args *args)
341{
342	int error, ff = RFPROC | RFSTOPPED;
343	struct proc *p2;
344	struct thread *td2;
345	int exit_signal;
346	struct linux_emuldata *em;
347
348#ifdef DEBUG
349	if (ldebug(clone)) {
350   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
351		    (unsigned int)args->flags, (unsigned int)args->stack,
352		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
353	}
354#endif
355
356	exit_signal = args->flags & 0x000000ff;
357	if (exit_signal >= LINUX_NSIG)
358		return (EINVAL);
359
360	if (exit_signal <= LINUX_SIGTBLSZ)
361		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
362
363	if (args->flags & CLONE_VM)
364		ff |= RFMEM;
365	if (args->flags & CLONE_SIGHAND)
366		ff |= RFSIGSHARE;
367	if (!(args->flags & CLONE_FILES))
368		ff |= RFFDG;
369
370	/*
371	 * Attempt to detect when linux_clone(2) is used for creating
372	 * kernel threads. Unfortunately despite the existence of the
373	 * CLONE_THREAD flag, version of linuxthreads package used in
374	 * most popular distros as of beginning of 2005 doesn't make
375	 * any use of it. Therefore, this detection relay fully on
376	 * empirical observation that linuxthreads sets certain
377	 * combination of flags, so that we can make more or less
378	 * precise detection and notify the FreeBSD kernel that several
379	 * processes are in fact part of the same threading group, so
380	 * that special treatment is necessary for signal delivery
381	 * between those processes and fd locking.
382	 */
383	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
384		ff |= RFTHREAD;
385
386	error = fork1(td, ff, 0, &p2);
387	if (error)
388		return (error);
389
390	/* create the emuldata */
391	error = linux_proc_init(td, p2->p_pid, args->flags);
392	/* reference it - no need to check this */
393	em = em_find(p2, EMUL_UNLOCKED);
394	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
395	/* and adjust it */
396	if (args->flags & CLONE_PARENT_SETTID) {
397	   	if (args->parent_tidptr == NULL) {
398		   	EMUL_UNLOCK(&emul_lock);
399			return (EINVAL);
400		}
401		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
402		if (error) {
403		   	EMUL_UNLOCK(&emul_lock);
404			return (error);
405		}
406	}
407
408	if (args->flags & CLONE_PARENT) {
409#ifdef DEBUG
410	   	printf("linux_clone: CLONE_PARENT\n");
411#endif
412	}
413
414	if (args->flags & CLONE_THREAD) {
415	   	/* XXX: linux mangles pgrp and pptr somehow
416		 * I think it might be this but I am not sure.
417		 */
418#ifdef notyet
419	   	p2->p_pgrp = td->td_proc->p_pgrp;
420	 	p2->p_pptr = td->td_proc->p_pptr;
421#endif
422	 	exit_signal = 0;
423#ifdef DEBUG
424	   	printf("linux_clone: CLONE_THREADS\n");
425#endif
426	}
427
428	if (args->flags & CLONE_CHILD_SETTID)
429		em->child_set_tid = args->child_tidptr;
430	else
431	   	em->child_set_tid = NULL;
432
433	if (args->flags & CLONE_CHILD_CLEARTID)
434		em->child_clear_tid = args->child_tidptr;
435	else
436	   	em->child_clear_tid = NULL;
437	EMUL_UNLOCK(&emul_lock);
438
439	PROC_LOCK(p2);
440	p2->p_sigparent = exit_signal;
441	PROC_UNLOCK(p2);
442	td2 = FIRST_THREAD_IN_PROC(p2);
443	/*
444	 * in a case of stack = NULL we are supposed to COW calling process stack
445	 * this is what normal fork() does so we just keep the tf_esp arg intact
446	 */
447	if (args->stack)
448   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
449
450	if (args->flags & CLONE_SETTLS) {
451   	   	struct l_user_desc info;
452   	   	int idx;
453	   	int a[2];
454		struct segment_descriptor sd;
455
456	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
457		if (error)
458   		   	return (error);
459
460		idx = info.entry_number;
461
462		/*
463		 * looks like we're getting the idx we returned
464		 * in the set_thread_area() syscall
465		 */
466		if (idx != 6 && idx != 3)
467			return (EINVAL);
468
469		/* this doesnt happen in practice */
470		if (idx == 6) {
471		   	/* we might copy out the entry_number as 3 */
472		   	info.entry_number = 3;
473			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
474			if (error)
475	   		   	return (error);
476		}
477
478		a[0] = LDT_entry_a(&info);
479		a[1] = LDT_entry_b(&info);
480
481		memcpy(&sd, &a, sizeof(a));
482#ifdef DEBUG
483	if (ldebug(clone))
484	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
485			sd.sd_hibase,
486			sd.sd_lolimit,
487			sd.sd_hilimit,
488			sd.sd_type,
489			sd.sd_dpl,
490			sd.sd_p,
491			sd.sd_xx,
492			sd.sd_def32,
493			sd.sd_gran);
494#endif
495
496		/* this is taken from i386 version of cpu_set_user_tls() */
497		critical_enter();
498		/* set %gs */
499		td2->td_pcb->pcb_gsd = sd;
500		PCPU_GET(fsgs_gdt)[1] = sd;
501		load_gs(GSEL(GUGS_SEL, SEL_UPL));
502		critical_exit();
503	}
504
505#ifdef DEBUG
506	if (ldebug(clone))
507		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
508		    (long)p2->p_pid, args->stack, exit_signal);
509#endif
510
511	/*
512	 * Make this runnable after we are finished with it.
513	 */
514	mtx_lock_spin(&sched_lock);
515	TD_SET_CAN_RUN(td2);
516	setrunqueue(td2, SRQ_BORING);
517	mtx_unlock_spin(&sched_lock);
518
519	td->td_retval[0] = p2->p_pid;
520	td->td_retval[1] = 0;
521	return (0);
522}
523
524/* XXX move */
525struct l_mmap_argv {
526	l_caddr_t	addr;
527	l_int		len;
528	l_int		prot;
529	l_int		flags;
530	l_int		fd;
531	l_int		pos;
532};
533
534#define STACK_SIZE  (2 * 1024 * 1024)
535#define GUARD_SIZE  (4 * PAGE_SIZE)
536
537static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
538
539int
540linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
541{
542	struct l_mmap_argv linux_args;
543
544#ifdef DEBUG
545	if (ldebug(mmap2))
546		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
547		    (void *)args->addr, args->len, args->prot,
548		    args->flags, args->fd, args->pgoff);
549#endif
550
551	linux_args.addr = (l_caddr_t)args->addr;
552	linux_args.len = args->len;
553	linux_args.prot = args->prot;
554	linux_args.flags = args->flags;
555	linux_args.fd = args->fd;
556	linux_args.pos = args->pgoff * PAGE_SIZE;
557
558	return (linux_mmap_common(td, &linux_args));
559}
560
561int
562linux_mmap(struct thread *td, struct linux_mmap_args *args)
563{
564	int error;
565	struct l_mmap_argv linux_args;
566
567	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
568	if (error)
569		return (error);
570
571#ifdef DEBUG
572	if (ldebug(mmap))
573		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
574		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
575		    linux_args.flags, linux_args.fd, linux_args.pos);
576#endif
577
578	return (linux_mmap_common(td, &linux_args));
579}
580
581static int
582linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
583{
584	struct proc *p = td->td_proc;
585	struct mmap_args /* {
586		caddr_t addr;
587		size_t len;
588		int prot;
589		int flags;
590		int fd;
591		long pad;
592		off_t pos;
593	} */ bsd_args;
594	int error;
595
596	error = 0;
597	bsd_args.flags = 0;
598	if (linux_args->flags & LINUX_MAP_SHARED)
599		bsd_args.flags |= MAP_SHARED;
600	if (linux_args->flags & LINUX_MAP_PRIVATE)
601		bsd_args.flags |= MAP_PRIVATE;
602	if (linux_args->flags & LINUX_MAP_FIXED)
603		bsd_args.flags |= MAP_FIXED;
604	if (linux_args->flags & LINUX_MAP_ANON)
605		bsd_args.flags |= MAP_ANON;
606	else
607		bsd_args.flags |= MAP_NOSYNC;
608	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
609		bsd_args.flags |= MAP_STACK;
610
611		/*
612		 * The linux MAP_GROWSDOWN option does not limit auto
613		 * growth of the region.  Linux mmap with this option
614		 * takes as addr the inital BOS, and as len, the initial
615		 * region size.  It can then grow down from addr without
616		 * limit.  However, linux threads has an implicit internal
617		 * limit to stack size of STACK_SIZE.  Its just not
618		 * enforced explicitly in linux.  But, here we impose
619		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
620		 * region, since we can do this with our mmap.
621		 *
622		 * Our mmap with MAP_STACK takes addr as the maximum
623		 * downsize limit on BOS, and as len the max size of
624		 * the region.  It them maps the top SGROWSIZ bytes,
625		 * and autgrows the region down, up to the limit
626		 * in addr.
627		 *
628		 * If we don't use the MAP_STACK option, the effect
629		 * of this code is to allocate a stack region of a
630		 * fixed size of (STACK_SIZE - GUARD_SIZE).
631		 */
632
633		/* This gives us TOS */
634		bsd_args.addr = linux_args->addr + linux_args->len;
635
636		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
637			/*
638			 * Some linux apps will attempt to mmap
639			 * thread stacks near the top of their
640			 * address space.  If their TOS is greater
641			 * than vm_maxsaddr, vm_map_growstack()
642			 * will confuse the thread stack with the
643			 * process stack and deliver a SEGV if they
644			 * attempt to grow the thread stack past their
645			 * current stacksize rlimit.  To avoid this,
646			 * adjust vm_maxsaddr upwards to reflect
647			 * the current stacksize rlimit rather
648			 * than the maximum possible stacksize.
649			 * It would be better to adjust the
650			 * mmap'ed region, but some apps do not check
651			 * mmap's return value.
652			 */
653			PROC_LOCK(p);
654			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
655			    lim_cur(p, RLIMIT_STACK);
656			PROC_UNLOCK(p);
657		}
658
659		/* This gives us our maximum stack size */
660		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
661			bsd_args.len = linux_args->len;
662		else
663			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
664
665		/*
666		 * This gives us a new BOS.  If we're using VM_STACK, then
667		 * mmap will just map the top SGROWSIZ bytes, and let
668		 * the stack grow down to the limit at BOS.  If we're
669		 * not using VM_STACK we map the full stack, since we
670		 * don't have a way to autogrow it.
671		 */
672		bsd_args.addr -= bsd_args.len;
673	} else {
674		bsd_args.addr = linux_args->addr;
675		bsd_args.len  = linux_args->len;
676	}
677
678	bsd_args.prot = linux_args->prot | PROT_READ;	/* always required */
679	if (linux_args->flags & LINUX_MAP_ANON)
680		bsd_args.fd = -1;
681	else
682		bsd_args.fd = linux_args->fd;
683	bsd_args.pos = linux_args->pos;
684	bsd_args.pad = 0;
685
686#ifdef DEBUG
687	if (ldebug(mmap))
688		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
689		    __func__,
690		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
691		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
692#endif
693	error = mmap(td, &bsd_args);
694#ifdef DEBUG
695	if (ldebug(mmap))
696		printf("-> %s() return: 0x%x (0x%08x)\n",
697			__func__, error, (u_int)td->td_retval[0]);
698#endif
699	return (error);
700}
701
702int
703linux_pipe(struct thread *td, struct linux_pipe_args *args)
704{
705	int error;
706	int reg_edx;
707
708#ifdef DEBUG
709	if (ldebug(pipe))
710		printf(ARGS(pipe, "*"));
711#endif
712
713	reg_edx = td->td_retval[1];
714	error = pipe(td, 0);
715	if (error) {
716		td->td_retval[1] = reg_edx;
717		return (error);
718	}
719
720	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
721	if (error) {
722		td->td_retval[1] = reg_edx;
723		return (error);
724	}
725
726	td->td_retval[1] = reg_edx;
727	td->td_retval[0] = 0;
728	return (0);
729}
730
731int
732linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
733{
734	int error;
735	struct i386_ioperm_args iia;
736
737	iia.start = args->start;
738	iia.length = args->length;
739	iia.enable = args->enable;
740	mtx_lock(&Giant);
741	error = i386_set_ioperm(td, &iia);
742	mtx_unlock(&Giant);
743	return (error);
744}
745
746int
747linux_iopl(struct thread *td, struct linux_iopl_args *args)
748{
749	int error;
750
751	if (args->level < 0 || args->level > 3)
752		return (EINVAL);
753	if ((error = suser(td)) != 0)
754		return (error);
755	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
756		return (error);
757	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
758	    (args->level * (PSL_IOPL / 3));
759	return (0);
760}
761
762int
763linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
764{
765	int error;
766	struct i386_ldt_args ldt;
767	struct l_descriptor ld;
768	union descriptor desc;
769
770	if (uap->ptr == NULL)
771		return (EINVAL);
772
773	switch (uap->func) {
774	case 0x00: /* read_ldt */
775		ldt.start = 0;
776		ldt.descs = uap->ptr;
777		ldt.num = uap->bytecount / sizeof(union descriptor);
778		mtx_lock(&Giant);
779		error = i386_get_ldt(td, &ldt);
780		td->td_retval[0] *= sizeof(union descriptor);
781		mtx_unlock(&Giant);
782		break;
783	case 0x01: /* write_ldt */
784	case 0x11: /* write_ldt */
785		if (uap->bytecount != sizeof(ld))
786			return (EINVAL);
787
788		error = copyin(uap->ptr, &ld, sizeof(ld));
789		if (error)
790			return (error);
791
792		ldt.start = ld.entry_number;
793		ldt.descs = &desc;
794		ldt.num = 1;
795		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
796		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
797		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
798		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
799		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
800			(ld.contents << 2);
801		desc.sd.sd_dpl = 3;
802		desc.sd.sd_p = (ld.seg_not_present ^ 1);
803		desc.sd.sd_xx = 0;
804		desc.sd.sd_def32 = ld.seg_32bit;
805		desc.sd.sd_gran = ld.limit_in_pages;
806		mtx_lock(&Giant);
807		error = i386_set_ldt(td, &ldt, &desc);
808		mtx_unlock(&Giant);
809		break;
810	default:
811		error = EINVAL;
812		break;
813	}
814
815	if (error == EOPNOTSUPP) {
816		printf("linux: modify_ldt needs kernel option USER_LDT\n");
817		error = ENOSYS;
818	}
819
820	return (error);
821}
822
823int
824linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
825{
826	l_osigaction_t osa;
827	l_sigaction_t act, oact;
828	int error;
829
830#ifdef DEBUG
831	if (ldebug(sigaction))
832		printf(ARGS(sigaction, "%d, %p, %p"),
833		    args->sig, (void *)args->nsa, (void *)args->osa);
834#endif
835
836	if (args->nsa != NULL) {
837		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
838		if (error)
839			return (error);
840		act.lsa_handler = osa.lsa_handler;
841		act.lsa_flags = osa.lsa_flags;
842		act.lsa_restorer = osa.lsa_restorer;
843		LINUX_SIGEMPTYSET(act.lsa_mask);
844		act.lsa_mask.__bits[0] = osa.lsa_mask;
845	}
846
847	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
848	    args->osa ? &oact : NULL);
849
850	if (args->osa != NULL && !error) {
851		osa.lsa_handler = oact.lsa_handler;
852		osa.lsa_flags = oact.lsa_flags;
853		osa.lsa_restorer = oact.lsa_restorer;
854		osa.lsa_mask = oact.lsa_mask.__bits[0];
855		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
856	}
857
858	return (error);
859}
860
861/*
862 * Linux has two extra args, restart and oldmask.  We dont use these,
863 * but it seems that "restart" is actually a context pointer that
864 * enables the signal to happen with a different register set.
865 */
866int
867linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
868{
869	sigset_t sigmask;
870	l_sigset_t mask;
871
872#ifdef DEBUG
873	if (ldebug(sigsuspend))
874		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
875#endif
876
877	LINUX_SIGEMPTYSET(mask);
878	mask.__bits[0] = args->mask;
879	linux_to_bsd_sigset(&mask, &sigmask);
880	return (kern_sigsuspend(td, sigmask));
881}
882
883int
884linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
885{
886	l_sigset_t lmask;
887	sigset_t sigmask;
888	int error;
889
890#ifdef DEBUG
891	if (ldebug(rt_sigsuspend))
892		printf(ARGS(rt_sigsuspend, "%p, %d"),
893		    (void *)uap->newset, uap->sigsetsize);
894#endif
895
896	if (uap->sigsetsize != sizeof(l_sigset_t))
897		return (EINVAL);
898
899	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
900	if (error)
901		return (error);
902
903	linux_to_bsd_sigset(&lmask, &sigmask);
904	return (kern_sigsuspend(td, sigmask));
905}
906
907int
908linux_pause(struct thread *td, struct linux_pause_args *args)
909{
910	struct proc *p = td->td_proc;
911	sigset_t sigmask;
912
913#ifdef DEBUG
914	if (ldebug(pause))
915		printf(ARGS(pause, ""));
916#endif
917
918	PROC_LOCK(p);
919	sigmask = td->td_sigmask;
920	PROC_UNLOCK(p);
921	return (kern_sigsuspend(td, sigmask));
922}
923
924int
925linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
926{
927	stack_t ss, oss;
928	l_stack_t lss;
929	int error;
930
931#ifdef DEBUG
932	if (ldebug(sigaltstack))
933		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
934#endif
935
936	if (uap->uss != NULL) {
937		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
938		if (error)
939			return (error);
940
941		ss.ss_sp = lss.ss_sp;
942		ss.ss_size = lss.ss_size;
943		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
944	}
945	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
946	    (uap->uoss != NULL) ? &oss : NULL);
947	if (!error && uap->uoss != NULL) {
948		lss.ss_sp = oss.ss_sp;
949		lss.ss_size = oss.ss_size;
950		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
951		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
952	}
953
954	return (error);
955}
956
957int
958linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
959{
960	struct ftruncate_args sa;
961
962#ifdef DEBUG
963	if (ldebug(ftruncate64))
964		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
965		    (intmax_t)args->length);
966#endif
967
968	sa.fd = args->fd;
969	sa.pad = 0;
970	sa.length = args->length;
971	return ftruncate(td, &sa);
972}
973
974int
975linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
976{
977	struct l_user_desc info;
978	int error;
979	int idx;
980	int a[2];
981	struct segment_descriptor sd;
982
983	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
984	if (error)
985		return (error);
986
987#ifdef DEBUG
988	if (ldebug(set_thread_area))
989	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
990		      info.entry_number,
991      		      info.base_addr,
992      		      info.limit,
993      		      info.seg_32bit,
994		      info.contents,
995      		      info.read_exec_only,
996      		      info.limit_in_pages,
997      		      info.seg_not_present,
998      		      info.useable);
999#endif
1000
1001	idx = info.entry_number;
1002	/*
1003	 * Semantics of linux version: every thread in the system has array
1004	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1005	 * syscall loads one of the selected tls decriptors with a value
1006	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1007	 * descriptors.
1008	 *
1009	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1010	 * descriptors and use just the 1st one. The tls_array[] is used only in
1011	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1012	 * we use just one GDT descriptor for TLS so we will load just one.
1013	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1014	 * comment in the linux sources says wine might do that.
1015	 */
1016
1017	/*
1018	 * we support just GLIBC TLS now
1019	 * we should let 3 proceed as well because we use this segment so
1020	 * if code does two subsequent calls it should succeed
1021	 */
1022	if (idx != 6 && idx != -1 && idx != 3)
1023		return (EINVAL);
1024
1025	/*
1026	 * we have to copy out the GDT entry we use
1027	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1028	 * XXX: what if userspace program doesnt check this value and tries
1029	 * to use 6, 7 or 8?
1030	 */
1031	idx = info.entry_number = 3;
1032	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1033	if (error)
1034		return (error);
1035
1036	if (LDT_empty(&info)) {
1037		a[0] = 0;
1038		a[1] = 0;
1039	} else {
1040		a[0] = LDT_entry_a(&info);
1041		a[1] = LDT_entry_b(&info);
1042	}
1043
1044	memcpy(&sd, &a, sizeof(a));
1045#ifdef DEBUG
1046	if (ldebug(set_thread_area))
1047	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1048			sd.sd_hibase,
1049			sd.sd_lolimit,
1050			sd.sd_hilimit,
1051			sd.sd_type,
1052			sd.sd_dpl,
1053			sd.sd_p,
1054			sd.sd_xx,
1055			sd.sd_def32,
1056			sd.sd_gran);
1057#endif
1058
1059	/* this is taken from i386 version of cpu_set_user_tls() */
1060	critical_enter();
1061	/* set %gs */
1062	td->td_pcb->pcb_gsd = sd;
1063	PCPU_GET(fsgs_gdt)[1] = sd;
1064	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1065	critical_exit();
1066
1067	return (0);
1068}
1069
1070int
1071linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1072{
1073
1074	struct l_user_desc info;
1075	int error;
1076	int idx;
1077	struct l_desc_struct desc;
1078	struct segment_descriptor sd;
1079
1080#ifdef DEBUG
1081	if (ldebug(get_thread_area))
1082		printf(ARGS(get_thread_area, "%p"), args->desc);
1083#endif
1084
1085	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1086	if (error)
1087		return (error);
1088
1089	idx = info.entry_number;
1090	/* XXX: I am not sure if we want 3 to be allowed too. */
1091	if (idx != 6 && idx != 3)
1092		return (EINVAL);
1093
1094	idx = 3;
1095
1096	memset(&info, 0, sizeof(info));
1097
1098	sd = PCPU_GET(fsgs_gdt)[1];
1099
1100	memcpy(&desc, &sd, sizeof(desc));
1101
1102	info.entry_number = idx;
1103	info.base_addr = GET_BASE(&desc);
1104	info.limit = GET_LIMIT(&desc);
1105	info.seg_32bit = GET_32BIT(&desc);
1106	info.contents = GET_CONTENTS(&desc);
1107	info.read_exec_only = !GET_WRITABLE(&desc);
1108	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1109	info.seg_not_present = !GET_PRESENT(&desc);
1110	info.useable = GET_USEABLE(&desc);
1111
1112	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1113	if (error)
1114	   	return (EFAULT);
1115
1116	return (0);
1117}
1118
1119/* copied from kern/kern_time.c */
1120int
1121linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1122{
1123   	return ktimer_create(td, (struct ktimer_create_args *) args);
1124}
1125
1126int
1127linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1128{
1129   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1130}
1131
1132int
1133linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1134{
1135   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1136}
1137
1138int
1139linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1140{
1141   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1142}
1143
1144int
1145linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1146{
1147   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1148}
1149
1150/* XXX: this wont work with module - convert it */
1151int
1152linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1153{
1154#ifdef P1003_1B_MQUEUE
1155   	return kmq_open(td, (struct kmq_open_args *) args);
1156#else
1157	return (ENOSYS);
1158#endif
1159}
1160
1161int
1162linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1163{
1164#ifdef P1003_1B_MQUEUE
1165   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1166#else
1167	return (ENOSYS);
1168#endif
1169}
1170
1171int
1172linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1173{
1174#ifdef P1003_1B_MQUEUE
1175   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1176#else
1177	return (ENOSYS);
1178#endif
1179}
1180
1181int
1182linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1183{
1184#ifdef P1003_1B_MQUEUE
1185   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1186#else
1187	return (ENOSYS);
1188#endif
1189}
1190
1191int
1192linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1193{
1194#ifdef P1003_1B_MQUEUE
1195	return kmq_notify(td, (struct kmq_notify_args *) args);
1196#else
1197	return (ENOSYS);
1198#endif
1199}
1200
1201int
1202linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1203{
1204#ifdef P1003_1B_MQUEUE
1205   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1206#else
1207	return (ENOSYS);
1208#endif
1209}
1210
1211