linux_machdep.c revision 161419
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 161419 2006-08-17 21:06:48Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/imgact.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mman.h>
38#include <sys/mutex.h>
39#include <sys/sx.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/resource.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysproto.h>
47#include <sys/unistd.h>
48#include <sys/wait.h>
49
50#include <machine/frame.h>
51#include <machine/psl.h>
52#include <machine/segments.h>
53#include <machine/sysarch.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58
59#include <i386/linux/linux.h>
60#include <i386/linux/linux_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
67
68#include "opt_posix.h"
69
70extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
71
72struct l_descriptor {
73	l_uint		entry_number;
74	l_ulong		base_addr;
75	l_uint		limit;
76	l_uint		seg_32bit:1;
77	l_uint		contents:2;
78	l_uint		read_exec_only:1;
79	l_uint		limit_in_pages:1;
80	l_uint		seg_not_present:1;
81	l_uint		useable:1;
82};
83
84struct l_old_select_argv {
85	l_int		nfds;
86	l_fd_set	*readfds;
87	l_fd_set	*writefds;
88	l_fd_set	*exceptfds;
89	struct l_timeval	*timeout;
90};
91
92int
93linux_to_bsd_sigaltstack(int lsa)
94{
95	int bsa = 0;
96
97	if (lsa & LINUX_SS_DISABLE)
98		bsa |= SS_DISABLE;
99	if (lsa & LINUX_SS_ONSTACK)
100		bsa |= SS_ONSTACK;
101	return (bsa);
102}
103
104int
105bsd_to_linux_sigaltstack(int bsa)
106{
107	int lsa = 0;
108
109	if (bsa & SS_DISABLE)
110		lsa |= LINUX_SS_DISABLE;
111	if (bsa & SS_ONSTACK)
112		lsa |= LINUX_SS_ONSTACK;
113	return (lsa);
114}
115
116int
117linux_execve(struct thread *td, struct linux_execve_args *args)
118{
119	int error;
120	char *newpath;
121	struct image_args eargs;
122
123	LCONVPATHEXIST(td, args->path, &newpath);
124
125#ifdef DEBUG
126	if (ldebug(execve))
127		printf(ARGS(execve, "%s"), newpath);
128#endif
129
130	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
131	    args->argp, args->envp);
132	free(newpath, M_TEMP);
133	if (error == 0)
134		error = kern_execve(td, &eargs, NULL);
135	if (error == 0)
136	   	/* linux process can exec fbsd one, dont attempt
137		 * to create emuldata for such process using
138		 * linux_proc_init, this leads to a panic on KASSERT
139		 * because such process has p->p_emuldata == NULL
140		 */
141	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
142   		   	error = linux_proc_init(td, 0, 0);
143	return (error);
144}
145
146struct l_ipc_kludge {
147	struct l_msgbuf *msgp;
148	l_long msgtyp;
149};
150
151int
152linux_ipc(struct thread *td, struct linux_ipc_args *args)
153{
154
155	switch (args->what & 0xFFFF) {
156	case LINUX_SEMOP: {
157		struct linux_semop_args a;
158
159		a.semid = args->arg1;
160		a.tsops = args->ptr;
161		a.nsops = args->arg2;
162		return (linux_semop(td, &a));
163	}
164	case LINUX_SEMGET: {
165		struct linux_semget_args a;
166
167		a.key = args->arg1;
168		a.nsems = args->arg2;
169		a.semflg = args->arg3;
170		return (linux_semget(td, &a));
171	}
172	case LINUX_SEMCTL: {
173		struct linux_semctl_args a;
174		int error;
175
176		a.semid = args->arg1;
177		a.semnum = args->arg2;
178		a.cmd = args->arg3;
179		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
180		if (error)
181			return (error);
182		return (linux_semctl(td, &a));
183	}
184	case LINUX_MSGSND: {
185		struct linux_msgsnd_args a;
186
187		a.msqid = args->arg1;
188		a.msgp = args->ptr;
189		a.msgsz = args->arg2;
190		a.msgflg = args->arg3;
191		return (linux_msgsnd(td, &a));
192	}
193	case LINUX_MSGRCV: {
194		struct linux_msgrcv_args a;
195
196		a.msqid = args->arg1;
197		a.msgsz = args->arg2;
198		a.msgflg = args->arg3;
199		if ((args->what >> 16) == 0) {
200			struct l_ipc_kludge tmp;
201			int error;
202
203			if (args->ptr == NULL)
204				return (EINVAL);
205			error = copyin(args->ptr, &tmp, sizeof(tmp));
206			if (error)
207				return (error);
208			a.msgp = tmp.msgp;
209			a.msgtyp = tmp.msgtyp;
210		} else {
211			a.msgp = args->ptr;
212			a.msgtyp = args->arg5;
213		}
214		return (linux_msgrcv(td, &a));
215	}
216	case LINUX_MSGGET: {
217		struct linux_msgget_args a;
218
219		a.key = args->arg1;
220		a.msgflg = args->arg2;
221		return (linux_msgget(td, &a));
222	}
223	case LINUX_MSGCTL: {
224		struct linux_msgctl_args a;
225
226		a.msqid = args->arg1;
227		a.cmd = args->arg2;
228		a.buf = args->ptr;
229		return (linux_msgctl(td, &a));
230	}
231	case LINUX_SHMAT: {
232		struct linux_shmat_args a;
233
234		a.shmid = args->arg1;
235		a.shmaddr = args->ptr;
236		a.shmflg = args->arg2;
237		a.raddr = (l_ulong *)args->arg3;
238		return (linux_shmat(td, &a));
239	}
240	case LINUX_SHMDT: {
241		struct linux_shmdt_args a;
242
243		a.shmaddr = args->ptr;
244		return (linux_shmdt(td, &a));
245	}
246	case LINUX_SHMGET: {
247		struct linux_shmget_args a;
248
249		a.key = args->arg1;
250		a.size = args->arg2;
251		a.shmflg = args->arg3;
252		return (linux_shmget(td, &a));
253	}
254	case LINUX_SHMCTL: {
255		struct linux_shmctl_args a;
256
257		a.shmid = args->arg1;
258		a.cmd = args->arg2;
259		a.buf = args->ptr;
260		return (linux_shmctl(td, &a));
261	}
262	default:
263		break;
264	}
265
266	return (EINVAL);
267}
268
269int
270linux_old_select(struct thread *td, struct linux_old_select_args *args)
271{
272	struct l_old_select_argv linux_args;
273	struct linux_select_args newsel;
274	int error;
275
276#ifdef DEBUG
277	if (ldebug(old_select))
278		printf(ARGS(old_select, "%p"), args->ptr);
279#endif
280
281	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
282	if (error)
283		return (error);
284
285	newsel.nfds = linux_args.nfds;
286	newsel.readfds = linux_args.readfds;
287	newsel.writefds = linux_args.writefds;
288	newsel.exceptfds = linux_args.exceptfds;
289	newsel.timeout = linux_args.timeout;
290	return (linux_select(td, &newsel));
291}
292
293int
294linux_fork(struct thread *td, struct linux_fork_args *args)
295{
296	int error;
297
298#ifdef DEBUG
299	if (ldebug(fork))
300		printf(ARGS(fork, ""));
301#endif
302
303	if ((error = fork(td, (struct fork_args *)args)) != 0)
304		return (error);
305
306	if (td->td_retval[1] == 1)
307		td->td_retval[0] = 0;
308	error = linux_proc_init(td, td->td_retval[0], 0);
309	if (error)
310		return (error);
311
312	return (0);
313}
314
315int
316linux_vfork(struct thread *td, struct linux_vfork_args *args)
317{
318	int error;
319
320#ifdef DEBUG
321	if (ldebug(vfork))
322		printf(ARGS(vfork, ""));
323#endif
324
325	if ((error = vfork(td, (struct vfork_args *)args)) != 0)
326		return (error);
327	/* Are we the child? */
328	if (td->td_retval[1] == 1)
329		td->td_retval[0] = 0;
330	error = linux_proc_init(td, td->td_retval[0], 0);
331	if (error)
332		return (error);
333	return (0);
334}
335
336int
337linux_clone(struct thread *td, struct linux_clone_args *args)
338{
339	int error, ff = RFPROC | RFSTOPPED;
340	struct proc *p2;
341	struct thread *td2;
342	int exit_signal;
343	struct linux_emuldata *em;
344
345#ifdef DEBUG
346	if (ldebug(clone)) {
347   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
348		    (unsigned int)args->flags, (unsigned int)args->stack,
349		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
350	}
351#endif
352
353	exit_signal = args->flags & 0x000000ff;
354	if (exit_signal >= LINUX_NSIG)
355		return (EINVAL);
356
357	if (exit_signal <= LINUX_SIGTBLSZ)
358		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
359
360	if (args->flags & CLONE_VM)
361		ff |= RFMEM;
362	if (args->flags & CLONE_SIGHAND)
363		ff |= RFSIGSHARE;
364	if (!(args->flags & CLONE_FILES))
365		ff |= RFFDG;
366
367	/*
368	 * Attempt to detect when linux_clone(2) is used for creating
369	 * kernel threads. Unfortunately despite the existence of the
370	 * CLONE_THREAD flag, version of linuxthreads package used in
371	 * most popular distros as of beginning of 2005 doesn't make
372	 * any use of it. Therefore, this detection relay fully on
373	 * empirical observation that linuxthreads sets certain
374	 * combination of flags, so that we can make more or less
375	 * precise detection and notify the FreeBSD kernel that several
376	 * processes are in fact part of the same threading group, so
377	 * that special treatment is necessary for signal delivery
378	 * between those processes and fd locking.
379	 */
380	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
381		ff |= RFTHREAD;
382
383	error = fork1(td, ff, 0, &p2);
384	if (error)
385		return (error);
386
387	/* create the emuldata */
388	error = linux_proc_init(td, p2->p_pid, args->flags);
389	/* reference it - no need to check this */
390	em = em_find(p2, EMUL_UNLOCKED);
391	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
392	/* and adjust it */
393	if (args->flags & CLONE_PARENT_SETTID) {
394	   	if (args->parent_tidptr == NULL) {
395		   	EMUL_UNLOCK(&emul_lock);
396			return (EINVAL);
397		}
398		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
399		if (error) {
400		   	EMUL_UNLOCK(&emul_lock);
401			return (error);
402		}
403	}
404
405	if (args->flags & CLONE_PARENT) {
406#ifdef DEBUG
407	   	printf("linux_clone: CLONE_PARENT\n");
408#endif
409	}
410
411	if (args->flags & CLONE_THREAD) {
412	   	/* XXX: linux mangles pgrp and pptr somehow
413		 * I think it might be this but I am not sure.
414		 */
415#ifdef notyet
416	   	p2->p_pgrp = td->td_proc->p_pgrp;
417	 	p2->p_pptr = td->td_proc->p_pptr;
418#endif
419	 	exit_signal = 0;
420#ifdef DEBUG
421	   	printf("linux_clone: CLONE_THREADS\n");
422#endif
423	}
424
425	if (args->flags & CLONE_CHILD_SETTID)
426		em->child_set_tid = args->child_tidptr;
427	else
428	   	em->child_set_tid = NULL;
429
430	if (args->flags & CLONE_CHILD_CLEARTID)
431		em->child_clear_tid = args->child_tidptr;
432	else
433	   	em->child_clear_tid = NULL;
434	EMUL_UNLOCK(&emul_lock);
435
436	PROC_LOCK(p2);
437	p2->p_sigparent = exit_signal;
438	PROC_UNLOCK(p2);
439	td2 = FIRST_THREAD_IN_PROC(p2);
440	/*
441	 * in a case of stack = NULL we are supposed to COW calling process stack
442	 * this is what normal fork() does so we just keep the tf_esp arg intact
443	 */
444	if (args->stack)
445   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
446
447	if (args->flags & CLONE_SETTLS) {
448   	   	struct l_user_desc info;
449   	   	int idx;
450	   	int a[2];
451		struct segment_descriptor sd;
452
453	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
454		if (error)
455   		   	return (error);
456
457		idx = info.entry_number;
458
459		/*
460		 * looks like we're getting the idx we returned
461		 * in the set_thread_area() syscall
462		 */
463		if (idx != 6 && idx != 3)
464			return (EINVAL);
465
466		/* this doesnt happen in practice */
467		if (idx == 6) {
468		   	/* we might copy out the entry_number as 3 */
469		   	info.entry_number = 3;
470			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
471			if (error)
472	   		   	return (error);
473		}
474
475		a[0] = LDT_entry_a(&info);
476		a[1] = LDT_entry_b(&info);
477
478		memcpy(&sd, &a, sizeof(a));
479#ifdef DEBUG
480	if (ldebug(clone))
481	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
482			sd.sd_hibase,
483			sd.sd_lolimit,
484			sd.sd_hilimit,
485			sd.sd_type,
486			sd.sd_dpl,
487			sd.sd_p,
488			sd.sd_xx,
489			sd.sd_def32,
490			sd.sd_gran);
491#endif
492
493		/* this is taken from i386 version of cpu_set_user_tls() */
494		critical_enter();
495		/* set %gs */
496		td2->td_pcb->pcb_gsd = sd;
497		PCPU_GET(fsgs_gdt)[1] = sd;
498		load_gs(GSEL(GUGS_SEL, SEL_UPL));
499		critical_exit();
500	}
501
502#ifdef DEBUG
503	if (ldebug(clone))
504		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
505		    (long)p2->p_pid, args->stack, exit_signal);
506#endif
507
508	/*
509	 * Make this runnable after we are finished with it.
510	 */
511	mtx_lock_spin(&sched_lock);
512	TD_SET_CAN_RUN(td2);
513	setrunqueue(td2, SRQ_BORING);
514	mtx_unlock_spin(&sched_lock);
515
516	td->td_retval[0] = p2->p_pid;
517	td->td_retval[1] = 0;
518	return (0);
519}
520
521/* XXX move */
522struct l_mmap_argv {
523	l_caddr_t	addr;
524	l_int		len;
525	l_int		prot;
526	l_int		flags;
527	l_int		fd;
528	l_int		pos;
529};
530
531#define STACK_SIZE  (2 * 1024 * 1024)
532#define GUARD_SIZE  (4 * PAGE_SIZE)
533
534static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
535
536int
537linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
538{
539	struct l_mmap_argv linux_args;
540
541#ifdef DEBUG
542	if (ldebug(mmap2))
543		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
544		    (void *)args->addr, args->len, args->prot,
545		    args->flags, args->fd, args->pgoff);
546#endif
547
548	linux_args.addr = (l_caddr_t)args->addr;
549	linux_args.len = args->len;
550	linux_args.prot = args->prot;
551	linux_args.flags = args->flags;
552	linux_args.fd = args->fd;
553	linux_args.pos = args->pgoff * PAGE_SIZE;
554
555	return (linux_mmap_common(td, &linux_args));
556}
557
558int
559linux_mmap(struct thread *td, struct linux_mmap_args *args)
560{
561	int error;
562	struct l_mmap_argv linux_args;
563
564	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
565	if (error)
566		return (error);
567
568#ifdef DEBUG
569	if (ldebug(mmap))
570		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
571		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
572		    linux_args.flags, linux_args.fd, linux_args.pos);
573#endif
574
575	return (linux_mmap_common(td, &linux_args));
576}
577
578static int
579linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
580{
581	struct proc *p = td->td_proc;
582	struct mmap_args /* {
583		caddr_t addr;
584		size_t len;
585		int prot;
586		int flags;
587		int fd;
588		long pad;
589		off_t pos;
590	} */ bsd_args;
591	int error;
592
593	error = 0;
594	bsd_args.flags = 0;
595	if (linux_args->flags & LINUX_MAP_SHARED)
596		bsd_args.flags |= MAP_SHARED;
597	if (linux_args->flags & LINUX_MAP_PRIVATE)
598		bsd_args.flags |= MAP_PRIVATE;
599	if (linux_args->flags & LINUX_MAP_FIXED)
600		bsd_args.flags |= MAP_FIXED;
601	if (linux_args->flags & LINUX_MAP_ANON)
602		bsd_args.flags |= MAP_ANON;
603	else
604		bsd_args.flags |= MAP_NOSYNC;
605	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
606		bsd_args.flags |= MAP_STACK;
607
608		/*
609		 * The linux MAP_GROWSDOWN option does not limit auto
610		 * growth of the region.  Linux mmap with this option
611		 * takes as addr the inital BOS, and as len, the initial
612		 * region size.  It can then grow down from addr without
613		 * limit.  However, linux threads has an implicit internal
614		 * limit to stack size of STACK_SIZE.  Its just not
615		 * enforced explicitly in linux.  But, here we impose
616		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
617		 * region, since we can do this with our mmap.
618		 *
619		 * Our mmap with MAP_STACK takes addr as the maximum
620		 * downsize limit on BOS, and as len the max size of
621		 * the region.  It them maps the top SGROWSIZ bytes,
622		 * and autgrows the region down, up to the limit
623		 * in addr.
624		 *
625		 * If we don't use the MAP_STACK option, the effect
626		 * of this code is to allocate a stack region of a
627		 * fixed size of (STACK_SIZE - GUARD_SIZE).
628		 */
629
630		/* This gives us TOS */
631		bsd_args.addr = linux_args->addr + linux_args->len;
632
633		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
634			/*
635			 * Some linux apps will attempt to mmap
636			 * thread stacks near the top of their
637			 * address space.  If their TOS is greater
638			 * than vm_maxsaddr, vm_map_growstack()
639			 * will confuse the thread stack with the
640			 * process stack and deliver a SEGV if they
641			 * attempt to grow the thread stack past their
642			 * current stacksize rlimit.  To avoid this,
643			 * adjust vm_maxsaddr upwards to reflect
644			 * the current stacksize rlimit rather
645			 * than the maximum possible stacksize.
646			 * It would be better to adjust the
647			 * mmap'ed region, but some apps do not check
648			 * mmap's return value.
649			 */
650			PROC_LOCK(p);
651			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
652			    lim_cur(p, RLIMIT_STACK);
653			PROC_UNLOCK(p);
654		}
655
656		/* This gives us our maximum stack size */
657		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
658			bsd_args.len = linux_args->len;
659		else
660			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
661
662		/*
663		 * This gives us a new BOS.  If we're using VM_STACK, then
664		 * mmap will just map the top SGROWSIZ bytes, and let
665		 * the stack grow down to the limit at BOS.  If we're
666		 * not using VM_STACK we map the full stack, since we
667		 * don't have a way to autogrow it.
668		 */
669		bsd_args.addr -= bsd_args.len;
670	} else {
671		bsd_args.addr = linux_args->addr;
672		bsd_args.len  = linux_args->len;
673	}
674
675	bsd_args.prot = linux_args->prot | PROT_READ;	/* always required */
676	if (linux_args->flags & LINUX_MAP_ANON)
677		bsd_args.fd = -1;
678	else
679		bsd_args.fd = linux_args->fd;
680	bsd_args.pos = linux_args->pos;
681	bsd_args.pad = 0;
682
683#ifdef DEBUG
684	if (ldebug(mmap))
685		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
686		    __func__,
687		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
688		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
689#endif
690	error = mmap(td, &bsd_args);
691#ifdef DEBUG
692	if (ldebug(mmap))
693		printf("-> %s() return: 0x%x (0x%08x)\n",
694			__func__, error, (u_int)td->td_retval[0]);
695#endif
696	return (error);
697}
698
699int
700linux_pipe(struct thread *td, struct linux_pipe_args *args)
701{
702	int error;
703	int reg_edx;
704
705#ifdef DEBUG
706	if (ldebug(pipe))
707		printf(ARGS(pipe, "*"));
708#endif
709
710	reg_edx = td->td_retval[1];
711	error = pipe(td, 0);
712	if (error) {
713		td->td_retval[1] = reg_edx;
714		return (error);
715	}
716
717	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
718	if (error) {
719		td->td_retval[1] = reg_edx;
720		return (error);
721	}
722
723	td->td_retval[1] = reg_edx;
724	td->td_retval[0] = 0;
725	return (0);
726}
727
728int
729linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
730{
731	int error;
732	struct i386_ioperm_args iia;
733
734	iia.start = args->start;
735	iia.length = args->length;
736	iia.enable = args->enable;
737	mtx_lock(&Giant);
738	error = i386_set_ioperm(td, &iia);
739	mtx_unlock(&Giant);
740	return (error);
741}
742
743int
744linux_iopl(struct thread *td, struct linux_iopl_args *args)
745{
746	int error;
747
748	if (args->level < 0 || args->level > 3)
749		return (EINVAL);
750	if ((error = suser(td)) != 0)
751		return (error);
752	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
753		return (error);
754	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
755	    (args->level * (PSL_IOPL / 3));
756	return (0);
757}
758
759int
760linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
761{
762	int error;
763	struct i386_ldt_args ldt;
764	struct l_descriptor ld;
765	union descriptor desc;
766
767	if (uap->ptr == NULL)
768		return (EINVAL);
769
770	switch (uap->func) {
771	case 0x00: /* read_ldt */
772		ldt.start = 0;
773		ldt.descs = uap->ptr;
774		ldt.num = uap->bytecount / sizeof(union descriptor);
775		mtx_lock(&Giant);
776		error = i386_get_ldt(td, &ldt);
777		td->td_retval[0] *= sizeof(union descriptor);
778		mtx_unlock(&Giant);
779		break;
780	case 0x01: /* write_ldt */
781	case 0x11: /* write_ldt */
782		if (uap->bytecount != sizeof(ld))
783			return (EINVAL);
784
785		error = copyin(uap->ptr, &ld, sizeof(ld));
786		if (error)
787			return (error);
788
789		ldt.start = ld.entry_number;
790		ldt.descs = &desc;
791		ldt.num = 1;
792		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
793		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
794		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
795		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
796		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
797			(ld.contents << 2);
798		desc.sd.sd_dpl = 3;
799		desc.sd.sd_p = (ld.seg_not_present ^ 1);
800		desc.sd.sd_xx = 0;
801		desc.sd.sd_def32 = ld.seg_32bit;
802		desc.sd.sd_gran = ld.limit_in_pages;
803		mtx_lock(&Giant);
804		error = i386_set_ldt(td, &ldt, &desc);
805		mtx_unlock(&Giant);
806		break;
807	default:
808		error = EINVAL;
809		break;
810	}
811
812	if (error == EOPNOTSUPP) {
813		printf("linux: modify_ldt needs kernel option USER_LDT\n");
814		error = ENOSYS;
815	}
816
817	return (error);
818}
819
820int
821linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
822{
823	l_osigaction_t osa;
824	l_sigaction_t act, oact;
825	int error;
826
827#ifdef DEBUG
828	if (ldebug(sigaction))
829		printf(ARGS(sigaction, "%d, %p, %p"),
830		    args->sig, (void *)args->nsa, (void *)args->osa);
831#endif
832
833	if (args->nsa != NULL) {
834		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
835		if (error)
836			return (error);
837		act.lsa_handler = osa.lsa_handler;
838		act.lsa_flags = osa.lsa_flags;
839		act.lsa_restorer = osa.lsa_restorer;
840		LINUX_SIGEMPTYSET(act.lsa_mask);
841		act.lsa_mask.__bits[0] = osa.lsa_mask;
842	}
843
844	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
845	    args->osa ? &oact : NULL);
846
847	if (args->osa != NULL && !error) {
848		osa.lsa_handler = oact.lsa_handler;
849		osa.lsa_flags = oact.lsa_flags;
850		osa.lsa_restorer = oact.lsa_restorer;
851		osa.lsa_mask = oact.lsa_mask.__bits[0];
852		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
853	}
854
855	return (error);
856}
857
858/*
859 * Linux has two extra args, restart and oldmask.  We dont use these,
860 * but it seems that "restart" is actually a context pointer that
861 * enables the signal to happen with a different register set.
862 */
863int
864linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
865{
866	sigset_t sigmask;
867	l_sigset_t mask;
868
869#ifdef DEBUG
870	if (ldebug(sigsuspend))
871		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
872#endif
873
874	LINUX_SIGEMPTYSET(mask);
875	mask.__bits[0] = args->mask;
876	linux_to_bsd_sigset(&mask, &sigmask);
877	return (kern_sigsuspend(td, sigmask));
878}
879
880int
881linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
882{
883	l_sigset_t lmask;
884	sigset_t sigmask;
885	int error;
886
887#ifdef DEBUG
888	if (ldebug(rt_sigsuspend))
889		printf(ARGS(rt_sigsuspend, "%p, %d"),
890		    (void *)uap->newset, uap->sigsetsize);
891#endif
892
893	if (uap->sigsetsize != sizeof(l_sigset_t))
894		return (EINVAL);
895
896	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
897	if (error)
898		return (error);
899
900	linux_to_bsd_sigset(&lmask, &sigmask);
901	return (kern_sigsuspend(td, sigmask));
902}
903
904int
905linux_pause(struct thread *td, struct linux_pause_args *args)
906{
907	struct proc *p = td->td_proc;
908	sigset_t sigmask;
909
910#ifdef DEBUG
911	if (ldebug(pause))
912		printf(ARGS(pause, ""));
913#endif
914
915	PROC_LOCK(p);
916	sigmask = td->td_sigmask;
917	PROC_UNLOCK(p);
918	return (kern_sigsuspend(td, sigmask));
919}
920
921int
922linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
923{
924	stack_t ss, oss;
925	l_stack_t lss;
926	int error;
927
928#ifdef DEBUG
929	if (ldebug(sigaltstack))
930		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
931#endif
932
933	if (uap->uss != NULL) {
934		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
935		if (error)
936			return (error);
937
938		ss.ss_sp = lss.ss_sp;
939		ss.ss_size = lss.ss_size;
940		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
941	}
942	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
943	    (uap->uoss != NULL) ? &oss : NULL);
944	if (!error && uap->uoss != NULL) {
945		lss.ss_sp = oss.ss_sp;
946		lss.ss_size = oss.ss_size;
947		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
948		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
949	}
950
951	return (error);
952}
953
954int
955linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
956{
957	struct ftruncate_args sa;
958
959#ifdef DEBUG
960	if (ldebug(ftruncate64))
961		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
962		    (intmax_t)args->length);
963#endif
964
965	sa.fd = args->fd;
966	sa.pad = 0;
967	sa.length = args->length;
968	return ftruncate(td, &sa);
969}
970
971int
972linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
973{
974	struct l_user_desc info;
975	int error;
976	int idx;
977	int a[2];
978	struct segment_descriptor sd;
979
980	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
981	if (error)
982		return (error);
983
984#ifdef DEBUG
985	if (ldebug(set_thread_area))
986	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
987		      info.entry_number,
988      		      info.base_addr,
989      		      info.limit,
990      		      info.seg_32bit,
991		      info.contents,
992      		      info.read_exec_only,
993      		      info.limit_in_pages,
994      		      info.seg_not_present,
995      		      info.useable);
996#endif
997
998	idx = info.entry_number;
999	/*
1000	 * Semantics of linux version: every thread in the system has array
1001	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1002	 * syscall loads one of the selected tls decriptors with a value
1003	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1004	 * descriptors.
1005	 *
1006	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1007	 * descriptors and use just the 1st one. The tls_array[] is used only in
1008	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1009	 * we use just one GDT descriptor for TLS so we will load just one.
1010	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1011	 * comment in the linux sources says wine might do that.
1012	 */
1013
1014	/*
1015	 * we support just GLIBC TLS now
1016	 * we should let 3 proceed as well because we use this segment so
1017	 * if code does two subsequent calls it should succeed
1018	 */
1019	if (idx != 6 && idx != -1 && idx != 3)
1020		return (EINVAL);
1021
1022	/*
1023	 * we have to copy out the GDT entry we use
1024	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1025	 * XXX: what if userspace program doesnt check this value and tries
1026	 * to use 6, 7 or 8?
1027	 */
1028	idx = info.entry_number = 3;
1029	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1030	if (error)
1031		return (error);
1032
1033	if (LDT_empty(&info)) {
1034		a[0] = 0;
1035		a[1] = 0;
1036	} else {
1037		a[0] = LDT_entry_a(&info);
1038		a[1] = LDT_entry_b(&info);
1039	}
1040
1041	memcpy(&sd, &a, sizeof(a));
1042#ifdef DEBUG
1043	if (ldebug(set_thread_area))
1044	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1045			sd.sd_hibase,
1046			sd.sd_lolimit,
1047			sd.sd_hilimit,
1048			sd.sd_type,
1049			sd.sd_dpl,
1050			sd.sd_p,
1051			sd.sd_xx,
1052			sd.sd_def32,
1053			sd.sd_gran);
1054#endif
1055
1056	/* this is taken from i386 version of cpu_set_user_tls() */
1057	critical_enter();
1058	/* set %gs */
1059	td->td_pcb->pcb_gsd = sd;
1060	PCPU_GET(fsgs_gdt)[1] = sd;
1061	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1062	critical_exit();
1063
1064	return (0);
1065}
1066
1067int
1068linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1069{
1070
1071	struct l_user_desc info;
1072	int error;
1073	int idx;
1074	struct l_desc_struct desc;
1075	struct segment_descriptor sd;
1076
1077#ifdef DEBUG
1078	if (ldebug(get_thread_area))
1079		printf(ARGS(get_thread_area, "%p"), args->desc);
1080#endif
1081
1082	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1083	if (error)
1084		return (error);
1085
1086	idx = info.entry_number;
1087	/* XXX: I am not sure if we want 3 to be allowed too. */
1088	if (idx != 6 && idx != 3)
1089		return (EINVAL);
1090
1091	idx = 3;
1092
1093	memset(&info, 0, sizeof(info));
1094
1095	sd = PCPU_GET(fsgs_gdt)[1];
1096
1097	memcpy(&desc, &sd, sizeof(desc));
1098
1099	info.entry_number = idx;
1100	info.base_addr = GET_BASE(&desc);
1101	info.limit = GET_LIMIT(&desc);
1102	info.seg_32bit = GET_32BIT(&desc);
1103	info.contents = GET_CONTENTS(&desc);
1104	info.read_exec_only = !GET_WRITABLE(&desc);
1105	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1106	info.seg_not_present = !GET_PRESENT(&desc);
1107	info.useable = GET_USEABLE(&desc);
1108
1109	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1110	if (error)
1111	   	return (EFAULT);
1112
1113	return (0);
1114}
1115
1116/* copied from kern/kern_time.c */
1117int
1118linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1119{
1120   	return ktimer_create(td, (struct ktimer_create_args *) args);
1121}
1122
1123int
1124linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1125{
1126   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1127}
1128
1129int
1130linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1131{
1132   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1133}
1134
1135int
1136linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1137{
1138   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1139}
1140
1141int
1142linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1143{
1144   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1145}
1146
1147/* XXX: this wont work with module - convert it */
1148int
1149linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1150{
1151#ifdef P1003_1B_MQUEUE
1152   	return kmq_open(td, (struct kmq_open_args *) args);
1153#else
1154	return (ENOSYS);
1155#endif
1156}
1157
1158int
1159linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1160{
1161#ifdef P1003_1B_MQUEUE
1162   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1163#else
1164	return (ENOSYS);
1165#endif
1166}
1167
1168int
1169linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1170{
1171#ifdef P1003_1B_MQUEUE
1172   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1173#else
1174	return (ENOSYS);
1175#endif
1176}
1177
1178int
1179linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1180{
1181#ifdef P1003_1B_MQUEUE
1182   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1183#else
1184	return (ENOSYS);
1185#endif
1186}
1187
1188int
1189linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1190{
1191#ifdef P1003_1B_MQUEUE
1192	return kmq_notify(td, (struct kmq_notify_args *) args);
1193#else
1194	return (ENOSYS);
1195#endif
1196}
1197
1198int
1199linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1200{
1201#ifdef P1003_1B_MQUEUE
1202   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1203#else
1204	return (ENOSYS);
1205#endif
1206}
1207
1208