linux_machdep.c revision 161310
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 161310 2006-08-15 12:54:30Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/imgact.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mman.h>
38#include <sys/mutex.h>
39#include <sys/sx.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/resource.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/sysproto.h>
47#include <sys/unistd.h>
48#include <sys/wait.h>
49
50#include <machine/frame.h>
51#include <machine/psl.h>
52#include <machine/segments.h>
53#include <machine/sysarch.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58
59#include <i386/linux/linux.h>
60#include <i386/linux/linux_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
67
68#include "opt_posix.h"
69
70extern struct sx emul_shared_lock;
71extern struct sx emul_lock;
72
73extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
74
75struct l_descriptor {
76	l_uint		entry_number;
77	l_ulong		base_addr;
78	l_uint		limit;
79	l_uint		seg_32bit:1;
80	l_uint		contents:2;
81	l_uint		read_exec_only:1;
82	l_uint		limit_in_pages:1;
83	l_uint		seg_not_present:1;
84	l_uint		useable:1;
85};
86
87struct l_old_select_argv {
88	l_int		nfds;
89	l_fd_set	*readfds;
90	l_fd_set	*writefds;
91	l_fd_set	*exceptfds;
92	struct l_timeval	*timeout;
93};
94
95int
96linux_to_bsd_sigaltstack(int lsa)
97{
98	int bsa = 0;
99
100	if (lsa & LINUX_SS_DISABLE)
101		bsa |= SS_DISABLE;
102	if (lsa & LINUX_SS_ONSTACK)
103		bsa |= SS_ONSTACK;
104	return (bsa);
105}
106
107int
108bsd_to_linux_sigaltstack(int bsa)
109{
110	int lsa = 0;
111
112	if (bsa & SS_DISABLE)
113		lsa |= LINUX_SS_DISABLE;
114	if (bsa & SS_ONSTACK)
115		lsa |= LINUX_SS_ONSTACK;
116	return (lsa);
117}
118
119int
120linux_execve(struct thread *td, struct linux_execve_args *args)
121{
122	int error;
123	char *newpath;
124	struct image_args eargs;
125
126	LCONVPATHEXIST(td, args->path, &newpath);
127
128#ifdef DEBUG
129	if (ldebug(execve))
130		printf(ARGS(execve, "%s"), newpath);
131#endif
132
133	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
134	    args->argp, args->envp);
135	free(newpath, M_TEMP);
136	if (error == 0)
137		error = kern_execve(td, &eargs, NULL);
138	if (error == 0)
139	   	/* linux process can exec fbsd one, dont attempt
140		 * to create emuldata for such process using
141		 * linux_proc_init, this leads to a panic on KASSERT
142		 * because such process has p->p_emuldata == NULL
143		 */
144	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
145   		   	error = linux_proc_init(td, 0, 0);
146	return (error);
147}
148
149struct l_ipc_kludge {
150	struct l_msgbuf *msgp;
151	l_long msgtyp;
152};
153
154int
155linux_ipc(struct thread *td, struct linux_ipc_args *args)
156{
157
158	switch (args->what & 0xFFFF) {
159	case LINUX_SEMOP: {
160		struct linux_semop_args a;
161
162		a.semid = args->arg1;
163		a.tsops = args->ptr;
164		a.nsops = args->arg2;
165		return (linux_semop(td, &a));
166	}
167	case LINUX_SEMGET: {
168		struct linux_semget_args a;
169
170		a.key = args->arg1;
171		a.nsems = args->arg2;
172		a.semflg = args->arg3;
173		return (linux_semget(td, &a));
174	}
175	case LINUX_SEMCTL: {
176		struct linux_semctl_args a;
177		int error;
178
179		a.semid = args->arg1;
180		a.semnum = args->arg2;
181		a.cmd = args->arg3;
182		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
183		if (error)
184			return (error);
185		return (linux_semctl(td, &a));
186	}
187	case LINUX_MSGSND: {
188		struct linux_msgsnd_args a;
189
190		a.msqid = args->arg1;
191		a.msgp = args->ptr;
192		a.msgsz = args->arg2;
193		a.msgflg = args->arg3;
194		return (linux_msgsnd(td, &a));
195	}
196	case LINUX_MSGRCV: {
197		struct linux_msgrcv_args a;
198
199		a.msqid = args->arg1;
200		a.msgsz = args->arg2;
201		a.msgflg = args->arg3;
202		if ((args->what >> 16) == 0) {
203			struct l_ipc_kludge tmp;
204			int error;
205
206			if (args->ptr == NULL)
207				return (EINVAL);
208			error = copyin(args->ptr, &tmp, sizeof(tmp));
209			if (error)
210				return (error);
211			a.msgp = tmp.msgp;
212			a.msgtyp = tmp.msgtyp;
213		} else {
214			a.msgp = args->ptr;
215			a.msgtyp = args->arg5;
216		}
217		return (linux_msgrcv(td, &a));
218	}
219	case LINUX_MSGGET: {
220		struct linux_msgget_args a;
221
222		a.key = args->arg1;
223		a.msgflg = args->arg2;
224		return (linux_msgget(td, &a));
225	}
226	case LINUX_MSGCTL: {
227		struct linux_msgctl_args a;
228
229		a.msqid = args->arg1;
230		a.cmd = args->arg2;
231		a.buf = args->ptr;
232		return (linux_msgctl(td, &a));
233	}
234	case LINUX_SHMAT: {
235		struct linux_shmat_args a;
236
237		a.shmid = args->arg1;
238		a.shmaddr = args->ptr;
239		a.shmflg = args->arg2;
240		a.raddr = (l_ulong *)args->arg3;
241		return (linux_shmat(td, &a));
242	}
243	case LINUX_SHMDT: {
244		struct linux_shmdt_args a;
245
246		a.shmaddr = args->ptr;
247		return (linux_shmdt(td, &a));
248	}
249	case LINUX_SHMGET: {
250		struct linux_shmget_args a;
251
252		a.key = args->arg1;
253		a.size = args->arg2;
254		a.shmflg = args->arg3;
255		return (linux_shmget(td, &a));
256	}
257	case LINUX_SHMCTL: {
258		struct linux_shmctl_args a;
259
260		a.shmid = args->arg1;
261		a.cmd = args->arg2;
262		a.buf = args->ptr;
263		return (linux_shmctl(td, &a));
264	}
265	default:
266		break;
267	}
268
269	return (EINVAL);
270}
271
272int
273linux_old_select(struct thread *td, struct linux_old_select_args *args)
274{
275	struct l_old_select_argv linux_args;
276	struct linux_select_args newsel;
277	int error;
278
279#ifdef DEBUG
280	if (ldebug(old_select))
281		printf(ARGS(old_select, "%p"), args->ptr);
282#endif
283
284	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
285	if (error)
286		return (error);
287
288	newsel.nfds = linux_args.nfds;
289	newsel.readfds = linux_args.readfds;
290	newsel.writefds = linux_args.writefds;
291	newsel.exceptfds = linux_args.exceptfds;
292	newsel.timeout = linux_args.timeout;
293	return (linux_select(td, &newsel));
294}
295
296int
297linux_fork(struct thread *td, struct linux_fork_args *args)
298{
299	int error;
300
301#ifdef DEBUG
302	if (ldebug(fork))
303		printf(ARGS(fork, ""));
304#endif
305
306	if ((error = fork(td, (struct fork_args *)args)) != 0)
307		return (error);
308
309	if (td->td_retval[1] == 1)
310		td->td_retval[0] = 0;
311	error = linux_proc_init(td, td->td_retval[0], 0);
312	if (error)
313		return (error);
314
315	return (0);
316}
317
318int
319linux_vfork(struct thread *td, struct linux_vfork_args *args)
320{
321	int error;
322
323#ifdef DEBUG
324	if (ldebug(vfork))
325		printf(ARGS(vfork, ""));
326#endif
327
328	if ((error = vfork(td, (struct vfork_args *)args)) != 0)
329		return (error);
330	/* Are we the child? */
331	if (td->td_retval[1] == 1)
332		td->td_retval[0] = 0;
333	error = linux_proc_init(td, td->td_retval[0], 0);
334	if (error)
335		return (error);
336	return (0);
337}
338
339int
340linux_clone(struct thread *td, struct linux_clone_args *args)
341{
342	int error, ff = RFPROC | RFSTOPPED;
343	struct proc *p2;
344	struct thread *td2;
345	int exit_signal;
346	struct linux_emuldata *em;
347
348#ifdef DEBUG
349	if (ldebug(clone)) {
350   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
351		    (unsigned int)args->flags, (unsigned int)args->stack,
352		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
353	}
354#endif
355
356	exit_signal = args->flags & 0x000000ff;
357	if (exit_signal >= LINUX_NSIG)
358		return (EINVAL);
359
360	if (exit_signal <= LINUX_SIGTBLSZ)
361		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
362
363	if (args->flags & CLONE_VM)
364		ff |= RFMEM;
365	if (args->flags & CLONE_SIGHAND)
366		ff |= RFSIGSHARE;
367	if (!(args->flags & CLONE_FILES))
368		ff |= RFFDG;
369
370	/*
371	 * Attempt to detect when linux_clone(2) is used for creating
372	 * kernel threads. Unfortunately despite the existence of the
373	 * CLONE_THREAD flag, version of linuxthreads package used in
374	 * most popular distros as of beginning of 2005 doesn't make
375	 * any use of it. Therefore, this detection relay fully on
376	 * empirical observation that linuxthreads sets certain
377	 * combination of flags, so that we can make more or less
378	 * precise detection and notify the FreeBSD kernel that several
379	 * processes are in fact part of the same threading group, so
380	 * that special treatment is necessary for signal delivery
381	 * between those processes and fd locking.
382	 */
383	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
384		ff |= RFTHREAD;
385
386	error = fork1(td, ff, 0, &p2);
387	if (error)
388		return (error);
389
390	/* create the emuldata */
391	error = linux_proc_init(td, p2->p_pid, args->flags);
392	/* reference it - no need to check this */
393	em = em_find(p2, EMUL_UNLOCKED);
394	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
395	/* and adjust it */
396	if (args->flags & CLONE_PARENT_SETTID) {
397	   	if (args->parent_tidptr == NULL) {
398		   	EMUL_UNLOCK(&emul_lock);
399			return (EINVAL);
400		}
401		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
402		if (error) {
403		   	EMUL_UNLOCK(&emul_lock);
404			return (error);
405		}
406	}
407
408	if (args->flags & CLONE_PARENT) {
409#ifdef DEBUG
410	   	printf("linux_clone: CLONE_PARENT\n");
411#endif
412	}
413
414	if (args->flags & CLONE_THREAD) {
415	   	/* XXX: linux mangles pgrp and pptr somehow
416		 * I think it might be this but I am not sure.
417		 */
418#ifdef notyet
419	   	p2->p_pgrp = td->td_proc->p_pgrp;
420	 	p2->p_pptr = td->td_proc->p_pptr;
421#endif
422	 	exit_signal = 0;
423#ifdef DEBUG
424	   	printf("linux_clone: CLONE_THREADS\n");
425#endif
426	}
427
428	if (args->flags & CLONE_CHILD_SETTID)
429		em->child_set_tid = args->child_tidptr;
430	else
431	   	em->child_set_tid = NULL;
432
433	if (args->flags & CLONE_CHILD_CLEARTID)
434		em->child_clear_tid = args->child_tidptr;
435	else
436	   	em->child_clear_tid = NULL;
437	EMUL_UNLOCK(&emul_lock);
438
439	PROC_LOCK(p2);
440	p2->p_sigparent = exit_signal;
441	PROC_UNLOCK(p2);
442	td2 = FIRST_THREAD_IN_PROC(p2);
443	/* in a case of stack = NULL we are supposed to COW calling process stack
444	 * this is what normal fork() does so we just keep the tf_esp arg intact
445	 */
446	if (args->stack)
447   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
448
449	if (args->flags & CLONE_SETTLS) {
450   	   	struct l_user_desc info;
451   	   	int idx;
452	   	int a[2];
453		struct segment_descriptor sd;
454
455	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
456		if (error)
457   		   	return (error);
458
459		idx = info.entry_number;
460
461		/* looks like we're getting the idx we returned
462		 * in the set_thread_area() syscall
463		 */
464		if (idx != 6 && idx != 3)
465			return (EINVAL);
466
467		/* this doesnt happen in practice */
468		if (idx == 6) {
469		   	/* we might copy out the entry_number as 3 */
470		   	info.entry_number = 3;
471			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
472			if (error)
473	   		   	return (error);
474		}
475
476		a[0] = LDT_entry_a(&info);
477		a[1] = LDT_entry_b(&info);
478
479		memcpy(&sd, &a, sizeof(a));
480#ifdef DEBUG
481	if (ldebug(clone))
482	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
483			sd.sd_hibase,
484			sd.sd_lolimit,
485			sd.sd_hilimit,
486			sd.sd_type,
487			sd.sd_dpl,
488			sd.sd_p,
489			sd.sd_xx,
490			sd.sd_def32,
491			sd.sd_gran);
492#endif
493
494		/* this is taken from i386 version of cpu_set_user_tls() */
495		critical_enter();
496		/* set %gs */
497		td2->td_pcb->pcb_gsd = sd;
498		PCPU_GET(fsgs_gdt)[1] = sd;
499		load_gs(GSEL(GUGS_SEL, SEL_UPL));
500		critical_exit();
501	}
502
503#ifdef DEBUG
504	if (ldebug(clone))
505		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
506		    (long)p2->p_pid, args->stack, exit_signal);
507#endif
508
509	/*
510	 * Make this runnable after we are finished with it.
511	 */
512	mtx_lock_spin(&sched_lock);
513	TD_SET_CAN_RUN(td2);
514	setrunqueue(td2, SRQ_BORING);
515	mtx_unlock_spin(&sched_lock);
516
517	td->td_retval[0] = p2->p_pid;
518	td->td_retval[1] = 0;
519	return (0);
520}
521
522/* XXX move */
523struct l_mmap_argv {
524	l_caddr_t	addr;
525	l_int		len;
526	l_int		prot;
527	l_int		flags;
528	l_int		fd;
529	l_int		pos;
530};
531
532#define STACK_SIZE  (2 * 1024 * 1024)
533#define GUARD_SIZE  (4 * PAGE_SIZE)
534
535static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
536
537int
538linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
539{
540	struct l_mmap_argv linux_args;
541
542#ifdef DEBUG
543	if (ldebug(mmap2))
544		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
545		    (void *)args->addr, args->len, args->prot,
546		    args->flags, args->fd, args->pgoff);
547#endif
548
549	linux_args.addr = (l_caddr_t)args->addr;
550	linux_args.len = args->len;
551	linux_args.prot = args->prot;
552	linux_args.flags = args->flags;
553	linux_args.fd = args->fd;
554	linux_args.pos = args->pgoff * PAGE_SIZE;
555
556	return (linux_mmap_common(td, &linux_args));
557}
558
559int
560linux_mmap(struct thread *td, struct linux_mmap_args *args)
561{
562	int error;
563	struct l_mmap_argv linux_args;
564
565	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
566	if (error)
567		return (error);
568
569#ifdef DEBUG
570	if (ldebug(mmap))
571		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
572		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
573		    linux_args.flags, linux_args.fd, linux_args.pos);
574#endif
575
576	return (linux_mmap_common(td, &linux_args));
577}
578
579static int
580linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
581{
582	struct proc *p = td->td_proc;
583	struct mmap_args /* {
584		caddr_t addr;
585		size_t len;
586		int prot;
587		int flags;
588		int fd;
589		long pad;
590		off_t pos;
591	} */ bsd_args;
592	int error;
593
594	error = 0;
595	bsd_args.flags = 0;
596	if (linux_args->flags & LINUX_MAP_SHARED)
597		bsd_args.flags |= MAP_SHARED;
598	if (linux_args->flags & LINUX_MAP_PRIVATE)
599		bsd_args.flags |= MAP_PRIVATE;
600	if (linux_args->flags & LINUX_MAP_FIXED)
601		bsd_args.flags |= MAP_FIXED;
602	if (linux_args->flags & LINUX_MAP_ANON)
603		bsd_args.flags |= MAP_ANON;
604	else
605		bsd_args.flags |= MAP_NOSYNC;
606	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
607		bsd_args.flags |= MAP_STACK;
608
609		/* The linux MAP_GROWSDOWN option does not limit auto
610		 * growth of the region.  Linux mmap with this option
611		 * takes as addr the inital BOS, and as len, the initial
612		 * region size.  It can then grow down from addr without
613		 * limit.  However, linux threads has an implicit internal
614		 * limit to stack size of STACK_SIZE.  Its just not
615		 * enforced explicitly in linux.  But, here we impose
616		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
617		 * region, since we can do this with our mmap.
618		 *
619		 * Our mmap with MAP_STACK takes addr as the maximum
620		 * downsize limit on BOS, and as len the max size of
621		 * the region.  It them maps the top SGROWSIZ bytes,
622		 * and autgrows the region down, up to the limit
623		 * in addr.
624		 *
625		 * If we don't use the MAP_STACK option, the effect
626		 * of this code is to allocate a stack region of a
627		 * fixed size of (STACK_SIZE - GUARD_SIZE).
628		 */
629
630		/* This gives us TOS */
631		bsd_args.addr = linux_args->addr + linux_args->len;
632
633		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
634			/* Some linux apps will attempt to mmap
635			 * thread stacks near the top of their
636			 * address space.  If their TOS is greater
637			 * than vm_maxsaddr, vm_map_growstack()
638			 * will confuse the thread stack with the
639			 * process stack and deliver a SEGV if they
640			 * attempt to grow the thread stack past their
641			 * current stacksize rlimit.  To avoid this,
642			 * adjust vm_maxsaddr upwards to reflect
643			 * the current stacksize rlimit rather
644			 * than the maximum possible stacksize.
645			 * It would be better to adjust the
646			 * mmap'ed region, but some apps do not check
647			 * mmap's return value.
648			 */
649			PROC_LOCK(p);
650			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
651			    lim_cur(p, RLIMIT_STACK);
652			PROC_UNLOCK(p);
653		}
654
655		/* This gives us our maximum stack size */
656		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
657			bsd_args.len = linux_args->len;
658		else
659			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
660
661		/* This gives us a new BOS.  If we're using VM_STACK, then
662		 * mmap will just map the top SGROWSIZ bytes, and let
663		 * the stack grow down to the limit at BOS.  If we're
664		 * not using VM_STACK we map the full stack, since we
665		 * don't have a way to autogrow it.
666		 */
667		bsd_args.addr -= bsd_args.len;
668	} else {
669		bsd_args.addr = linux_args->addr;
670		bsd_args.len  = linux_args->len;
671	}
672
673	bsd_args.prot = linux_args->prot | PROT_READ;	/* always required */
674	if (linux_args->flags & LINUX_MAP_ANON)
675		bsd_args.fd = -1;
676	else
677		bsd_args.fd = linux_args->fd;
678	bsd_args.pos = linux_args->pos;
679	bsd_args.pad = 0;
680
681#ifdef DEBUG
682	if (ldebug(mmap))
683		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
684		    __func__,
685		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
686		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
687#endif
688	error = mmap(td, &bsd_args);
689#ifdef DEBUG
690	if (ldebug(mmap))
691		printf("-> %s() return: 0x%x (0x%08x)\n",
692			__func__, error, (u_int)td->td_retval[0]);
693#endif
694	return (error);
695}
696
697int
698linux_pipe(struct thread *td, struct linux_pipe_args *args)
699{
700	int error;
701	int reg_edx;
702
703#ifdef DEBUG
704	if (ldebug(pipe))
705		printf(ARGS(pipe, "*"));
706#endif
707
708	reg_edx = td->td_retval[1];
709	error = pipe(td, 0);
710	if (error) {
711		td->td_retval[1] = reg_edx;
712		return (error);
713	}
714
715	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
716	if (error) {
717		td->td_retval[1] = reg_edx;
718		return (error);
719	}
720
721	td->td_retval[1] = reg_edx;
722	td->td_retval[0] = 0;
723	return (0);
724}
725
726int
727linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
728{
729	int error;
730	struct i386_ioperm_args iia;
731
732	iia.start = args->start;
733	iia.length = args->length;
734	iia.enable = args->enable;
735	mtx_lock(&Giant);
736	error = i386_set_ioperm(td, &iia);
737	mtx_unlock(&Giant);
738	return (error);
739}
740
741int
742linux_iopl(struct thread *td, struct linux_iopl_args *args)
743{
744	int error;
745
746	if (args->level < 0 || args->level > 3)
747		return (EINVAL);
748	if ((error = suser(td)) != 0)
749		return (error);
750	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
751		return (error);
752	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
753	    (args->level * (PSL_IOPL / 3));
754	return (0);
755}
756
757int
758linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
759{
760	int error;
761	struct i386_ldt_args ldt;
762	struct l_descriptor ld;
763	union descriptor desc;
764
765	if (uap->ptr == NULL)
766		return (EINVAL);
767
768	switch (uap->func) {
769	case 0x00: /* read_ldt */
770		ldt.start = 0;
771		ldt.descs = uap->ptr;
772		ldt.num = uap->bytecount / sizeof(union descriptor);
773		mtx_lock(&Giant);
774		error = i386_get_ldt(td, &ldt);
775		td->td_retval[0] *= sizeof(union descriptor);
776		mtx_unlock(&Giant);
777		break;
778	case 0x01: /* write_ldt */
779	case 0x11: /* write_ldt */
780		if (uap->bytecount != sizeof(ld))
781			return (EINVAL);
782
783		error = copyin(uap->ptr, &ld, sizeof(ld));
784		if (error)
785			return (error);
786
787		ldt.start = ld.entry_number;
788		ldt.descs = &desc;
789		ldt.num = 1;
790		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
791		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
792		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
793		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
794		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
795			(ld.contents << 2);
796		desc.sd.sd_dpl = 3;
797		desc.sd.sd_p = (ld.seg_not_present ^ 1);
798		desc.sd.sd_xx = 0;
799		desc.sd.sd_def32 = ld.seg_32bit;
800		desc.sd.sd_gran = ld.limit_in_pages;
801		mtx_lock(&Giant);
802		error = i386_set_ldt(td, &ldt, &desc);
803		mtx_unlock(&Giant);
804		break;
805	default:
806		error = EINVAL;
807		break;
808	}
809
810	if (error == EOPNOTSUPP) {
811		printf("linux: modify_ldt needs kernel option USER_LDT\n");
812		error = ENOSYS;
813	}
814
815	return (error);
816}
817
818int
819linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
820{
821	l_osigaction_t osa;
822	l_sigaction_t act, oact;
823	int error;
824
825#ifdef DEBUG
826	if (ldebug(sigaction))
827		printf(ARGS(sigaction, "%d, %p, %p"),
828		    args->sig, (void *)args->nsa, (void *)args->osa);
829#endif
830
831	if (args->nsa != NULL) {
832		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
833		if (error)
834			return (error);
835		act.lsa_handler = osa.lsa_handler;
836		act.lsa_flags = osa.lsa_flags;
837		act.lsa_restorer = osa.lsa_restorer;
838		LINUX_SIGEMPTYSET(act.lsa_mask);
839		act.lsa_mask.__bits[0] = osa.lsa_mask;
840	}
841
842	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
843	    args->osa ? &oact : NULL);
844
845	if (args->osa != NULL && !error) {
846		osa.lsa_handler = oact.lsa_handler;
847		osa.lsa_flags = oact.lsa_flags;
848		osa.lsa_restorer = oact.lsa_restorer;
849		osa.lsa_mask = oact.lsa_mask.__bits[0];
850		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
851	}
852
853	return (error);
854}
855
856/*
857 * Linux has two extra args, restart and oldmask.  We dont use these,
858 * but it seems that "restart" is actually a context pointer that
859 * enables the signal to happen with a different register set.
860 */
861int
862linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
863{
864	sigset_t sigmask;
865	l_sigset_t mask;
866
867#ifdef DEBUG
868	if (ldebug(sigsuspend))
869		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
870#endif
871
872	LINUX_SIGEMPTYSET(mask);
873	mask.__bits[0] = args->mask;
874	linux_to_bsd_sigset(&mask, &sigmask);
875	return (kern_sigsuspend(td, sigmask));
876}
877
878int
879linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
880{
881	l_sigset_t lmask;
882	sigset_t sigmask;
883	int error;
884
885#ifdef DEBUG
886	if (ldebug(rt_sigsuspend))
887		printf(ARGS(rt_sigsuspend, "%p, %d"),
888		    (void *)uap->newset, uap->sigsetsize);
889#endif
890
891	if (uap->sigsetsize != sizeof(l_sigset_t))
892		return (EINVAL);
893
894	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
895	if (error)
896		return (error);
897
898	linux_to_bsd_sigset(&lmask, &sigmask);
899	return (kern_sigsuspend(td, sigmask));
900}
901
902int
903linux_pause(struct thread *td, struct linux_pause_args *args)
904{
905	struct proc *p = td->td_proc;
906	sigset_t sigmask;
907
908#ifdef DEBUG
909	if (ldebug(pause))
910		printf(ARGS(pause, ""));
911#endif
912
913	PROC_LOCK(p);
914	sigmask = td->td_sigmask;
915	PROC_UNLOCK(p);
916	return (kern_sigsuspend(td, sigmask));
917}
918
919int
920linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
921{
922	stack_t ss, oss;
923	l_stack_t lss;
924	int error;
925
926#ifdef DEBUG
927	if (ldebug(sigaltstack))
928		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
929#endif
930
931	if (uap->uss != NULL) {
932		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
933		if (error)
934			return (error);
935
936		ss.ss_sp = lss.ss_sp;
937		ss.ss_size = lss.ss_size;
938		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
939	}
940	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
941	    (uap->uoss != NULL) ? &oss : NULL);
942	if (!error && uap->uoss != NULL) {
943		lss.ss_sp = oss.ss_sp;
944		lss.ss_size = oss.ss_size;
945		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
946		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
947	}
948
949	return (error);
950}
951
952int
953linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
954{
955	struct ftruncate_args sa;
956
957#ifdef DEBUG
958	if (ldebug(ftruncate64))
959		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
960		    (intmax_t)args->length);
961#endif
962
963	sa.fd = args->fd;
964	sa.pad = 0;
965	sa.length = args->length;
966	return ftruncate(td, &sa);
967}
968
969int
970linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
971{
972	struct l_user_desc info;
973	int error;
974	int idx;
975	int a[2];
976	struct segment_descriptor sd;
977
978	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
979	if (error)
980		return (error);
981
982#ifdef DEBUG
983	if (ldebug(set_thread_area))
984	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
985		      info.entry_number,
986      		      info.base_addr,
987      		      info.limit,
988      		      info.seg_32bit,
989		      info.contents,
990      		      info.read_exec_only,
991      		      info.limit_in_pages,
992      		      info.seg_not_present,
993      		      info.useable);
994#endif
995
996	idx = info.entry_number;
997	/* Semantics of linux version: every thread in the system has array
998	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
999	 * syscall loads one of the selected tls decriptors with a value
1000	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1001	 * descriptors.
1002	 *
1003	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1004	 * descriptors and use just the 1st one. The tls_array[] is used only in
1005	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1006	 * we use just one GDT descriptor for TLS so we will load just one.
1007	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1008	 * comment in the linux sources says wine might do that.
1009	 */
1010
1011	/* we support just GLIBC TLS now
1012	 * we should let 3 proceed as well because we use this segment so
1013	 * if code does two subsequent calls it should succeed
1014	 */
1015	if (idx != 6 && idx != -1 && idx != 3)
1016		return (EINVAL);
1017
1018	/* we have to copy out the GDT entry we use
1019	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1020	 * XXX: what if userspace program doesnt check this value and tries
1021	 * to use 6, 7 or 8?
1022	 */
1023	idx = info.entry_number = 3;
1024	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1025	if (error)
1026		return (error);
1027
1028	if (LDT_empty(&info)) {
1029		a[0] = 0;
1030		a[1] = 0;
1031	} else {
1032		a[0] = LDT_entry_a(&info);
1033		a[1] = LDT_entry_b(&info);
1034	}
1035
1036	memcpy(&sd, &a, sizeof(a));
1037#ifdef DEBUG
1038	if (ldebug(set_thread_area))
1039	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1040			sd.sd_hibase,
1041			sd.sd_lolimit,
1042			sd.sd_hilimit,
1043			sd.sd_type,
1044			sd.sd_dpl,
1045			sd.sd_p,
1046			sd.sd_xx,
1047			sd.sd_def32,
1048			sd.sd_gran);
1049#endif
1050
1051	/* this is taken from i386 version of cpu_set_user_tls() */
1052	critical_enter();
1053	/* set %gs */
1054	td->td_pcb->pcb_gsd = sd;
1055	PCPU_GET(fsgs_gdt)[1] = sd;
1056	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1057	critical_exit();
1058
1059	return (0);
1060}
1061
1062int
1063linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1064{
1065
1066	struct l_user_desc info;
1067	int error;
1068	int idx;
1069	struct l_desc_struct desc;
1070	struct segment_descriptor sd;
1071
1072#ifdef DEBUG
1073	if (ldebug(get_thread_area))
1074		printf(ARGS(get_thread_area, "%p"), args->desc);
1075#endif
1076
1077	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1078	if (error)
1079		return (error);
1080
1081	idx = info.entry_number;
1082	/* XXX: I am not sure if we want 3 to be allowed too. */
1083	if (idx != 6 && idx != 3)
1084		return (EINVAL);
1085
1086	idx = 3;
1087
1088	memset(&info, 0, sizeof(info));
1089
1090	sd = PCPU_GET(fsgs_gdt)[1];
1091
1092	memcpy(&desc, &sd, sizeof(desc));
1093
1094	info.entry_number = idx;
1095	info.base_addr = GET_BASE(&desc);
1096	info.limit = GET_LIMIT(&desc);
1097	info.seg_32bit = GET_32BIT(&desc);
1098	info.contents = GET_CONTENTS(&desc);
1099	info.read_exec_only = !GET_WRITABLE(&desc);
1100	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1101	info.seg_not_present = !GET_PRESENT(&desc);
1102	info.useable = GET_USEABLE(&desc);
1103
1104	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1105	if (error)
1106	   	return (EFAULT);
1107
1108	return (0);
1109}
1110
1111/* copied from kern/kern_time.c */
1112int
1113linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1114{
1115   	return ktimer_create(td, (struct ktimer_create_args *) args);
1116}
1117
1118int
1119linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1120{
1121   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1122}
1123
1124int
1125linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1126{
1127   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1128}
1129
1130int
1131linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1132{
1133   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1134}
1135
1136int
1137linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1138{
1139   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1140}
1141
1142/* XXX: this wont work with module - convert it */
1143int
1144linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1145{
1146#ifdef P1003_1B_MQUEUE
1147   	return kmq_open(td, (struct kmq_open_args *) args);
1148#else
1149	return (ENOSYS);
1150#endif
1151}
1152
1153int
1154linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1155{
1156#ifdef P1003_1B_MQUEUE
1157   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1158#else
1159	return (ENOSYS);
1160#endif
1161}
1162
1163int
1164linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1165{
1166#ifdef P1003_1B_MQUEUE
1167   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1168#else
1169	return (ENOSYS);
1170#endif
1171}
1172
1173int
1174linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1175{
1176#ifdef P1003_1B_MQUEUE
1177   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1178#else
1179	return (ENOSYS);
1180#endif
1181}
1182
1183int
1184linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1185{
1186#ifdef P1003_1B_MQUEUE
1187	return kmq_notify(td, (struct kmq_notify_args *) args);
1188#else
1189	return (ENOSYS);
1190#endif
1191}
1192
1193int
1194linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1195{
1196#ifdef P1003_1B_MQUEUE
1197   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1198#else
1199	return (ENOSYS);
1200#endif
1201}
1202
1203