linux_machdep.c revision 166150
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/i386/linux/linux_machdep.c 166150 2007-01-20 14:58:59Z netchild $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/file.h>
35#include <sys/fcntl.h>
36#include <sys/imgact.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mman.h>
40#include <sys/mutex.h>
41#include <sys/sx.h>
42#include <sys/priv.h>
43#include <sys/proc.h>
44#include <sys/queue.h>
45#include <sys/resource.h>
46#include <sys/resourcevar.h>
47#include <sys/signalvar.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysproto.h>
50#include <sys/unistd.h>
51#include <sys/wait.h>
52
53#include <machine/frame.h>
54#include <machine/psl.h>
55#include <machine/segments.h>
56#include <machine/sysarch.h>
57
58#include <vm/vm.h>
59#include <vm/pmap.h>
60#include <vm/vm_map.h>
61
62#include <i386/linux/linux.h>
63#include <i386/linux/linux_proto.h>
64#include <compat/linux/linux_ipc.h>
65#include <compat/linux/linux_signal.h>
66#include <compat/linux/linux_util.h>
67#include <compat/linux/linux_emul.h>
68
69#include <i386/include/pcb.h>			/* needed for pcb definition in linux_set_thread_area */
70
71#include "opt_posix.h"
72
73extern struct sysentvec elf32_freebsd_sysvec;	/* defined in i386/i386/elf_machdep.c */
74
75struct l_descriptor {
76	l_uint		entry_number;
77	l_ulong		base_addr;
78	l_uint		limit;
79	l_uint		seg_32bit:1;
80	l_uint		contents:2;
81	l_uint		read_exec_only:1;
82	l_uint		limit_in_pages:1;
83	l_uint		seg_not_present:1;
84	l_uint		useable:1;
85};
86
87struct l_old_select_argv {
88	l_int		nfds;
89	l_fd_set	*readfds;
90	l_fd_set	*writefds;
91	l_fd_set	*exceptfds;
92	struct l_timeval	*timeout;
93};
94
95int
96linux_to_bsd_sigaltstack(int lsa)
97{
98	int bsa = 0;
99
100	if (lsa & LINUX_SS_DISABLE)
101		bsa |= SS_DISABLE;
102	if (lsa & LINUX_SS_ONSTACK)
103		bsa |= SS_ONSTACK;
104	return (bsa);
105}
106
107int
108bsd_to_linux_sigaltstack(int bsa)
109{
110	int lsa = 0;
111
112	if (bsa & SS_DISABLE)
113		lsa |= LINUX_SS_DISABLE;
114	if (bsa & SS_ONSTACK)
115		lsa |= LINUX_SS_ONSTACK;
116	return (lsa);
117}
118
119int
120linux_execve(struct thread *td, struct linux_execve_args *args)
121{
122	int error;
123	char *newpath;
124	struct image_args eargs;
125
126	LCONVPATHEXIST(td, args->path, &newpath);
127
128#ifdef DEBUG
129	if (ldebug(execve))
130		printf(ARGS(execve, "%s"), newpath);
131#endif
132
133	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
134	    args->argp, args->envp);
135	free(newpath, M_TEMP);
136	if (error == 0)
137		error = kern_execve(td, &eargs, NULL);
138	if (error == 0)
139	   	/* linux process can exec fbsd one, dont attempt
140		 * to create emuldata for such process using
141		 * linux_proc_init, this leads to a panic on KASSERT
142		 * because such process has p->p_emuldata == NULL
143		 */
144	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
145   		   	error = linux_proc_init(td, 0, 0);
146	return (error);
147}
148
149struct l_ipc_kludge {
150	struct l_msgbuf *msgp;
151	l_long msgtyp;
152};
153
154int
155linux_ipc(struct thread *td, struct linux_ipc_args *args)
156{
157
158	switch (args->what & 0xFFFF) {
159	case LINUX_SEMOP: {
160		struct linux_semop_args a;
161
162		a.semid = args->arg1;
163		a.tsops = args->ptr;
164		a.nsops = args->arg2;
165		return (linux_semop(td, &a));
166	}
167	case LINUX_SEMGET: {
168		struct linux_semget_args a;
169
170		a.key = args->arg1;
171		a.nsems = args->arg2;
172		a.semflg = args->arg3;
173		return (linux_semget(td, &a));
174	}
175	case LINUX_SEMCTL: {
176		struct linux_semctl_args a;
177		int error;
178
179		a.semid = args->arg1;
180		a.semnum = args->arg2;
181		a.cmd = args->arg3;
182		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
183		if (error)
184			return (error);
185		return (linux_semctl(td, &a));
186	}
187	case LINUX_MSGSND: {
188		struct linux_msgsnd_args a;
189
190		a.msqid = args->arg1;
191		a.msgp = args->ptr;
192		a.msgsz = args->arg2;
193		a.msgflg = args->arg3;
194		return (linux_msgsnd(td, &a));
195	}
196	case LINUX_MSGRCV: {
197		struct linux_msgrcv_args a;
198
199		a.msqid = args->arg1;
200		a.msgsz = args->arg2;
201		a.msgflg = args->arg3;
202		if ((args->what >> 16) == 0) {
203			struct l_ipc_kludge tmp;
204			int error;
205
206			if (args->ptr == NULL)
207				return (EINVAL);
208			error = copyin(args->ptr, &tmp, sizeof(tmp));
209			if (error)
210				return (error);
211			a.msgp = tmp.msgp;
212			a.msgtyp = tmp.msgtyp;
213		} else {
214			a.msgp = args->ptr;
215			a.msgtyp = args->arg5;
216		}
217		return (linux_msgrcv(td, &a));
218	}
219	case LINUX_MSGGET: {
220		struct linux_msgget_args a;
221
222		a.key = args->arg1;
223		a.msgflg = args->arg2;
224		return (linux_msgget(td, &a));
225	}
226	case LINUX_MSGCTL: {
227		struct linux_msgctl_args a;
228
229		a.msqid = args->arg1;
230		a.cmd = args->arg2;
231		a.buf = args->ptr;
232		return (linux_msgctl(td, &a));
233	}
234	case LINUX_SHMAT: {
235		struct linux_shmat_args a;
236
237		a.shmid = args->arg1;
238		a.shmaddr = args->ptr;
239		a.shmflg = args->arg2;
240		a.raddr = (l_ulong *)args->arg3;
241		return (linux_shmat(td, &a));
242	}
243	case LINUX_SHMDT: {
244		struct linux_shmdt_args a;
245
246		a.shmaddr = args->ptr;
247		return (linux_shmdt(td, &a));
248	}
249	case LINUX_SHMGET: {
250		struct linux_shmget_args a;
251
252		a.key = args->arg1;
253		a.size = args->arg2;
254		a.shmflg = args->arg3;
255		return (linux_shmget(td, &a));
256	}
257	case LINUX_SHMCTL: {
258		struct linux_shmctl_args a;
259
260		a.shmid = args->arg1;
261		a.cmd = args->arg2;
262		a.buf = args->ptr;
263		return (linux_shmctl(td, &a));
264	}
265	default:
266		break;
267	}
268
269	return (EINVAL);
270}
271
272int
273linux_old_select(struct thread *td, struct linux_old_select_args *args)
274{
275	struct l_old_select_argv linux_args;
276	struct linux_select_args newsel;
277	int error;
278
279#ifdef DEBUG
280	if (ldebug(old_select))
281		printf(ARGS(old_select, "%p"), args->ptr);
282#endif
283
284	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
285	if (error)
286		return (error);
287
288	newsel.nfds = linux_args.nfds;
289	newsel.readfds = linux_args.readfds;
290	newsel.writefds = linux_args.writefds;
291	newsel.exceptfds = linux_args.exceptfds;
292	newsel.timeout = linux_args.timeout;
293	return (linux_select(td, &newsel));
294}
295
296int
297linux_fork(struct thread *td, struct linux_fork_args *args)
298{
299	int error;
300	struct proc *p2;
301	struct thread *td2;
302
303#ifdef DEBUG
304	if (ldebug(fork))
305		printf(ARGS(fork, ""));
306#endif
307
308	if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0)
309		return (error);
310
311	if (error == 0) {
312		td->td_retval[0] = p2->p_pid;
313		td->td_retval[1] = 0;
314	}
315
316	if (td->td_retval[1] == 1)
317		td->td_retval[0] = 0;
318	error = linux_proc_init(td, td->td_retval[0], 0);
319	if (error)
320		return (error);
321
322	td2 = FIRST_THREAD_IN_PROC(p2);
323
324	/*
325	 * Make this runnable after we are finished with it.
326	 */
327	mtx_lock_spin(&sched_lock);
328	TD_SET_CAN_RUN(td2);
329	setrunqueue(td2, SRQ_BORING);
330	mtx_unlock_spin(&sched_lock);
331
332	return (0);
333}
334
335int
336linux_vfork(struct thread *td, struct linux_vfork_args *args)
337{
338	int error;
339	struct proc *p2;
340	struct thread *td2;
341
342#ifdef DEBUG
343	if (ldebug(vfork))
344		printf(ARGS(vfork, ""));
345#endif
346
347	/* exclude RFPPWAIT */
348	if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0)
349		return (error);
350	if (error == 0) {
351		td->td_retval[0] = p2->p_pid;
352		td->td_retval[1] = 0;
353	}
354	/* Are we the child? */
355	if (td->td_retval[1] == 1)
356		td->td_retval[0] = 0;
357	error = linux_proc_init(td, td->td_retval[0], 0);
358	if (error)
359		return (error);
360
361	PROC_LOCK(p2);
362	p2->p_flag |= P_PPWAIT;
363	PROC_UNLOCK(p2);
364
365	td2 = FIRST_THREAD_IN_PROC(p2);
366
367	/*
368	 * Make this runnable after we are finished with it.
369	 */
370	mtx_lock_spin(&sched_lock);
371	TD_SET_CAN_RUN(td2);
372	setrunqueue(td2, SRQ_BORING);
373	mtx_unlock_spin(&sched_lock);
374
375	/* wait for the children to exit, ie. emulate vfork */
376	PROC_LOCK(p2);
377	while (p2->p_flag & P_PPWAIT)
378	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
379	PROC_UNLOCK(p2);
380
381	return (0);
382}
383
384int
385linux_clone(struct thread *td, struct linux_clone_args *args)
386{
387	int error, ff = RFPROC | RFSTOPPED;
388	struct proc *p2;
389	struct thread *td2;
390	int exit_signal;
391	struct linux_emuldata *em;
392
393#ifdef DEBUG
394	if (ldebug(clone)) {
395   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
396		    (unsigned int)args->flags, (unsigned int)args->stack,
397		    (unsigned int)args->parent_tidptr, (unsigned int)args->child_tidptr);
398	}
399#endif
400
401	exit_signal = args->flags & 0x000000ff;
402	if (!LINUX_SIG_VALID(exit_signal) && exit_signal != 0)
403		return (EINVAL);
404
405	if (exit_signal <= LINUX_SIGTBLSZ)
406		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
407
408	if (args->flags & CLONE_VM)
409		ff |= RFMEM;
410	if (args->flags & CLONE_SIGHAND)
411		ff |= RFSIGSHARE;
412	/*
413	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
414	 * and open files is independant. in fbsd its in one
415	 * structure but in reality it doesnt make any problems
416	 * because both this flags are set at once usually.
417	 */
418	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
419		ff |= RFFDG;
420
421	/*
422	 * Attempt to detect when linux_clone(2) is used for creating
423	 * kernel threads. Unfortunately despite the existence of the
424	 * CLONE_THREAD flag, version of linuxthreads package used in
425	 * most popular distros as of beginning of 2005 doesn't make
426	 * any use of it. Therefore, this detection relay fully on
427	 * empirical observation that linuxthreads sets certain
428	 * combination of flags, so that we can make more or less
429	 * precise detection and notify the FreeBSD kernel that several
430	 * processes are in fact part of the same threading group, so
431	 * that special treatment is necessary for signal delivery
432	 * between those processes and fd locking.
433	 */
434	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
435		ff |= RFTHREAD;
436
437	error = fork1(td, ff, 0, &p2);
438	if (error)
439		return (error);
440
441	/* create the emuldata */
442	error = linux_proc_init(td, p2->p_pid, args->flags);
443	/* reference it - no need to check this */
444	em = em_find(p2, EMUL_DOLOCK);
445	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
446	/* and adjust it */
447	if (args->flags & CLONE_PARENT_SETTID) {
448	   	if (args->parent_tidptr == NULL) {
449		   	EMUL_UNLOCK(&emul_lock);
450			return (EINVAL);
451		}
452		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
453		if (error) {
454		   	EMUL_UNLOCK(&emul_lock);
455			return (error);
456		}
457	}
458
459	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
460	   	sx_xlock(&proctree_lock);
461		PROC_LOCK(p2);
462		proc_reparent(p2, td->td_proc->p_pptr);
463		PROC_UNLOCK(p2);
464		sx_xunlock(&proctree_lock);
465	}
466
467	if (args->flags & CLONE_THREAD) {
468	   	/* XXX: linux mangles pgrp and pptr somehow
469		 * I think it might be this but I am not sure.
470		 */
471#ifdef notyet
472	   	PROC_LOCK(p2);
473	   	p2->p_pgrp = td->td_proc->p_pgrp;
474	   	PROC_UNLOCK(p2);
475#endif
476	 	exit_signal = 0;
477	}
478
479	if (args->flags & CLONE_CHILD_SETTID)
480		em->child_set_tid = args->child_tidptr;
481	else
482	   	em->child_set_tid = NULL;
483
484	if (args->flags & CLONE_CHILD_CLEARTID)
485		em->child_clear_tid = args->child_tidptr;
486	else
487	   	em->child_clear_tid = NULL;
488
489	EMUL_UNLOCK(&emul_lock);
490
491	PROC_LOCK(p2);
492	p2->p_sigparent = exit_signal;
493	PROC_UNLOCK(p2);
494	td2 = FIRST_THREAD_IN_PROC(p2);
495	/*
496	 * in a case of stack = NULL we are supposed to COW calling process stack
497	 * this is what normal fork() does so we just keep the tf_esp arg intact
498	 */
499	if (args->stack)
500   	   	td2->td_frame->tf_esp = (unsigned int)args->stack;
501
502	if (args->flags & CLONE_SETTLS) {
503   	   	struct l_user_desc info;
504   	   	int idx;
505	   	int a[2];
506		struct segment_descriptor sd;
507
508	   	error = copyin((void *)td->td_frame->tf_esi, &info, sizeof(struct l_user_desc));
509		if (error)
510   		   	return (error);
511
512		idx = info.entry_number;
513
514		/*
515		 * looks like we're getting the idx we returned
516		 * in the set_thread_area() syscall
517		 */
518		if (idx != 6 && idx != 3)
519			return (EINVAL);
520
521		/* this doesnt happen in practice */
522		if (idx == 6) {
523		   	/* we might copy out the entry_number as 3 */
524		   	info.entry_number = 3;
525			error = copyout(&info, (void *) td->td_frame->tf_esi, sizeof(struct l_user_desc));
526			if (error)
527	   		   	return (error);
528		}
529
530		a[0] = LDT_entry_a(&info);
531		a[1] = LDT_entry_b(&info);
532
533		memcpy(&sd, &a, sizeof(a));
534#ifdef DEBUG
535	if (ldebug(clone))
536	   	printf("Segment created in clone with CLONE_SETTLS: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
537			sd.sd_hibase,
538			sd.sd_lolimit,
539			sd.sd_hilimit,
540			sd.sd_type,
541			sd.sd_dpl,
542			sd.sd_p,
543			sd.sd_xx,
544			sd.sd_def32,
545			sd.sd_gran);
546#endif
547
548		/* set %gs */
549		td2->td_pcb->pcb_gsd = sd;
550		td2->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL);
551	}
552
553#ifdef DEBUG
554	if (ldebug(clone))
555		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
556		    (long)p2->p_pid, args->stack, exit_signal);
557#endif
558	if (args->flags & CLONE_VFORK) {
559	   	PROC_LOCK(p2);
560		p2->p_flag |= P_PPWAIT;
561	   	PROC_UNLOCK(p2);
562	}
563
564	/*
565	 * Make this runnable after we are finished with it.
566	 */
567	mtx_lock_spin(&sched_lock);
568	TD_SET_CAN_RUN(td2);
569	setrunqueue(td2, SRQ_BORING);
570	mtx_unlock_spin(&sched_lock);
571
572	td->td_retval[0] = p2->p_pid;
573	td->td_retval[1] = 0;
574
575	if (args->flags & CLONE_VFORK) {
576   	   	/* wait for the children to exit, ie. emulate vfork */
577   	   	PROC_LOCK(p2);
578		while (p2->p_flag & P_PPWAIT)
579   		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
580		PROC_UNLOCK(p2);
581	}
582
583	return (0);
584}
585
586/* XXX move */
587struct l_mmap_argv {
588	l_caddr_t	addr;
589	l_int		len;
590	l_int		prot;
591	l_int		flags;
592	l_int		fd;
593	l_int		pos;
594};
595
596#define STACK_SIZE  (2 * 1024 * 1024)
597#define GUARD_SIZE  (4 * PAGE_SIZE)
598
599static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
600
601int
602linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
603{
604	struct l_mmap_argv linux_args;
605
606#ifdef DEBUG
607	if (ldebug(mmap2))
608		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
609		    (void *)args->addr, args->len, args->prot,
610		    args->flags, args->fd, args->pgoff);
611#endif
612
613	linux_args.addr = (l_caddr_t)args->addr;
614	linux_args.len = args->len;
615	linux_args.prot = args->prot;
616	linux_args.flags = args->flags;
617	linux_args.fd = args->fd;
618	linux_args.pos = args->pgoff * PAGE_SIZE;
619
620	return (linux_mmap_common(td, &linux_args));
621}
622
623int
624linux_mmap(struct thread *td, struct linux_mmap_args *args)
625{
626	int error;
627	struct l_mmap_argv linux_args;
628
629	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
630	if (error)
631		return (error);
632
633#ifdef DEBUG
634	if (ldebug(mmap))
635		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
636		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
637		    linux_args.flags, linux_args.fd, linux_args.pos);
638#endif
639
640	return (linux_mmap_common(td, &linux_args));
641}
642
643static int
644linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
645{
646	struct proc *p = td->td_proc;
647	struct mmap_args /* {
648		caddr_t addr;
649		size_t len;
650		int prot;
651		int flags;
652		int fd;
653		long pad;
654		off_t pos;
655	} */ bsd_args;
656	int error;
657	struct file *fp;
658
659	error = 0;
660	bsd_args.flags = 0;
661	fp = NULL;
662
663	/*
664	 * Linux mmap(2):
665	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
666	 */
667	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
668	    (linux_args->flags & LINUX_MAP_PRIVATE)))
669		return (EINVAL);
670
671	if (linux_args->flags & LINUX_MAP_SHARED)
672		bsd_args.flags |= MAP_SHARED;
673	if (linux_args->flags & LINUX_MAP_PRIVATE)
674		bsd_args.flags |= MAP_PRIVATE;
675	if (linux_args->flags & LINUX_MAP_FIXED)
676		bsd_args.flags |= MAP_FIXED;
677	if (linux_args->flags & LINUX_MAP_ANON)
678		bsd_args.flags |= MAP_ANON;
679	else
680		bsd_args.flags |= MAP_NOSYNC;
681	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
682		bsd_args.flags |= MAP_STACK;
683
684		/*
685		 * The linux MAP_GROWSDOWN option does not limit auto
686		 * growth of the region.  Linux mmap with this option
687		 * takes as addr the inital BOS, and as len, the initial
688		 * region size.  It can then grow down from addr without
689		 * limit.  However, linux threads has an implicit internal
690		 * limit to stack size of STACK_SIZE.  Its just not
691		 * enforced explicitly in linux.  But, here we impose
692		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
693		 * region, since we can do this with our mmap.
694		 *
695		 * Our mmap with MAP_STACK takes addr as the maximum
696		 * downsize limit on BOS, and as len the max size of
697		 * the region.  It them maps the top SGROWSIZ bytes,
698		 * and autgrows the region down, up to the limit
699		 * in addr.
700		 *
701		 * If we don't use the MAP_STACK option, the effect
702		 * of this code is to allocate a stack region of a
703		 * fixed size of (STACK_SIZE - GUARD_SIZE).
704		 */
705
706		/* This gives us TOS */
707		bsd_args.addr = linux_args->addr + linux_args->len;
708
709		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
710			/*
711			 * Some linux apps will attempt to mmap
712			 * thread stacks near the top of their
713			 * address space.  If their TOS is greater
714			 * than vm_maxsaddr, vm_map_growstack()
715			 * will confuse the thread stack with the
716			 * process stack and deliver a SEGV if they
717			 * attempt to grow the thread stack past their
718			 * current stacksize rlimit.  To avoid this,
719			 * adjust vm_maxsaddr upwards to reflect
720			 * the current stacksize rlimit rather
721			 * than the maximum possible stacksize.
722			 * It would be better to adjust the
723			 * mmap'ed region, but some apps do not check
724			 * mmap's return value.
725			 */
726			PROC_LOCK(p);
727			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
728			    lim_cur(p, RLIMIT_STACK);
729			PROC_UNLOCK(p);
730		}
731
732		/* This gives us our maximum stack size */
733		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
734			bsd_args.len = linux_args->len;
735		else
736			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
737
738		/*
739		 * This gives us a new BOS.  If we're using VM_STACK, then
740		 * mmap will just map the top SGROWSIZ bytes, and let
741		 * the stack grow down to the limit at BOS.  If we're
742		 * not using VM_STACK we map the full stack, since we
743		 * don't have a way to autogrow it.
744		 */
745		bsd_args.addr -= bsd_args.len;
746	} else {
747		bsd_args.addr = linux_args->addr;
748		bsd_args.len  = linux_args->len;
749	}
750
751	bsd_args.prot = linux_args->prot;
752	if (linux_args->flags & LINUX_MAP_ANON)
753		bsd_args.fd = -1;
754	else {
755		/*
756		 * Linux follows Solaris mmap(2) description:
757		 * The file descriptor fildes is opened with
758		 * read permission, regardless of the
759		 * protection options specified.
760		 * If PROT_WRITE is specified, the application
761		 * must have opened the file descriptor
762		 * fildes with write permission unless
763		 * MAP_PRIVATE is specified in the flag
764		 * argument as described below.
765		 */
766
767		if ((error = fget(td, linux_args->fd, &fp)) != 0)
768			return (error);
769		if (fp->f_type != DTYPE_VNODE) {
770			fdrop(fp, td);
771			return (EINVAL);
772		}
773
774		/* Linux mmap() just fails for O_WRONLY files */
775		if (! (fp->f_flag & FREAD)) {
776			fdrop(fp, td);
777			return (EACCES);
778		}
779
780		bsd_args.fd = linux_args->fd;
781		fdrop(fp, td);
782	}
783	bsd_args.pos = linux_args->pos;
784	bsd_args.pad = 0;
785
786#ifdef DEBUG
787	if (ldebug(mmap))
788		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
789		    __func__,
790		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
791		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
792#endif
793	error = mmap(td, &bsd_args);
794#ifdef DEBUG
795	if (ldebug(mmap))
796		printf("-> %s() return: 0x%x (0x%08x)\n",
797			__func__, error, (u_int)td->td_retval[0]);
798#endif
799	return (error);
800}
801
802int
803linux_pipe(struct thread *td, struct linux_pipe_args *args)
804{
805	int error;
806	int reg_edx;
807
808#ifdef DEBUG
809	if (ldebug(pipe))
810		printf(ARGS(pipe, "*"));
811#endif
812
813	reg_edx = td->td_retval[1];
814	error = pipe(td, 0);
815	if (error) {
816		td->td_retval[1] = reg_edx;
817		return (error);
818	}
819
820	error = copyout(td->td_retval, args->pipefds, 2*sizeof(int));
821	if (error) {
822		td->td_retval[1] = reg_edx;
823		return (error);
824	}
825
826	td->td_retval[1] = reg_edx;
827	td->td_retval[0] = 0;
828	return (0);
829}
830
831int
832linux_ioperm(struct thread *td, struct linux_ioperm_args *args)
833{
834	int error;
835	struct i386_ioperm_args iia;
836
837	iia.start = args->start;
838	iia.length = args->length;
839	iia.enable = args->enable;
840	mtx_lock(&Giant);
841	error = i386_set_ioperm(td, &iia);
842	mtx_unlock(&Giant);
843	return (error);
844}
845
846int
847linux_iopl(struct thread *td, struct linux_iopl_args *args)
848{
849	int error;
850
851	if (args->level < 0 || args->level > 3)
852		return (EINVAL);
853	if ((error = priv_check(td, PRIV_IO)) != 0)
854		return (error);
855	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
856		return (error);
857	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) |
858	    (args->level * (PSL_IOPL / 3));
859	return (0);
860}
861
862int
863linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap)
864{
865	int error;
866	struct i386_ldt_args ldt;
867	struct l_descriptor ld;
868	union descriptor desc;
869
870	if (uap->ptr == NULL)
871		return (EINVAL);
872
873	switch (uap->func) {
874	case 0x00: /* read_ldt */
875		ldt.start = 0;
876		ldt.descs = uap->ptr;
877		ldt.num = uap->bytecount / sizeof(union descriptor);
878		mtx_lock(&Giant);
879		error = i386_get_ldt(td, &ldt);
880		td->td_retval[0] *= sizeof(union descriptor);
881		mtx_unlock(&Giant);
882		break;
883	case 0x01: /* write_ldt */
884	case 0x11: /* write_ldt */
885		if (uap->bytecount != sizeof(ld))
886			return (EINVAL);
887
888		error = copyin(uap->ptr, &ld, sizeof(ld));
889		if (error)
890			return (error);
891
892		ldt.start = ld.entry_number;
893		ldt.descs = &desc;
894		ldt.num = 1;
895		desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
896		desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
897		desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
898		desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
899		desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
900			(ld.contents << 2);
901		desc.sd.sd_dpl = 3;
902		desc.sd.sd_p = (ld.seg_not_present ^ 1);
903		desc.sd.sd_xx = 0;
904		desc.sd.sd_def32 = ld.seg_32bit;
905		desc.sd.sd_gran = ld.limit_in_pages;
906		mtx_lock(&Giant);
907		error = i386_set_ldt(td, &ldt, &desc);
908		mtx_unlock(&Giant);
909		break;
910	default:
911		error = EINVAL;
912		break;
913	}
914
915	if (error == EOPNOTSUPP) {
916		printf("linux: modify_ldt needs kernel option USER_LDT\n");
917		error = ENOSYS;
918	}
919
920	return (error);
921}
922
923int
924linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
925{
926	l_osigaction_t osa;
927	l_sigaction_t act, oact;
928	int error;
929
930#ifdef DEBUG
931	if (ldebug(sigaction))
932		printf(ARGS(sigaction, "%d, %p, %p"),
933		    args->sig, (void *)args->nsa, (void *)args->osa);
934#endif
935
936	if (args->nsa != NULL) {
937		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
938		if (error)
939			return (error);
940		act.lsa_handler = osa.lsa_handler;
941		act.lsa_flags = osa.lsa_flags;
942		act.lsa_restorer = osa.lsa_restorer;
943		LINUX_SIGEMPTYSET(act.lsa_mask);
944		act.lsa_mask.__bits[0] = osa.lsa_mask;
945	}
946
947	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
948	    args->osa ? &oact : NULL);
949
950	if (args->osa != NULL && !error) {
951		osa.lsa_handler = oact.lsa_handler;
952		osa.lsa_flags = oact.lsa_flags;
953		osa.lsa_restorer = oact.lsa_restorer;
954		osa.lsa_mask = oact.lsa_mask.__bits[0];
955		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
956	}
957
958	return (error);
959}
960
961/*
962 * Linux has two extra args, restart and oldmask.  We dont use these,
963 * but it seems that "restart" is actually a context pointer that
964 * enables the signal to happen with a different register set.
965 */
966int
967linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
968{
969	sigset_t sigmask;
970	l_sigset_t mask;
971
972#ifdef DEBUG
973	if (ldebug(sigsuspend))
974		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
975#endif
976
977	LINUX_SIGEMPTYSET(mask);
978	mask.__bits[0] = args->mask;
979	linux_to_bsd_sigset(&mask, &sigmask);
980	return (kern_sigsuspend(td, sigmask));
981}
982
983int
984linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
985{
986	l_sigset_t lmask;
987	sigset_t sigmask;
988	int error;
989
990#ifdef DEBUG
991	if (ldebug(rt_sigsuspend))
992		printf(ARGS(rt_sigsuspend, "%p, %d"),
993		    (void *)uap->newset, uap->sigsetsize);
994#endif
995
996	if (uap->sigsetsize != sizeof(l_sigset_t))
997		return (EINVAL);
998
999	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
1000	if (error)
1001		return (error);
1002
1003	linux_to_bsd_sigset(&lmask, &sigmask);
1004	return (kern_sigsuspend(td, sigmask));
1005}
1006
1007int
1008linux_pause(struct thread *td, struct linux_pause_args *args)
1009{
1010	struct proc *p = td->td_proc;
1011	sigset_t sigmask;
1012
1013#ifdef DEBUG
1014	if (ldebug(pause))
1015		printf(ARGS(pause, ""));
1016#endif
1017
1018	PROC_LOCK(p);
1019	sigmask = td->td_sigmask;
1020	PROC_UNLOCK(p);
1021	return (kern_sigsuspend(td, sigmask));
1022}
1023
1024int
1025linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
1026{
1027	stack_t ss, oss;
1028	l_stack_t lss;
1029	int error;
1030
1031#ifdef DEBUG
1032	if (ldebug(sigaltstack))
1033		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
1034#endif
1035
1036	if (uap->uss != NULL) {
1037		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1038		if (error)
1039			return (error);
1040
1041		ss.ss_sp = lss.ss_sp;
1042		ss.ss_size = lss.ss_size;
1043		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1044	}
1045	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1046	    (uap->uoss != NULL) ? &oss : NULL);
1047	if (!error && uap->uoss != NULL) {
1048		lss.ss_sp = oss.ss_sp;
1049		lss.ss_size = oss.ss_size;
1050		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1051		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1052	}
1053
1054	return (error);
1055}
1056
1057int
1058linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1059{
1060	struct ftruncate_args sa;
1061
1062#ifdef DEBUG
1063	if (ldebug(ftruncate64))
1064		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1065		    (intmax_t)args->length);
1066#endif
1067
1068	sa.fd = args->fd;
1069	sa.pad = 0;
1070	sa.length = args->length;
1071	return ftruncate(td, &sa);
1072}
1073
1074int
1075linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args)
1076{
1077	struct l_user_desc info;
1078	int error;
1079	int idx;
1080	int a[2];
1081	struct segment_descriptor sd;
1082
1083	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1084	if (error)
1085		return (error);
1086
1087#ifdef DEBUG
1088	if (ldebug(set_thread_area))
1089	   	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
1090		      info.entry_number,
1091      		      info.base_addr,
1092      		      info.limit,
1093      		      info.seg_32bit,
1094		      info.contents,
1095      		      info.read_exec_only,
1096      		      info.limit_in_pages,
1097      		      info.seg_not_present,
1098      		      info.useable);
1099#endif
1100
1101	idx = info.entry_number;
1102	/*
1103	 * Semantics of linux version: every thread in the system has array
1104	 * of 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
1105	 * syscall loads one of the selected tls decriptors with a value
1106	 * and also loads GDT descriptors 6, 7 and 8 with the content of the per-thread
1107	 * descriptors.
1108	 *
1109	 * Semantics of fbsd version: I think we can ignore that linux has 3 per-thread
1110	 * descriptors and use just the 1st one. The tls_array[] is used only in
1111	 * set/get-thread_area() syscalls and for loading the GDT descriptors. In fbsd
1112	 * we use just one GDT descriptor for TLS so we will load just one.
1113	 * XXX: this doesnt work when user-space process tries to use more then 1 TLS segment
1114	 * comment in the linux sources says wine might do that.
1115	 */
1116
1117	/*
1118	 * we support just GLIBC TLS now
1119	 * we should let 3 proceed as well because we use this segment so
1120	 * if code does two subsequent calls it should succeed
1121	 */
1122	if (idx != 6 && idx != -1 && idx != 3)
1123		return (EINVAL);
1124
1125	/*
1126	 * we have to copy out the GDT entry we use
1127	 * FreeBSD uses GDT entry #3 for storing %gs so load that
1128	 * XXX: what if userspace program doesnt check this value and tries
1129	 * to use 6, 7 or 8?
1130	 */
1131	idx = info.entry_number = 3;
1132	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1133	if (error)
1134		return (error);
1135
1136	if (LDT_empty(&info)) {
1137		a[0] = 0;
1138		a[1] = 0;
1139	} else {
1140		a[0] = LDT_entry_a(&info);
1141		a[1] = LDT_entry_b(&info);
1142	}
1143
1144	memcpy(&sd, &a, sizeof(a));
1145#ifdef DEBUG
1146	if (ldebug(set_thread_area))
1147	   	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
1148			sd.sd_hibase,
1149			sd.sd_lolimit,
1150			sd.sd_hilimit,
1151			sd.sd_type,
1152			sd.sd_dpl,
1153			sd.sd_p,
1154			sd.sd_xx,
1155			sd.sd_def32,
1156			sd.sd_gran);
1157#endif
1158
1159	/* this is taken from i386 version of cpu_set_user_tls() */
1160	critical_enter();
1161	/* set %gs */
1162	td->td_pcb->pcb_gsd = sd;
1163	PCPU_GET(fsgs_gdt)[1] = sd;
1164	load_gs(GSEL(GUGS_SEL, SEL_UPL));
1165	critical_exit();
1166
1167	return (0);
1168}
1169
1170int
1171linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args)
1172{
1173
1174	struct l_user_desc info;
1175	int error;
1176	int idx;
1177	struct l_desc_struct desc;
1178	struct segment_descriptor sd;
1179
1180#ifdef DEBUG
1181	if (ldebug(get_thread_area))
1182		printf(ARGS(get_thread_area, "%p"), args->desc);
1183#endif
1184
1185	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1186	if (error)
1187		return (error);
1188
1189	idx = info.entry_number;
1190	/* XXX: I am not sure if we want 3 to be allowed too. */
1191	if (idx != 6 && idx != 3)
1192		return (EINVAL);
1193
1194	idx = 3;
1195
1196	memset(&info, 0, sizeof(info));
1197
1198	sd = PCPU_GET(fsgs_gdt)[1];
1199
1200	memcpy(&desc, &sd, sizeof(desc));
1201
1202	info.entry_number = idx;
1203	info.base_addr = GET_BASE(&desc);
1204	info.limit = GET_LIMIT(&desc);
1205	info.seg_32bit = GET_32BIT(&desc);
1206	info.contents = GET_CONTENTS(&desc);
1207	info.read_exec_only = !GET_WRITABLE(&desc);
1208	info.limit_in_pages = GET_LIMIT_PAGES(&desc);
1209	info.seg_not_present = !GET_PRESENT(&desc);
1210	info.useable = GET_USEABLE(&desc);
1211
1212	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1213	if (error)
1214	   	return (EFAULT);
1215
1216	return (0);
1217}
1218
1219/* copied from kern/kern_time.c */
1220int
1221linux_timer_create(struct thread *td, struct linux_timer_create_args *args)
1222{
1223   	return ktimer_create(td, (struct ktimer_create_args *) args);
1224}
1225
1226int
1227linux_timer_settime(struct thread *td, struct linux_timer_settime_args *args)
1228{
1229   	return ktimer_settime(td, (struct ktimer_settime_args *) args);
1230}
1231
1232int
1233linux_timer_gettime(struct thread *td, struct linux_timer_gettime_args *args)
1234{
1235   	return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
1236}
1237
1238int
1239linux_timer_getoverrun(struct thread *td, struct linux_timer_getoverrun_args *args)
1240{
1241   	return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
1242}
1243
1244int
1245linux_timer_delete(struct thread *td, struct linux_timer_delete_args *args)
1246{
1247   	return ktimer_delete(td, (struct ktimer_delete_args *) args);
1248}
1249
1250/* XXX: this wont work with module - convert it */
1251int
1252linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
1253{
1254#ifdef P1003_1B_MQUEUE
1255   	return kmq_open(td, (struct kmq_open_args *) args);
1256#else
1257	return (ENOSYS);
1258#endif
1259}
1260
1261int
1262linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
1263{
1264#ifdef P1003_1B_MQUEUE
1265   	return kmq_unlink(td, (struct kmq_unlink_args *) args);
1266#else
1267	return (ENOSYS);
1268#endif
1269}
1270
1271int
1272linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
1273{
1274#ifdef P1003_1B_MQUEUE
1275   	return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
1276#else
1277	return (ENOSYS);
1278#endif
1279}
1280
1281int
1282linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
1283{
1284#ifdef P1003_1B_MQUEUE
1285   	return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
1286#else
1287	return (ENOSYS);
1288#endif
1289}
1290
1291int
1292linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
1293{
1294#ifdef P1003_1B_MQUEUE
1295	return kmq_notify(td, (struct kmq_notify_args *) args);
1296#else
1297	return (ENOSYS);
1298#endif
1299}
1300
1301int
1302linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
1303{
1304#ifdef P1003_1B_MQUEUE
1305   	return kmq_setattr(td, (struct kmq_setattr_args *) args);
1306#else
1307	return (ENOSYS);
1308#endif
1309}
1310
1311