linux32_machdep.c revision 163372
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 163372 2006-10-15 13:25:23Z netchild $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/clock.h>
38#include <sys/imgact.h>
39#include <sys/limits.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/mman.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/resource.h>
46#include <sys/resourcevar.h>
47#include <sys/syscallsubr.h>
48#include <sys/sysproto.h>
49#include <sys/unistd.h>
50
51#include <machine/frame.h>
52
53#include <vm/vm.h>
54#include <vm/pmap.h>
55#include <vm/vm_extern.h>
56#include <vm/vm_kern.h>
57#include <vm/vm_map.h>
58
59#include <amd64/linux32/linux.h>
60#include <amd64/linux32/linux32_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66struct l_old_select_argv {
67	l_int		nfds;
68	l_uintptr_t	readfds;
69	l_uintptr_t	writefds;
70	l_uintptr_t	exceptfds;
71	l_uintptr_t	timeout;
72} __packed;
73
74int
75linux_to_bsd_sigaltstack(int lsa)
76{
77	int bsa = 0;
78
79	if (lsa & LINUX_SS_DISABLE)
80		bsa |= SS_DISABLE;
81	if (lsa & LINUX_SS_ONSTACK)
82		bsa |= SS_ONSTACK;
83	return (bsa);
84}
85
86int
87bsd_to_linux_sigaltstack(int bsa)
88{
89	int lsa = 0;
90
91	if (bsa & SS_DISABLE)
92		lsa |= LINUX_SS_DISABLE;
93	if (bsa & SS_ONSTACK)
94		lsa |= LINUX_SS_ONSTACK;
95	return (lsa);
96}
97
98/*
99 * Custom version of exec_copyin_args() so that we can translate
100 * the pointers.
101 */
102static int
103linux_exec_copyin_args(struct image_args *args, char *fname,
104    enum uio_seg segflg, char **argv, char **envv)
105{
106	char *argp, *envp;
107	u_int32_t *p32, arg;
108	size_t length;
109	int error;
110
111	bzero(args, sizeof(*args));
112	if (argv == NULL)
113		return (EFAULT);
114
115	/*
116	 * Allocate temporary demand zeroed space for argument and
117	 *	environment strings
118	 */
119	args->buf = (char *) kmem_alloc_wait(exec_map,
120	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
121	if (args->buf == NULL)
122		return (ENOMEM);
123	args->begin_argv = args->buf;
124	args->endp = args->begin_argv;
125	args->stringspace = ARG_MAX;
126
127	args->fname = args->buf + ARG_MAX;
128
129	/*
130	 * Copy the file name.
131	 */
132	error = (segflg == UIO_SYSSPACE) ?
133	    copystr(fname, args->fname, PATH_MAX, &length) :
134	    copyinstr(fname, args->fname, PATH_MAX, &length);
135	if (error != 0)
136		goto err_exit;
137
138	/*
139	 * extract arguments first
140	 */
141	p32 = (u_int32_t *)argv;
142	for (;;) {
143		error = copyin(p32++, &arg, sizeof(arg));
144		if (error)
145			goto err_exit;
146		if (arg == 0)
147			break;
148		argp = PTRIN(arg);
149		error = copyinstr(argp, args->endp, args->stringspace, &length);
150		if (error) {
151			if (error == ENAMETOOLONG)
152				error = E2BIG;
153
154			goto err_exit;
155		}
156		args->stringspace -= length;
157		args->endp += length;
158		args->argc++;
159	}
160
161	args->begin_envv = args->endp;
162
163	/*
164	 * extract environment strings
165	 */
166	if (envv) {
167		p32 = (u_int32_t *)envv;
168		for (;;) {
169			error = copyin(p32++, &arg, sizeof(arg));
170			if (error)
171				goto err_exit;
172			if (arg == 0)
173				break;
174			envp = PTRIN(arg);
175			error = copyinstr(envp, args->endp, args->stringspace,
176			    &length);
177			if (error) {
178				if (error == ENAMETOOLONG)
179					error = E2BIG;
180				goto err_exit;
181			}
182			args->stringspace -= length;
183			args->endp += length;
184			args->envc++;
185		}
186	}
187
188	return (0);
189
190err_exit:
191	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
192	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
193	args->buf = NULL;
194	return (error);
195}
196
197int
198linux_execve(struct thread *td, struct linux_execve_args *args)
199{
200	struct image_args eargs;
201	char *path;
202	int error;
203
204	LCONVPATHEXIST(td, args->path, &path);
205
206#ifdef DEBUG
207	if (ldebug(execve))
208		printf(ARGS(execve, "%s"), path);
209#endif
210
211	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
212	    args->envp);
213	free(path, M_TEMP);
214	if (error == 0)
215		error = kern_execve(td, &eargs, NULL);
216	if (error == 0)
217	   	/* linux process can exec fbsd one, dont attempt
218		 * to create emuldata for such process using
219		 * linux_proc_init, this leads to a panic on KASSERT
220		 * because such process has p->p_emuldata == NULL
221		 */
222	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
223   		   	error = linux_proc_init(td, 0, 0);
224	return (error);
225}
226
227struct iovec32 {
228	u_int32_t iov_base;
229	int	iov_len;
230};
231
232CTASSERT(sizeof(struct iovec32) == 8);
233
234static int
235linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
236{
237	struct iovec32 iov32;
238	struct iovec *iov;
239	struct uio *uio;
240	u_int iovlen;
241	int error, i;
242
243	*uiop = NULL;
244	if (iovcnt > UIO_MAXIOV)
245		return (EINVAL);
246	iovlen = iovcnt * sizeof(struct iovec);
247	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
248	iov = (struct iovec *)(uio + 1);
249	for (i = 0; i < iovcnt; i++) {
250		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
251		if (error) {
252			free(uio, M_IOV);
253			return (error);
254		}
255		iov[i].iov_base = PTRIN(iov32.iov_base);
256		iov[i].iov_len = iov32.iov_len;
257	}
258	uio->uio_iov = iov;
259	uio->uio_iovcnt = iovcnt;
260	uio->uio_segflg = UIO_USERSPACE;
261	uio->uio_offset = -1;
262	uio->uio_resid = 0;
263	for (i = 0; i < iovcnt; i++) {
264		if (iov->iov_len > INT_MAX - uio->uio_resid) {
265			free(uio, M_IOV);
266			return (EINVAL);
267		}
268		uio->uio_resid += iov->iov_len;
269		iov++;
270	}
271	*uiop = uio;
272	return (0);
273}
274
275int
276linux_readv(struct thread *td, struct linux_readv_args *uap)
277{
278	struct uio *auio;
279	int error;
280
281	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
282	if (error)
283		return (error);
284	error = kern_readv(td, uap->fd, auio);
285	free(auio, M_IOV);
286	return (error);
287}
288
289int
290linux_writev(struct thread *td, struct linux_writev_args *uap)
291{
292	struct uio *auio;
293	int error;
294
295	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
296	if (error)
297		return (error);
298	error = kern_writev(td, uap->fd, auio);
299	free(auio, M_IOV);
300	return (error);
301}
302
303struct l_ipc_kludge {
304	l_uintptr_t msgp;
305	l_long msgtyp;
306} __packed;
307
308int
309linux_ipc(struct thread *td, struct linux_ipc_args *args)
310{
311
312	switch (args->what & 0xFFFF) {
313	case LINUX_SEMOP: {
314		struct linux_semop_args a;
315
316		a.semid = args->arg1;
317		a.tsops = args->ptr;
318		a.nsops = args->arg2;
319		return (linux_semop(td, &a));
320	}
321	case LINUX_SEMGET: {
322		struct linux_semget_args a;
323
324		a.key = args->arg1;
325		a.nsems = args->arg2;
326		a.semflg = args->arg3;
327		return (linux_semget(td, &a));
328	}
329	case LINUX_SEMCTL: {
330		struct linux_semctl_args a;
331		int error;
332
333		a.semid = args->arg1;
334		a.semnum = args->arg2;
335		a.cmd = args->arg3;
336		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
337		if (error)
338			return (error);
339		return (linux_semctl(td, &a));
340	}
341	case LINUX_MSGSND: {
342		struct linux_msgsnd_args a;
343
344		a.msqid = args->arg1;
345		a.msgp = args->ptr;
346		a.msgsz = args->arg2;
347		a.msgflg = args->arg3;
348		return (linux_msgsnd(td, &a));
349	}
350	case LINUX_MSGRCV: {
351		struct linux_msgrcv_args a;
352
353		a.msqid = args->arg1;
354		a.msgsz = args->arg2;
355		a.msgflg = args->arg3;
356		if ((args->what >> 16) == 0) {
357			struct l_ipc_kludge tmp;
358			int error;
359
360			if (args->ptr == 0)
361				return (EINVAL);
362			error = copyin(args->ptr, &tmp, sizeof(tmp));
363			if (error)
364				return (error);
365			a.msgp = PTRIN(tmp.msgp);
366			a.msgtyp = tmp.msgtyp;
367		} else {
368			a.msgp = args->ptr;
369			a.msgtyp = args->arg5;
370		}
371		return (linux_msgrcv(td, &a));
372	}
373	case LINUX_MSGGET: {
374		struct linux_msgget_args a;
375
376		a.key = args->arg1;
377		a.msgflg = args->arg2;
378		return (linux_msgget(td, &a));
379	}
380	case LINUX_MSGCTL: {
381		struct linux_msgctl_args a;
382
383		a.msqid = args->arg1;
384		a.cmd = args->arg2;
385		a.buf = args->ptr;
386		return (linux_msgctl(td, &a));
387	}
388	case LINUX_SHMAT: {
389		struct linux_shmat_args a;
390
391		a.shmid = args->arg1;
392		a.shmaddr = args->ptr;
393		a.shmflg = args->arg2;
394		a.raddr = PTRIN((l_uint)args->arg3);
395		return (linux_shmat(td, &a));
396	}
397	case LINUX_SHMDT: {
398		struct linux_shmdt_args a;
399
400		a.shmaddr = args->ptr;
401		return (linux_shmdt(td, &a));
402	}
403	case LINUX_SHMGET: {
404		struct linux_shmget_args a;
405
406		a.key = args->arg1;
407		a.size = args->arg2;
408		a.shmflg = args->arg3;
409		return (linux_shmget(td, &a));
410	}
411	case LINUX_SHMCTL: {
412		struct linux_shmctl_args a;
413
414		a.shmid = args->arg1;
415		a.cmd = args->arg2;
416		a.buf = args->ptr;
417		return (linux_shmctl(td, &a));
418	}
419	default:
420		break;
421	}
422
423	return (EINVAL);
424}
425
426int
427linux_old_select(struct thread *td, struct linux_old_select_args *args)
428{
429	struct l_old_select_argv linux_args;
430	struct linux_select_args newsel;
431	int error;
432
433#ifdef DEBUG
434	if (ldebug(old_select))
435		printf(ARGS(old_select, "%p"), args->ptr);
436#endif
437
438	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
439	if (error)
440		return (error);
441
442	newsel.nfds = linux_args.nfds;
443	newsel.readfds = PTRIN(linux_args.readfds);
444	newsel.writefds = PTRIN(linux_args.writefds);
445	newsel.exceptfds = PTRIN(linux_args.exceptfds);
446	newsel.timeout = PTRIN(linux_args.timeout);
447	return (linux_select(td, &newsel));
448}
449
450int
451linux_fork(struct thread *td, struct linux_fork_args *args)
452{
453	int error;
454
455#ifdef DEBUG
456	if (ldebug(fork))
457		printf(ARGS(fork, ""));
458#endif
459
460	if ((error = fork(td, (struct fork_args *)args)) != 0)
461		return (error);
462
463	if (td->td_retval[1] == 1)
464		td->td_retval[0] = 0;
465	error = linux_proc_init(td, td->td_retval[0], 0);
466	if (error)
467		return (error);
468
469	return (0);
470}
471
472int
473linux_vfork(struct thread *td, struct linux_vfork_args *args)
474{
475	int error;
476	struct proc *p2;
477
478#ifdef DEBUG
479	if (ldebug(vfork))
480		printf(ARGS(vfork, ""));
481#endif
482
483	/* exclude RFPPWAIT */
484	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
485		return (error);
486	if (error == 0) {
487	   	td->td_retval[0] = p2->p_pid;
488		td->td_retval[1] = 0;
489	}
490	/* Are we the child? */
491	if (td->td_retval[1] == 1)
492		td->td_retval[0] = 0;
493	error = linux_proc_init(td, td->td_retval[0], 0);
494	if (error)
495		return (error);
496	/* wait for the children to exit, ie. emulate vfork */
497	PROC_LOCK(p2);
498	p2->p_flag |= P_PPWAIT;
499	while (p2->p_flag & P_PPWAIT)
500	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
501	PROC_UNLOCK(p2);
502	return (0);
503}
504
505int
506linux_clone(struct thread *td, struct linux_clone_args *args)
507{
508	int error, ff = RFPROC | RFSTOPPED;
509	struct proc *p2;
510	struct thread *td2;
511	int exit_signal;
512	struct linux_emuldata *em;
513
514#ifdef DEBUG
515	if (ldebug(clone)) {
516   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
517		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
518		    (unsigned int)(uintptr_t)args->parent_tidptr,
519		    (unsigned int)(uintptr_t)args->child_tidptr);
520	}
521#endif
522
523	exit_signal = args->flags & 0x000000ff;
524	if (exit_signal >= LINUX_NSIG)
525		return (EINVAL);
526
527	if (exit_signal <= LINUX_SIGTBLSZ)
528		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
529
530	if (args->flags & CLONE_VM)
531		ff |= RFMEM;
532	if (args->flags & CLONE_SIGHAND)
533		ff |= RFSIGSHARE;
534	/*
535	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
536	 * and open files is independant. in fbsd its in one
537	 * structure but in reality it doesnt make any problems
538	 * because both this flags are set at once usually.
539	 */
540	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
541		ff |= RFFDG;
542
543	/*
544	 * Attempt to detect when linux_clone(2) is used for creating
545	 * kernel threads. Unfortunately despite the existence of the
546	 * CLONE_THREAD flag, version of linuxthreads package used in
547	 * most popular distros as of beginning of 2005 doesn't make
548	 * any use of it. Therefore, this detection relay fully on
549	 * empirical observation that linuxthreads sets certain
550	 * combination of flags, so that we can make more or less
551	 * precise detection and notify the FreeBSD kernel that several
552	 * processes are in fact part of the same threading group, so
553	 * that special treatment is necessary for signal delivery
554	 * between those processes and fd locking.
555	 */
556	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
557		ff |= RFTHREAD;
558
559	error = fork1(td, ff, 0, &p2);
560	if (error)
561		return (error);
562
563	/* create the emuldata */
564	error = linux_proc_init(td, p2->p_pid, args->flags);
565	/* reference it - no need to check this */
566	em = em_find(p2, EMUL_UNLOCKED);
567	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
568	/* and adjust it */
569	if (args->flags & CLONE_PARENT_SETTID) {
570	   	if (args->parent_tidptr == NULL) {
571		   	EMUL_UNLOCK(&emul_lock);
572			return (EINVAL);
573		}
574		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
575		if (error) {
576		   	EMUL_UNLOCK(&emul_lock);
577			return (error);
578		}
579	}
580
581	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
582	   	sx_xlock(&proctree_lock);
583		PROC_LOCK(p2);
584		proc_reparent(p2, td->td_proc->p_pptr);
585		PROC_UNLOCK(p2);
586		sx_xunlock(&proctree_lock);
587	}
588
589	if (args->flags & CLONE_THREAD) {
590	   	/* XXX: linux mangles pgrp and pptr somehow
591		 * I think it might be this but I am not sure.
592		 */
593#ifdef notyet
594	   	PROC_LOCK(p2);
595	   	p2->p_pgrp = td->td_proc->p_pgrp;
596	   	PROC_UNLOCK(p2);
597#endif
598	 	exit_signal = 0;
599	}
600
601	if (args->flags & CLONE_CHILD_SETTID)
602		em->child_set_tid = args->child_tidptr;
603	else
604	   	em->child_set_tid = NULL;
605
606	if (args->flags & CLONE_CHILD_CLEARTID)
607		em->child_clear_tid = args->child_tidptr;
608	else
609	   	em->child_clear_tid = NULL;
610
611	EMUL_UNLOCK(&emul_lock);
612
613	PROC_LOCK(p2);
614	p2->p_sigparent = exit_signal;
615	PROC_UNLOCK(p2);
616	td2 = FIRST_THREAD_IN_PROC(p2);
617	/*
618	 * in a case of stack = NULL we are supposed to COW calling process stack
619	 * this is what normal fork() does so we just keep the tf_rsp arg intact
620	 */
621	if (args->stack)
622   	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
623
624	if (args->flags & CLONE_SETTLS) {
625	   	/* XXX: todo */
626	}
627
628#ifdef DEBUG
629	if (ldebug(clone))
630		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
631		    (long)p2->p_pid, args->stack, exit_signal);
632#endif
633
634	/*
635	 * Make this runnable after we are finished with it.
636	 */
637	mtx_lock_spin(&sched_lock);
638	TD_SET_CAN_RUN(td2);
639	setrunqueue(td2, SRQ_BORING);
640	mtx_unlock_spin(&sched_lock);
641
642	td->td_retval[0] = p2->p_pid;
643	td->td_retval[1] = 0;
644	return (0);
645}
646
647/* XXX move */
648struct l_mmap_argv {
649	l_ulong		addr;
650	l_ulong		len;
651	l_ulong		prot;
652	l_ulong		flags;
653	l_ulong		fd;
654	l_ulong		pgoff;
655};
656
657#define STACK_SIZE  (2 * 1024 * 1024)
658#define GUARD_SIZE  (4 * PAGE_SIZE)
659
660static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
661
662int
663linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
664{
665	struct l_mmap_argv linux_args;
666
667#ifdef DEBUG
668	if (ldebug(mmap2))
669		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
670		    (void *)(intptr_t)args->addr, args->len, args->prot,
671		    args->flags, args->fd, args->pgoff);
672#endif
673
674	linux_args.addr = PTROUT(args->addr);
675	linux_args.len = args->len;
676	linux_args.prot = args->prot;
677	linux_args.flags = args->flags;
678	linux_args.fd = args->fd;
679	linux_args.pgoff = args->pgoff;
680
681	return (linux_mmap_common(td, &linux_args));
682}
683
684int
685linux_mmap(struct thread *td, struct linux_mmap_args *args)
686{
687	int error;
688	struct l_mmap_argv linux_args;
689
690	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
691	if (error)
692		return (error);
693
694#ifdef DEBUG
695	if (ldebug(mmap))
696		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
697		    (void *)(intptr_t)linux_args.addr, linux_args.len,
698		    linux_args.prot, linux_args.flags, linux_args.fd,
699		    linux_args.pgoff);
700#endif
701	if ((linux_args.pgoff % PAGE_SIZE) != 0)
702		return (EINVAL);
703	linux_args.pgoff /= PAGE_SIZE;
704
705	return (linux_mmap_common(td, &linux_args));
706}
707
708static int
709linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
710{
711	struct proc *p = td->td_proc;
712	struct mmap_args /* {
713		caddr_t addr;
714		size_t len;
715		int prot;
716		int flags;
717		int fd;
718		long pad;
719		off_t pos;
720	} */ bsd_args;
721	int error;
722
723	error = 0;
724	bsd_args.flags = 0;
725	if (linux_args->flags & LINUX_MAP_SHARED)
726		bsd_args.flags |= MAP_SHARED;
727	if (linux_args->flags & LINUX_MAP_PRIVATE)
728		bsd_args.flags |= MAP_PRIVATE;
729	if (linux_args->flags & LINUX_MAP_FIXED)
730		bsd_args.flags |= MAP_FIXED;
731	if (linux_args->flags & LINUX_MAP_ANON)
732		bsd_args.flags |= MAP_ANON;
733	else
734		bsd_args.flags |= MAP_NOSYNC;
735	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
736		bsd_args.flags |= MAP_STACK;
737
738		/*
739		 * The linux MAP_GROWSDOWN option does not limit auto
740		 * growth of the region.  Linux mmap with this option
741		 * takes as addr the inital BOS, and as len, the initial
742		 * region size.  It can then grow down from addr without
743		 * limit.  However, linux threads has an implicit internal
744		 * limit to stack size of STACK_SIZE.  Its just not
745		 * enforced explicitly in linux.  But, here we impose
746		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
747		 * region, since we can do this with our mmap.
748		 *
749		 * Our mmap with MAP_STACK takes addr as the maximum
750		 * downsize limit on BOS, and as len the max size of
751		 * the region.  It them maps the top SGROWSIZ bytes,
752		 * and autgrows the region down, up to the limit
753		 * in addr.
754		 *
755		 * If we don't use the MAP_STACK option, the effect
756		 * of this code is to allocate a stack region of a
757		 * fixed size of (STACK_SIZE - GUARD_SIZE).
758		 */
759
760		/* This gives us TOS */
761		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
762		    linux_args->len;
763
764		if ((caddr_t)PTRIN(bsd_args.addr) >
765		    p->p_vmspace->vm_maxsaddr) {
766			/*
767			 * Some linux apps will attempt to mmap
768			 * thread stacks near the top of their
769			 * address space.  If their TOS is greater
770			 * than vm_maxsaddr, vm_map_growstack()
771			 * will confuse the thread stack with the
772			 * process stack and deliver a SEGV if they
773			 * attempt to grow the thread stack past their
774			 * current stacksize rlimit.  To avoid this,
775			 * adjust vm_maxsaddr upwards to reflect
776			 * the current stacksize rlimit rather
777			 * than the maximum possible stacksize.
778			 * It would be better to adjust the
779			 * mmap'ed region, but some apps do not check
780			 * mmap's return value.
781			 */
782			PROC_LOCK(p);
783			p->p_vmspace->vm_maxsaddr =
784			    (char *)LINUX32_USRSTACK -
785			    lim_cur(p, RLIMIT_STACK);
786			PROC_UNLOCK(p);
787		}
788
789		/* This gives us our maximum stack size */
790		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
791			bsd_args.len = linux_args->len;
792		else
793			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
794
795		/*
796		 * This gives us a new BOS.  If we're using VM_STACK, then
797		 * mmap will just map the top SGROWSIZ bytes, and let
798		 * the stack grow down to the limit at BOS.  If we're
799		 * not using VM_STACK we map the full stack, since we
800		 * don't have a way to autogrow it.
801		 */
802		bsd_args.addr -= bsd_args.len;
803	} else {
804		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
805		bsd_args.len  = linux_args->len;
806	}
807	/*
808	 * XXX i386 Linux always emulator forces PROT_READ on (why?)
809	 * so we do the same. We add PROT_EXEC to work around buggy
810	 * applications (e.g. Java) that take advantage of the fact
811	 * that execute permissions are not enforced by x86 CPUs.
812	 */
813	bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ;
814	if (linux_args->flags & LINUX_MAP_ANON)
815		bsd_args.fd = -1;
816	else
817		bsd_args.fd = linux_args->fd;
818	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
819	bsd_args.pad = 0;
820
821#ifdef DEBUG
822	if (ldebug(mmap))
823		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
824		    __func__,
825		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
826		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
827#endif
828	error = mmap(td, &bsd_args);
829#ifdef DEBUG
830	if (ldebug(mmap))
831		printf("-> %s() return: 0x%x (0x%08x)\n",
832			__func__, error, (u_int)td->td_retval[0]);
833#endif
834	return (error);
835}
836
837int
838linux_pipe(struct thread *td, struct linux_pipe_args *args)
839{
840	int pip[2];
841	int error;
842	register_t reg_rdx;
843
844#ifdef DEBUG
845	if (ldebug(pipe))
846		printf(ARGS(pipe, "*"));
847#endif
848
849	reg_rdx = td->td_retval[1];
850	error = pipe(td, 0);
851	if (error) {
852		td->td_retval[1] = reg_rdx;
853		return (error);
854	}
855
856	pip[0] = td->td_retval[0];
857	pip[1] = td->td_retval[1];
858	error = copyout(pip, args->pipefds, 2 * sizeof(int));
859	if (error) {
860		td->td_retval[1] = reg_rdx;
861		return (error);
862	}
863
864	td->td_retval[1] = reg_rdx;
865	td->td_retval[0] = 0;
866	return (0);
867}
868
869int
870linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
871{
872	l_osigaction_t osa;
873	l_sigaction_t act, oact;
874	int error;
875
876#ifdef DEBUG
877	if (ldebug(sigaction))
878		printf(ARGS(sigaction, "%d, %p, %p"),
879		    args->sig, (void *)args->nsa, (void *)args->osa);
880#endif
881
882	if (args->nsa != NULL) {
883		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
884		if (error)
885			return (error);
886		act.lsa_handler = osa.lsa_handler;
887		act.lsa_flags = osa.lsa_flags;
888		act.lsa_restorer = osa.lsa_restorer;
889		LINUX_SIGEMPTYSET(act.lsa_mask);
890		act.lsa_mask.__bits[0] = osa.lsa_mask;
891	}
892
893	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
894	    args->osa ? &oact : NULL);
895
896	if (args->osa != NULL && !error) {
897		osa.lsa_handler = oact.lsa_handler;
898		osa.lsa_flags = oact.lsa_flags;
899		osa.lsa_restorer = oact.lsa_restorer;
900		osa.lsa_mask = oact.lsa_mask.__bits[0];
901		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
902	}
903
904	return (error);
905}
906
907/*
908 * Linux has two extra args, restart and oldmask.  We dont use these,
909 * but it seems that "restart" is actually a context pointer that
910 * enables the signal to happen with a different register set.
911 */
912int
913linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
914{
915	sigset_t sigmask;
916	l_sigset_t mask;
917
918#ifdef DEBUG
919	if (ldebug(sigsuspend))
920		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
921#endif
922
923	LINUX_SIGEMPTYSET(mask);
924	mask.__bits[0] = args->mask;
925	linux_to_bsd_sigset(&mask, &sigmask);
926	return (kern_sigsuspend(td, sigmask));
927}
928
929int
930linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
931{
932	l_sigset_t lmask;
933	sigset_t sigmask;
934	int error;
935
936#ifdef DEBUG
937	if (ldebug(rt_sigsuspend))
938		printf(ARGS(rt_sigsuspend, "%p, %d"),
939		    (void *)uap->newset, uap->sigsetsize);
940#endif
941
942	if (uap->sigsetsize != sizeof(l_sigset_t))
943		return (EINVAL);
944
945	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
946	if (error)
947		return (error);
948
949	linux_to_bsd_sigset(&lmask, &sigmask);
950	return (kern_sigsuspend(td, sigmask));
951}
952
953int
954linux_pause(struct thread *td, struct linux_pause_args *args)
955{
956	struct proc *p = td->td_proc;
957	sigset_t sigmask;
958
959#ifdef DEBUG
960	if (ldebug(pause))
961		printf(ARGS(pause, ""));
962#endif
963
964	PROC_LOCK(p);
965	sigmask = td->td_sigmask;
966	PROC_UNLOCK(p);
967	return (kern_sigsuspend(td, sigmask));
968}
969
970int
971linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
972{
973	stack_t ss, oss;
974	l_stack_t lss;
975	int error;
976
977#ifdef DEBUG
978	if (ldebug(sigaltstack))
979		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
980#endif
981
982	if (uap->uss != NULL) {
983		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
984		if (error)
985			return (error);
986
987		ss.ss_sp = PTRIN(lss.ss_sp);
988		ss.ss_size = lss.ss_size;
989		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
990	}
991	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
992	    (uap->uoss != NULL) ? &oss : NULL);
993	if (!error && uap->uoss != NULL) {
994		lss.ss_sp = PTROUT(oss.ss_sp);
995		lss.ss_size = oss.ss_size;
996		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
997		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
998	}
999
1000	return (error);
1001}
1002
1003int
1004linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1005{
1006	struct ftruncate_args sa;
1007
1008#ifdef DEBUG
1009	if (ldebug(ftruncate64))
1010		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1011		    (intmax_t)args->length);
1012#endif
1013
1014	sa.fd = args->fd;
1015	sa.pad = 0;
1016	sa.length = args->length;
1017	return ftruncate(td, &sa);
1018}
1019
1020int
1021linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1022{
1023	struct timeval atv;
1024	l_timeval atv32;
1025	struct timezone rtz;
1026	int error = 0;
1027
1028	if (uap->tp) {
1029		microtime(&atv);
1030		atv32.tv_sec = atv.tv_sec;
1031		atv32.tv_usec = atv.tv_usec;
1032		error = copyout(&atv32, uap->tp, sizeof (atv32));
1033	}
1034	if (error == 0 && uap->tzp != NULL) {
1035		rtz.tz_minuteswest = tz_minuteswest;
1036		rtz.tz_dsttime = tz_dsttime;
1037		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1038	}
1039	return (error);
1040}
1041
1042int
1043linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap)
1044{
1045	struct timespec rqt, rmt;
1046	struct l_timespec ats32;
1047	int error;
1048
1049	error = copyin(uap->rqtp, &ats32, sizeof(ats32));
1050	if (error != 0)
1051		return (error);
1052	rqt.tv_sec = ats32.tv_sec;
1053	rqt.tv_nsec = ats32.tv_nsec;
1054	error = kern_nanosleep(td, &rqt, &rmt);
1055	if (uap->rmtp != NULL) {
1056		ats32.tv_sec = rmt.tv_sec;
1057		ats32.tv_nsec = rmt.tv_nsec;
1058		error = copyout(&ats32, uap->rmtp, sizeof(ats32));
1059	}
1060	return (error);
1061}
1062
1063int
1064linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1065{
1066	struct l_rusage s32;
1067	struct rusage s;
1068	int error;
1069
1070	error = kern_getrusage(td, uap->who, &s);
1071	if (error != 0)
1072		return (error);
1073	if (uap->rusage != NULL) {
1074		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1075		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1076		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1077		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1078		s32.ru_maxrss = s.ru_maxrss;
1079		s32.ru_ixrss = s.ru_ixrss;
1080		s32.ru_idrss = s.ru_idrss;
1081		s32.ru_isrss = s.ru_isrss;
1082		s32.ru_minflt = s.ru_minflt;
1083		s32.ru_majflt = s.ru_majflt;
1084		s32.ru_nswap = s.ru_nswap;
1085		s32.ru_inblock = s.ru_inblock;
1086		s32.ru_oublock = s.ru_oublock;
1087		s32.ru_msgsnd = s.ru_msgsnd;
1088		s32.ru_msgrcv = s.ru_msgrcv;
1089		s32.ru_nsignals = s.ru_nsignals;
1090		s32.ru_nvcsw = s.ru_nvcsw;
1091		s32.ru_nivcsw = s.ru_nivcsw;
1092		error = copyout(&s32, uap->rusage, sizeof(s32));
1093	}
1094	return (error);
1095}
1096
1097int
1098linux_sched_rr_get_interval(struct thread *td,
1099    struct linux_sched_rr_get_interval_args *uap)
1100{
1101	struct timespec ts;
1102	struct l_timespec ts32;
1103	int error;
1104
1105	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1106	if (error != 0)
1107		return (error);
1108	ts32.tv_sec = ts.tv_sec;
1109	ts32.tv_nsec = ts.tv_nsec;
1110	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1111}
1112
1113int
1114linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1115{
1116	struct mprotect_args bsd_args;
1117
1118	bsd_args.addr = uap->addr;
1119	bsd_args.len = uap->len;
1120	bsd_args.prot = uap->prot;
1121	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1122	if ((bsd_args.prot & PROT_READ) != 0)
1123		bsd_args.prot |= PROT_EXEC;
1124	return (mprotect(td, &bsd_args));
1125}
1126