linux32_machdep.c revision 163371
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 163371 2006-10-15 13:22:14Z netchild $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/clock.h>
38#include <sys/imgact.h>
39#include <sys/limits.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/mman.h>
43#include <sys/mutex.h>
44#include <sys/proc.h>
45#include <sys/resource.h>
46#include <sys/resourcevar.h>
47#include <sys/syscallsubr.h>
48#include <sys/sysproto.h>
49#include <sys/unistd.h>
50
51#include <machine/frame.h>
52
53#include <vm/vm.h>
54#include <vm/pmap.h>
55#include <vm/vm_extern.h>
56#include <vm/vm_kern.h>
57#include <vm/vm_map.h>
58
59#include <amd64/linux32/linux.h>
60#include <amd64/linux32/linux32_proto.h>
61#include <compat/linux/linux_ipc.h>
62#include <compat/linux/linux_signal.h>
63#include <compat/linux/linux_util.h>
64#include <compat/linux/linux_emul.h>
65
66struct l_old_select_argv {
67	l_int		nfds;
68	l_uintptr_t	readfds;
69	l_uintptr_t	writefds;
70	l_uintptr_t	exceptfds;
71	l_uintptr_t	timeout;
72} __packed;
73
74int
75linux_to_bsd_sigaltstack(int lsa)
76{
77	int bsa = 0;
78
79	if (lsa & LINUX_SS_DISABLE)
80		bsa |= SS_DISABLE;
81	if (lsa & LINUX_SS_ONSTACK)
82		bsa |= SS_ONSTACK;
83	return (bsa);
84}
85
86int
87bsd_to_linux_sigaltstack(int bsa)
88{
89	int lsa = 0;
90
91	if (bsa & SS_DISABLE)
92		lsa |= LINUX_SS_DISABLE;
93	if (bsa & SS_ONSTACK)
94		lsa |= LINUX_SS_ONSTACK;
95	return (lsa);
96}
97
98/*
99 * Custom version of exec_copyin_args() so that we can translate
100 * the pointers.
101 */
102static int
103linux_exec_copyin_args(struct image_args *args, char *fname,
104    enum uio_seg segflg, char **argv, char **envv)
105{
106	char *argp, *envp;
107	u_int32_t *p32, arg;
108	size_t length;
109	int error;
110
111	bzero(args, sizeof(*args));
112	if (argv == NULL)
113		return (EFAULT);
114
115	/*
116	 * Allocate temporary demand zeroed space for argument and
117	 *	environment strings
118	 */
119	args->buf = (char *) kmem_alloc_wait(exec_map,
120	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
121	if (args->buf == NULL)
122		return (ENOMEM);
123	args->begin_argv = args->buf;
124	args->endp = args->begin_argv;
125	args->stringspace = ARG_MAX;
126
127	args->fname = args->buf + ARG_MAX;
128
129	/*
130	 * Copy the file name.
131	 */
132	error = (segflg == UIO_SYSSPACE) ?
133	    copystr(fname, args->fname, PATH_MAX, &length) :
134	    copyinstr(fname, args->fname, PATH_MAX, &length);
135	if (error != 0)
136		goto err_exit;
137
138	/*
139	 * extract arguments first
140	 */
141	p32 = (u_int32_t *)argv;
142	for (;;) {
143		error = copyin(p32++, &arg, sizeof(arg));
144		if (error)
145			goto err_exit;
146		if (arg == 0)
147			break;
148		argp = PTRIN(arg);
149		error = copyinstr(argp, args->endp, args->stringspace, &length);
150		if (error) {
151			if (error == ENAMETOOLONG)
152				error = E2BIG;
153
154			goto err_exit;
155		}
156		args->stringspace -= length;
157		args->endp += length;
158		args->argc++;
159	}
160
161	args->begin_envv = args->endp;
162
163	/*
164	 * extract environment strings
165	 */
166	if (envv) {
167		p32 = (u_int32_t *)envv;
168		for (;;) {
169			error = copyin(p32++, &arg, sizeof(arg));
170			if (error)
171				goto err_exit;
172			if (arg == 0)
173				break;
174			envp = PTRIN(arg);
175			error = copyinstr(envp, args->endp, args->stringspace,
176			    &length);
177			if (error) {
178				if (error == ENAMETOOLONG)
179					error = E2BIG;
180				goto err_exit;
181			}
182			args->stringspace -= length;
183			args->endp += length;
184			args->envc++;
185		}
186	}
187
188	return (0);
189
190err_exit:
191	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
192	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
193	args->buf = NULL;
194	return (error);
195}
196
197int
198linux_execve(struct thread *td, struct linux_execve_args *args)
199{
200	struct image_args eargs;
201	char *path;
202	int error;
203
204	LCONVPATHEXIST(td, args->path, &path);
205
206#ifdef DEBUG
207	if (ldebug(execve))
208		printf(ARGS(execve, "%s"), path);
209#endif
210
211	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
212	    args->envp);
213	free(path, M_TEMP);
214	if (error == 0)
215		error = kern_execve(td, &eargs, NULL);
216	if (error == 0)
217	   	/* linux process can exec fbsd one, dont attempt
218		 * to create emuldata for such process using
219		 * linux_proc_init, this leads to a panic on KASSERT
220		 * because such process has p->p_emuldata == NULL
221		 */
222	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
223   		   	error = linux_proc_init(td, 0, 0);
224	return (error);
225}
226
227struct iovec32 {
228	u_int32_t iov_base;
229	int	iov_len;
230};
231
232CTASSERT(sizeof(struct iovec32) == 8);
233
234static int
235linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
236{
237	struct iovec32 iov32;
238	struct iovec *iov;
239	struct uio *uio;
240	u_int iovlen;
241	int error, i;
242
243	*uiop = NULL;
244	if (iovcnt > UIO_MAXIOV)
245		return (EINVAL);
246	iovlen = iovcnt * sizeof(struct iovec);
247	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
248	iov = (struct iovec *)(uio + 1);
249	for (i = 0; i < iovcnt; i++) {
250		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
251		if (error) {
252			free(uio, M_IOV);
253			return (error);
254		}
255		iov[i].iov_base = PTRIN(iov32.iov_base);
256		iov[i].iov_len = iov32.iov_len;
257	}
258	uio->uio_iov = iov;
259	uio->uio_iovcnt = iovcnt;
260	uio->uio_segflg = UIO_USERSPACE;
261	uio->uio_offset = -1;
262	uio->uio_resid = 0;
263	for (i = 0; i < iovcnt; i++) {
264		if (iov->iov_len > INT_MAX - uio->uio_resid) {
265			free(uio, M_IOV);
266			return (EINVAL);
267		}
268		uio->uio_resid += iov->iov_len;
269		iov++;
270	}
271	*uiop = uio;
272	return (0);
273}
274
275int
276linux_readv(struct thread *td, struct linux_readv_args *uap)
277{
278	struct uio *auio;
279	int error;
280
281	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
282	if (error)
283		return (error);
284	error = kern_readv(td, uap->fd, auio);
285	free(auio, M_IOV);
286	return (error);
287}
288
289int
290linux_writev(struct thread *td, struct linux_writev_args *uap)
291{
292	struct uio *auio;
293	int error;
294
295	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
296	if (error)
297		return (error);
298	error = kern_writev(td, uap->fd, auio);
299	free(auio, M_IOV);
300	return (error);
301}
302
303struct l_ipc_kludge {
304	l_uintptr_t msgp;
305	l_long msgtyp;
306} __packed;
307
308int
309linux_ipc(struct thread *td, struct linux_ipc_args *args)
310{
311
312	switch (args->what & 0xFFFF) {
313	case LINUX_SEMOP: {
314		struct linux_semop_args a;
315
316		a.semid = args->arg1;
317		a.tsops = args->ptr;
318		a.nsops = args->arg2;
319		return (linux_semop(td, &a));
320	}
321	case LINUX_SEMGET: {
322		struct linux_semget_args a;
323
324		a.key = args->arg1;
325		a.nsems = args->arg2;
326		a.semflg = args->arg3;
327		return (linux_semget(td, &a));
328	}
329	case LINUX_SEMCTL: {
330		struct linux_semctl_args a;
331		int error;
332
333		a.semid = args->arg1;
334		a.semnum = args->arg2;
335		a.cmd = args->arg3;
336		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
337		if (error)
338			return (error);
339		return (linux_semctl(td, &a));
340	}
341	case LINUX_MSGSND: {
342		struct linux_msgsnd_args a;
343
344		a.msqid = args->arg1;
345		a.msgp = args->ptr;
346		a.msgsz = args->arg2;
347		a.msgflg = args->arg3;
348		return (linux_msgsnd(td, &a));
349	}
350	case LINUX_MSGRCV: {
351		struct linux_msgrcv_args a;
352
353		a.msqid = args->arg1;
354		a.msgsz = args->arg2;
355		a.msgflg = args->arg3;
356		if ((args->what >> 16) == 0) {
357			struct l_ipc_kludge tmp;
358			int error;
359
360			if (args->ptr == 0)
361				return (EINVAL);
362			error = copyin(args->ptr, &tmp, sizeof(tmp));
363			if (error)
364				return (error);
365			a.msgp = PTRIN(tmp.msgp);
366			a.msgtyp = tmp.msgtyp;
367		} else {
368			a.msgp = args->ptr;
369			a.msgtyp = args->arg5;
370		}
371		return (linux_msgrcv(td, &a));
372	}
373	case LINUX_MSGGET: {
374		struct linux_msgget_args a;
375
376		a.key = args->arg1;
377		a.msgflg = args->arg2;
378		return (linux_msgget(td, &a));
379	}
380	case LINUX_MSGCTL: {
381		struct linux_msgctl_args a;
382
383		a.msqid = args->arg1;
384		a.cmd = args->arg2;
385		a.buf = args->ptr;
386		return (linux_msgctl(td, &a));
387	}
388	case LINUX_SHMAT: {
389		struct linux_shmat_args a;
390
391		a.shmid = args->arg1;
392		a.shmaddr = args->ptr;
393		a.shmflg = args->arg2;
394		a.raddr = PTRIN((l_uint)args->arg3);
395		return (linux_shmat(td, &a));
396	}
397	case LINUX_SHMDT: {
398		struct linux_shmdt_args a;
399
400		a.shmaddr = args->ptr;
401		return (linux_shmdt(td, &a));
402	}
403	case LINUX_SHMGET: {
404		struct linux_shmget_args a;
405
406		a.key = args->arg1;
407		a.size = args->arg2;
408		a.shmflg = args->arg3;
409		return (linux_shmget(td, &a));
410	}
411	case LINUX_SHMCTL: {
412		struct linux_shmctl_args a;
413
414		a.shmid = args->arg1;
415		a.cmd = args->arg2;
416		a.buf = args->ptr;
417		return (linux_shmctl(td, &a));
418	}
419	default:
420		break;
421	}
422
423	return (EINVAL);
424}
425
426int
427linux_old_select(struct thread *td, struct linux_old_select_args *args)
428{
429	struct l_old_select_argv linux_args;
430	struct linux_select_args newsel;
431	int error;
432
433#ifdef DEBUG
434	if (ldebug(old_select))
435		printf(ARGS(old_select, "%p"), args->ptr);
436#endif
437
438	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
439	if (error)
440		return (error);
441
442	newsel.nfds = linux_args.nfds;
443	newsel.readfds = PTRIN(linux_args.readfds);
444	newsel.writefds = PTRIN(linux_args.writefds);
445	newsel.exceptfds = PTRIN(linux_args.exceptfds);
446	newsel.timeout = PTRIN(linux_args.timeout);
447	return (linux_select(td, &newsel));
448}
449
450int
451linux_fork(struct thread *td, struct linux_fork_args *args)
452{
453	int error;
454
455#ifdef DEBUG
456	if (ldebug(fork))
457		printf(ARGS(fork, ""));
458#endif
459
460	if ((error = fork(td, (struct fork_args *)args)) != 0)
461		return (error);
462
463	if (td->td_retval[1] == 1)
464		td->td_retval[0] = 0;
465	error = linux_proc_init(td, td->td_retval[0], 0);
466	if (error)
467		return (error);
468
469	return (0);
470}
471
472int
473linux_vfork(struct thread *td, struct linux_vfork_args *args)
474{
475	int error;
476	struct proc *p2;
477
478#ifdef DEBUG
479	if (ldebug(vfork))
480		printf(ARGS(vfork, ""));
481#endif
482
483	/* exclude RFPPWAIT */
484	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
485		return (error);
486	if (error == 0) {
487	   	td->td_retval[0] = p2->p_pid;
488		td->td_retval[1] = 0;
489	}
490	/* Are we the child? */
491	if (td->td_retval[1] == 1)
492		td->td_retval[0] = 0;
493	error = linux_proc_init(td, td->td_retval[0], 0);
494	if (error)
495		return (error);
496	/* wait for the children to exit, ie. emulate vfork */
497	PROC_LOCK(p2);
498	while (p2->p_flag & P_PPWAIT)
499	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
500	PROC_UNLOCK(p2);
501	return (0);
502}
503
504int
505linux_clone(struct thread *td, struct linux_clone_args *args)
506{
507	int error, ff = RFPROC | RFSTOPPED;
508	struct proc *p2;
509	struct thread *td2;
510	int exit_signal;
511	struct linux_emuldata *em;
512
513#ifdef DEBUG
514	if (ldebug(clone)) {
515   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
516		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
517		    (unsigned int)(uintptr_t)args->parent_tidptr,
518		    (unsigned int)(uintptr_t)args->child_tidptr);
519	}
520#endif
521
522	exit_signal = args->flags & 0x000000ff;
523	if (exit_signal >= LINUX_NSIG)
524		return (EINVAL);
525
526	if (exit_signal <= LINUX_SIGTBLSZ)
527		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
528
529	if (args->flags & CLONE_VM)
530		ff |= RFMEM;
531	if (args->flags & CLONE_SIGHAND)
532		ff |= RFSIGSHARE;
533	/*
534	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
535	 * and open files is independant. in fbsd its in one
536	 * structure but in reality it doesnt make any problems
537	 * because both this flags are set at once usually.
538	 */
539	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
540		ff |= RFFDG;
541
542	/*
543	 * Attempt to detect when linux_clone(2) is used for creating
544	 * kernel threads. Unfortunately despite the existence of the
545	 * CLONE_THREAD flag, version of linuxthreads package used in
546	 * most popular distros as of beginning of 2005 doesn't make
547	 * any use of it. Therefore, this detection relay fully on
548	 * empirical observation that linuxthreads sets certain
549	 * combination of flags, so that we can make more or less
550	 * precise detection and notify the FreeBSD kernel that several
551	 * processes are in fact part of the same threading group, so
552	 * that special treatment is necessary for signal delivery
553	 * between those processes and fd locking.
554	 */
555	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
556		ff |= RFTHREAD;
557
558	error = fork1(td, ff, 0, &p2);
559	if (error)
560		return (error);
561
562	/* create the emuldata */
563	error = linux_proc_init(td, p2->p_pid, args->flags);
564	/* reference it - no need to check this */
565	em = em_find(p2, EMUL_UNLOCKED);
566	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
567	/* and adjust it */
568	if (args->flags & CLONE_PARENT_SETTID) {
569	   	if (args->parent_tidptr == NULL) {
570		   	EMUL_UNLOCK(&emul_lock);
571			return (EINVAL);
572		}
573		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
574		if (error) {
575		   	EMUL_UNLOCK(&emul_lock);
576			return (error);
577		}
578	}
579
580	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
581	   	sx_xlock(&proctree_lock);
582		PROC_LOCK(p2);
583		proc_reparent(p2, td->td_proc->p_pptr);
584		PROC_UNLOCK(p2);
585		sx_xunlock(&proctree_lock);
586	}
587
588	if (args->flags & CLONE_THREAD) {
589	   	/* XXX: linux mangles pgrp and pptr somehow
590		 * I think it might be this but I am not sure.
591		 */
592#ifdef notyet
593	   	PROC_LOCK(p2);
594	   	p2->p_pgrp = td->td_proc->p_pgrp;
595	   	PROC_UNLOCK(p2);
596#endif
597	 	exit_signal = 0;
598	}
599
600	if (args->flags & CLONE_CHILD_SETTID)
601		em->child_set_tid = args->child_tidptr;
602	else
603	   	em->child_set_tid = NULL;
604
605	if (args->flags & CLONE_CHILD_CLEARTID)
606		em->child_clear_tid = args->child_tidptr;
607	else
608	   	em->child_clear_tid = NULL;
609
610	EMUL_UNLOCK(&emul_lock);
611
612	PROC_LOCK(p2);
613	p2->p_sigparent = exit_signal;
614	PROC_UNLOCK(p2);
615	td2 = FIRST_THREAD_IN_PROC(p2);
616	/*
617	 * in a case of stack = NULL we are supposed to COW calling process stack
618	 * this is what normal fork() does so we just keep the tf_rsp arg intact
619	 */
620	if (args->stack)
621   	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
622
623	if (args->flags & CLONE_SETTLS) {
624	   	/* XXX: todo */
625	}
626
627#ifdef DEBUG
628	if (ldebug(clone))
629		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
630		    (long)p2->p_pid, args->stack, exit_signal);
631#endif
632
633	/*
634	 * Make this runnable after we are finished with it.
635	 */
636	mtx_lock_spin(&sched_lock);
637	TD_SET_CAN_RUN(td2);
638	setrunqueue(td2, SRQ_BORING);
639	mtx_unlock_spin(&sched_lock);
640
641	td->td_retval[0] = p2->p_pid;
642	td->td_retval[1] = 0;
643	return (0);
644}
645
646/* XXX move */
647struct l_mmap_argv {
648	l_ulong		addr;
649	l_ulong		len;
650	l_ulong		prot;
651	l_ulong		flags;
652	l_ulong		fd;
653	l_ulong		pgoff;
654};
655
656#define STACK_SIZE  (2 * 1024 * 1024)
657#define GUARD_SIZE  (4 * PAGE_SIZE)
658
659static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
660
661int
662linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
663{
664	struct l_mmap_argv linux_args;
665
666#ifdef DEBUG
667	if (ldebug(mmap2))
668		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
669		    (void *)(intptr_t)args->addr, args->len, args->prot,
670		    args->flags, args->fd, args->pgoff);
671#endif
672
673	linux_args.addr = PTROUT(args->addr);
674	linux_args.len = args->len;
675	linux_args.prot = args->prot;
676	linux_args.flags = args->flags;
677	linux_args.fd = args->fd;
678	linux_args.pgoff = args->pgoff;
679
680	return (linux_mmap_common(td, &linux_args));
681}
682
683int
684linux_mmap(struct thread *td, struct linux_mmap_args *args)
685{
686	int error;
687	struct l_mmap_argv linux_args;
688
689	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
690	if (error)
691		return (error);
692
693#ifdef DEBUG
694	if (ldebug(mmap))
695		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
696		    (void *)(intptr_t)linux_args.addr, linux_args.len,
697		    linux_args.prot, linux_args.flags, linux_args.fd,
698		    linux_args.pgoff);
699#endif
700	if ((linux_args.pgoff % PAGE_SIZE) != 0)
701		return (EINVAL);
702	linux_args.pgoff /= PAGE_SIZE;
703
704	return (linux_mmap_common(td, &linux_args));
705}
706
707static int
708linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
709{
710	struct proc *p = td->td_proc;
711	struct mmap_args /* {
712		caddr_t addr;
713		size_t len;
714		int prot;
715		int flags;
716		int fd;
717		long pad;
718		off_t pos;
719	} */ bsd_args;
720	int error;
721
722	error = 0;
723	bsd_args.flags = 0;
724	if (linux_args->flags & LINUX_MAP_SHARED)
725		bsd_args.flags |= MAP_SHARED;
726	if (linux_args->flags & LINUX_MAP_PRIVATE)
727		bsd_args.flags |= MAP_PRIVATE;
728	if (linux_args->flags & LINUX_MAP_FIXED)
729		bsd_args.flags |= MAP_FIXED;
730	if (linux_args->flags & LINUX_MAP_ANON)
731		bsd_args.flags |= MAP_ANON;
732	else
733		bsd_args.flags |= MAP_NOSYNC;
734	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
735		bsd_args.flags |= MAP_STACK;
736
737		/*
738		 * The linux MAP_GROWSDOWN option does not limit auto
739		 * growth of the region.  Linux mmap with this option
740		 * takes as addr the inital BOS, and as len, the initial
741		 * region size.  It can then grow down from addr without
742		 * limit.  However, linux threads has an implicit internal
743		 * limit to stack size of STACK_SIZE.  Its just not
744		 * enforced explicitly in linux.  But, here we impose
745		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
746		 * region, since we can do this with our mmap.
747		 *
748		 * Our mmap with MAP_STACK takes addr as the maximum
749		 * downsize limit on BOS, and as len the max size of
750		 * the region.  It them maps the top SGROWSIZ bytes,
751		 * and autgrows the region down, up to the limit
752		 * in addr.
753		 *
754		 * If we don't use the MAP_STACK option, the effect
755		 * of this code is to allocate a stack region of a
756		 * fixed size of (STACK_SIZE - GUARD_SIZE).
757		 */
758
759		/* This gives us TOS */
760		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
761		    linux_args->len;
762
763		if ((caddr_t)PTRIN(bsd_args.addr) >
764		    p->p_vmspace->vm_maxsaddr) {
765			/*
766			 * Some linux apps will attempt to mmap
767			 * thread stacks near the top of their
768			 * address space.  If their TOS is greater
769			 * than vm_maxsaddr, vm_map_growstack()
770			 * will confuse the thread stack with the
771			 * process stack and deliver a SEGV if they
772			 * attempt to grow the thread stack past their
773			 * current stacksize rlimit.  To avoid this,
774			 * adjust vm_maxsaddr upwards to reflect
775			 * the current stacksize rlimit rather
776			 * than the maximum possible stacksize.
777			 * It would be better to adjust the
778			 * mmap'ed region, but some apps do not check
779			 * mmap's return value.
780			 */
781			PROC_LOCK(p);
782			p->p_vmspace->vm_maxsaddr =
783			    (char *)LINUX32_USRSTACK -
784			    lim_cur(p, RLIMIT_STACK);
785			PROC_UNLOCK(p);
786		}
787
788		/* This gives us our maximum stack size */
789		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
790			bsd_args.len = linux_args->len;
791		else
792			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
793
794		/*
795		 * This gives us a new BOS.  If we're using VM_STACK, then
796		 * mmap will just map the top SGROWSIZ bytes, and let
797		 * the stack grow down to the limit at BOS.  If we're
798		 * not using VM_STACK we map the full stack, since we
799		 * don't have a way to autogrow it.
800		 */
801		bsd_args.addr -= bsd_args.len;
802	} else {
803		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
804		bsd_args.len  = linux_args->len;
805	}
806	/*
807	 * XXX i386 Linux always emulator forces PROT_READ on (why?)
808	 * so we do the same. We add PROT_EXEC to work around buggy
809	 * applications (e.g. Java) that take advantage of the fact
810	 * that execute permissions are not enforced by x86 CPUs.
811	 */
812	bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ;
813	if (linux_args->flags & LINUX_MAP_ANON)
814		bsd_args.fd = -1;
815	else
816		bsd_args.fd = linux_args->fd;
817	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
818	bsd_args.pad = 0;
819
820#ifdef DEBUG
821	if (ldebug(mmap))
822		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
823		    __func__,
824		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
825		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
826#endif
827	error = mmap(td, &bsd_args);
828#ifdef DEBUG
829	if (ldebug(mmap))
830		printf("-> %s() return: 0x%x (0x%08x)\n",
831			__func__, error, (u_int)td->td_retval[0]);
832#endif
833	return (error);
834}
835
836int
837linux_pipe(struct thread *td, struct linux_pipe_args *args)
838{
839	int pip[2];
840	int error;
841	register_t reg_rdx;
842
843#ifdef DEBUG
844	if (ldebug(pipe))
845		printf(ARGS(pipe, "*"));
846#endif
847
848	reg_rdx = td->td_retval[1];
849	error = pipe(td, 0);
850	if (error) {
851		td->td_retval[1] = reg_rdx;
852		return (error);
853	}
854
855	pip[0] = td->td_retval[0];
856	pip[1] = td->td_retval[1];
857	error = copyout(pip, args->pipefds, 2 * sizeof(int));
858	if (error) {
859		td->td_retval[1] = reg_rdx;
860		return (error);
861	}
862
863	td->td_retval[1] = reg_rdx;
864	td->td_retval[0] = 0;
865	return (0);
866}
867
868int
869linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
870{
871	l_osigaction_t osa;
872	l_sigaction_t act, oact;
873	int error;
874
875#ifdef DEBUG
876	if (ldebug(sigaction))
877		printf(ARGS(sigaction, "%d, %p, %p"),
878		    args->sig, (void *)args->nsa, (void *)args->osa);
879#endif
880
881	if (args->nsa != NULL) {
882		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
883		if (error)
884			return (error);
885		act.lsa_handler = osa.lsa_handler;
886		act.lsa_flags = osa.lsa_flags;
887		act.lsa_restorer = osa.lsa_restorer;
888		LINUX_SIGEMPTYSET(act.lsa_mask);
889		act.lsa_mask.__bits[0] = osa.lsa_mask;
890	}
891
892	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
893	    args->osa ? &oact : NULL);
894
895	if (args->osa != NULL && !error) {
896		osa.lsa_handler = oact.lsa_handler;
897		osa.lsa_flags = oact.lsa_flags;
898		osa.lsa_restorer = oact.lsa_restorer;
899		osa.lsa_mask = oact.lsa_mask.__bits[0];
900		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
901	}
902
903	return (error);
904}
905
906/*
907 * Linux has two extra args, restart and oldmask.  We dont use these,
908 * but it seems that "restart" is actually a context pointer that
909 * enables the signal to happen with a different register set.
910 */
911int
912linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
913{
914	sigset_t sigmask;
915	l_sigset_t mask;
916
917#ifdef DEBUG
918	if (ldebug(sigsuspend))
919		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
920#endif
921
922	LINUX_SIGEMPTYSET(mask);
923	mask.__bits[0] = args->mask;
924	linux_to_bsd_sigset(&mask, &sigmask);
925	return (kern_sigsuspend(td, sigmask));
926}
927
928int
929linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
930{
931	l_sigset_t lmask;
932	sigset_t sigmask;
933	int error;
934
935#ifdef DEBUG
936	if (ldebug(rt_sigsuspend))
937		printf(ARGS(rt_sigsuspend, "%p, %d"),
938		    (void *)uap->newset, uap->sigsetsize);
939#endif
940
941	if (uap->sigsetsize != sizeof(l_sigset_t))
942		return (EINVAL);
943
944	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
945	if (error)
946		return (error);
947
948	linux_to_bsd_sigset(&lmask, &sigmask);
949	return (kern_sigsuspend(td, sigmask));
950}
951
952int
953linux_pause(struct thread *td, struct linux_pause_args *args)
954{
955	struct proc *p = td->td_proc;
956	sigset_t sigmask;
957
958#ifdef DEBUG
959	if (ldebug(pause))
960		printf(ARGS(pause, ""));
961#endif
962
963	PROC_LOCK(p);
964	sigmask = td->td_sigmask;
965	PROC_UNLOCK(p);
966	return (kern_sigsuspend(td, sigmask));
967}
968
969int
970linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
971{
972	stack_t ss, oss;
973	l_stack_t lss;
974	int error;
975
976#ifdef DEBUG
977	if (ldebug(sigaltstack))
978		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
979#endif
980
981	if (uap->uss != NULL) {
982		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
983		if (error)
984			return (error);
985
986		ss.ss_sp = PTRIN(lss.ss_sp);
987		ss.ss_size = lss.ss_size;
988		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
989	}
990	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
991	    (uap->uoss != NULL) ? &oss : NULL);
992	if (!error && uap->uoss != NULL) {
993		lss.ss_sp = PTROUT(oss.ss_sp);
994		lss.ss_size = oss.ss_size;
995		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
996		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
997	}
998
999	return (error);
1000}
1001
1002int
1003linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1004{
1005	struct ftruncate_args sa;
1006
1007#ifdef DEBUG
1008	if (ldebug(ftruncate64))
1009		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1010		    (intmax_t)args->length);
1011#endif
1012
1013	sa.fd = args->fd;
1014	sa.pad = 0;
1015	sa.length = args->length;
1016	return ftruncate(td, &sa);
1017}
1018
1019int
1020linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1021{
1022	struct timeval atv;
1023	l_timeval atv32;
1024	struct timezone rtz;
1025	int error = 0;
1026
1027	if (uap->tp) {
1028		microtime(&atv);
1029		atv32.tv_sec = atv.tv_sec;
1030		atv32.tv_usec = atv.tv_usec;
1031		error = copyout(&atv32, uap->tp, sizeof (atv32));
1032	}
1033	if (error == 0 && uap->tzp != NULL) {
1034		rtz.tz_minuteswest = tz_minuteswest;
1035		rtz.tz_dsttime = tz_dsttime;
1036		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1037	}
1038	return (error);
1039}
1040
1041int
1042linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap)
1043{
1044	struct timespec rqt, rmt;
1045	struct l_timespec ats32;
1046	int error;
1047
1048	error = copyin(uap->rqtp, &ats32, sizeof(ats32));
1049	if (error != 0)
1050		return (error);
1051	rqt.tv_sec = ats32.tv_sec;
1052	rqt.tv_nsec = ats32.tv_nsec;
1053	error = kern_nanosleep(td, &rqt, &rmt);
1054	if (uap->rmtp != NULL) {
1055		ats32.tv_sec = rmt.tv_sec;
1056		ats32.tv_nsec = rmt.tv_nsec;
1057		error = copyout(&ats32, uap->rmtp, sizeof(ats32));
1058	}
1059	return (error);
1060}
1061
1062int
1063linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1064{
1065	struct l_rusage s32;
1066	struct rusage s;
1067	int error;
1068
1069	error = kern_getrusage(td, uap->who, &s);
1070	if (error != 0)
1071		return (error);
1072	if (uap->rusage != NULL) {
1073		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1074		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1075		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1076		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1077		s32.ru_maxrss = s.ru_maxrss;
1078		s32.ru_ixrss = s.ru_ixrss;
1079		s32.ru_idrss = s.ru_idrss;
1080		s32.ru_isrss = s.ru_isrss;
1081		s32.ru_minflt = s.ru_minflt;
1082		s32.ru_majflt = s.ru_majflt;
1083		s32.ru_nswap = s.ru_nswap;
1084		s32.ru_inblock = s.ru_inblock;
1085		s32.ru_oublock = s.ru_oublock;
1086		s32.ru_msgsnd = s.ru_msgsnd;
1087		s32.ru_msgrcv = s.ru_msgrcv;
1088		s32.ru_nsignals = s.ru_nsignals;
1089		s32.ru_nvcsw = s.ru_nvcsw;
1090		s32.ru_nivcsw = s.ru_nivcsw;
1091		error = copyout(&s32, uap->rusage, sizeof(s32));
1092	}
1093	return (error);
1094}
1095
1096int
1097linux_sched_rr_get_interval(struct thread *td,
1098    struct linux_sched_rr_get_interval_args *uap)
1099{
1100	struct timespec ts;
1101	struct l_timespec ts32;
1102	int error;
1103
1104	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1105	if (error != 0)
1106		return (error);
1107	ts32.tv_sec = ts.tv_sec;
1108	ts32.tv_nsec = ts.tv_nsec;
1109	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1110}
1111
1112int
1113linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1114{
1115	struct mprotect_args bsd_args;
1116
1117	bsd_args.addr = uap->addr;
1118	bsd_args.len = uap->len;
1119	bsd_args.prot = uap->prot;
1120	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1121	if ((bsd_args.prot & PROT_READ) != 0)
1122		bsd_args.prot |= PROT_EXEC;
1123	return (mprotect(td, &bsd_args));
1124}
1125