linux32_machdep.c revision 166150
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 166150 2007-01-20 14:58:59Z netchild $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/file.h>
38#include <sys/fcntl.h>
39#include <sys/clock.h>
40#include <sys/imgact.h>
41#include <sys/limits.h>
42#include <sys/lock.h>
43#include <sys/malloc.h>
44#include <sys/mman.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/resource.h>
48#include <sys/resourcevar.h>
49#include <sys/syscallsubr.h>
50#include <sys/sysproto.h>
51#include <sys/unistd.h>
52
53#include <machine/frame.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_extern.h>
58#include <vm/vm_kern.h>
59#include <vm/vm_map.h>
60
61#include <amd64/linux32/linux.h>
62#include <amd64/linux32/linux32_proto.h>
63#include <compat/linux/linux_ipc.h>
64#include <compat/linux/linux_signal.h>
65#include <compat/linux/linux_util.h>
66#include <compat/linux/linux_emul.h>
67
68struct l_old_select_argv {
69	l_int		nfds;
70	l_uintptr_t	readfds;
71	l_uintptr_t	writefds;
72	l_uintptr_t	exceptfds;
73	l_uintptr_t	timeout;
74} __packed;
75
76int
77linux_to_bsd_sigaltstack(int lsa)
78{
79	int bsa = 0;
80
81	if (lsa & LINUX_SS_DISABLE)
82		bsa |= SS_DISABLE;
83	if (lsa & LINUX_SS_ONSTACK)
84		bsa |= SS_ONSTACK;
85	return (bsa);
86}
87
88int
89bsd_to_linux_sigaltstack(int bsa)
90{
91	int lsa = 0;
92
93	if (bsa & SS_DISABLE)
94		lsa |= LINUX_SS_DISABLE;
95	if (bsa & SS_ONSTACK)
96		lsa |= LINUX_SS_ONSTACK;
97	return (lsa);
98}
99
100/*
101 * Custom version of exec_copyin_args() so that we can translate
102 * the pointers.
103 */
104static int
105linux_exec_copyin_args(struct image_args *args, char *fname,
106    enum uio_seg segflg, char **argv, char **envv)
107{
108	char *argp, *envp;
109	u_int32_t *p32, arg;
110	size_t length;
111	int error;
112
113	bzero(args, sizeof(*args));
114	if (argv == NULL)
115		return (EFAULT);
116
117	/*
118	 * Allocate temporary demand zeroed space for argument and
119	 *	environment strings
120	 */
121	args->buf = (char *) kmem_alloc_wait(exec_map,
122	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
123	if (args->buf == NULL)
124		return (ENOMEM);
125	args->begin_argv = args->buf;
126	args->endp = args->begin_argv;
127	args->stringspace = ARG_MAX;
128
129	args->fname = args->buf + ARG_MAX;
130
131	/*
132	 * Copy the file name.
133	 */
134	error = (segflg == UIO_SYSSPACE) ?
135	    copystr(fname, args->fname, PATH_MAX, &length) :
136	    copyinstr(fname, args->fname, PATH_MAX, &length);
137	if (error != 0)
138		goto err_exit;
139
140	/*
141	 * extract arguments first
142	 */
143	p32 = (u_int32_t *)argv;
144	for (;;) {
145		error = copyin(p32++, &arg, sizeof(arg));
146		if (error)
147			goto err_exit;
148		if (arg == 0)
149			break;
150		argp = PTRIN(arg);
151		error = copyinstr(argp, args->endp, args->stringspace, &length);
152		if (error) {
153			if (error == ENAMETOOLONG)
154				error = E2BIG;
155
156			goto err_exit;
157		}
158		args->stringspace -= length;
159		args->endp += length;
160		args->argc++;
161	}
162
163	args->begin_envv = args->endp;
164
165	/*
166	 * extract environment strings
167	 */
168	if (envv) {
169		p32 = (u_int32_t *)envv;
170		for (;;) {
171			error = copyin(p32++, &arg, sizeof(arg));
172			if (error)
173				goto err_exit;
174			if (arg == 0)
175				break;
176			envp = PTRIN(arg);
177			error = copyinstr(envp, args->endp, args->stringspace,
178			    &length);
179			if (error) {
180				if (error == ENAMETOOLONG)
181					error = E2BIG;
182				goto err_exit;
183			}
184			args->stringspace -= length;
185			args->endp += length;
186			args->envc++;
187		}
188	}
189
190	return (0);
191
192err_exit:
193	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
194	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
195	args->buf = NULL;
196	return (error);
197}
198
199int
200linux_execve(struct thread *td, struct linux_execve_args *args)
201{
202	struct image_args eargs;
203	char *path;
204	int error;
205
206	LCONVPATHEXIST(td, args->path, &path);
207
208#ifdef DEBUG
209	if (ldebug(execve))
210		printf(ARGS(execve, "%s"), path);
211#endif
212
213	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
214	    args->envp);
215	free(path, M_TEMP);
216	if (error == 0)
217		error = kern_execve(td, &eargs, NULL);
218	if (error == 0)
219	   	/* linux process can exec fbsd one, dont attempt
220		 * to create emuldata for such process using
221		 * linux_proc_init, this leads to a panic on KASSERT
222		 * because such process has p->p_emuldata == NULL
223		 */
224	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
225   		   	error = linux_proc_init(td, 0, 0);
226	return (error);
227}
228
229struct iovec32 {
230	u_int32_t iov_base;
231	int	iov_len;
232};
233
234CTASSERT(sizeof(struct iovec32) == 8);
235
236static int
237linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
238{
239	struct iovec32 iov32;
240	struct iovec *iov;
241	struct uio *uio;
242	u_int iovlen;
243	int error, i;
244
245	*uiop = NULL;
246	if (iovcnt > UIO_MAXIOV)
247		return (EINVAL);
248	iovlen = iovcnt * sizeof(struct iovec);
249	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
250	iov = (struct iovec *)(uio + 1);
251	for (i = 0; i < iovcnt; i++) {
252		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
253		if (error) {
254			free(uio, M_IOV);
255			return (error);
256		}
257		iov[i].iov_base = PTRIN(iov32.iov_base);
258		iov[i].iov_len = iov32.iov_len;
259	}
260	uio->uio_iov = iov;
261	uio->uio_iovcnt = iovcnt;
262	uio->uio_segflg = UIO_USERSPACE;
263	uio->uio_offset = -1;
264	uio->uio_resid = 0;
265	for (i = 0; i < iovcnt; i++) {
266		if (iov->iov_len > INT_MAX - uio->uio_resid) {
267			free(uio, M_IOV);
268			return (EINVAL);
269		}
270		uio->uio_resid += iov->iov_len;
271		iov++;
272	}
273	*uiop = uio;
274	return (0);
275}
276
277int
278linux_readv(struct thread *td, struct linux_readv_args *uap)
279{
280	struct uio *auio;
281	int error;
282
283	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
284	if (error)
285		return (error);
286	error = kern_readv(td, uap->fd, auio);
287	free(auio, M_IOV);
288	return (error);
289}
290
291int
292linux_writev(struct thread *td, struct linux_writev_args *uap)
293{
294	struct uio *auio;
295	int error;
296
297	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
298	if (error)
299		return (error);
300	error = kern_writev(td, uap->fd, auio);
301	free(auio, M_IOV);
302	return (error);
303}
304
305struct l_ipc_kludge {
306	l_uintptr_t msgp;
307	l_long msgtyp;
308} __packed;
309
310int
311linux_ipc(struct thread *td, struct linux_ipc_args *args)
312{
313
314	switch (args->what & 0xFFFF) {
315	case LINUX_SEMOP: {
316		struct linux_semop_args a;
317
318		a.semid = args->arg1;
319		a.tsops = args->ptr;
320		a.nsops = args->arg2;
321		return (linux_semop(td, &a));
322	}
323	case LINUX_SEMGET: {
324		struct linux_semget_args a;
325
326		a.key = args->arg1;
327		a.nsems = args->arg2;
328		a.semflg = args->arg3;
329		return (linux_semget(td, &a));
330	}
331	case LINUX_SEMCTL: {
332		struct linux_semctl_args a;
333		int error;
334
335		a.semid = args->arg1;
336		a.semnum = args->arg2;
337		a.cmd = args->arg3;
338		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
339		if (error)
340			return (error);
341		return (linux_semctl(td, &a));
342	}
343	case LINUX_MSGSND: {
344		struct linux_msgsnd_args a;
345
346		a.msqid = args->arg1;
347		a.msgp = args->ptr;
348		a.msgsz = args->arg2;
349		a.msgflg = args->arg3;
350		return (linux_msgsnd(td, &a));
351	}
352	case LINUX_MSGRCV: {
353		struct linux_msgrcv_args a;
354
355		a.msqid = args->arg1;
356		a.msgsz = args->arg2;
357		a.msgflg = args->arg3;
358		if ((args->what >> 16) == 0) {
359			struct l_ipc_kludge tmp;
360			int error;
361
362			if (args->ptr == 0)
363				return (EINVAL);
364			error = copyin(args->ptr, &tmp, sizeof(tmp));
365			if (error)
366				return (error);
367			a.msgp = PTRIN(tmp.msgp);
368			a.msgtyp = tmp.msgtyp;
369		} else {
370			a.msgp = args->ptr;
371			a.msgtyp = args->arg5;
372		}
373		return (linux_msgrcv(td, &a));
374	}
375	case LINUX_MSGGET: {
376		struct linux_msgget_args a;
377
378		a.key = args->arg1;
379		a.msgflg = args->arg2;
380		return (linux_msgget(td, &a));
381	}
382	case LINUX_MSGCTL: {
383		struct linux_msgctl_args a;
384
385		a.msqid = args->arg1;
386		a.cmd = args->arg2;
387		a.buf = args->ptr;
388		return (linux_msgctl(td, &a));
389	}
390	case LINUX_SHMAT: {
391		struct linux_shmat_args a;
392
393		a.shmid = args->arg1;
394		a.shmaddr = args->ptr;
395		a.shmflg = args->arg2;
396		a.raddr = PTRIN((l_uint)args->arg3);
397		return (linux_shmat(td, &a));
398	}
399	case LINUX_SHMDT: {
400		struct linux_shmdt_args a;
401
402		a.shmaddr = args->ptr;
403		return (linux_shmdt(td, &a));
404	}
405	case LINUX_SHMGET: {
406		struct linux_shmget_args a;
407
408		a.key = args->arg1;
409		a.size = args->arg2;
410		a.shmflg = args->arg3;
411		return (linux_shmget(td, &a));
412	}
413	case LINUX_SHMCTL: {
414		struct linux_shmctl_args a;
415
416		a.shmid = args->arg1;
417		a.cmd = args->arg2;
418		a.buf = args->ptr;
419		return (linux_shmctl(td, &a));
420	}
421	default:
422		break;
423	}
424
425	return (EINVAL);
426}
427
428int
429linux_old_select(struct thread *td, struct linux_old_select_args *args)
430{
431	struct l_old_select_argv linux_args;
432	struct linux_select_args newsel;
433	int error;
434
435#ifdef DEBUG
436	if (ldebug(old_select))
437		printf(ARGS(old_select, "%p"), args->ptr);
438#endif
439
440	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
441	if (error)
442		return (error);
443
444	newsel.nfds = linux_args.nfds;
445	newsel.readfds = PTRIN(linux_args.readfds);
446	newsel.writefds = PTRIN(linux_args.writefds);
447	newsel.exceptfds = PTRIN(linux_args.exceptfds);
448	newsel.timeout = PTRIN(linux_args.timeout);
449	return (linux_select(td, &newsel));
450}
451
452int
453linux_fork(struct thread *td, struct linux_fork_args *args)
454{
455	int error;
456	struct proc *p2;
457	struct thread *td2;
458
459#ifdef DEBUG
460	if (ldebug(fork))
461		printf(ARGS(fork, ""));
462#endif
463
464	if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0)
465		return (error);
466
467	if (error == 0) {
468		td->td_retval[0] = p2->p_pid;
469		td->td_retval[1] = 0;
470	}
471
472	if (td->td_retval[1] == 1)
473		td->td_retval[0] = 0;
474	error = linux_proc_init(td, td->td_retval[0], 0);
475	if (error)
476		return (error);
477
478	td2 = FIRST_THREAD_IN_PROC(p2);
479
480	/* make it run */
481	mtx_lock_spin(&sched_lock);
482	TD_SET_CAN_RUN(td2);
483	setrunqueue(td2, SRQ_BORING);
484	mtx_unlock_spin(&sched_lock);
485
486	return (0);
487}
488
489int
490linux_vfork(struct thread *td, struct linux_vfork_args *args)
491{
492	int error;
493	struct proc *p2;
494	struct thread *td2;
495
496#ifdef DEBUG
497	if (ldebug(vfork))
498		printf(ARGS(vfork, ""));
499#endif
500
501	/* exclude RFPPWAIT */
502	if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0)
503		return (error);
504	if (error == 0) {
505	   	td->td_retval[0] = p2->p_pid;
506		td->td_retval[1] = 0;
507	}
508	/* Are we the child? */
509	if (td->td_retval[1] == 1)
510		td->td_retval[0] = 0;
511	error = linux_proc_init(td, td->td_retval[0], 0);
512	if (error)
513		return (error);
514
515	PROC_LOCK(p2);
516	p2->p_flag |= P_PPWAIT;
517	PROC_UNLOCK(p2);
518
519	td2 = FIRST_THREAD_IN_PROC(p2);
520
521	/* make it run */
522	mtx_lock_spin(&sched_lock);
523	TD_SET_CAN_RUN(td2);
524	setrunqueue(td2, SRQ_BORING);
525	mtx_unlock_spin(&sched_lock);
526
527	/* wait for the children to exit, ie. emulate vfork */
528	PROC_LOCK(p2);
529	while (p2->p_flag & P_PPWAIT)
530	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
531	PROC_UNLOCK(p2);
532
533	return (0);
534}
535
536int
537linux_clone(struct thread *td, struct linux_clone_args *args)
538{
539	int error, ff = RFPROC | RFSTOPPED;
540	struct proc *p2;
541	struct thread *td2;
542	int exit_signal;
543	struct linux_emuldata *em;
544
545#ifdef DEBUG
546	if (ldebug(clone)) {
547   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
548		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
549		    (unsigned int)(uintptr_t)args->parent_tidptr,
550		    (unsigned int)(uintptr_t)args->child_tidptr);
551	}
552#endif
553
554	exit_signal = args->flags & 0x000000ff;
555	if (exit_signal >= LINUX_NSIG)
556		return (EINVAL);
557
558	if (exit_signal <= LINUX_SIGTBLSZ)
559		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
560
561	if (args->flags & CLONE_VM)
562		ff |= RFMEM;
563	if (args->flags & CLONE_SIGHAND)
564		ff |= RFSIGSHARE;
565	/*
566	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
567	 * and open files is independant. in fbsd its in one
568	 * structure but in reality it doesnt make any problems
569	 * because both this flags are set at once usually.
570	 */
571	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
572		ff |= RFFDG;
573
574	/*
575	 * Attempt to detect when linux_clone(2) is used for creating
576	 * kernel threads. Unfortunately despite the existence of the
577	 * CLONE_THREAD flag, version of linuxthreads package used in
578	 * most popular distros as of beginning of 2005 doesn't make
579	 * any use of it. Therefore, this detection relay fully on
580	 * empirical observation that linuxthreads sets certain
581	 * combination of flags, so that we can make more or less
582	 * precise detection and notify the FreeBSD kernel that several
583	 * processes are in fact part of the same threading group, so
584	 * that special treatment is necessary for signal delivery
585	 * between those processes and fd locking.
586	 */
587	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
588		ff |= RFTHREAD;
589
590	error = fork1(td, ff, 0, &p2);
591	if (error)
592		return (error);
593
594	/* create the emuldata */
595	error = linux_proc_init(td, p2->p_pid, args->flags);
596	/* reference it - no need to check this */
597	em = em_find(p2, EMUL_DOLOCK);
598	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
599	/* and adjust it */
600	if (args->flags & CLONE_PARENT_SETTID) {
601	   	if (args->parent_tidptr == NULL) {
602		   	EMUL_UNLOCK(&emul_lock);
603			return (EINVAL);
604		}
605		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
606		if (error) {
607		   	EMUL_UNLOCK(&emul_lock);
608			return (error);
609		}
610	}
611
612	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
613	   	sx_xlock(&proctree_lock);
614		PROC_LOCK(p2);
615		proc_reparent(p2, td->td_proc->p_pptr);
616		PROC_UNLOCK(p2);
617		sx_xunlock(&proctree_lock);
618	}
619
620	if (args->flags & CLONE_THREAD) {
621	   	/* XXX: linux mangles pgrp and pptr somehow
622		 * I think it might be this but I am not sure.
623		 */
624#ifdef notyet
625	   	PROC_LOCK(p2);
626	   	p2->p_pgrp = td->td_proc->p_pgrp;
627	   	PROC_UNLOCK(p2);
628#endif
629	 	exit_signal = 0;
630	}
631
632	if (args->flags & CLONE_CHILD_SETTID)
633		em->child_set_tid = args->child_tidptr;
634	else
635	   	em->child_set_tid = NULL;
636
637	if (args->flags & CLONE_CHILD_CLEARTID)
638		em->child_clear_tid = args->child_tidptr;
639	else
640	   	em->child_clear_tid = NULL;
641
642	EMUL_UNLOCK(&emul_lock);
643
644	PROC_LOCK(p2);
645	p2->p_sigparent = exit_signal;
646	PROC_UNLOCK(p2);
647	td2 = FIRST_THREAD_IN_PROC(p2);
648	/*
649	 * in a case of stack = NULL we are supposed to COW calling process stack
650	 * this is what normal fork() does so we just keep the tf_rsp arg intact
651	 */
652	if (args->stack)
653   	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
654
655	if (args->flags & CLONE_SETTLS) {
656	   	/* XXX: todo */
657	}
658
659#ifdef DEBUG
660	if (ldebug(clone))
661		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
662		    (long)p2->p_pid, args->stack, exit_signal);
663#endif
664	if (args->flags & CLONE_VFORK) {
665	   	PROC_LOCK(p2);
666	   	p2->p_flag |= P_PPWAIT;
667	   	PROC_UNLOCK(p2);
668	}
669
670	/*
671	 * Make this runnable after we are finished with it.
672	 */
673	mtx_lock_spin(&sched_lock);
674	TD_SET_CAN_RUN(td2);
675	setrunqueue(td2, SRQ_BORING);
676	mtx_unlock_spin(&sched_lock);
677
678	td->td_retval[0] = p2->p_pid;
679	td->td_retval[1] = 0;
680
681	if (args->flags & CLONE_VFORK) {
682   	   	/* wait for the children to exit, ie. emulate vfork */
683   	   	PROC_LOCK(p2);
684		while (p2->p_flag & P_PPWAIT)
685   		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
686		PROC_UNLOCK(p2);
687	}
688
689	return (0);
690}
691
692/* XXX move */
693struct l_mmap_argv {
694	l_ulong		addr;
695	l_ulong		len;
696	l_ulong		prot;
697	l_ulong		flags;
698	l_ulong		fd;
699	l_ulong		pgoff;
700};
701
702#define STACK_SIZE  (2 * 1024 * 1024)
703#define GUARD_SIZE  (4 * PAGE_SIZE)
704
705static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
706
707int
708linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
709{
710	struct l_mmap_argv linux_args;
711
712#ifdef DEBUG
713	if (ldebug(mmap2))
714		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
715		    (void *)(intptr_t)args->addr, args->len, args->prot,
716		    args->flags, args->fd, args->pgoff);
717#endif
718
719	linux_args.addr = PTROUT(args->addr);
720	linux_args.len = args->len;
721	linux_args.prot = args->prot;
722	linux_args.flags = args->flags;
723	linux_args.fd = args->fd;
724	linux_args.pgoff = args->pgoff;
725
726	return (linux_mmap_common(td, &linux_args));
727}
728
729int
730linux_mmap(struct thread *td, struct linux_mmap_args *args)
731{
732	int error;
733	struct l_mmap_argv linux_args;
734
735	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
736	if (error)
737		return (error);
738
739#ifdef DEBUG
740	if (ldebug(mmap))
741		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
742		    (void *)(intptr_t)linux_args.addr, linux_args.len,
743		    linux_args.prot, linux_args.flags, linux_args.fd,
744		    linux_args.pgoff);
745#endif
746	if ((linux_args.pgoff % PAGE_SIZE) != 0)
747		return (EINVAL);
748	linux_args.pgoff /= PAGE_SIZE;
749
750	return (linux_mmap_common(td, &linux_args));
751}
752
753static int
754linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
755{
756	struct proc *p = td->td_proc;
757	struct mmap_args /* {
758		caddr_t addr;
759		size_t len;
760		int prot;
761		int flags;
762		int fd;
763		long pad;
764		off_t pos;
765	} */ bsd_args;
766	int error;
767	struct file *fp;
768
769	error = 0;
770	bsd_args.flags = 0;
771	fp = NULL;
772
773	/*
774	 * Linux mmap(2):
775	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
776	 */
777	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
778	    (linux_args->flags & LINUX_MAP_PRIVATE)))
779		return (EINVAL);
780
781	if (linux_args->flags & LINUX_MAP_SHARED)
782		bsd_args.flags |= MAP_SHARED;
783	if (linux_args->flags & LINUX_MAP_PRIVATE)
784		bsd_args.flags |= MAP_PRIVATE;
785	if (linux_args->flags & LINUX_MAP_FIXED)
786		bsd_args.flags |= MAP_FIXED;
787	if (linux_args->flags & LINUX_MAP_ANON)
788		bsd_args.flags |= MAP_ANON;
789	else
790		bsd_args.flags |= MAP_NOSYNC;
791	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
792		bsd_args.flags |= MAP_STACK;
793
794		/*
795		 * The linux MAP_GROWSDOWN option does not limit auto
796		 * growth of the region.  Linux mmap with this option
797		 * takes as addr the inital BOS, and as len, the initial
798		 * region size.  It can then grow down from addr without
799		 * limit.  However, linux threads has an implicit internal
800		 * limit to stack size of STACK_SIZE.  Its just not
801		 * enforced explicitly in linux.  But, here we impose
802		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
803		 * region, since we can do this with our mmap.
804		 *
805		 * Our mmap with MAP_STACK takes addr as the maximum
806		 * downsize limit on BOS, and as len the max size of
807		 * the region.  It them maps the top SGROWSIZ bytes,
808		 * and autgrows the region down, up to the limit
809		 * in addr.
810		 *
811		 * If we don't use the MAP_STACK option, the effect
812		 * of this code is to allocate a stack region of a
813		 * fixed size of (STACK_SIZE - GUARD_SIZE).
814		 */
815
816		/* This gives us TOS */
817		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
818		    linux_args->len;
819
820		if ((caddr_t)PTRIN(bsd_args.addr) >
821		    p->p_vmspace->vm_maxsaddr) {
822			/*
823			 * Some linux apps will attempt to mmap
824			 * thread stacks near the top of their
825			 * address space.  If their TOS is greater
826			 * than vm_maxsaddr, vm_map_growstack()
827			 * will confuse the thread stack with the
828			 * process stack and deliver a SEGV if they
829			 * attempt to grow the thread stack past their
830			 * current stacksize rlimit.  To avoid this,
831			 * adjust vm_maxsaddr upwards to reflect
832			 * the current stacksize rlimit rather
833			 * than the maximum possible stacksize.
834			 * It would be better to adjust the
835			 * mmap'ed region, but some apps do not check
836			 * mmap's return value.
837			 */
838			PROC_LOCK(p);
839			p->p_vmspace->vm_maxsaddr =
840			    (char *)LINUX32_USRSTACK -
841			    lim_cur(p, RLIMIT_STACK);
842			PROC_UNLOCK(p);
843		}
844
845		/* This gives us our maximum stack size */
846		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
847			bsd_args.len = linux_args->len;
848		else
849			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
850
851		/*
852		 * This gives us a new BOS.  If we're using VM_STACK, then
853		 * mmap will just map the top SGROWSIZ bytes, and let
854		 * the stack grow down to the limit at BOS.  If we're
855		 * not using VM_STACK we map the full stack, since we
856		 * don't have a way to autogrow it.
857		 */
858		bsd_args.addr -= bsd_args.len;
859	} else {
860		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
861		bsd_args.len  = linux_args->len;
862	}
863
864	/*
865	 * We add PROT_EXEC to work around buggy applications (e.g. Java)
866	 * that take advantage of the fact that execute permissions are not
867	 * enforced by x86 CPUs.
868	 */
869	bsd_args.prot = linux_args->prot | PROT_EXEC;
870	if (linux_args->flags & LINUX_MAP_ANON)
871		bsd_args.fd = -1;
872	else {
873		/*
874		 * Linux follows Solaris mmap(2) description:
875		 * The file descriptor fildes is opened with
876		 * read permission, regardless of the
877		 * protection options specified.
878		 * If PROT_WRITE is specified, the application
879		 * must have opened the file descriptor
880		 * fildes with write permission unless
881		 * MAP_PRIVATE is specified in the flag
882		 * argument as described below.
883		 */
884
885		if ((error = fget(td, linux_args->fd, &fp)) != 0)
886			return (error);
887		if (fp->f_type != DTYPE_VNODE) {
888			fdrop(fp, td);
889			return (EINVAL);
890		}
891
892		/* Linux mmap() just fails for O_WRONLY files */
893		if (! (fp->f_flag & FREAD)) {
894			fdrop(fp, td);
895			return (EACCES);
896		}
897
898		bsd_args.fd = linux_args->fd;
899		fdrop(fp, td);
900	}
901	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
902	bsd_args.pad = 0;
903
904#ifdef DEBUG
905	if (ldebug(mmap))
906		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
907		    __func__,
908		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
909		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
910#endif
911	error = mmap(td, &bsd_args);
912#ifdef DEBUG
913	if (ldebug(mmap))
914		printf("-> %s() return: 0x%x (0x%08x)\n",
915			__func__, error, (u_int)td->td_retval[0]);
916#endif
917	return (error);
918}
919
920int
921linux_pipe(struct thread *td, struct linux_pipe_args *args)
922{
923	int pip[2];
924	int error;
925	register_t reg_rdx;
926
927#ifdef DEBUG
928	if (ldebug(pipe))
929		printf(ARGS(pipe, "*"));
930#endif
931
932	reg_rdx = td->td_retval[1];
933	error = pipe(td, 0);
934	if (error) {
935		td->td_retval[1] = reg_rdx;
936		return (error);
937	}
938
939	pip[0] = td->td_retval[0];
940	pip[1] = td->td_retval[1];
941	error = copyout(pip, args->pipefds, 2 * sizeof(int));
942	if (error) {
943		td->td_retval[1] = reg_rdx;
944		return (error);
945	}
946
947	td->td_retval[1] = reg_rdx;
948	td->td_retval[0] = 0;
949	return (0);
950}
951
952int
953linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
954{
955	l_osigaction_t osa;
956	l_sigaction_t act, oact;
957	int error;
958
959#ifdef DEBUG
960	if (ldebug(sigaction))
961		printf(ARGS(sigaction, "%d, %p, %p"),
962		    args->sig, (void *)args->nsa, (void *)args->osa);
963#endif
964
965	if (args->nsa != NULL) {
966		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
967		if (error)
968			return (error);
969		act.lsa_handler = osa.lsa_handler;
970		act.lsa_flags = osa.lsa_flags;
971		act.lsa_restorer = osa.lsa_restorer;
972		LINUX_SIGEMPTYSET(act.lsa_mask);
973		act.lsa_mask.__bits[0] = osa.lsa_mask;
974	}
975
976	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
977	    args->osa ? &oact : NULL);
978
979	if (args->osa != NULL && !error) {
980		osa.lsa_handler = oact.lsa_handler;
981		osa.lsa_flags = oact.lsa_flags;
982		osa.lsa_restorer = oact.lsa_restorer;
983		osa.lsa_mask = oact.lsa_mask.__bits[0];
984		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
985	}
986
987	return (error);
988}
989
990/*
991 * Linux has two extra args, restart and oldmask.  We dont use these,
992 * but it seems that "restart" is actually a context pointer that
993 * enables the signal to happen with a different register set.
994 */
995int
996linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
997{
998	sigset_t sigmask;
999	l_sigset_t mask;
1000
1001#ifdef DEBUG
1002	if (ldebug(sigsuspend))
1003		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
1004#endif
1005
1006	LINUX_SIGEMPTYSET(mask);
1007	mask.__bits[0] = args->mask;
1008	linux_to_bsd_sigset(&mask, &sigmask);
1009	return (kern_sigsuspend(td, sigmask));
1010}
1011
1012int
1013linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
1014{
1015	l_sigset_t lmask;
1016	sigset_t sigmask;
1017	int error;
1018
1019#ifdef DEBUG
1020	if (ldebug(rt_sigsuspend))
1021		printf(ARGS(rt_sigsuspend, "%p, %d"),
1022		    (void *)uap->newset, uap->sigsetsize);
1023#endif
1024
1025	if (uap->sigsetsize != sizeof(l_sigset_t))
1026		return (EINVAL);
1027
1028	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
1029	if (error)
1030		return (error);
1031
1032	linux_to_bsd_sigset(&lmask, &sigmask);
1033	return (kern_sigsuspend(td, sigmask));
1034}
1035
1036int
1037linux_pause(struct thread *td, struct linux_pause_args *args)
1038{
1039	struct proc *p = td->td_proc;
1040	sigset_t sigmask;
1041
1042#ifdef DEBUG
1043	if (ldebug(pause))
1044		printf(ARGS(pause, ""));
1045#endif
1046
1047	PROC_LOCK(p);
1048	sigmask = td->td_sigmask;
1049	PROC_UNLOCK(p);
1050	return (kern_sigsuspend(td, sigmask));
1051}
1052
1053int
1054linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
1055{
1056	stack_t ss, oss;
1057	l_stack_t lss;
1058	int error;
1059
1060#ifdef DEBUG
1061	if (ldebug(sigaltstack))
1062		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
1063#endif
1064
1065	if (uap->uss != NULL) {
1066		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1067		if (error)
1068			return (error);
1069
1070		ss.ss_sp = PTRIN(lss.ss_sp);
1071		ss.ss_size = lss.ss_size;
1072		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1073	}
1074	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1075	    (uap->uoss != NULL) ? &oss : NULL);
1076	if (!error && uap->uoss != NULL) {
1077		lss.ss_sp = PTROUT(oss.ss_sp);
1078		lss.ss_size = oss.ss_size;
1079		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1080		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1081	}
1082
1083	return (error);
1084}
1085
1086int
1087linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1088{
1089	struct ftruncate_args sa;
1090
1091#ifdef DEBUG
1092	if (ldebug(ftruncate64))
1093		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1094		    (intmax_t)args->length);
1095#endif
1096
1097	sa.fd = args->fd;
1098	sa.pad = 0;
1099	sa.length = args->length;
1100	return ftruncate(td, &sa);
1101}
1102
1103int
1104linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1105{
1106	struct timeval atv;
1107	l_timeval atv32;
1108	struct timezone rtz;
1109	int error = 0;
1110
1111	if (uap->tp) {
1112		microtime(&atv);
1113		atv32.tv_sec = atv.tv_sec;
1114		atv32.tv_usec = atv.tv_usec;
1115		error = copyout(&atv32, uap->tp, sizeof (atv32));
1116	}
1117	if (error == 0 && uap->tzp != NULL) {
1118		rtz.tz_minuteswest = tz_minuteswest;
1119		rtz.tz_dsttime = tz_dsttime;
1120		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1121	}
1122	return (error);
1123}
1124
1125int
1126linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1127{
1128	struct l_rusage s32;
1129	struct rusage s;
1130	int error;
1131
1132	error = kern_getrusage(td, uap->who, &s);
1133	if (error != 0)
1134		return (error);
1135	if (uap->rusage != NULL) {
1136		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1137		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1138		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1139		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1140		s32.ru_maxrss = s.ru_maxrss;
1141		s32.ru_ixrss = s.ru_ixrss;
1142		s32.ru_idrss = s.ru_idrss;
1143		s32.ru_isrss = s.ru_isrss;
1144		s32.ru_minflt = s.ru_minflt;
1145		s32.ru_majflt = s.ru_majflt;
1146		s32.ru_nswap = s.ru_nswap;
1147		s32.ru_inblock = s.ru_inblock;
1148		s32.ru_oublock = s.ru_oublock;
1149		s32.ru_msgsnd = s.ru_msgsnd;
1150		s32.ru_msgrcv = s.ru_msgrcv;
1151		s32.ru_nsignals = s.ru_nsignals;
1152		s32.ru_nvcsw = s.ru_nvcsw;
1153		s32.ru_nivcsw = s.ru_nivcsw;
1154		error = copyout(&s32, uap->rusage, sizeof(s32));
1155	}
1156	return (error);
1157}
1158
1159int
1160linux_sched_rr_get_interval(struct thread *td,
1161    struct linux_sched_rr_get_interval_args *uap)
1162{
1163	struct timespec ts;
1164	struct l_timespec ts32;
1165	int error;
1166
1167	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1168	if (error != 0)
1169		return (error);
1170	ts32.tv_sec = ts.tv_sec;
1171	ts32.tv_nsec = ts.tv_nsec;
1172	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1173}
1174
1175int
1176linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1177{
1178	struct mprotect_args bsd_args;
1179
1180	bsd_args.addr = uap->addr;
1181	bsd_args.len = uap->len;
1182	bsd_args.prot = uap->prot;
1183	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1184	if ((bsd_args.prot & PROT_READ) != 0)
1185		bsd_args.prot |= PROT_EXEC;
1186	return (mprotect(td, &bsd_args));
1187}
1188