linux32_machdep.c revision 165832
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 165832 2007-01-06 15:58:34Z netchild $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/file.h>
38#include <sys/fcntl.h>
39#include <sys/clock.h>
40#include <sys/imgact.h>
41#include <sys/limits.h>
42#include <sys/lock.h>
43#include <sys/malloc.h>
44#include <sys/mman.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/resource.h>
48#include <sys/resourcevar.h>
49#include <sys/syscallsubr.h>
50#include <sys/sysproto.h>
51#include <sys/unistd.h>
52
53#include <machine/frame.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_extern.h>
58#include <vm/vm_kern.h>
59#include <vm/vm_map.h>
60
61#include <amd64/linux32/linux.h>
62#include <amd64/linux32/linux32_proto.h>
63#include <compat/linux/linux_ipc.h>
64#include <compat/linux/linux_signal.h>
65#include <compat/linux/linux_util.h>
66#include <compat/linux/linux_emul.h>
67
68struct l_old_select_argv {
69	l_int		nfds;
70	l_uintptr_t	readfds;
71	l_uintptr_t	writefds;
72	l_uintptr_t	exceptfds;
73	l_uintptr_t	timeout;
74} __packed;
75
76int
77linux_to_bsd_sigaltstack(int lsa)
78{
79	int bsa = 0;
80
81	if (lsa & LINUX_SS_DISABLE)
82		bsa |= SS_DISABLE;
83	if (lsa & LINUX_SS_ONSTACK)
84		bsa |= SS_ONSTACK;
85	return (bsa);
86}
87
88int
89bsd_to_linux_sigaltstack(int bsa)
90{
91	int lsa = 0;
92
93	if (bsa & SS_DISABLE)
94		lsa |= LINUX_SS_DISABLE;
95	if (bsa & SS_ONSTACK)
96		lsa |= LINUX_SS_ONSTACK;
97	return (lsa);
98}
99
100/*
101 * Custom version of exec_copyin_args() so that we can translate
102 * the pointers.
103 */
104static int
105linux_exec_copyin_args(struct image_args *args, char *fname,
106    enum uio_seg segflg, char **argv, char **envv)
107{
108	char *argp, *envp;
109	u_int32_t *p32, arg;
110	size_t length;
111	int error;
112
113	bzero(args, sizeof(*args));
114	if (argv == NULL)
115		return (EFAULT);
116
117	/*
118	 * Allocate temporary demand zeroed space for argument and
119	 *	environment strings
120	 */
121	args->buf = (char *) kmem_alloc_wait(exec_map,
122	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
123	if (args->buf == NULL)
124		return (ENOMEM);
125	args->begin_argv = args->buf;
126	args->endp = args->begin_argv;
127	args->stringspace = ARG_MAX;
128
129	args->fname = args->buf + ARG_MAX;
130
131	/*
132	 * Copy the file name.
133	 */
134	error = (segflg == UIO_SYSSPACE) ?
135	    copystr(fname, args->fname, PATH_MAX, &length) :
136	    copyinstr(fname, args->fname, PATH_MAX, &length);
137	if (error != 0)
138		goto err_exit;
139
140	/*
141	 * extract arguments first
142	 */
143	p32 = (u_int32_t *)argv;
144	for (;;) {
145		error = copyin(p32++, &arg, sizeof(arg));
146		if (error)
147			goto err_exit;
148		if (arg == 0)
149			break;
150		argp = PTRIN(arg);
151		error = copyinstr(argp, args->endp, args->stringspace, &length);
152		if (error) {
153			if (error == ENAMETOOLONG)
154				error = E2BIG;
155
156			goto err_exit;
157		}
158		args->stringspace -= length;
159		args->endp += length;
160		args->argc++;
161	}
162
163	args->begin_envv = args->endp;
164
165	/*
166	 * extract environment strings
167	 */
168	if (envv) {
169		p32 = (u_int32_t *)envv;
170		for (;;) {
171			error = copyin(p32++, &arg, sizeof(arg));
172			if (error)
173				goto err_exit;
174			if (arg == 0)
175				break;
176			envp = PTRIN(arg);
177			error = copyinstr(envp, args->endp, args->stringspace,
178			    &length);
179			if (error) {
180				if (error == ENAMETOOLONG)
181					error = E2BIG;
182				goto err_exit;
183			}
184			args->stringspace -= length;
185			args->endp += length;
186			args->envc++;
187		}
188	}
189
190	return (0);
191
192err_exit:
193	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
194	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
195	args->buf = NULL;
196	return (error);
197}
198
199int
200linux_execve(struct thread *td, struct linux_execve_args *args)
201{
202	struct image_args eargs;
203	char *path;
204	int error;
205
206	LCONVPATHEXIST(td, args->path, &path);
207
208#ifdef DEBUG
209	if (ldebug(execve))
210		printf(ARGS(execve, "%s"), path);
211#endif
212
213	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
214	    args->envp);
215	free(path, M_TEMP);
216	if (error == 0)
217		error = kern_execve(td, &eargs, NULL);
218	if (error == 0)
219	   	/* linux process can exec fbsd one, dont attempt
220		 * to create emuldata for such process using
221		 * linux_proc_init, this leads to a panic on KASSERT
222		 * because such process has p->p_emuldata == NULL
223		 */
224	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
225   		   	error = linux_proc_init(td, 0, 0);
226	return (error);
227}
228
229struct iovec32 {
230	u_int32_t iov_base;
231	int	iov_len;
232};
233
234CTASSERT(sizeof(struct iovec32) == 8);
235
236static int
237linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
238{
239	struct iovec32 iov32;
240	struct iovec *iov;
241	struct uio *uio;
242	u_int iovlen;
243	int error, i;
244
245	*uiop = NULL;
246	if (iovcnt > UIO_MAXIOV)
247		return (EINVAL);
248	iovlen = iovcnt * sizeof(struct iovec);
249	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
250	iov = (struct iovec *)(uio + 1);
251	for (i = 0; i < iovcnt; i++) {
252		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
253		if (error) {
254			free(uio, M_IOV);
255			return (error);
256		}
257		iov[i].iov_base = PTRIN(iov32.iov_base);
258		iov[i].iov_len = iov32.iov_len;
259	}
260	uio->uio_iov = iov;
261	uio->uio_iovcnt = iovcnt;
262	uio->uio_segflg = UIO_USERSPACE;
263	uio->uio_offset = -1;
264	uio->uio_resid = 0;
265	for (i = 0; i < iovcnt; i++) {
266		if (iov->iov_len > INT_MAX - uio->uio_resid) {
267			free(uio, M_IOV);
268			return (EINVAL);
269		}
270		uio->uio_resid += iov->iov_len;
271		iov++;
272	}
273	*uiop = uio;
274	return (0);
275}
276
277int
278linux_readv(struct thread *td, struct linux_readv_args *uap)
279{
280	struct uio *auio;
281	int error;
282
283	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
284	if (error)
285		return (error);
286	error = kern_readv(td, uap->fd, auio);
287	free(auio, M_IOV);
288	return (error);
289}
290
291int
292linux_writev(struct thread *td, struct linux_writev_args *uap)
293{
294	struct uio *auio;
295	int error;
296
297	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
298	if (error)
299		return (error);
300	error = kern_writev(td, uap->fd, auio);
301	free(auio, M_IOV);
302	return (error);
303}
304
305struct l_ipc_kludge {
306	l_uintptr_t msgp;
307	l_long msgtyp;
308} __packed;
309
310int
311linux_ipc(struct thread *td, struct linux_ipc_args *args)
312{
313
314	switch (args->what & 0xFFFF) {
315	case LINUX_SEMOP: {
316		struct linux_semop_args a;
317
318		a.semid = args->arg1;
319		a.tsops = args->ptr;
320		a.nsops = args->arg2;
321		return (linux_semop(td, &a));
322	}
323	case LINUX_SEMGET: {
324		struct linux_semget_args a;
325
326		a.key = args->arg1;
327		a.nsems = args->arg2;
328		a.semflg = args->arg3;
329		return (linux_semget(td, &a));
330	}
331	case LINUX_SEMCTL: {
332		struct linux_semctl_args a;
333		int error;
334
335		a.semid = args->arg1;
336		a.semnum = args->arg2;
337		a.cmd = args->arg3;
338		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
339		if (error)
340			return (error);
341		return (linux_semctl(td, &a));
342	}
343	case LINUX_MSGSND: {
344		struct linux_msgsnd_args a;
345
346		a.msqid = args->arg1;
347		a.msgp = args->ptr;
348		a.msgsz = args->arg2;
349		a.msgflg = args->arg3;
350		return (linux_msgsnd(td, &a));
351	}
352	case LINUX_MSGRCV: {
353		struct linux_msgrcv_args a;
354
355		a.msqid = args->arg1;
356		a.msgsz = args->arg2;
357		a.msgflg = args->arg3;
358		if ((args->what >> 16) == 0) {
359			struct l_ipc_kludge tmp;
360			int error;
361
362			if (args->ptr == 0)
363				return (EINVAL);
364			error = copyin(args->ptr, &tmp, sizeof(tmp));
365			if (error)
366				return (error);
367			a.msgp = PTRIN(tmp.msgp);
368			a.msgtyp = tmp.msgtyp;
369		} else {
370			a.msgp = args->ptr;
371			a.msgtyp = args->arg5;
372		}
373		return (linux_msgrcv(td, &a));
374	}
375	case LINUX_MSGGET: {
376		struct linux_msgget_args a;
377
378		a.key = args->arg1;
379		a.msgflg = args->arg2;
380		return (linux_msgget(td, &a));
381	}
382	case LINUX_MSGCTL: {
383		struct linux_msgctl_args a;
384
385		a.msqid = args->arg1;
386		a.cmd = args->arg2;
387		a.buf = args->ptr;
388		return (linux_msgctl(td, &a));
389	}
390	case LINUX_SHMAT: {
391		struct linux_shmat_args a;
392
393		a.shmid = args->arg1;
394		a.shmaddr = args->ptr;
395		a.shmflg = args->arg2;
396		a.raddr = PTRIN((l_uint)args->arg3);
397		return (linux_shmat(td, &a));
398	}
399	case LINUX_SHMDT: {
400		struct linux_shmdt_args a;
401
402		a.shmaddr = args->ptr;
403		return (linux_shmdt(td, &a));
404	}
405	case LINUX_SHMGET: {
406		struct linux_shmget_args a;
407
408		a.key = args->arg1;
409		a.size = args->arg2;
410		a.shmflg = args->arg3;
411		return (linux_shmget(td, &a));
412	}
413	case LINUX_SHMCTL: {
414		struct linux_shmctl_args a;
415
416		a.shmid = args->arg1;
417		a.cmd = args->arg2;
418		a.buf = args->ptr;
419		return (linux_shmctl(td, &a));
420	}
421	default:
422		break;
423	}
424
425	return (EINVAL);
426}
427
428int
429linux_old_select(struct thread *td, struct linux_old_select_args *args)
430{
431	struct l_old_select_argv linux_args;
432	struct linux_select_args newsel;
433	int error;
434
435#ifdef DEBUG
436	if (ldebug(old_select))
437		printf(ARGS(old_select, "%p"), args->ptr);
438#endif
439
440	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
441	if (error)
442		return (error);
443
444	newsel.nfds = linux_args.nfds;
445	newsel.readfds = PTRIN(linux_args.readfds);
446	newsel.writefds = PTRIN(linux_args.writefds);
447	newsel.exceptfds = PTRIN(linux_args.exceptfds);
448	newsel.timeout = PTRIN(linux_args.timeout);
449	return (linux_select(td, &newsel));
450}
451
452int
453linux_fork(struct thread *td, struct linux_fork_args *args)
454{
455	int error;
456
457#ifdef DEBUG
458	if (ldebug(fork))
459		printf(ARGS(fork, ""));
460#endif
461
462	if ((error = fork(td, (struct fork_args *)args)) != 0)
463		return (error);
464
465	if (td->td_retval[1] == 1)
466		td->td_retval[0] = 0;
467	error = linux_proc_init(td, td->td_retval[0], 0);
468	if (error)
469		return (error);
470
471	return (0);
472}
473
474int
475linux_vfork(struct thread *td, struct linux_vfork_args *args)
476{
477	int error;
478	struct proc *p2;
479
480#ifdef DEBUG
481	if (ldebug(vfork))
482		printf(ARGS(vfork, ""));
483#endif
484
485	/* exclude RFPPWAIT */
486	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
487		return (error);
488	if (error == 0) {
489	   	td->td_retval[0] = p2->p_pid;
490		td->td_retval[1] = 0;
491	}
492	/* Are we the child? */
493	if (td->td_retval[1] == 1)
494		td->td_retval[0] = 0;
495	error = linux_proc_init(td, td->td_retval[0], 0);
496	if (error)
497		return (error);
498	/* wait for the children to exit, ie. emulate vfork */
499	PROC_LOCK(p2);
500	while (p2->p_flag & P_PPWAIT)
501	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
502	PROC_UNLOCK(p2);
503	return (0);
504}
505
506int
507linux_clone(struct thread *td, struct linux_clone_args *args)
508{
509	int error, ff = RFPROC | RFSTOPPED;
510	struct proc *p2;
511	struct thread *td2;
512	int exit_signal;
513	struct linux_emuldata *em;
514
515#ifdef DEBUG
516	if (ldebug(clone)) {
517   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
518		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
519		    (unsigned int)(uintptr_t)args->parent_tidptr,
520		    (unsigned int)(uintptr_t)args->child_tidptr);
521	}
522#endif
523
524	exit_signal = args->flags & 0x000000ff;
525	if (exit_signal >= LINUX_NSIG)
526		return (EINVAL);
527
528	if (exit_signal <= LINUX_SIGTBLSZ)
529		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
530
531	if (args->flags & CLONE_VM)
532		ff |= RFMEM;
533	if (args->flags & CLONE_SIGHAND)
534		ff |= RFSIGSHARE;
535	/*
536	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
537	 * and open files is independant. in fbsd its in one
538	 * structure but in reality it doesnt make any problems
539	 * because both this flags are set at once usually.
540	 */
541	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
542		ff |= RFFDG;
543
544	/*
545	 * Attempt to detect when linux_clone(2) is used for creating
546	 * kernel threads. Unfortunately despite the existence of the
547	 * CLONE_THREAD flag, version of linuxthreads package used in
548	 * most popular distros as of beginning of 2005 doesn't make
549	 * any use of it. Therefore, this detection relay fully on
550	 * empirical observation that linuxthreads sets certain
551	 * combination of flags, so that we can make more or less
552	 * precise detection and notify the FreeBSD kernel that several
553	 * processes are in fact part of the same threading group, so
554	 * that special treatment is necessary for signal delivery
555	 * between those processes and fd locking.
556	 */
557	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
558		ff |= RFTHREAD;
559
560	error = fork1(td, ff, 0, &p2);
561	if (error)
562		return (error);
563
564	/* create the emuldata */
565	error = linux_proc_init(td, p2->p_pid, args->flags);
566	/* reference it - no need to check this */
567	em = em_find(p2, EMUL_UNLOCKED);
568	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
569	/* and adjust it */
570	if (args->flags & CLONE_PARENT_SETTID) {
571	   	if (args->parent_tidptr == NULL) {
572		   	EMUL_UNLOCK(&emul_lock);
573			return (EINVAL);
574		}
575		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
576		if (error) {
577		   	EMUL_UNLOCK(&emul_lock);
578			return (error);
579		}
580	}
581
582	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
583	   	sx_xlock(&proctree_lock);
584		PROC_LOCK(p2);
585		proc_reparent(p2, td->td_proc->p_pptr);
586		PROC_UNLOCK(p2);
587		sx_xunlock(&proctree_lock);
588	}
589
590	if (args->flags & CLONE_THREAD) {
591	   	/* XXX: linux mangles pgrp and pptr somehow
592		 * I think it might be this but I am not sure.
593		 */
594#ifdef notyet
595	   	PROC_LOCK(p2);
596	   	p2->p_pgrp = td->td_proc->p_pgrp;
597	   	PROC_UNLOCK(p2);
598#endif
599	 	exit_signal = 0;
600	}
601
602	if (args->flags & CLONE_CHILD_SETTID)
603		em->child_set_tid = args->child_tidptr;
604	else
605	   	em->child_set_tid = NULL;
606
607	if (args->flags & CLONE_CHILD_CLEARTID)
608		em->child_clear_tid = args->child_tidptr;
609	else
610	   	em->child_clear_tid = NULL;
611
612	EMUL_UNLOCK(&emul_lock);
613
614	PROC_LOCK(p2);
615	p2->p_sigparent = exit_signal;
616	PROC_UNLOCK(p2);
617	td2 = FIRST_THREAD_IN_PROC(p2);
618	/*
619	 * in a case of stack = NULL we are supposed to COW calling process stack
620	 * this is what normal fork() does so we just keep the tf_rsp arg intact
621	 */
622	if (args->stack)
623   	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
624
625	if (args->flags & CLONE_SETTLS) {
626	   	/* XXX: todo */
627	}
628
629#ifdef DEBUG
630	if (ldebug(clone))
631		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
632		    (long)p2->p_pid, args->stack, exit_signal);
633#endif
634
635	/*
636	 * Make this runnable after we are finished with it.
637	 */
638	mtx_lock_spin(&sched_lock);
639	TD_SET_CAN_RUN(td2);
640	setrunqueue(td2, SRQ_BORING);
641	mtx_unlock_spin(&sched_lock);
642
643	td->td_retval[0] = p2->p_pid;
644	td->td_retval[1] = 0;
645
646	if (args->flags & CLONE_VFORK) {
647   	   	/* wait for the children to exit, ie. emulate vfork */
648   	   	PROC_LOCK(p2);
649		p2->p_flag |= P_PPWAIT;
650		while (p2->p_flag & P_PPWAIT)
651   		   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
652		PROC_UNLOCK(p2);
653	}
654
655	return (0);
656}
657
658/* XXX move */
659struct l_mmap_argv {
660	l_ulong		addr;
661	l_ulong		len;
662	l_ulong		prot;
663	l_ulong		flags;
664	l_ulong		fd;
665	l_ulong		pgoff;
666};
667
668#define STACK_SIZE  (2 * 1024 * 1024)
669#define GUARD_SIZE  (4 * PAGE_SIZE)
670
671static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
672
673int
674linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
675{
676	struct l_mmap_argv linux_args;
677
678#ifdef DEBUG
679	if (ldebug(mmap2))
680		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
681		    (void *)(intptr_t)args->addr, args->len, args->prot,
682		    args->flags, args->fd, args->pgoff);
683#endif
684
685	linux_args.addr = PTROUT(args->addr);
686	linux_args.len = args->len;
687	linux_args.prot = args->prot;
688	linux_args.flags = args->flags;
689	linux_args.fd = args->fd;
690	linux_args.pgoff = args->pgoff;
691
692	return (linux_mmap_common(td, &linux_args));
693}
694
695int
696linux_mmap(struct thread *td, struct linux_mmap_args *args)
697{
698	int error;
699	struct l_mmap_argv linux_args;
700
701	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
702	if (error)
703		return (error);
704
705#ifdef DEBUG
706	if (ldebug(mmap))
707		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
708		    (void *)(intptr_t)linux_args.addr, linux_args.len,
709		    linux_args.prot, linux_args.flags, linux_args.fd,
710		    linux_args.pgoff);
711#endif
712	if ((linux_args.pgoff % PAGE_SIZE) != 0)
713		return (EINVAL);
714	linux_args.pgoff /= PAGE_SIZE;
715
716	return (linux_mmap_common(td, &linux_args));
717}
718
719static int
720linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
721{
722	struct proc *p = td->td_proc;
723	struct mmap_args /* {
724		caddr_t addr;
725		size_t len;
726		int prot;
727		int flags;
728		int fd;
729		long pad;
730		off_t pos;
731	} */ bsd_args;
732	int error;
733	struct file *fp;
734
735	error = 0;
736	bsd_args.flags = 0;
737	fp = NULL;
738
739	/*
740	 * Linux mmap(2):
741	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
742	 */
743	if (! ((linux_args->flags & LINUX_MAP_SHARED) ^
744	    (linux_args->flags & LINUX_MAP_PRIVATE)))
745		return (EINVAL);
746
747	if (linux_args->flags & LINUX_MAP_SHARED)
748		bsd_args.flags |= MAP_SHARED;
749	if (linux_args->flags & LINUX_MAP_PRIVATE)
750		bsd_args.flags |= MAP_PRIVATE;
751	if (linux_args->flags & LINUX_MAP_FIXED)
752		bsd_args.flags |= MAP_FIXED;
753	if (linux_args->flags & LINUX_MAP_ANON)
754		bsd_args.flags |= MAP_ANON;
755	else
756		bsd_args.flags |= MAP_NOSYNC;
757	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
758		bsd_args.flags |= MAP_STACK;
759
760		/*
761		 * The linux MAP_GROWSDOWN option does not limit auto
762		 * growth of the region.  Linux mmap with this option
763		 * takes as addr the inital BOS, and as len, the initial
764		 * region size.  It can then grow down from addr without
765		 * limit.  However, linux threads has an implicit internal
766		 * limit to stack size of STACK_SIZE.  Its just not
767		 * enforced explicitly in linux.  But, here we impose
768		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
769		 * region, since we can do this with our mmap.
770		 *
771		 * Our mmap with MAP_STACK takes addr as the maximum
772		 * downsize limit on BOS, and as len the max size of
773		 * the region.  It them maps the top SGROWSIZ bytes,
774		 * and autgrows the region down, up to the limit
775		 * in addr.
776		 *
777		 * If we don't use the MAP_STACK option, the effect
778		 * of this code is to allocate a stack region of a
779		 * fixed size of (STACK_SIZE - GUARD_SIZE).
780		 */
781
782		/* This gives us TOS */
783		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
784		    linux_args->len;
785
786		if ((caddr_t)PTRIN(bsd_args.addr) >
787		    p->p_vmspace->vm_maxsaddr) {
788			/*
789			 * Some linux apps will attempt to mmap
790			 * thread stacks near the top of their
791			 * address space.  If their TOS is greater
792			 * than vm_maxsaddr, vm_map_growstack()
793			 * will confuse the thread stack with the
794			 * process stack and deliver a SEGV if they
795			 * attempt to grow the thread stack past their
796			 * current stacksize rlimit.  To avoid this,
797			 * adjust vm_maxsaddr upwards to reflect
798			 * the current stacksize rlimit rather
799			 * than the maximum possible stacksize.
800			 * It would be better to adjust the
801			 * mmap'ed region, but some apps do not check
802			 * mmap's return value.
803			 */
804			PROC_LOCK(p);
805			p->p_vmspace->vm_maxsaddr =
806			    (char *)LINUX32_USRSTACK -
807			    lim_cur(p, RLIMIT_STACK);
808			PROC_UNLOCK(p);
809		}
810
811		/* This gives us our maximum stack size */
812		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
813			bsd_args.len = linux_args->len;
814		else
815			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
816
817		/*
818		 * This gives us a new BOS.  If we're using VM_STACK, then
819		 * mmap will just map the top SGROWSIZ bytes, and let
820		 * the stack grow down to the limit at BOS.  If we're
821		 * not using VM_STACK we map the full stack, since we
822		 * don't have a way to autogrow it.
823		 */
824		bsd_args.addr -= bsd_args.len;
825	} else {
826		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
827		bsd_args.len  = linux_args->len;
828	}
829
830	/*
831	 * We add PROT_EXEC to work around buggy applications (e.g. Java)
832	 * that take advantage of the fact that execute permissions are not
833	 * enforced by x86 CPUs.
834	 */
835	bsd_args.prot = linux_args->prot | PROT_EXEC;
836	if (linux_args->flags & LINUX_MAP_ANON)
837		bsd_args.fd = -1;
838	else {
839		/*
840		 * Linux follows Solaris mmap(2) description:
841		 * The file descriptor fildes is opened with
842		 * read permission, regardless of the
843		 * protection options specified.
844		 * If PROT_WRITE is specified, the application
845		 * must have opened the file descriptor
846		 * fildes with write permission unless
847		 * MAP_PRIVATE is specified in the flag
848		 * argument as described below.
849		 */
850
851		if ((error = fget(td, linux_args->fd, &fp)) != 0)
852			return (error);
853		if (fp->f_type != DTYPE_VNODE) {
854			fdrop(fp, td);
855			return (EINVAL);
856		}
857
858		/* Linux mmap() just fails for O_WRONLY files */
859		if (! (fp->f_flag & FREAD)) {
860			fdrop(fp, td);
861			return (EACCES);
862		}
863
864		bsd_args.fd = linux_args->fd;
865		fdrop(fp, td);
866	}
867	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
868	bsd_args.pad = 0;
869
870#ifdef DEBUG
871	if (ldebug(mmap))
872		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
873		    __func__,
874		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
875		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
876#endif
877	error = mmap(td, &bsd_args);
878#ifdef DEBUG
879	if (ldebug(mmap))
880		printf("-> %s() return: 0x%x (0x%08x)\n",
881			__func__, error, (u_int)td->td_retval[0]);
882#endif
883	return (error);
884}
885
886int
887linux_pipe(struct thread *td, struct linux_pipe_args *args)
888{
889	int pip[2];
890	int error;
891	register_t reg_rdx;
892
893#ifdef DEBUG
894	if (ldebug(pipe))
895		printf(ARGS(pipe, "*"));
896#endif
897
898	reg_rdx = td->td_retval[1];
899	error = pipe(td, 0);
900	if (error) {
901		td->td_retval[1] = reg_rdx;
902		return (error);
903	}
904
905	pip[0] = td->td_retval[0];
906	pip[1] = td->td_retval[1];
907	error = copyout(pip, args->pipefds, 2 * sizeof(int));
908	if (error) {
909		td->td_retval[1] = reg_rdx;
910		return (error);
911	}
912
913	td->td_retval[1] = reg_rdx;
914	td->td_retval[0] = 0;
915	return (0);
916}
917
918int
919linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
920{
921	l_osigaction_t osa;
922	l_sigaction_t act, oact;
923	int error;
924
925#ifdef DEBUG
926	if (ldebug(sigaction))
927		printf(ARGS(sigaction, "%d, %p, %p"),
928		    args->sig, (void *)args->nsa, (void *)args->osa);
929#endif
930
931	if (args->nsa != NULL) {
932		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
933		if (error)
934			return (error);
935		act.lsa_handler = osa.lsa_handler;
936		act.lsa_flags = osa.lsa_flags;
937		act.lsa_restorer = osa.lsa_restorer;
938		LINUX_SIGEMPTYSET(act.lsa_mask);
939		act.lsa_mask.__bits[0] = osa.lsa_mask;
940	}
941
942	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
943	    args->osa ? &oact : NULL);
944
945	if (args->osa != NULL && !error) {
946		osa.lsa_handler = oact.lsa_handler;
947		osa.lsa_flags = oact.lsa_flags;
948		osa.lsa_restorer = oact.lsa_restorer;
949		osa.lsa_mask = oact.lsa_mask.__bits[0];
950		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
951	}
952
953	return (error);
954}
955
956/*
957 * Linux has two extra args, restart and oldmask.  We dont use these,
958 * but it seems that "restart" is actually a context pointer that
959 * enables the signal to happen with a different register set.
960 */
961int
962linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
963{
964	sigset_t sigmask;
965	l_sigset_t mask;
966
967#ifdef DEBUG
968	if (ldebug(sigsuspend))
969		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
970#endif
971
972	LINUX_SIGEMPTYSET(mask);
973	mask.__bits[0] = args->mask;
974	linux_to_bsd_sigset(&mask, &sigmask);
975	return (kern_sigsuspend(td, sigmask));
976}
977
978int
979linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
980{
981	l_sigset_t lmask;
982	sigset_t sigmask;
983	int error;
984
985#ifdef DEBUG
986	if (ldebug(rt_sigsuspend))
987		printf(ARGS(rt_sigsuspend, "%p, %d"),
988		    (void *)uap->newset, uap->sigsetsize);
989#endif
990
991	if (uap->sigsetsize != sizeof(l_sigset_t))
992		return (EINVAL);
993
994	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
995	if (error)
996		return (error);
997
998	linux_to_bsd_sigset(&lmask, &sigmask);
999	return (kern_sigsuspend(td, sigmask));
1000}
1001
1002int
1003linux_pause(struct thread *td, struct linux_pause_args *args)
1004{
1005	struct proc *p = td->td_proc;
1006	sigset_t sigmask;
1007
1008#ifdef DEBUG
1009	if (ldebug(pause))
1010		printf(ARGS(pause, ""));
1011#endif
1012
1013	PROC_LOCK(p);
1014	sigmask = td->td_sigmask;
1015	PROC_UNLOCK(p);
1016	return (kern_sigsuspend(td, sigmask));
1017}
1018
1019int
1020linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
1021{
1022	stack_t ss, oss;
1023	l_stack_t lss;
1024	int error;
1025
1026#ifdef DEBUG
1027	if (ldebug(sigaltstack))
1028		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
1029#endif
1030
1031	if (uap->uss != NULL) {
1032		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1033		if (error)
1034			return (error);
1035
1036		ss.ss_sp = PTRIN(lss.ss_sp);
1037		ss.ss_size = lss.ss_size;
1038		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1039	}
1040	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1041	    (uap->uoss != NULL) ? &oss : NULL);
1042	if (!error && uap->uoss != NULL) {
1043		lss.ss_sp = PTROUT(oss.ss_sp);
1044		lss.ss_size = oss.ss_size;
1045		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1046		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1047	}
1048
1049	return (error);
1050}
1051
1052int
1053linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1054{
1055	struct ftruncate_args sa;
1056
1057#ifdef DEBUG
1058	if (ldebug(ftruncate64))
1059		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1060		    (intmax_t)args->length);
1061#endif
1062
1063	sa.fd = args->fd;
1064	sa.pad = 0;
1065	sa.length = args->length;
1066	return ftruncate(td, &sa);
1067}
1068
1069int
1070linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1071{
1072	struct timeval atv;
1073	l_timeval atv32;
1074	struct timezone rtz;
1075	int error = 0;
1076
1077	if (uap->tp) {
1078		microtime(&atv);
1079		atv32.tv_sec = atv.tv_sec;
1080		atv32.tv_usec = atv.tv_usec;
1081		error = copyout(&atv32, uap->tp, sizeof (atv32));
1082	}
1083	if (error == 0 && uap->tzp != NULL) {
1084		rtz.tz_minuteswest = tz_minuteswest;
1085		rtz.tz_dsttime = tz_dsttime;
1086		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1087	}
1088	return (error);
1089}
1090
1091int
1092linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1093{
1094	struct l_rusage s32;
1095	struct rusage s;
1096	int error;
1097
1098	error = kern_getrusage(td, uap->who, &s);
1099	if (error != 0)
1100		return (error);
1101	if (uap->rusage != NULL) {
1102		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1103		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1104		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1105		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1106		s32.ru_maxrss = s.ru_maxrss;
1107		s32.ru_ixrss = s.ru_ixrss;
1108		s32.ru_idrss = s.ru_idrss;
1109		s32.ru_isrss = s.ru_isrss;
1110		s32.ru_minflt = s.ru_minflt;
1111		s32.ru_majflt = s.ru_majflt;
1112		s32.ru_nswap = s.ru_nswap;
1113		s32.ru_inblock = s.ru_inblock;
1114		s32.ru_oublock = s.ru_oublock;
1115		s32.ru_msgsnd = s.ru_msgsnd;
1116		s32.ru_msgrcv = s.ru_msgrcv;
1117		s32.ru_nsignals = s.ru_nsignals;
1118		s32.ru_nvcsw = s.ru_nvcsw;
1119		s32.ru_nivcsw = s.ru_nivcsw;
1120		error = copyout(&s32, uap->rusage, sizeof(s32));
1121	}
1122	return (error);
1123}
1124
1125int
1126linux_sched_rr_get_interval(struct thread *td,
1127    struct linux_sched_rr_get_interval_args *uap)
1128{
1129	struct timespec ts;
1130	struct l_timespec ts32;
1131	int error;
1132
1133	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1134	if (error != 0)
1135		return (error);
1136	ts32.tv_sec = ts.tv_sec;
1137	ts32.tv_nsec = ts.tv_nsec;
1138	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1139}
1140
1141int
1142linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1143{
1144	struct mprotect_args bsd_args;
1145
1146	bsd_args.addr = uap->addr;
1147	bsd_args.len = uap->len;
1148	bsd_args.prot = uap->prot;
1149	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1150	if ((bsd_args.prot & PROT_READ) != 0)
1151		bsd_args.prot |= PROT_EXEC;
1152	return (mprotect(td, &bsd_args));
1153}
1154