linux32_machdep.c revision 163371
134355Sjb/*-
234355Sjb * Copyright (c) 2004 Tim J. Robbins
389985Sbde * Copyright (c) 2002 Doug Rabson
4151317Sdavidxu * Copyright (c) 2000 Marcel Moolenaar
534355Sjb * All rights reserved.
634355Sjb *
764002Speter * Redistribution and use in source and binary forms, with or without
834355Sjb * modification, are permitted provided that the following conditions
934355Sjb * are met:
1034355Sjb * 1. Redistributions of source code must retain the above copyright
1134355Sjb *    notice, this list of conditions and the following disclaimer
1234355Sjb *    in this position and unchanged.
1334355Sjb * 2. Redistributions in binary form must reproduce the above copyright
1434355Sjb *    notice, this list of conditions and the following disclaimer in the
1534355Sjb *    documentation and/or other materials provided with the distribution.
1634355Sjb * 3. The name of the author may not be used to endorse or promote products
1734355Sjb *    derived from this software without specific prior written permission.
1834355Sjb *
1934355Sjb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
2034355Sjb * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
2134355Sjb * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
2234355Sjb * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
2334355Sjb * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
2434355Sjb * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2534355Sjb * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2634355Sjb * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2734355Sjb * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
2834355Sjb * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2934355Sjb */
3034355Sjb
3134355Sjb#include <sys/cdefs.h>
3234355Sjb__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 163371 2006-10-15 13:22:14Z netchild $");
3334355Sjb
3434355Sjb#include <sys/param.h>
3534355Sjb#include <sys/kernel.h>
3634355Sjb#include <sys/systm.h>
3734355Sjb#include <sys/clock.h>
3834355Sjb#include <sys/imgact.h>
3934355Sjb#include <sys/limits.h>
4034355Sjb#include <sys/lock.h>
4134355Sjb#include <sys/malloc.h>
4234355Sjb#include <sys/mman.h>
4334355Sjb#include <sys/mutex.h>
4434355Sjb#include <sys/proc.h>
4534355Sjb#include <sys/resource.h>
4634355Sjb#include <sys/resourcevar.h>
4734355Sjb#include <sys/syscallsubr.h>
4834355Sjb#include <sys/sysproto.h>
4934355Sjb#include <sys/unistd.h>
5034355Sjb
5134355Sjb#include <machine/frame.h>
5234355Sjb
5334355Sjb#include <vm/vm.h>
5434355Sjb#include <vm/pmap.h>
5534355Sjb#include <vm/vm_extern.h>
5634355Sjb#include <vm/vm_kern.h>
5734355Sjb#include <vm/vm_map.h>
5834355Sjb
5934355Sjb#include <amd64/linux32/linux.h>
6034355Sjb#include <amd64/linux32/linux32_proto.h>
6134355Sjb#include <compat/linux/linux_ipc.h>
6234355Sjb#include <compat/linux/linux_signal.h>
6334355Sjb#include <compat/linux/linux_util.h>
6434355Sjb#include <compat/linux/linux_emul.h>
6534355Sjb
6634355Sjbstruct l_old_select_argv {
6734355Sjb	l_int		nfds;
6834355Sjb	l_uintptr_t	readfds;
6934355Sjb	l_uintptr_t	writefds;
7034355Sjb	l_uintptr_t	exceptfds;
7134355Sjb	l_uintptr_t	timeout;
7234355Sjb} __packed;
7334355Sjb
7434355Sjbint
7534355Sjblinux_to_bsd_sigaltstack(int lsa)
7634355Sjb{
7734355Sjb	int bsa = 0;
7834355Sjb
7934355Sjb	if (lsa & LINUX_SS_DISABLE)
8034355Sjb		bsa |= SS_DISABLE;
8134355Sjb	if (lsa & LINUX_SS_ONSTACK)
8234355Sjb		bsa |= SS_ONSTACK;
8334355Sjb	return (bsa);
8434355Sjb}
8534355Sjb
8634355Sjbint
8734355Sjbbsd_to_linux_sigaltstack(int bsa)
8834355Sjb{
8934355Sjb	int lsa = 0;
9034355Sjb
9134355Sjb	if (bsa & SS_DISABLE)
9234355Sjb		lsa |= LINUX_SS_DISABLE;
9334355Sjb	if (bsa & SS_ONSTACK)
9434355Sjb		lsa |= LINUX_SS_ONSTACK;
9534355Sjb	return (lsa);
9634355Sjb}
9734355Sjb
9834355Sjb/*
9934355Sjb * Custom version of exec_copyin_args() so that we can translate
10034355Sjb * the pointers.
10134355Sjb */
10234355Sjbstatic int
10334355Sjblinux_exec_copyin_args(struct image_args *args, char *fname,
10434355Sjb    enum uio_seg segflg, char **argv, char **envv)
10534355Sjb{
10634355Sjb	char *argp, *envp;
10734355Sjb	u_int32_t *p32, arg;
10834355Sjb	size_t length;
10934355Sjb	int error;
110127891Sdfr
11134355Sjb	bzero(args, sizeof(*args));
11234355Sjb	if (argv == NULL)
11334355Sjb		return (EFAULT);
11434355Sjb
11534355Sjb	/*
11634355Sjb	 * Allocate temporary demand zeroed space for argument and
11734355Sjb	 *	environment strings
11834355Sjb	 */
11934355Sjb	args->buf = (char *) kmem_alloc_wait(exec_map,
12045065Salc	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
12145065Salc	if (args->buf == NULL)
12234355Sjb		return (ENOMEM);
12334355Sjb	args->begin_argv = args->buf;
12434355Sjb	args->endp = args->begin_argv;
12534355Sjb	args->stringspace = ARG_MAX;
12634355Sjb
12734355Sjb	args->fname = args->buf + ARG_MAX;
12834355Sjb
12934355Sjb	/*
13034355Sjb	 * Copy the file name.
13134355Sjb	 */
13234355Sjb	error = (segflg == UIO_SYSSPACE) ?
13334355Sjb	    copystr(fname, args->fname, PATH_MAX, &length) :
13434355Sjb	    copyinstr(fname, args->fname, PATH_MAX, &length);
13534355Sjb	if (error != 0)
13634355Sjb		goto err_exit;
13734355Sjb
13834355Sjb	/*
13934355Sjb	 * extract arguments first
14034355Sjb	 */
14134355Sjb	p32 = (u_int32_t *)argv;
14234355Sjb	for (;;) {
14335938Sdyson		error = copyin(p32++, &arg, sizeof(arg));
14434355Sjb		if (error)
14534355Sjb			goto err_exit;
14634355Sjb		if (arg == 0)
14734355Sjb			break;
14834355Sjb		argp = PTRIN(arg);
14934355Sjb		error = copyinstr(argp, args->endp, args->stringspace, &length);
15034355Sjb		if (error) {
15134355Sjb			if (error == ENAMETOOLONG)
15234355Sjb				error = E2BIG;
15334355Sjb
15434355Sjb			goto err_exit;
15534355Sjb		}
15634355Sjb		args->stringspace -= length;
15734355Sjb		args->endp += length;
15834355Sjb		args->argc++;
15934355Sjb	}
16034355Sjb
161137875Smarks	args->begin_envv = args->endp;
16234355Sjb
16334355Sjb	/*
16434355Sjb	 * extract environment strings
16534355Sjb	 */
16634355Sjb	if (envv) {
16735938Sdyson		p32 = (u_int32_t *)envv;
16835938Sdyson		for (;;) {
16935938Sdyson			error = copyin(p32++, &arg, sizeof(arg));
17035938Sdyson			if (error)
17135938Sdyson				goto err_exit;
17235938Sdyson			if (arg == 0)
17335938Sdyson				break;
17435938Sdyson			envp = PTRIN(arg);
175147814Sjhb			error = copyinstr(envp, args->endp, args->stringspace,
176147814Sjhb			    &length);
17751138Salfred			if (error) {
17851138Salfred				if (error == ENAMETOOLONG)
17934355Sjb					error = E2BIG;
18034355Sjb				goto err_exit;
18134355Sjb			}
18234355Sjb			args->stringspace -= length;
18334355Sjb			args->endp += length;
18434355Sjb			args->envc++;
18534355Sjb		}
18634355Sjb	}
18734355Sjb
18834355Sjb	return (0);
18934355Sjb
19056115Spetererr_exit:
19156115Speter	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
19234355Sjb	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
19334355Sjb	args->buf = NULL;
19434355Sjb	return (error);
19534355Sjb}
19634355Sjb
19734355Sjbint
19834355Sjblinux_execve(struct thread *td, struct linux_execve_args *args)
19934355Sjb{
20034355Sjb	struct image_args eargs;
20134355Sjb	char *path;
20234925Sdufault	int error;
20334925Sdufault
20434925Sdufault	LCONVPATHEXIST(td, args->path, &path);
20534925Sdufault
20634925Sdufault#ifdef DEBUG
20734925Sdufault	if (ldebug(execve))
20834925Sdufault		printf(ARGS(execve, "%s"), path);
20934925Sdufault#endif
21035938Sdyson
21140931Sdg	error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp,
21246155Sphk	    args->envp);
21351791Smarcel	free(path, M_TEMP);
21451791Smarcel	if (error == 0)
21551791Smarcel		error = kern_execve(td, &eargs, NULL);
21651791Smarcel	if (error == 0)
217112895Sjeff	   	/* linux process can exec fbsd one, dont attempt
218112895Sjeff		 * to create emuldata for such process using
21956272Srwatson		 * linux_proc_init, this leads to a panic on KASSERT
22056272Srwatson		 * because such process has p->p_emuldata == NULL
22156272Srwatson		 */
22256272Srwatson	   	if (td->td_proc->p_sysent == &elf_linux_sysvec)
22356272Srwatson   		   	error = linux_proc_init(td, 0, 0);
22456272Srwatson	return (error);
22556272Srwatson}
22656272Srwatson
22754803Srwatsonstruct iovec32 {
22854803Srwatson	u_int32_t iov_base;
22954803Srwatson	int	iov_len;
23055943Sjasone};
23156115Speter
23256115SpeterCTASSERT(sizeof(struct iovec32) == 8);
23359288Sjlemon
23459288Sjlemonstatic int
23561719Srwatsonlinux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
23675039Srwatson{
23775039Srwatson	struct iovec32 iov32;
23875427Srwatson	struct iovec *iov;
23983652Speter	struct uio *uio;
24083796Srwatson	u_int iovlen;
24185891Sphk	int error, i;
24290889Sjulian
24390889Sjulian	*uiop = NULL;
24490889Sjulian	if (iovcnt > UIO_MAXIOV)
245103972Sarchie		return (EINVAL);
246103972Sarchie	iovlen = iovcnt * sizeof(struct iovec);
247103972Sarchie	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
248100897Srwatson	iov = (struct iovec *)(uio + 1);
249100897Srwatson	for (i = 0; i < iovcnt; i++) {
250100897Srwatson		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
251100897Srwatson		if (error) {
252100897Srwatson			free(uio, M_IOV);
253100897Srwatson			return (error);
25496084Smux		}
25597372Smarcel		iov[i].iov_base = PTRIN(iov32.iov_base);
25699856Salfred		iov[i].iov_len = iov32.iov_len;
257100956Srwatson	}
258103575Salfred	uio->uio_iov = iov;
259122540Smckusick	uio->uio_iovcnt = iovcnt;
260122540Smckusick	uio->uio_segflg = UIO_USERSPACE;
261122540Smckusick	uio->uio_offset = -1;
262122540Smckusick	uio->uio_resid = 0;
263103575Salfred	for (i = 0; i < iovcnt; i++) {
264103575Salfred		if (iov->iov_len > INT_MAX - uio->uio_resid) {
265103575Salfred			free(uio, M_IOV);
266103575Salfred			return (EINVAL);
267103575Salfred		}
268103575Salfred		uio->uio_resid += iov->iov_len;
269103575Salfred		iov++;
270103575Salfred	}
271104731Srwatson	*uiop = uio;
272105692Srwatson	return (0);
273105692Srwatson}
274105692Srwatson
275104731Srwatsonint
276104731Srwatsonlinux_readv(struct thread *td, struct linux_readv_args *uap)
277105950Speter{
278106467Srwatson	struct uio *auio;
279105950Speter	int error;
280106978Sdeischen
281106978Sdeischen	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
282106978Sdeischen	if (error)
283107914Sdillon		return (error);
284108406Srwatson	error = kern_readv(td, uap->fd, auio);
285108406Srwatson	free(auio, M_IOV);
286108406Srwatson	return (error);
287108406Srwatson}
288112895Sjeff
289112902Sjeffint
290112902Sjefflinux_writev(struct thread *td, struct linux_writev_args *uap)
291112902Sjeff{
292112902Sjeff	struct uio *auio;
293112909Sjeff	int error;
294112909Sjeff
295113276Smike	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
296115800Srwatson	if (error)
297115800Srwatson		return (error);
298115800Srwatson	error = kern_writev(td, uap->fd, auio);
299123253Smarcel	free(auio, M_IOV);
300125369Sdeischen	return (error);
301127484Smtm}
302127484Smtm
303132117Sphkstruct l_ipc_kludge {
304136831Srwatson	l_uintptr_t msgp;
305136831Srwatson	l_long msgtyp;
306136831Srwatson} __packed;
307136831Srwatson
308136831Srwatsonint
309136831Srwatsonlinux_ipc(struct thread *td, struct linux_ipc_args *args)
310136831Srwatson{
311136831Srwatson
312136831Srwatson	switch (args->what & 0xFFFF) {
313139013Sdavidxu	case LINUX_SEMOP: {
314145435Sdavidxu		struct linux_semop_args a;
315151317Sdavidxu
316151317Sdavidxu		a.semid = args->arg1;
317		a.tsops = args->ptr;
318		a.nsops = args->arg2;
319		return (linux_semop(td, &a));
320	}
321	case LINUX_SEMGET: {
322		struct linux_semget_args a;
323
324		a.key = args->arg1;
325		a.nsems = args->arg2;
326		a.semflg = args->arg3;
327		return (linux_semget(td, &a));
328	}
329	case LINUX_SEMCTL: {
330		struct linux_semctl_args a;
331		int error;
332
333		a.semid = args->arg1;
334		a.semnum = args->arg2;
335		a.cmd = args->arg3;
336		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
337		if (error)
338			return (error);
339		return (linux_semctl(td, &a));
340	}
341	case LINUX_MSGSND: {
342		struct linux_msgsnd_args a;
343
344		a.msqid = args->arg1;
345		a.msgp = args->ptr;
346		a.msgsz = args->arg2;
347		a.msgflg = args->arg3;
348		return (linux_msgsnd(td, &a));
349	}
350	case LINUX_MSGRCV: {
351		struct linux_msgrcv_args a;
352
353		a.msqid = args->arg1;
354		a.msgsz = args->arg2;
355		a.msgflg = args->arg3;
356		if ((args->what >> 16) == 0) {
357			struct l_ipc_kludge tmp;
358			int error;
359
360			if (args->ptr == 0)
361				return (EINVAL);
362			error = copyin(args->ptr, &tmp, sizeof(tmp));
363			if (error)
364				return (error);
365			a.msgp = PTRIN(tmp.msgp);
366			a.msgtyp = tmp.msgtyp;
367		} else {
368			a.msgp = args->ptr;
369			a.msgtyp = args->arg5;
370		}
371		return (linux_msgrcv(td, &a));
372	}
373	case LINUX_MSGGET: {
374		struct linux_msgget_args a;
375
376		a.key = args->arg1;
377		a.msgflg = args->arg2;
378		return (linux_msgget(td, &a));
379	}
380	case LINUX_MSGCTL: {
381		struct linux_msgctl_args a;
382
383		a.msqid = args->arg1;
384		a.cmd = args->arg2;
385		a.buf = args->ptr;
386		return (linux_msgctl(td, &a));
387	}
388	case LINUX_SHMAT: {
389		struct linux_shmat_args a;
390
391		a.shmid = args->arg1;
392		a.shmaddr = args->ptr;
393		a.shmflg = args->arg2;
394		a.raddr = PTRIN((l_uint)args->arg3);
395		return (linux_shmat(td, &a));
396	}
397	case LINUX_SHMDT: {
398		struct linux_shmdt_args a;
399
400		a.shmaddr = args->ptr;
401		return (linux_shmdt(td, &a));
402	}
403	case LINUX_SHMGET: {
404		struct linux_shmget_args a;
405
406		a.key = args->arg1;
407		a.size = args->arg2;
408		a.shmflg = args->arg3;
409		return (linux_shmget(td, &a));
410	}
411	case LINUX_SHMCTL: {
412		struct linux_shmctl_args a;
413
414		a.shmid = args->arg1;
415		a.cmd = args->arg2;
416		a.buf = args->ptr;
417		return (linux_shmctl(td, &a));
418	}
419	default:
420		break;
421	}
422
423	return (EINVAL);
424}
425
426int
427linux_old_select(struct thread *td, struct linux_old_select_args *args)
428{
429	struct l_old_select_argv linux_args;
430	struct linux_select_args newsel;
431	int error;
432
433#ifdef DEBUG
434	if (ldebug(old_select))
435		printf(ARGS(old_select, "%p"), args->ptr);
436#endif
437
438	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
439	if (error)
440		return (error);
441
442	newsel.nfds = linux_args.nfds;
443	newsel.readfds = PTRIN(linux_args.readfds);
444	newsel.writefds = PTRIN(linux_args.writefds);
445	newsel.exceptfds = PTRIN(linux_args.exceptfds);
446	newsel.timeout = PTRIN(linux_args.timeout);
447	return (linux_select(td, &newsel));
448}
449
450int
451linux_fork(struct thread *td, struct linux_fork_args *args)
452{
453	int error;
454
455#ifdef DEBUG
456	if (ldebug(fork))
457		printf(ARGS(fork, ""));
458#endif
459
460	if ((error = fork(td, (struct fork_args *)args)) != 0)
461		return (error);
462
463	if (td->td_retval[1] == 1)
464		td->td_retval[0] = 0;
465	error = linux_proc_init(td, td->td_retval[0], 0);
466	if (error)
467		return (error);
468
469	return (0);
470}
471
472int
473linux_vfork(struct thread *td, struct linux_vfork_args *args)
474{
475	int error;
476	struct proc *p2;
477
478#ifdef DEBUG
479	if (ldebug(vfork))
480		printf(ARGS(vfork, ""));
481#endif
482
483	/* exclude RFPPWAIT */
484	if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0)
485		return (error);
486	if (error == 0) {
487	   	td->td_retval[0] = p2->p_pid;
488		td->td_retval[1] = 0;
489	}
490	/* Are we the child? */
491	if (td->td_retval[1] == 1)
492		td->td_retval[0] = 0;
493	error = linux_proc_init(td, td->td_retval[0], 0);
494	if (error)
495		return (error);
496	/* wait for the children to exit, ie. emulate vfork */
497	PROC_LOCK(p2);
498	while (p2->p_flag & P_PPWAIT)
499	   	msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0);
500	PROC_UNLOCK(p2);
501	return (0);
502}
503
504int
505linux_clone(struct thread *td, struct linux_clone_args *args)
506{
507	int error, ff = RFPROC | RFSTOPPED;
508	struct proc *p2;
509	struct thread *td2;
510	int exit_signal;
511	struct linux_emuldata *em;
512
513#ifdef DEBUG
514	if (ldebug(clone)) {
515   	   	printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"),
516		    (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack,
517		    (unsigned int)(uintptr_t)args->parent_tidptr,
518		    (unsigned int)(uintptr_t)args->child_tidptr);
519	}
520#endif
521
522	exit_signal = args->flags & 0x000000ff;
523	if (exit_signal >= LINUX_NSIG)
524		return (EINVAL);
525
526	if (exit_signal <= LINUX_SIGTBLSZ)
527		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
528
529	if (args->flags & CLONE_VM)
530		ff |= RFMEM;
531	if (args->flags & CLONE_SIGHAND)
532		ff |= RFSIGSHARE;
533	/*
534	 * XXX: in linux sharing of fs info (chroot/cwd/umask)
535	 * and open files is independant. in fbsd its in one
536	 * structure but in reality it doesnt make any problems
537	 * because both this flags are set at once usually.
538	 */
539	if (!(args->flags & (CLONE_FILES | CLONE_FS)))
540		ff |= RFFDG;
541
542	/*
543	 * Attempt to detect when linux_clone(2) is used for creating
544	 * kernel threads. Unfortunately despite the existence of the
545	 * CLONE_THREAD flag, version of linuxthreads package used in
546	 * most popular distros as of beginning of 2005 doesn't make
547	 * any use of it. Therefore, this detection relay fully on
548	 * empirical observation that linuxthreads sets certain
549	 * combination of flags, so that we can make more or less
550	 * precise detection and notify the FreeBSD kernel that several
551	 * processes are in fact part of the same threading group, so
552	 * that special treatment is necessary for signal delivery
553	 * between those processes and fd locking.
554	 */
555	if ((args->flags & 0xffffff00) == THREADING_FLAGS)
556		ff |= RFTHREAD;
557
558	error = fork1(td, ff, 0, &p2);
559	if (error)
560		return (error);
561
562	/* create the emuldata */
563	error = linux_proc_init(td, p2->p_pid, args->flags);
564	/* reference it - no need to check this */
565	em = em_find(p2, EMUL_UNLOCKED);
566	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
567	/* and adjust it */
568	if (args->flags & CLONE_PARENT_SETTID) {
569	   	if (args->parent_tidptr == NULL) {
570		   	EMUL_UNLOCK(&emul_lock);
571			return (EINVAL);
572		}
573		error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid));
574		if (error) {
575		   	EMUL_UNLOCK(&emul_lock);
576			return (error);
577		}
578	}
579
580	if (args->flags & (CLONE_PARENT|CLONE_THREAD)) {
581	   	sx_xlock(&proctree_lock);
582		PROC_LOCK(p2);
583		proc_reparent(p2, td->td_proc->p_pptr);
584		PROC_UNLOCK(p2);
585		sx_xunlock(&proctree_lock);
586	}
587
588	if (args->flags & CLONE_THREAD) {
589	   	/* XXX: linux mangles pgrp and pptr somehow
590		 * I think it might be this but I am not sure.
591		 */
592#ifdef notyet
593	   	PROC_LOCK(p2);
594	   	p2->p_pgrp = td->td_proc->p_pgrp;
595	   	PROC_UNLOCK(p2);
596#endif
597	 	exit_signal = 0;
598	}
599
600	if (args->flags & CLONE_CHILD_SETTID)
601		em->child_set_tid = args->child_tidptr;
602	else
603	   	em->child_set_tid = NULL;
604
605	if (args->flags & CLONE_CHILD_CLEARTID)
606		em->child_clear_tid = args->child_tidptr;
607	else
608	   	em->child_clear_tid = NULL;
609
610	EMUL_UNLOCK(&emul_lock);
611
612	PROC_LOCK(p2);
613	p2->p_sigparent = exit_signal;
614	PROC_UNLOCK(p2);
615	td2 = FIRST_THREAD_IN_PROC(p2);
616	/*
617	 * in a case of stack = NULL we are supposed to COW calling process stack
618	 * this is what normal fork() does so we just keep the tf_rsp arg intact
619	 */
620	if (args->stack)
621   	   	td2->td_frame->tf_rsp = PTROUT(args->stack);
622
623	if (args->flags & CLONE_SETTLS) {
624	   	/* XXX: todo */
625	}
626
627#ifdef DEBUG
628	if (ldebug(clone))
629		printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"),
630		    (long)p2->p_pid, args->stack, exit_signal);
631#endif
632
633	/*
634	 * Make this runnable after we are finished with it.
635	 */
636	mtx_lock_spin(&sched_lock);
637	TD_SET_CAN_RUN(td2);
638	setrunqueue(td2, SRQ_BORING);
639	mtx_unlock_spin(&sched_lock);
640
641	td->td_retval[0] = p2->p_pid;
642	td->td_retval[1] = 0;
643	return (0);
644}
645
646/* XXX move */
647struct l_mmap_argv {
648	l_ulong		addr;
649	l_ulong		len;
650	l_ulong		prot;
651	l_ulong		flags;
652	l_ulong		fd;
653	l_ulong		pgoff;
654};
655
656#define STACK_SIZE  (2 * 1024 * 1024)
657#define GUARD_SIZE  (4 * PAGE_SIZE)
658
659static int linux_mmap_common(struct thread *, struct l_mmap_argv *);
660
661int
662linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
663{
664	struct l_mmap_argv linux_args;
665
666#ifdef DEBUG
667	if (ldebug(mmap2))
668		printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
669		    (void *)(intptr_t)args->addr, args->len, args->prot,
670		    args->flags, args->fd, args->pgoff);
671#endif
672
673	linux_args.addr = PTROUT(args->addr);
674	linux_args.len = args->len;
675	linux_args.prot = args->prot;
676	linux_args.flags = args->flags;
677	linux_args.fd = args->fd;
678	linux_args.pgoff = args->pgoff;
679
680	return (linux_mmap_common(td, &linux_args));
681}
682
683int
684linux_mmap(struct thread *td, struct linux_mmap_args *args)
685{
686	int error;
687	struct l_mmap_argv linux_args;
688
689	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
690	if (error)
691		return (error);
692
693#ifdef DEBUG
694	if (ldebug(mmap))
695		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
696		    (void *)(intptr_t)linux_args.addr, linux_args.len,
697		    linux_args.prot, linux_args.flags, linux_args.fd,
698		    linux_args.pgoff);
699#endif
700	if ((linux_args.pgoff % PAGE_SIZE) != 0)
701		return (EINVAL);
702	linux_args.pgoff /= PAGE_SIZE;
703
704	return (linux_mmap_common(td, &linux_args));
705}
706
707static int
708linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args)
709{
710	struct proc *p = td->td_proc;
711	struct mmap_args /* {
712		caddr_t addr;
713		size_t len;
714		int prot;
715		int flags;
716		int fd;
717		long pad;
718		off_t pos;
719	} */ bsd_args;
720	int error;
721
722	error = 0;
723	bsd_args.flags = 0;
724	if (linux_args->flags & LINUX_MAP_SHARED)
725		bsd_args.flags |= MAP_SHARED;
726	if (linux_args->flags & LINUX_MAP_PRIVATE)
727		bsd_args.flags |= MAP_PRIVATE;
728	if (linux_args->flags & LINUX_MAP_FIXED)
729		bsd_args.flags |= MAP_FIXED;
730	if (linux_args->flags & LINUX_MAP_ANON)
731		bsd_args.flags |= MAP_ANON;
732	else
733		bsd_args.flags |= MAP_NOSYNC;
734	if (linux_args->flags & LINUX_MAP_GROWSDOWN) {
735		bsd_args.flags |= MAP_STACK;
736
737		/*
738		 * The linux MAP_GROWSDOWN option does not limit auto
739		 * growth of the region.  Linux mmap with this option
740		 * takes as addr the inital BOS, and as len, the initial
741		 * region size.  It can then grow down from addr without
742		 * limit.  However, linux threads has an implicit internal
743		 * limit to stack size of STACK_SIZE.  Its just not
744		 * enforced explicitly in linux.  But, here we impose
745		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
746		 * region, since we can do this with our mmap.
747		 *
748		 * Our mmap with MAP_STACK takes addr as the maximum
749		 * downsize limit on BOS, and as len the max size of
750		 * the region.  It them maps the top SGROWSIZ bytes,
751		 * and autgrows the region down, up to the limit
752		 * in addr.
753		 *
754		 * If we don't use the MAP_STACK option, the effect
755		 * of this code is to allocate a stack region of a
756		 * fixed size of (STACK_SIZE - GUARD_SIZE).
757		 */
758
759		/* This gives us TOS */
760		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) +
761		    linux_args->len;
762
763		if ((caddr_t)PTRIN(bsd_args.addr) >
764		    p->p_vmspace->vm_maxsaddr) {
765			/*
766			 * Some linux apps will attempt to mmap
767			 * thread stacks near the top of their
768			 * address space.  If their TOS is greater
769			 * than vm_maxsaddr, vm_map_growstack()
770			 * will confuse the thread stack with the
771			 * process stack and deliver a SEGV if they
772			 * attempt to grow the thread stack past their
773			 * current stacksize rlimit.  To avoid this,
774			 * adjust vm_maxsaddr upwards to reflect
775			 * the current stacksize rlimit rather
776			 * than the maximum possible stacksize.
777			 * It would be better to adjust the
778			 * mmap'ed region, but some apps do not check
779			 * mmap's return value.
780			 */
781			PROC_LOCK(p);
782			p->p_vmspace->vm_maxsaddr =
783			    (char *)LINUX32_USRSTACK -
784			    lim_cur(p, RLIMIT_STACK);
785			PROC_UNLOCK(p);
786		}
787
788		/* This gives us our maximum stack size */
789		if (linux_args->len > STACK_SIZE - GUARD_SIZE)
790			bsd_args.len = linux_args->len;
791		else
792			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
793
794		/*
795		 * This gives us a new BOS.  If we're using VM_STACK, then
796		 * mmap will just map the top SGROWSIZ bytes, and let
797		 * the stack grow down to the limit at BOS.  If we're
798		 * not using VM_STACK we map the full stack, since we
799		 * don't have a way to autogrow it.
800		 */
801		bsd_args.addr -= bsd_args.len;
802	} else {
803		bsd_args.addr = (caddr_t)PTRIN(linux_args->addr);
804		bsd_args.len  = linux_args->len;
805	}
806	/*
807	 * XXX i386 Linux always emulator forces PROT_READ on (why?)
808	 * so we do the same. We add PROT_EXEC to work around buggy
809	 * applications (e.g. Java) that take advantage of the fact
810	 * that execute permissions are not enforced by x86 CPUs.
811	 */
812	bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ;
813	if (linux_args->flags & LINUX_MAP_ANON)
814		bsd_args.fd = -1;
815	else
816		bsd_args.fd = linux_args->fd;
817	bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE;
818	bsd_args.pad = 0;
819
820#ifdef DEBUG
821	if (ldebug(mmap))
822		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
823		    __func__,
824		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
825		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
826#endif
827	error = mmap(td, &bsd_args);
828#ifdef DEBUG
829	if (ldebug(mmap))
830		printf("-> %s() return: 0x%x (0x%08x)\n",
831			__func__, error, (u_int)td->td_retval[0]);
832#endif
833	return (error);
834}
835
836int
837linux_pipe(struct thread *td, struct linux_pipe_args *args)
838{
839	int pip[2];
840	int error;
841	register_t reg_rdx;
842
843#ifdef DEBUG
844	if (ldebug(pipe))
845		printf(ARGS(pipe, "*"));
846#endif
847
848	reg_rdx = td->td_retval[1];
849	error = pipe(td, 0);
850	if (error) {
851		td->td_retval[1] = reg_rdx;
852		return (error);
853	}
854
855	pip[0] = td->td_retval[0];
856	pip[1] = td->td_retval[1];
857	error = copyout(pip, args->pipefds, 2 * sizeof(int));
858	if (error) {
859		td->td_retval[1] = reg_rdx;
860		return (error);
861	}
862
863	td->td_retval[1] = reg_rdx;
864	td->td_retval[0] = 0;
865	return (0);
866}
867
868int
869linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
870{
871	l_osigaction_t osa;
872	l_sigaction_t act, oact;
873	int error;
874
875#ifdef DEBUG
876	if (ldebug(sigaction))
877		printf(ARGS(sigaction, "%d, %p, %p"),
878		    args->sig, (void *)args->nsa, (void *)args->osa);
879#endif
880
881	if (args->nsa != NULL) {
882		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
883		if (error)
884			return (error);
885		act.lsa_handler = osa.lsa_handler;
886		act.lsa_flags = osa.lsa_flags;
887		act.lsa_restorer = osa.lsa_restorer;
888		LINUX_SIGEMPTYSET(act.lsa_mask);
889		act.lsa_mask.__bits[0] = osa.lsa_mask;
890	}
891
892	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
893	    args->osa ? &oact : NULL);
894
895	if (args->osa != NULL && !error) {
896		osa.lsa_handler = oact.lsa_handler;
897		osa.lsa_flags = oact.lsa_flags;
898		osa.lsa_restorer = oact.lsa_restorer;
899		osa.lsa_mask = oact.lsa_mask.__bits[0];
900		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
901	}
902
903	return (error);
904}
905
906/*
907 * Linux has two extra args, restart and oldmask.  We dont use these,
908 * but it seems that "restart" is actually a context pointer that
909 * enables the signal to happen with a different register set.
910 */
911int
912linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
913{
914	sigset_t sigmask;
915	l_sigset_t mask;
916
917#ifdef DEBUG
918	if (ldebug(sigsuspend))
919		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
920#endif
921
922	LINUX_SIGEMPTYSET(mask);
923	mask.__bits[0] = args->mask;
924	linux_to_bsd_sigset(&mask, &sigmask);
925	return (kern_sigsuspend(td, sigmask));
926}
927
928int
929linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
930{
931	l_sigset_t lmask;
932	sigset_t sigmask;
933	int error;
934
935#ifdef DEBUG
936	if (ldebug(rt_sigsuspend))
937		printf(ARGS(rt_sigsuspend, "%p, %d"),
938		    (void *)uap->newset, uap->sigsetsize);
939#endif
940
941	if (uap->sigsetsize != sizeof(l_sigset_t))
942		return (EINVAL);
943
944	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
945	if (error)
946		return (error);
947
948	linux_to_bsd_sigset(&lmask, &sigmask);
949	return (kern_sigsuspend(td, sigmask));
950}
951
952int
953linux_pause(struct thread *td, struct linux_pause_args *args)
954{
955	struct proc *p = td->td_proc;
956	sigset_t sigmask;
957
958#ifdef DEBUG
959	if (ldebug(pause))
960		printf(ARGS(pause, ""));
961#endif
962
963	PROC_LOCK(p);
964	sigmask = td->td_sigmask;
965	PROC_UNLOCK(p);
966	return (kern_sigsuspend(td, sigmask));
967}
968
969int
970linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
971{
972	stack_t ss, oss;
973	l_stack_t lss;
974	int error;
975
976#ifdef DEBUG
977	if (ldebug(sigaltstack))
978		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
979#endif
980
981	if (uap->uss != NULL) {
982		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
983		if (error)
984			return (error);
985
986		ss.ss_sp = PTRIN(lss.ss_sp);
987		ss.ss_size = lss.ss_size;
988		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
989	}
990	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
991	    (uap->uoss != NULL) ? &oss : NULL);
992	if (!error && uap->uoss != NULL) {
993		lss.ss_sp = PTROUT(oss.ss_sp);
994		lss.ss_size = oss.ss_size;
995		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
996		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
997	}
998
999	return (error);
1000}
1001
1002int
1003linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1004{
1005	struct ftruncate_args sa;
1006
1007#ifdef DEBUG
1008	if (ldebug(ftruncate64))
1009		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1010		    (intmax_t)args->length);
1011#endif
1012
1013	sa.fd = args->fd;
1014	sa.pad = 0;
1015	sa.length = args->length;
1016	return ftruncate(td, &sa);
1017}
1018
1019int
1020linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1021{
1022	struct timeval atv;
1023	l_timeval atv32;
1024	struct timezone rtz;
1025	int error = 0;
1026
1027	if (uap->tp) {
1028		microtime(&atv);
1029		atv32.tv_sec = atv.tv_sec;
1030		atv32.tv_usec = atv.tv_usec;
1031		error = copyout(&atv32, uap->tp, sizeof (atv32));
1032	}
1033	if (error == 0 && uap->tzp != NULL) {
1034		rtz.tz_minuteswest = tz_minuteswest;
1035		rtz.tz_dsttime = tz_dsttime;
1036		error = copyout(&rtz, uap->tzp, sizeof (rtz));
1037	}
1038	return (error);
1039}
1040
1041int
1042linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap)
1043{
1044	struct timespec rqt, rmt;
1045	struct l_timespec ats32;
1046	int error;
1047
1048	error = copyin(uap->rqtp, &ats32, sizeof(ats32));
1049	if (error != 0)
1050		return (error);
1051	rqt.tv_sec = ats32.tv_sec;
1052	rqt.tv_nsec = ats32.tv_nsec;
1053	error = kern_nanosleep(td, &rqt, &rmt);
1054	if (uap->rmtp != NULL) {
1055		ats32.tv_sec = rmt.tv_sec;
1056		ats32.tv_nsec = rmt.tv_nsec;
1057		error = copyout(&ats32, uap->rmtp, sizeof(ats32));
1058	}
1059	return (error);
1060}
1061
1062int
1063linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1064{
1065	struct l_rusage s32;
1066	struct rusage s;
1067	int error;
1068
1069	error = kern_getrusage(td, uap->who, &s);
1070	if (error != 0)
1071		return (error);
1072	if (uap->rusage != NULL) {
1073		s32.ru_utime.tv_sec = s.ru_utime.tv_sec;
1074		s32.ru_utime.tv_usec = s.ru_utime.tv_usec;
1075		s32.ru_stime.tv_sec = s.ru_stime.tv_sec;
1076		s32.ru_stime.tv_usec = s.ru_stime.tv_usec;
1077		s32.ru_maxrss = s.ru_maxrss;
1078		s32.ru_ixrss = s.ru_ixrss;
1079		s32.ru_idrss = s.ru_idrss;
1080		s32.ru_isrss = s.ru_isrss;
1081		s32.ru_minflt = s.ru_minflt;
1082		s32.ru_majflt = s.ru_majflt;
1083		s32.ru_nswap = s.ru_nswap;
1084		s32.ru_inblock = s.ru_inblock;
1085		s32.ru_oublock = s.ru_oublock;
1086		s32.ru_msgsnd = s.ru_msgsnd;
1087		s32.ru_msgrcv = s.ru_msgrcv;
1088		s32.ru_nsignals = s.ru_nsignals;
1089		s32.ru_nvcsw = s.ru_nvcsw;
1090		s32.ru_nivcsw = s.ru_nivcsw;
1091		error = copyout(&s32, uap->rusage, sizeof(s32));
1092	}
1093	return (error);
1094}
1095
1096int
1097linux_sched_rr_get_interval(struct thread *td,
1098    struct linux_sched_rr_get_interval_args *uap)
1099{
1100	struct timespec ts;
1101	struct l_timespec ts32;
1102	int error;
1103
1104	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1105	if (error != 0)
1106		return (error);
1107	ts32.tv_sec = ts.tv_sec;
1108	ts32.tv_nsec = ts.tv_nsec;
1109	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1110}
1111
1112int
1113linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
1114{
1115	struct mprotect_args bsd_args;
1116
1117	bsd_args.addr = uap->addr;
1118	bsd_args.len = uap->len;
1119	bsd_args.prot = uap->prot;
1120	/* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */
1121	if ((bsd_args.prot & PROT_READ) != 0)
1122		bsd_args.prot |= PROT_EXEC;
1123	return (mprotect(td, &bsd_args));
1124}
1125