linux32_machdep.c revision 218059
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 218059 2011-01-29 07:22:33Z dchagin $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/file.h>
38#include <sys/fcntl.h>
39#include <sys/clock.h>
40#include <sys/imgact.h>
41#include <sys/limits.h>
42#include <sys/lock.h>
43#include <sys/malloc.h>
44#include <sys/mman.h>
45#include <sys/mutex.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/resource.h>
49#include <sys/resourcevar.h>
50#include <sys/sched.h>
51#include <sys/syscallsubr.h>
52#include <sys/sysproto.h>
53#include <sys/unistd.h>
54#include <sys/wait.h>
55
56#include <machine/frame.h>
57#include <machine/pcb.h>
58#include <machine/psl.h>
59#include <machine/segments.h>
60#include <machine/specialreg.h>
61
62#include <vm/vm.h>
63#include <vm/pmap.h>
64#include <vm/vm_map.h>
65
66#include <compat/freebsd32/freebsd32_util.h>
67#include <amd64/linux32/linux.h>
68#include <amd64/linux32/linux32_proto.h>
69#include <compat/linux/linux_ipc.h>
70#include <compat/linux/linux_misc.h>
71#include <compat/linux/linux_signal.h>
72#include <compat/linux/linux_util.h>
73#include <compat/linux/linux_emul.h>
74
75struct l_old_select_argv {
76	l_int		nfds;
77	l_uintptr_t	readfds;
78	l_uintptr_t	writefds;
79	l_uintptr_t	exceptfds;
80	l_uintptr_t	timeout;
81} __packed;
82
83int
84linux_to_bsd_sigaltstack(int lsa)
85{
86	int bsa = 0;
87
88	if (lsa & LINUX_SS_DISABLE)
89		bsa |= SS_DISABLE;
90	if (lsa & LINUX_SS_ONSTACK)
91		bsa |= SS_ONSTACK;
92	return (bsa);
93}
94
95static int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
96		    l_size_t len, l_int prot, l_int flags, l_int fd,
97		    l_loff_t pos);
98
99int
100bsd_to_linux_sigaltstack(int bsa)
101{
102	int lsa = 0;
103
104	if (bsa & SS_DISABLE)
105		lsa |= LINUX_SS_DISABLE;
106	if (bsa & SS_ONSTACK)
107		lsa |= LINUX_SS_ONSTACK;
108	return (lsa);
109}
110
111static void
112bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
113{
114
115	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
116	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
117	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
118	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
119	lru->ru_maxrss = ru->ru_maxrss;
120	lru->ru_ixrss = ru->ru_ixrss;
121	lru->ru_idrss = ru->ru_idrss;
122	lru->ru_isrss = ru->ru_isrss;
123	lru->ru_minflt = ru->ru_minflt;
124	lru->ru_majflt = ru->ru_majflt;
125	lru->ru_nswap = ru->ru_nswap;
126	lru->ru_inblock = ru->ru_inblock;
127	lru->ru_oublock = ru->ru_oublock;
128	lru->ru_msgsnd = ru->ru_msgsnd;
129	lru->ru_msgrcv = ru->ru_msgrcv;
130	lru->ru_nsignals = ru->ru_nsignals;
131	lru->ru_nvcsw = ru->ru_nvcsw;
132	lru->ru_nivcsw = ru->ru_nivcsw;
133}
134
135int
136linux_execve(struct thread *td, struct linux_execve_args *args)
137{
138	struct image_args eargs;
139	char *path;
140	int error;
141
142	LCONVPATHEXIST(td, args->path, &path);
143
144#ifdef DEBUG
145	if (ldebug(execve))
146		printf(ARGS(execve, "%s"), path);
147#endif
148
149	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
150	    args->argp, args->envp);
151	free(path, M_TEMP);
152	if (error == 0)
153		error = kern_execve(td, &eargs, NULL);
154	if (error == 0)
155		/* Linux process can execute FreeBSD one, do not attempt
156		 * to create emuldata for such process using
157		 * linux_proc_init, this leads to a panic on KASSERT
158		 * because such process has p->p_emuldata == NULL.
159		 */
160		if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
161			error = linux_proc_init(td, 0, 0);
162	return (error);
163}
164
165CTASSERT(sizeof(struct l_iovec32) == 8);
166
167static int
168linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
169{
170	struct l_iovec32 iov32;
171	struct iovec *iov;
172	struct uio *uio;
173	uint32_t iovlen;
174	int error, i;
175
176	*uiop = NULL;
177	if (iovcnt > UIO_MAXIOV)
178		return (EINVAL);
179	iovlen = iovcnt * sizeof(struct iovec);
180	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
181	iov = (struct iovec *)(uio + 1);
182	for (i = 0; i < iovcnt; i++) {
183		error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
184		if (error) {
185			free(uio, M_IOV);
186			return (error);
187		}
188		iov[i].iov_base = PTRIN(iov32.iov_base);
189		iov[i].iov_len = iov32.iov_len;
190	}
191	uio->uio_iov = iov;
192	uio->uio_iovcnt = iovcnt;
193	uio->uio_segflg = UIO_USERSPACE;
194	uio->uio_offset = -1;
195	uio->uio_resid = 0;
196	for (i = 0; i < iovcnt; i++) {
197		if (iov->iov_len > INT_MAX - uio->uio_resid) {
198			free(uio, M_IOV);
199			return (EINVAL);
200		}
201		uio->uio_resid += iov->iov_len;
202		iov++;
203	}
204	*uiop = uio;
205	return (0);
206}
207
208int
209linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
210    int error)
211{
212	struct l_iovec32 iov32;
213	struct iovec *iov;
214	uint32_t iovlen;
215	int i;
216
217	*iovp = NULL;
218	if (iovcnt > UIO_MAXIOV)
219		return (error);
220	iovlen = iovcnt * sizeof(struct iovec);
221	iov = malloc(iovlen, M_IOV, M_WAITOK);
222	for (i = 0; i < iovcnt; i++) {
223		error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
224		if (error) {
225			free(iov, M_IOV);
226			return (error);
227		}
228		iov[i].iov_base = PTRIN(iov32.iov_base);
229		iov[i].iov_len = iov32.iov_len;
230	}
231	*iovp = iov;
232	return(0);
233
234}
235
236int
237linux_readv(struct thread *td, struct linux_readv_args *uap)
238{
239	struct uio *auio;
240	int error;
241
242	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
243	if (error)
244		return (error);
245	error = kern_readv(td, uap->fd, auio);
246	free(auio, M_IOV);
247	return (error);
248}
249
250int
251linux_writev(struct thread *td, struct linux_writev_args *uap)
252{
253	struct uio *auio;
254	int error;
255
256	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
257	if (error)
258		return (error);
259	error = kern_writev(td, uap->fd, auio);
260	free(auio, M_IOV);
261	return (error);
262}
263
264struct l_ipc_kludge {
265	l_uintptr_t msgp;
266	l_long msgtyp;
267} __packed;
268
269int
270linux_ipc(struct thread *td, struct linux_ipc_args *args)
271{
272
273	switch (args->what & 0xFFFF) {
274	case LINUX_SEMOP: {
275		struct linux_semop_args a;
276
277		a.semid = args->arg1;
278		a.tsops = args->ptr;
279		a.nsops = args->arg2;
280		return (linux_semop(td, &a));
281	}
282	case LINUX_SEMGET: {
283		struct linux_semget_args a;
284
285		a.key = args->arg1;
286		a.nsems = args->arg2;
287		a.semflg = args->arg3;
288		return (linux_semget(td, &a));
289	}
290	case LINUX_SEMCTL: {
291		struct linux_semctl_args a;
292		int error;
293
294		a.semid = args->arg1;
295		a.semnum = args->arg2;
296		a.cmd = args->arg3;
297		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
298		if (error)
299			return (error);
300		return (linux_semctl(td, &a));
301	}
302	case LINUX_MSGSND: {
303		struct linux_msgsnd_args a;
304
305		a.msqid = args->arg1;
306		a.msgp = args->ptr;
307		a.msgsz = args->arg2;
308		a.msgflg = args->arg3;
309		return (linux_msgsnd(td, &a));
310	}
311	case LINUX_MSGRCV: {
312		struct linux_msgrcv_args a;
313
314		a.msqid = args->arg1;
315		a.msgsz = args->arg2;
316		a.msgflg = args->arg3;
317		if ((args->what >> 16) == 0) {
318			struct l_ipc_kludge tmp;
319			int error;
320
321			if (args->ptr == 0)
322				return (EINVAL);
323			error = copyin(args->ptr, &tmp, sizeof(tmp));
324			if (error)
325				return (error);
326			a.msgp = PTRIN(tmp.msgp);
327			a.msgtyp = tmp.msgtyp;
328		} else {
329			a.msgp = args->ptr;
330			a.msgtyp = args->arg5;
331		}
332		return (linux_msgrcv(td, &a));
333	}
334	case LINUX_MSGGET: {
335		struct linux_msgget_args a;
336
337		a.key = args->arg1;
338		a.msgflg = args->arg2;
339		return (linux_msgget(td, &a));
340	}
341	case LINUX_MSGCTL: {
342		struct linux_msgctl_args a;
343
344		a.msqid = args->arg1;
345		a.cmd = args->arg2;
346		a.buf = args->ptr;
347		return (linux_msgctl(td, &a));
348	}
349	case LINUX_SHMAT: {
350		struct linux_shmat_args a;
351
352		a.shmid = args->arg1;
353		a.shmaddr = args->ptr;
354		a.shmflg = args->arg2;
355		a.raddr = PTRIN((l_uint)args->arg3);
356		return (linux_shmat(td, &a));
357	}
358	case LINUX_SHMDT: {
359		struct linux_shmdt_args a;
360
361		a.shmaddr = args->ptr;
362		return (linux_shmdt(td, &a));
363	}
364	case LINUX_SHMGET: {
365		struct linux_shmget_args a;
366
367		a.key = args->arg1;
368		a.size = args->arg2;
369		a.shmflg = args->arg3;
370		return (linux_shmget(td, &a));
371	}
372	case LINUX_SHMCTL: {
373		struct linux_shmctl_args a;
374
375		a.shmid = args->arg1;
376		a.cmd = args->arg2;
377		a.buf = args->ptr;
378		return (linux_shmctl(td, &a));
379	}
380	default:
381		break;
382	}
383
384	return (EINVAL);
385}
386
387int
388linux_old_select(struct thread *td, struct linux_old_select_args *args)
389{
390	struct l_old_select_argv linux_args;
391	struct linux_select_args newsel;
392	int error;
393
394#ifdef DEBUG
395	if (ldebug(old_select))
396		printf(ARGS(old_select, "%p"), args->ptr);
397#endif
398
399	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
400	if (error)
401		return (error);
402
403	newsel.nfds = linux_args.nfds;
404	newsel.readfds = PTRIN(linux_args.readfds);
405	newsel.writefds = PTRIN(linux_args.writefds);
406	newsel.exceptfds = PTRIN(linux_args.exceptfds);
407	newsel.timeout = PTRIN(linux_args.timeout);
408	return (linux_select(td, &newsel));
409}
410
411int
412linux_fork(struct thread *td, struct linux_fork_args *args)
413{
414	int error;
415	struct proc *p2;
416	struct thread *td2;
417
418#ifdef DEBUG
419	if (ldebug(fork))
420		printf(ARGS(fork, ""));
421#endif
422
423	if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0)
424		return (error);
425
426	if (error == 0) {
427		td->td_retval[0] = p2->p_pid;
428		td->td_retval[1] = 0;
429	}
430
431	if (td->td_retval[1] == 1)
432		td->td_retval[0] = 0;
433	error = linux_proc_init(td, td->td_retval[0], 0);
434	if (error)
435		return (error);
436
437	td2 = FIRST_THREAD_IN_PROC(p2);
438
439	/*
440	 * Make this runnable after we are finished with it.
441	 */
442	thread_lock(td2);
443	TD_SET_CAN_RUN(td2);
444	sched_add(td2, SRQ_BORING);
445	thread_unlock(td2);
446
447	return (0);
448}
449
450int
451linux_vfork(struct thread *td, struct linux_vfork_args *args)
452{
453	int error;
454	struct proc *p2;
455	struct thread *td2;
456
457#ifdef DEBUG
458	if (ldebug(vfork))
459		printf(ARGS(vfork, ""));
460#endif
461
462	/* Exclude RFPPWAIT */
463	if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0)
464		return (error);
465	if (error == 0) {
466	   	td->td_retval[0] = p2->p_pid;
467		td->td_retval[1] = 0;
468	}
469	/* Are we the child? */
470	if (td->td_retval[1] == 1)
471		td->td_retval[0] = 0;
472	error = linux_proc_init(td, td->td_retval[0], 0);
473	if (error)
474		return (error);
475
476	PROC_LOCK(p2);
477	p2->p_flag |= P_PPWAIT;
478	PROC_UNLOCK(p2);
479
480	td2 = FIRST_THREAD_IN_PROC(p2);
481
482	/*
483	 * Make this runnable after we are finished with it.
484	 */
485	thread_lock(td2);
486	TD_SET_CAN_RUN(td2);
487	sched_add(td2, SRQ_BORING);
488	thread_unlock(td2);
489
490	/* wait for the children to exit, ie. emulate vfork */
491	PROC_LOCK(p2);
492	while (p2->p_flag & P_PPWAIT)
493		cv_wait(&p2->p_pwait, &p2->p_mtx);
494	PROC_UNLOCK(p2);
495
496	return (0);
497}
498
499int
500linux_clone(struct thread *td, struct linux_clone_args *args)
501{
502	int error, ff = RFPROC | RFSTOPPED;
503	struct proc *p2;
504	struct thread *td2;
505	int exit_signal;
506	struct linux_emuldata *em;
507
508#ifdef DEBUG
509	if (ldebug(clone)) {
510		printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, "
511		    "child tid: %p"), (unsigned)args->flags,
512		    args->stack, args->parent_tidptr, args->child_tidptr);
513	}
514#endif
515
516	exit_signal = args->flags & 0x000000ff;
517	if (LINUX_SIG_VALID(exit_signal)) {
518		if (exit_signal <= LINUX_SIGTBLSZ)
519			exit_signal =
520			    linux_to_bsd_signal[_SIG_IDX(exit_signal)];
521	} else if (exit_signal != 0)
522		return (EINVAL);
523
524	if (args->flags & LINUX_CLONE_VM)
525		ff |= RFMEM;
526	if (args->flags & LINUX_CLONE_SIGHAND)
527		ff |= RFSIGSHARE;
528	/*
529	 * XXX: In Linux, sharing of fs info (chroot/cwd/umask)
530	 * and open files is independant.  In FreeBSD, its in one
531	 * structure but in reality it does not cause any problems
532	 * because both of these flags are usually set together.
533	 */
534	if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS)))
535		ff |= RFFDG;
536
537	/*
538	 * Attempt to detect when linux_clone(2) is used for creating
539	 * kernel threads. Unfortunately despite the existence of the
540	 * CLONE_THREAD flag, version of linuxthreads package used in
541	 * most popular distros as of beginning of 2005 doesn't make
542	 * any use of it. Therefore, this detection relies on
543	 * empirical observation that linuxthreads sets certain
544	 * combination of flags, so that we can make more or less
545	 * precise detection and notify the FreeBSD kernel that several
546	 * processes are in fact part of the same threading group, so
547	 * that special treatment is necessary for signal delivery
548	 * between those processes and fd locking.
549	 */
550	if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS)
551		ff |= RFTHREAD;
552
553	if (args->flags & LINUX_CLONE_PARENT_SETTID)
554		if (args->parent_tidptr == NULL)
555			return (EINVAL);
556
557	error = fork1(td, ff, 0, &p2);
558	if (error)
559		return (error);
560
561	if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) {
562	   	sx_xlock(&proctree_lock);
563		PROC_LOCK(p2);
564		proc_reparent(p2, td->td_proc->p_pptr);
565		PROC_UNLOCK(p2);
566		sx_xunlock(&proctree_lock);
567	}
568
569	/* create the emuldata */
570	error = linux_proc_init(td, p2->p_pid, args->flags);
571	/* reference it - no need to check this */
572	em = em_find(p2, EMUL_DOLOCK);
573	KASSERT(em != NULL, ("clone: emuldata not found.\n"));
574	/* and adjust it */
575
576	if (args->flags & LINUX_CLONE_THREAD) {
577#ifdef notyet
578	   	PROC_LOCK(p2);
579	   	p2->p_pgrp = td->td_proc->p_pgrp;
580	   	PROC_UNLOCK(p2);
581#endif
582		exit_signal = 0;
583	}
584
585	if (args->flags & LINUX_CLONE_CHILD_SETTID)
586		em->child_set_tid = args->child_tidptr;
587	else
588	   	em->child_set_tid = NULL;
589
590	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
591		em->child_clear_tid = args->child_tidptr;
592	else
593	   	em->child_clear_tid = NULL;
594
595	EMUL_UNLOCK(&emul_lock);
596
597	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
598		error = copyout(&p2->p_pid, args->parent_tidptr,
599		    sizeof(p2->p_pid));
600		if (error)
601			printf(LMSG("copyout failed!"));
602	}
603
604	PROC_LOCK(p2);
605	p2->p_sigparent = exit_signal;
606	PROC_UNLOCK(p2);
607	td2 = FIRST_THREAD_IN_PROC(p2);
608	/*
609	 * In a case of stack = NULL, we are supposed to COW calling process
610	 * stack. This is what normal fork() does, so we just keep tf_rsp arg
611	 * intact.
612	 */
613	if (args->stack)
614		td2->td_frame->tf_rsp = PTROUT(args->stack);
615
616	if (args->flags & LINUX_CLONE_SETTLS) {
617		struct user_segment_descriptor sd;
618		struct l_user_desc info;
619		struct pcb *pcb;
620		int a[2];
621
622		error = copyin((void *)td->td_frame->tf_rsi, &info,
623		    sizeof(struct l_user_desc));
624		if (error) {
625			printf(LMSG("copyin failed!"));
626		} else {
627			/* We might copy out the entry_number as GUGS32_SEL. */
628			info.entry_number = GUGS32_SEL;
629			error = copyout(&info, (void *)td->td_frame->tf_rsi,
630			    sizeof(struct l_user_desc));
631			if (error)
632				printf(LMSG("copyout failed!"));
633
634			a[0] = LINUX_LDT_entry_a(&info);
635			a[1] = LINUX_LDT_entry_b(&info);
636
637			memcpy(&sd, &a, sizeof(a));
638#ifdef DEBUG
639			if (ldebug(clone))
640				printf("Segment created in clone with "
641				    "CLONE_SETTLS: lobase: %x, hibase: %x, "
642				    "lolimit: %x, hilimit: %x, type: %i, "
643				    "dpl: %i, p: %i, xx: %i, long: %i, "
644				    "def32: %i, gran: %i\n", sd.sd_lobase,
645				    sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
646				    sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
647				    sd.sd_long, sd.sd_def32, sd.sd_gran);
648#endif
649			pcb = td2->td_pcb;
650			pcb->pcb_gsbase = (register_t)info.base_addr;
651/* XXXKIB		pcb->pcb_gs32sd = sd; */
652			td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
653			set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT);
654		}
655	}
656
657#ifdef DEBUG
658	if (ldebug(clone))
659		printf(LMSG("clone: successful rfork to %d, "
660		    "stack %p sig = %d"), (int)p2->p_pid, args->stack,
661		    exit_signal);
662#endif
663	if (args->flags & LINUX_CLONE_VFORK) {
664	   	PROC_LOCK(p2);
665	   	p2->p_flag |= P_PPWAIT;
666	   	PROC_UNLOCK(p2);
667	}
668
669	/*
670	 * Make this runnable after we are finished with it.
671	 */
672	thread_lock(td2);
673	TD_SET_CAN_RUN(td2);
674	sched_add(td2, SRQ_BORING);
675	thread_unlock(td2);
676
677	td->td_retval[0] = p2->p_pid;
678	td->td_retval[1] = 0;
679
680	if (args->flags & LINUX_CLONE_VFORK) {
681		/* wait for the children to exit, ie. emulate vfork */
682		PROC_LOCK(p2);
683		while (p2->p_flag & P_PPWAIT)
684			cv_wait(&p2->p_pwait, &p2->p_mtx);
685		PROC_UNLOCK(p2);
686	}
687
688	return (0);
689}
690
691#define STACK_SIZE  (2 * 1024 * 1024)
692#define GUARD_SIZE  (4 * PAGE_SIZE)
693
694int
695linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
696{
697
698#ifdef DEBUG
699	if (ldebug(mmap2))
700		printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
701		    args->addr, args->len, args->prot,
702		    args->flags, args->fd, args->pgoff);
703#endif
704
705	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
706		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
707		PAGE_SIZE));
708}
709
710int
711linux_mmap(struct thread *td, struct linux_mmap_args *args)
712{
713	int error;
714	struct l_mmap_argv linux_args;
715
716	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
717	if (error)
718		return (error);
719
720#ifdef DEBUG
721	if (ldebug(mmap))
722		printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
723		    linux_args.addr, linux_args.len, linux_args.prot,
724		    linux_args.flags, linux_args.fd, linux_args.pgoff);
725#endif
726
727	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
728	    linux_args.prot, linux_args.flags, linux_args.fd,
729	    (uint32_t)linux_args.pgoff));
730}
731
732static int
733linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
734    l_int flags, l_int fd, l_loff_t pos)
735{
736	struct proc *p = td->td_proc;
737	struct mmap_args /* {
738		caddr_t addr;
739		size_t len;
740		int prot;
741		int flags;
742		int fd;
743		long pad;
744		off_t pos;
745	} */ bsd_args;
746	int error;
747	struct file *fp;
748
749	error = 0;
750	bsd_args.flags = 0;
751	fp = NULL;
752
753	/*
754	 * Linux mmap(2):
755	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
756	 */
757	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
758		return (EINVAL);
759
760	if (flags & LINUX_MAP_SHARED)
761		bsd_args.flags |= MAP_SHARED;
762	if (flags & LINUX_MAP_PRIVATE)
763		bsd_args.flags |= MAP_PRIVATE;
764	if (flags & LINUX_MAP_FIXED)
765		bsd_args.flags |= MAP_FIXED;
766	if (flags & LINUX_MAP_ANON) {
767		/* Enforce pos to be on page boundary, then ignore. */
768		if ((pos & PAGE_MASK) != 0)
769			return (EINVAL);
770		pos = 0;
771		bsd_args.flags |= MAP_ANON;
772	} else
773		bsd_args.flags |= MAP_NOSYNC;
774	if (flags & LINUX_MAP_GROWSDOWN)
775		bsd_args.flags |= MAP_STACK;
776
777	/*
778	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
779	 * on Linux/i386. We do this to ensure maximum compatibility.
780	 * Linux/ia64 does the same in i386 emulation mode.
781	 */
782	bsd_args.prot = prot;
783	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
784		bsd_args.prot |= PROT_READ | PROT_EXEC;
785
786	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
787	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
788	if (bsd_args.fd != -1) {
789		/*
790		 * Linux follows Solaris mmap(2) description:
791		 * The file descriptor fildes is opened with
792		 * read permission, regardless of the
793		 * protection options specified.
794		 */
795
796		if ((error = fget(td, bsd_args.fd, &fp)) != 0)
797			return (error);
798		if (fp->f_type != DTYPE_VNODE) {
799			fdrop(fp, td);
800			return (EINVAL);
801		}
802
803		/* Linux mmap() just fails for O_WRONLY files */
804		if (!(fp->f_flag & FREAD)) {
805			fdrop(fp, td);
806			return (EACCES);
807		}
808
809		fdrop(fp, td);
810	}
811
812	if (flags & LINUX_MAP_GROWSDOWN) {
813		/*
814		 * The Linux MAP_GROWSDOWN option does not limit auto
815		 * growth of the region.  Linux mmap with this option
816		 * takes as addr the inital BOS, and as len, the initial
817		 * region size.  It can then grow down from addr without
818		 * limit.  However, Linux threads has an implicit internal
819		 * limit to stack size of STACK_SIZE.  Its just not
820		 * enforced explicitly in Linux.  But, here we impose
821		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
822		 * region, since we can do this with our mmap.
823		 *
824		 * Our mmap with MAP_STACK takes addr as the maximum
825		 * downsize limit on BOS, and as len the max size of
826		 * the region.  It then maps the top SGROWSIZ bytes,
827		 * and auto grows the region down, up to the limit
828		 * in addr.
829		 *
830		 * If we don't use the MAP_STACK option, the effect
831		 * of this code is to allocate a stack region of a
832		 * fixed size of (STACK_SIZE - GUARD_SIZE).
833		 */
834
835		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
836			/*
837			 * Some Linux apps will attempt to mmap
838			 * thread stacks near the top of their
839			 * address space.  If their TOS is greater
840			 * than vm_maxsaddr, vm_map_growstack()
841			 * will confuse the thread stack with the
842			 * process stack and deliver a SEGV if they
843			 * attempt to grow the thread stack past their
844			 * current stacksize rlimit.  To avoid this,
845			 * adjust vm_maxsaddr upwards to reflect
846			 * the current stacksize rlimit rather
847			 * than the maximum possible stacksize.
848			 * It would be better to adjust the
849			 * mmap'ed region, but some apps do not check
850			 * mmap's return value.
851			 */
852			PROC_LOCK(p);
853			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
854			    lim_cur(p, RLIMIT_STACK);
855			PROC_UNLOCK(p);
856		}
857
858		/*
859		 * This gives us our maximum stack size and a new BOS.
860		 * If we're using VM_STACK, then mmap will just map
861		 * the top SGROWSIZ bytes, and let the stack grow down
862		 * to the limit at BOS.  If we're not using VM_STACK
863		 * we map the full stack, since we don't have a way
864		 * to autogrow it.
865		 */
866		if (len > STACK_SIZE - GUARD_SIZE) {
867			bsd_args.addr = (caddr_t)PTRIN(addr);
868			bsd_args.len = len;
869		} else {
870			bsd_args.addr = (caddr_t)PTRIN(addr) -
871			    (STACK_SIZE - GUARD_SIZE - len);
872			bsd_args.len = STACK_SIZE - GUARD_SIZE;
873		}
874	} else {
875		bsd_args.addr = (caddr_t)PTRIN(addr);
876		bsd_args.len  = len;
877	}
878	bsd_args.pos = pos;
879
880#ifdef DEBUG
881	if (ldebug(mmap))
882		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
883		    __func__,
884		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
885		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
886#endif
887	error = mmap(td, &bsd_args);
888#ifdef DEBUG
889	if (ldebug(mmap))
890		printf("-> %s() return: 0x%x (0x%08x)\n",
891			__func__, error, (u_int)td->td_retval[0]);
892#endif
893	return (error);
894}
895
896int
897linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
898{
899	struct mprotect_args bsd_args;
900
901	bsd_args.addr = uap->addr;
902	bsd_args.len = uap->len;
903	bsd_args.prot = uap->prot;
904	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
905		bsd_args.prot |= PROT_READ | PROT_EXEC;
906	return (mprotect(td, &bsd_args));
907}
908
909int
910linux_iopl(struct thread *td, struct linux_iopl_args *args)
911{
912	int error;
913
914	if (args->level < 0 || args->level > 3)
915		return (EINVAL);
916	if ((error = priv_check(td, PRIV_IO)) != 0)
917		return (error);
918	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
919		return (error);
920	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
921	    (args->level * (PSL_IOPL / 3));
922
923	return (0);
924}
925
926int
927linux_pipe(struct thread *td, struct linux_pipe_args *args)
928{
929	int error;
930	int fildes[2];
931
932#ifdef DEBUG
933	if (ldebug(pipe))
934		printf(ARGS(pipe, "*"));
935#endif
936
937	error = kern_pipe(td, fildes);
938	if (error)
939		return (error);
940
941	/* XXX: Close descriptors on error. */
942	return (copyout(fildes, args->pipefds, sizeof fildes));
943}
944
945int
946linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
947{
948	l_osigaction_t osa;
949	l_sigaction_t act, oact;
950	int error;
951
952#ifdef DEBUG
953	if (ldebug(sigaction))
954		printf(ARGS(sigaction, "%d, %p, %p"),
955		    args->sig, (void *)args->nsa, (void *)args->osa);
956#endif
957
958	if (args->nsa != NULL) {
959		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
960		if (error)
961			return (error);
962		act.lsa_handler = osa.lsa_handler;
963		act.lsa_flags = osa.lsa_flags;
964		act.lsa_restorer = osa.lsa_restorer;
965		LINUX_SIGEMPTYSET(act.lsa_mask);
966		act.lsa_mask.__bits[0] = osa.lsa_mask;
967	}
968
969	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
970	    args->osa ? &oact : NULL);
971
972	if (args->osa != NULL && !error) {
973		osa.lsa_handler = oact.lsa_handler;
974		osa.lsa_flags = oact.lsa_flags;
975		osa.lsa_restorer = oact.lsa_restorer;
976		osa.lsa_mask = oact.lsa_mask.__bits[0];
977		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
978	}
979
980	return (error);
981}
982
983/*
984 * Linux has two extra args, restart and oldmask.  We don't use these,
985 * but it seems that "restart" is actually a context pointer that
986 * enables the signal to happen with a different register set.
987 */
988int
989linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
990{
991	sigset_t sigmask;
992	l_sigset_t mask;
993
994#ifdef DEBUG
995	if (ldebug(sigsuspend))
996		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
997#endif
998
999	LINUX_SIGEMPTYSET(mask);
1000	mask.__bits[0] = args->mask;
1001	linux_to_bsd_sigset(&mask, &sigmask);
1002	return (kern_sigsuspend(td, sigmask));
1003}
1004
1005int
1006linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
1007{
1008	l_sigset_t lmask;
1009	sigset_t sigmask;
1010	int error;
1011
1012#ifdef DEBUG
1013	if (ldebug(rt_sigsuspend))
1014		printf(ARGS(rt_sigsuspend, "%p, %d"),
1015		    (void *)uap->newset, uap->sigsetsize);
1016#endif
1017
1018	if (uap->sigsetsize != sizeof(l_sigset_t))
1019		return (EINVAL);
1020
1021	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
1022	if (error)
1023		return (error);
1024
1025	linux_to_bsd_sigset(&lmask, &sigmask);
1026	return (kern_sigsuspend(td, sigmask));
1027}
1028
1029int
1030linux_pause(struct thread *td, struct linux_pause_args *args)
1031{
1032	struct proc *p = td->td_proc;
1033	sigset_t sigmask;
1034
1035#ifdef DEBUG
1036	if (ldebug(pause))
1037		printf(ARGS(pause, ""));
1038#endif
1039
1040	PROC_LOCK(p);
1041	sigmask = td->td_sigmask;
1042	PROC_UNLOCK(p);
1043	return (kern_sigsuspend(td, sigmask));
1044}
1045
1046int
1047linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
1048{
1049	stack_t ss, oss;
1050	l_stack_t lss;
1051	int error;
1052
1053#ifdef DEBUG
1054	if (ldebug(sigaltstack))
1055		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
1056#endif
1057
1058	if (uap->uss != NULL) {
1059		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
1060		if (error)
1061			return (error);
1062
1063		ss.ss_sp = PTRIN(lss.ss_sp);
1064		ss.ss_size = lss.ss_size;
1065		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
1066	}
1067	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
1068	    (uap->uoss != NULL) ? &oss : NULL);
1069	if (!error && uap->uoss != NULL) {
1070		lss.ss_sp = PTROUT(oss.ss_sp);
1071		lss.ss_size = oss.ss_size;
1072		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
1073		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
1074	}
1075
1076	return (error);
1077}
1078
1079int
1080linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
1081{
1082	struct ftruncate_args sa;
1083
1084#ifdef DEBUG
1085	if (ldebug(ftruncate64))
1086		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
1087		    (intmax_t)args->length);
1088#endif
1089
1090	sa.fd = args->fd;
1091	sa.length = args->length;
1092	return ftruncate(td, &sa);
1093}
1094
1095int
1096linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
1097{
1098	struct timeval atv;
1099	l_timeval atv32;
1100	struct timezone rtz;
1101	int error = 0;
1102
1103	if (uap->tp) {
1104		microtime(&atv);
1105		atv32.tv_sec = atv.tv_sec;
1106		atv32.tv_usec = atv.tv_usec;
1107		error = copyout(&atv32, uap->tp, sizeof(atv32));
1108	}
1109	if (error == 0 && uap->tzp != NULL) {
1110		rtz.tz_minuteswest = tz_minuteswest;
1111		rtz.tz_dsttime = tz_dsttime;
1112		error = copyout(&rtz, uap->tzp, sizeof(rtz));
1113	}
1114	return (error);
1115}
1116
1117int
1118linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
1119{
1120	l_timeval atv32;
1121	struct timeval atv, *tvp;
1122	struct timezone atz, *tzp;
1123	int error;
1124
1125	if (uap->tp) {
1126		error = copyin(uap->tp, &atv32, sizeof(atv32));
1127		if (error)
1128			return (error);
1129		atv.tv_sec = atv32.tv_sec;
1130		atv.tv_usec = atv32.tv_usec;
1131		tvp = &atv;
1132	} else
1133		tvp = NULL;
1134	if (uap->tzp) {
1135		error = copyin(uap->tzp, &atz, sizeof(atz));
1136		if (error)
1137			return (error);
1138		tzp = &atz;
1139	} else
1140		tzp = NULL;
1141	return (kern_settimeofday(td, tvp, tzp));
1142}
1143
1144int
1145linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
1146{
1147	struct l_rusage s32;
1148	struct rusage s;
1149	int error;
1150
1151	error = kern_getrusage(td, uap->who, &s);
1152	if (error != 0)
1153		return (error);
1154	if (uap->rusage != NULL) {
1155		bsd_to_linux_rusage(&s, &s32);
1156		error = copyout(&s32, uap->rusage, sizeof(s32));
1157	}
1158	return (error);
1159}
1160
1161int
1162linux_sched_rr_get_interval(struct thread *td,
1163    struct linux_sched_rr_get_interval_args *uap)
1164{
1165	struct timespec ts;
1166	struct l_timespec ts32;
1167	int error;
1168
1169	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
1170	if (error != 0)
1171		return (error);
1172	ts32.tv_sec = ts.tv_sec;
1173	ts32.tv_nsec = ts.tv_nsec;
1174	return (copyout(&ts32, uap->interval, sizeof(ts32)));
1175}
1176
1177int
1178linux_set_thread_area(struct thread *td,
1179    struct linux_set_thread_area_args *args)
1180{
1181	struct l_user_desc info;
1182	struct user_segment_descriptor sd;
1183	struct pcb *pcb;
1184	int a[2];
1185	int error;
1186
1187	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
1188	if (error)
1189		return (error);
1190
1191#ifdef DEBUG
1192	if (ldebug(set_thread_area))
1193		printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
1194		    "%i, %i, %i"), info.entry_number, info.base_addr,
1195		    info.limit, info.seg_32bit, info.contents,
1196		    info.read_exec_only, info.limit_in_pages,
1197		    info.seg_not_present, info.useable);
1198#endif
1199
1200	/*
1201	 * Semantics of Linux version: every thread in the system has array
1202	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
1203	 * This syscall loads one of the selected TLS decriptors with a value
1204	 * and also loads GDT descriptors 6, 7 and 8 with the content of
1205	 * the per-thread descriptors.
1206	 *
1207	 * Semantics of FreeBSD version: I think we can ignore that Linux has
1208	 * three per-thread descriptors and use just the first one.
1209	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
1210	 * for loading the GDT descriptors. We use just one GDT descriptor
1211	 * for TLS, so we will load just one.
1212	 *
1213	 * XXX: This doesn't work when a user space process tries to use more
1214	 * than one TLS segment. Comment in the Linux source says wine might
1215	 * do this.
1216	 */
1217
1218	/*
1219	 * GLIBC reads current %gs and call set_thread_area() with it.
1220	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
1221	 * we use these segments.
1222	 */
1223	switch (info.entry_number) {
1224	case GUGS32_SEL:
1225	case GUDATA_SEL:
1226	case 6:
1227	case -1:
1228		info.entry_number = GUGS32_SEL;
1229		break;
1230	default:
1231		return (EINVAL);
1232	}
1233
1234	/*
1235	 * We have to copy out the GDT entry we use.
1236	 *
1237	 * XXX: What if a user space program does not check the return value
1238	 * and tries to use 6, 7 or 8?
1239	 */
1240	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
1241	if (error)
1242		return (error);
1243
1244	if (LINUX_LDT_empty(&info)) {
1245		a[0] = 0;
1246		a[1] = 0;
1247	} else {
1248		a[0] = LINUX_LDT_entry_a(&info);
1249		a[1] = LINUX_LDT_entry_b(&info);
1250	}
1251
1252	memcpy(&sd, &a, sizeof(a));
1253#ifdef DEBUG
1254	if (ldebug(set_thread_area))
1255		printf("Segment created in set_thread_area: "
1256		    "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
1257		    "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
1258		    "def32: %i, gran: %i\n",
1259		    sd.sd_lobase,
1260		    sd.sd_hibase,
1261		    sd.sd_lolimit,
1262		    sd.sd_hilimit,
1263		    sd.sd_type,
1264		    sd.sd_dpl,
1265		    sd.sd_p,
1266		    sd.sd_xx,
1267		    sd.sd_long,
1268		    sd.sd_def32,
1269		    sd.sd_gran);
1270#endif
1271
1272	pcb = td->td_pcb;
1273	pcb->pcb_gsbase = (register_t)info.base_addr;
1274	set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT);
1275	update_gdt_gsbase(td, info.base_addr);
1276
1277	return (0);
1278}
1279
1280int
1281linux_wait4(struct thread *td, struct linux_wait4_args *args)
1282{
1283	int error, options;
1284	struct rusage ru, *rup;
1285	struct l_rusage lru;
1286	struct proc *p;
1287
1288#ifdef DEBUG
1289	if (ldebug(wait4))
1290		printf(ARGS(wait4, "%d, %p, %d, %p"),
1291		    args->pid, (void *)args->status, args->options,
1292		    (void *)args->rusage);
1293#endif
1294
1295	options = (args->options & (WNOHANG | WUNTRACED));
1296	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
1297	if (args->options & __WCLONE)
1298		options |= WLINUXCLONE;
1299
1300	if (args->rusage != NULL)
1301		rup = &ru;
1302	else
1303		rup = NULL;
1304	error = linux_common_wait(td, args->pid, args->status, options, rup);
1305	if (error)
1306		return (error);
1307
1308	p = td->td_proc;
1309	PROC_LOCK(p);
1310	sigqueue_delete(&p->p_sigqueue, SIGCHLD);
1311	PROC_UNLOCK(p);
1312
1313	if (args->rusage != NULL) {
1314		bsd_to_linux_rusage(rup, &lru);
1315		error = copyout(&lru, args->rusage, sizeof(lru));
1316	}
1317
1318	return (error);
1319}
1320