1/*	$NetBSD: linux_misc.c,v 1.263 2024/02/10 18:43:52 andvar Exp $	*/
2
3/*-
4 * Copyright (c) 1995, 1998, 1999, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Frank van der Linden and Eric Haszlakiewicz; by Jason R. Thorpe
9 * of the Numerical Aerospace Simulation Facility, NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Linux compatibility module. Try to deal with various Linux system calls.
35 */
36
37/*
38 * These functions have been moved to multiarch to allow
39 * selection of which machines include them to be
40 * determined by the individual files.linux_<arch> files.
41 *
42 * Function in multiarch:
43 *	linux_sys_break			: linux_break.c
44 *	linux_sys_alarm			: linux_misc_notalpha.c
45 *	linux_sys_getresgid		: linux_misc_notalpha.c
46 *	linux_sys_nice			: linux_misc_notalpha.c
47 *	linux_sys_readdir		: linux_misc_notalpha.c
48 *	linux_sys_setresgid		: linux_misc_notalpha.c
49 *	linux_sys_time			: linux_misc_notalpha.c
50 *	linux_sys_utime			: linux_misc_notalpha.c
51 *	linux_sys_waitpid		: linux_misc_notalpha.c
52 *	linux_sys_old_mmap		: linux_oldmmap.c
53 *	linux_sys_oldolduname		: linux_oldolduname.c
54 *	linux_sys_oldselect		: linux_oldselect.c
55 *	linux_sys_olduname		: linux_olduname.c
56 *	linux_sys_pipe			: linux_pipe.c
57 */
58
59#include <sys/cdefs.h>
60__KERNEL_RCSID(0, "$NetBSD: linux_misc.c,v 1.263 2024/02/10 18:43:52 andvar Exp $");
61
62#include <sys/param.h>
63#include <sys/systm.h>
64#include <sys/namei.h>
65#include <sys/proc.h>
66#include <sys/dirent.h>
67#include <sys/epoll.h>
68#include <sys/eventfd.h>
69#include <sys/file.h>
70#include <sys/stat.h>
71#include <sys/filedesc.h>
72#include <sys/ioctl.h>
73#include <sys/kernel.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/mman.h>
77#include <sys/mount.h>
78#include <sys/poll.h>
79#include <sys/prot.h>
80#include <sys/reboot.h>
81#include <sys/resource.h>
82#include <sys/resourcevar.h>
83#include <sys/select.h>
84#include <sys/signal.h>
85#include <sys/signalvar.h>
86#include <sys/socket.h>
87#include <sys/time.h>
88#include <sys/times.h>
89#include <sys/vnode.h>
90#include <sys/uio.h>
91#include <sys/wait.h>
92#include <sys/utsname.h>
93#include <sys/unistd.h>
94#include <sys/vfs_syscalls.h>
95#include <sys/swap.h>		/* for SWAP_ON */
96#include <sys/sysctl.h>		/* for KERN_DOMAINNAME */
97#include <sys/kauth.h>
98#include <sys/futex.h>
99
100#include <sys/ptrace.h>
101#include <machine/ptrace.h>
102
103#include <sys/syscall.h>
104#include <sys/syscallargs.h>
105
106#include <compat/sys/resource.h>
107
108#include <compat/linux/common/linux_machdep.h>
109#include <compat/linux/common/linux_types.h>
110#include <compat/linux/common/linux_signal.h>
111#include <compat/linux/common/linux_ipc.h>
112#include <compat/linux/common/linux_sem.h>
113
114#include <compat/linux/common/linux_fcntl.h>
115#include <compat/linux/common/linux_mmap.h>
116#include <compat/linux/common/linux_dirent.h>
117#include <compat/linux/common/linux_util.h>
118#include <compat/linux/common/linux_misc.h>
119#include <compat/linux/common/linux_statfs.h>
120#include <compat/linux/common/linux_limit.h>
121#include <compat/linux/common/linux_ptrace.h>
122#include <compat/linux/common/linux_reboot.h>
123#include <compat/linux/common/linux_emuldata.h>
124#include <compat/linux/common/linux_sched.h>
125
126#include <compat/linux/linux_syscallargs.h>
127
128const int linux_ptrace_request_map[] = {
129	LINUX_PTRACE_TRACEME,	PT_TRACE_ME,
130	LINUX_PTRACE_PEEKTEXT,	PT_READ_I,
131	LINUX_PTRACE_PEEKDATA,	PT_READ_D,
132	LINUX_PTRACE_POKETEXT,	PT_WRITE_I,
133	LINUX_PTRACE_POKEDATA,	PT_WRITE_D,
134	LINUX_PTRACE_CONT,	PT_CONTINUE,
135	LINUX_PTRACE_KILL,	PT_KILL,
136	LINUX_PTRACE_ATTACH,	PT_ATTACH,
137	LINUX_PTRACE_DETACH,	PT_DETACH,
138# ifdef PT_STEP
139	LINUX_PTRACE_SINGLESTEP,	PT_STEP,
140# endif
141	LINUX_PTRACE_SYSCALL,	PT_SYSCALL,
142	-1
143};
144
145const struct linux_mnttypes linux_fstypes[] = {
146	{ MOUNT_FFS,		LINUX_DEFAULT_SUPER_MAGIC	},
147	{ MOUNT_NFS,		LINUX_NFS_SUPER_MAGIC 		},
148	{ MOUNT_MFS,		LINUX_DEFAULT_SUPER_MAGIC	},
149	{ MOUNT_MSDOS,		LINUX_MSDOS_SUPER_MAGIC		},
150	{ MOUNT_LFS,		LINUX_DEFAULT_SUPER_MAGIC	},
151	{ MOUNT_FDESC,		LINUX_DEFAULT_SUPER_MAGIC	},
152	{ MOUNT_NULL,		LINUX_DEFAULT_SUPER_MAGIC	},
153	{ MOUNT_OVERLAY,	LINUX_DEFAULT_SUPER_MAGIC	},
154	{ MOUNT_UMAP,		LINUX_DEFAULT_SUPER_MAGIC	},
155	{ MOUNT_KERNFS,		LINUX_DEFAULT_SUPER_MAGIC	},
156	{ MOUNT_PROCFS,		LINUX_PROC_SUPER_MAGIC		},
157	{ MOUNT_AFS,		LINUX_DEFAULT_SUPER_MAGIC	},
158	{ MOUNT_CD9660,		LINUX_ISOFS_SUPER_MAGIC		},
159	{ MOUNT_UNION,		LINUX_DEFAULT_SUPER_MAGIC	},
160	{ MOUNT_ADOSFS,		LINUX_ADFS_SUPER_MAGIC		},
161	{ MOUNT_EXT2FS,		LINUX_EXT2_SUPER_MAGIC		},
162	{ MOUNT_CFS,		LINUX_DEFAULT_SUPER_MAGIC	},
163	{ MOUNT_CODA,		LINUX_CODA_SUPER_MAGIC		},
164	{ MOUNT_FILECORE,	LINUX_DEFAULT_SUPER_MAGIC	},
165	{ MOUNT_NTFS,		LINUX_DEFAULT_SUPER_MAGIC	},
166	{ MOUNT_SMBFS,		LINUX_SMB_SUPER_MAGIC		},
167	{ MOUNT_PTYFS,		LINUX_DEVPTS_SUPER_MAGIC	},
168	{ MOUNT_TMPFS,		LINUX_TMPFS_SUPER_MAGIC		}
169};
170const int linux_fstypes_cnt = sizeof(linux_fstypes) / sizeof(linux_fstypes[0]);
171
172# ifdef DEBUG_LINUX
173#define	DPRINTF(a)	uprintf a
174# else
175#define	DPRINTF(a)
176# endif
177
178/* Local linux_misc.c functions: */
179static void linux_to_bsd_mmap_args(struct sys_mmap_args *,
180    const struct linux_sys_mmap_args *);
181static int linux_mmap(struct lwp *, const struct linux_sys_mmap_args *,
182    register_t *, off_t);
183static int linux_to_native_wait_options(int);
184
185/*
186 * The information on a terminated (or stopped) process needs
187 * to be converted in order for Linux binaries to get a valid signal
188 * number out of it.
189 */
190int
191bsd_to_linux_wstat(int st)
192{
193
194	int sig;
195
196	if (WIFSIGNALED(st)) {
197		sig = WTERMSIG(st);
198		if (sig >= 0 && sig < NSIG)
199			st= (st & ~0177) | native_to_linux_signo[sig];
200	} else if (WIFSTOPPED(st)) {
201		sig = WSTOPSIG(st);
202		if (sig >= 0 && sig < NSIG)
203			st = (st & ~0xff00) |
204			    (native_to_linux_signo[sig] << 8);
205	}
206	return st;
207}
208
209/*
210 * wait4(2).  Passed on to the NetBSD call, surrounded by code to
211 * reserve some space for a NetBSD-style wait status, and converting
212 * it to what Linux wants.
213 */
214int
215linux_sys_wait4(struct lwp *l, const struct linux_sys_wait4_args *uap, register_t *retval)
216{
217	/* {
218		syscallarg(int) pid;
219		syscallarg(int *) status;
220		syscallarg(int) options;
221		syscallarg(struct rusage50 *) rusage;
222	} */
223	int error, status, options, linux_options, pid = SCARG(uap, pid);
224	struct rusage50 ru50;
225	struct rusage ru;
226	proc_t *p;
227
228	linux_options = SCARG(uap, options);
229	if (linux_options & ~(LINUX_WAIT4_KNOWNFLAGS))
230		return (EINVAL);
231
232	options = linux_to_native_wait_options(linux_options);
233# ifdef DIAGNOSTIC
234	if (linux_options & LINUX_WNOTHREAD)
235		printf("WARNING: %s: linux process %d.%d called "
236		       "waitpid with __WNOTHREAD set!\n",
237		       __FILE__, l->l_proc->p_pid, l->l_lid);
238
239# endif
240
241	error = do_sys_wait(&pid, &status, options,
242	    SCARG(uap, rusage) != NULL ? &ru : NULL);
243
244	retval[0] = pid;
245	if (pid == 0)
246		return error;
247
248	p = curproc;
249	mutex_enter(p->p_lock);
250	sigdelset(&p->p_sigpend.sp_set, SIGCHLD); /* XXXAD ksiginfo leak */
251	mutex_exit(p->p_lock);
252
253	if (SCARG(uap, rusage) != NULL) {
254		rusage_to_rusage50(&ru, &ru50);
255		error = copyout(&ru, SCARG(uap, rusage), sizeof(ru));
256	}
257
258	if (error == 0 && SCARG(uap, status) != NULL) {
259		status = bsd_to_linux_wstat(status);
260		error = copyout(&status, SCARG(uap, status), sizeof status);
261	}
262
263	return error;
264}
265
266/*
267 * waitid(2).  Converting arguments to the NetBSD equivalent and
268 * calling it.
269 */
270int
271linux_sys_waitid(struct lwp *l, const struct linux_sys_waitid_args *uap, register_t *retval)
272{
273	/* {
274		syscallarg(int) idtype;
275		syscallarg(id_t) id;
276		syscallarg(linux_siginfo_t *) infop;
277		syscallarg(int) options;
278		syscallarg(struct rusage50 *) rusage;
279	} */
280	int error, linux_options, options, linux_idtype, status;
281	pid_t pid;
282	idtype_t idtype;
283	id_t id;
284	siginfo_t info;
285	linux_siginfo_t linux_info;
286	struct wrusage wru;
287	struct rusage50 ru50;
288
289	linux_idtype = SCARG(uap, idtype);
290	switch (linux_idtype) {
291	case LINUX_P_ALL:
292		idtype = P_ALL;
293		break;
294	case LINUX_P_PID:
295		idtype = P_PID;
296		break;
297	case LINUX_P_PGID:
298		idtype = P_PGID;
299		break;
300	case LINUX_P_PIDFD:
301		return EOPNOTSUPP;
302	default:
303		return EINVAL;
304	}
305
306	linux_options = SCARG(uap, options);
307	if (linux_options & ~(LINUX_WAITID_KNOWNFLAGS))
308		return EINVAL;
309
310	options = linux_to_native_wait_options(linux_options);
311	id = SCARG(uap, id);
312
313	error = do_sys_waitid(idtype, id, &pid, &status, options, &wru, &info);
314	if (pid == 0 && options & WNOHANG) {
315		info.si_signo = 0;
316		info.si_pid = 0;
317	}
318
319	if (error == 0 && SCARG(uap, infop) != NULL) {
320		/* POSIX says that this NULL check is a bug, but Linux does this. */
321		native_to_linux_siginfo(&linux_info, &info._info);
322		error = copyout(&linux_info, SCARG(uap, infop), sizeof(linux_info));
323	}
324
325	if (error == 0 && SCARG(uap, rusage) != NULL) {
326		rusage_to_rusage50(&wru.wru_children, &ru50);
327		error = copyout(&ru50, SCARG(uap, rusage), sizeof(ru50));
328	}
329
330	return error;
331}
332
333/*
334 * Convert the options argument for wait4(2) and waitid(2) from what
335 * Linux wants to what NetBSD wants.
336 */
337static int
338linux_to_native_wait_options(int linux_options)
339{
340	int options = 0;
341
342	if (linux_options & LINUX_WNOHANG)
343		options |= WNOHANG;
344	if (linux_options & LINUX_WUNTRACED)
345		options |= WUNTRACED;
346	if (linux_options & LINUX_WEXITED)
347		options |= WEXITED;
348	if (linux_options & LINUX_WCONTINUED)
349		options |= WCONTINUED;
350	if (linux_options & LINUX_WNOWAIT)
351		options |= WNOWAIT;
352	if (linux_options & LINUX_WALL)
353		options |= WALLSIG;
354	if (linux_options & LINUX_WCLONE)
355		options |= WALTSIG;
356
357	return options;
358}
359
360/*
361 * Linux brk(2).  Like native, but always return the new break value.
362 */
363int
364linux_sys_brk(struct lwp *l, const struct linux_sys_brk_args *uap, register_t *retval)
365{
366	/* {
367		syscallarg(char *) nsize;
368	} */
369	struct proc *p = l->l_proc;
370	struct vmspace *vm = p->p_vmspace;
371	struct sys_obreak_args oba;
372
373	SCARG(&oba, nsize) = SCARG(uap, nsize);
374
375	(void) sys_obreak(l, &oba, retval);
376	retval[0] = (register_t)((char *)vm->vm_daddr + ptoa(vm->vm_dsize));
377	return 0;
378}
379
380/*
381 * Implement the fs stat functions. Straightforward.
382 */
383int
384linux_sys_statfs(struct lwp *l, const struct linux_sys_statfs_args *uap, register_t *retval)
385{
386	/* {
387		syscallarg(const char *) path;
388		syscallarg(struct linux_statfs *) sp;
389	} */
390	struct statvfs *sb;
391	struct linux_statfs ltmp;
392	int error;
393
394	sb = STATVFSBUF_GET();
395	error = do_sys_pstatvfs(l, SCARG(uap, path), ST_WAIT, sb);
396	if (error == 0) {
397		bsd_to_linux_statfs(sb, &ltmp);
398		error = copyout(&ltmp, SCARG(uap, sp), sizeof ltmp);
399	}
400	STATVFSBUF_PUT(sb);
401
402	return error;
403}
404
405int
406linux_sys_fstatfs(struct lwp *l, const struct linux_sys_fstatfs_args *uap, register_t *retval)
407{
408	/* {
409		syscallarg(int) fd;
410		syscallarg(struct linux_statfs *) sp;
411	} */
412	struct statvfs *sb;
413	struct linux_statfs ltmp;
414	int error;
415
416	sb = STATVFSBUF_GET();
417	error = do_sys_fstatvfs(l, SCARG(uap, fd), ST_WAIT, sb);
418	if (error == 0) {
419		bsd_to_linux_statfs(sb, &ltmp);
420		error = copyout(&ltmp, SCARG(uap, sp), sizeof ltmp);
421	}
422	STATVFSBUF_PUT(sb);
423
424	return error;
425}
426
427/*
428 * uname(). Just copy the info from the various strings stored in the
429 * kernel, and put it in the Linux utsname structure. That structure
430 * is almost the same as the NetBSD one, only it has fields 65 characters
431 * long, and an extra domainname field.
432 */
433int
434linux_sys_uname(struct lwp *l, const struct linux_sys_uname_args *uap, register_t *retval)
435{
436	/* {
437		syscallarg(struct linux_utsname *) up;
438	} */
439	struct linux_utsname luts;
440
441	memset(&luts, 0, sizeof(luts));
442	strlcpy(luts.l_sysname, linux_sysname, sizeof(luts.l_sysname));
443	strlcpy(luts.l_nodename, hostname, sizeof(luts.l_nodename));
444	strlcpy(luts.l_release, linux_release, sizeof(luts.l_release));
445	strlcpy(luts.l_version, linux_version, sizeof(luts.l_version));
446	strlcpy(luts.l_machine, LINUX_UNAME_ARCH, sizeof(luts.l_machine));
447	strlcpy(luts.l_domainname, domainname, sizeof(luts.l_domainname));
448
449	return copyout(&luts, SCARG(uap, up), sizeof(luts));
450}
451
452/* Used directly on: alpha, mips, ppc, sparc, sparc64 */
453/* Used indirectly on: arm, i386, m68k */
454
455/*
456 * New type Linux mmap call.
457 * Only called directly on machines with >= 6 free regs.
458 */
459int
460linux_sys_mmap(struct lwp *l, const struct linux_sys_mmap_args *uap, register_t *retval)
461{
462	/* {
463		syscallarg(unsigned long) addr;
464		syscallarg(size_t) len;
465		syscallarg(int) prot;
466		syscallarg(int) flags;
467		syscallarg(int) fd;
468		syscallarg(linux_off_t) offset;
469	} */
470
471	if (SCARG(uap, offset) & PAGE_MASK)
472		return EINVAL;
473
474	return linux_mmap(l, uap, retval, SCARG(uap, offset));
475}
476
477/*
478 * Guts of most architectures' mmap64() implementations.  This shares
479 * its list of arguments with linux_sys_mmap().
480 *
481 * The difference in linux_sys_mmap2() is that "offset" is actually
482 * (offset / pagesize), not an absolute byte count.  This translation
483 * to pagesize offsets is done inside glibc between the mmap64() call
484 * point, and the actual syscall.
485 */
486int
487linux_sys_mmap2(struct lwp *l, const struct linux_sys_mmap2_args *uap, register_t *retval)
488{
489	/* {
490		syscallarg(unsigned long) addr;
491		syscallarg(size_t) len;
492		syscallarg(int) prot;
493		syscallarg(int) flags;
494		syscallarg(int) fd;
495		syscallarg(linux_off_t) offset;
496	} */
497
498	return linux_mmap(l, uap, retval,
499	    ((off_t)SCARG(uap, offset)) << PAGE_SHIFT);
500}
501
502/*
503 * Massage arguments and call system mmap(2).
504 */
505static int
506linux_mmap(struct lwp *l, const struct linux_sys_mmap_args *uap, register_t *retval, off_t offset)
507{
508	struct sys_mmap_args cma;
509	int error;
510	size_t mmoff=0;
511
512	linux_to_bsd_mmap_args(&cma, uap);
513	SCARG(&cma, pos) = offset;
514
515	if (SCARG(uap, flags) & LINUX_MAP_GROWSDOWN) {
516		/*
517		 * Request for stack-like memory segment. On linux, this
518		 * works by mmap()ping (small) segment, which is automatically
519		 * extended when page fault happens below the currently
520		 * allocated area. We emulate this by allocating (typically
521		 * bigger) segment sized at current stack size limit, and
522		 * offsetting the requested and returned address accordingly.
523		 * Since physical pages are only allocated on-demand, this
524		 * is effectively identical.
525		 */
526		rlim_t ssl = l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur;
527
528		if (SCARG(&cma, len) < ssl) {
529			/* Compute the address offset */
530			mmoff = round_page(ssl) - SCARG(uap, len);
531
532			if (SCARG(&cma, addr))
533				SCARG(&cma, addr) = (char *)SCARG(&cma, addr) - mmoff;
534
535			SCARG(&cma, len) = (size_t) ssl;
536		}
537	}
538
539	error = sys_mmap(l, &cma, retval);
540	if (error)
541		return (error);
542
543	/* Shift the returned address for stack-like segment if necessary */
544	retval[0] += mmoff;
545
546	return (0);
547}
548
549static void
550linux_to_bsd_mmap_args(struct sys_mmap_args *cma, const struct linux_sys_mmap_args *uap)
551{
552	int flags = MAP_TRYFIXED, fl = SCARG(uap, flags);
553
554	flags |= cvtto_bsd_mask(fl, LINUX_MAP_SHARED, MAP_SHARED);
555	flags |= cvtto_bsd_mask(fl, LINUX_MAP_PRIVATE, MAP_PRIVATE);
556	flags |= cvtto_bsd_mask(fl, LINUX_MAP_FIXED, MAP_FIXED);
557	flags |= cvtto_bsd_mask(fl, LINUX_MAP_ANON, MAP_ANON);
558	flags |= cvtto_bsd_mask(fl, LINUX_MAP_LOCKED, MAP_WIRED);
559	/* XXX XAX ERH: Any other flags here?  There are more defined... */
560
561	SCARG(cma, addr) = (void *)SCARG(uap, addr);
562	SCARG(cma, len) = SCARG(uap, len);
563	SCARG(cma, prot) = SCARG(uap, prot);
564	if (SCARG(cma, prot) & VM_PROT_WRITE) /* XXX */
565		SCARG(cma, prot) |= VM_PROT_READ;
566	SCARG(cma, flags) = flags;
567	SCARG(cma, fd) = flags & MAP_ANON ? -1 : SCARG(uap, fd);
568	SCARG(cma, PAD) = 0;
569}
570
571#define	LINUX_MREMAP_MAYMOVE	1
572#define	LINUX_MREMAP_FIXED	2
573
574int
575linux_sys_mremap(struct lwp *l, const struct linux_sys_mremap_args *uap, register_t *retval)
576{
577	/* {
578		syscallarg(void *) old_address;
579		syscallarg(size_t) old_size;
580		syscallarg(size_t) new_size;
581		syscallarg(u_long) flags;
582	} */
583
584	struct proc *p;
585	struct vm_map *map;
586	vaddr_t oldva;
587	vaddr_t newva;
588	size_t oldsize;
589	size_t newsize;
590	int flags;
591	int uvmflags;
592	int error;
593
594	flags = SCARG(uap, flags);
595	oldva = (vaddr_t)SCARG(uap, old_address);
596	oldsize = round_page(SCARG(uap, old_size));
597	newsize = round_page(SCARG(uap, new_size));
598	if ((flags & ~(LINUX_MREMAP_FIXED|LINUX_MREMAP_MAYMOVE)) != 0) {
599		error = EINVAL;
600		goto done;
601	}
602	if ((flags & LINUX_MREMAP_FIXED) != 0) {
603		if ((flags & LINUX_MREMAP_MAYMOVE) == 0) {
604			error = EINVAL;
605			goto done;
606		}
607#if 0 /* notyet */
608		newva = SCARG(uap, new_address);
609		uvmflags = MAP_FIXED;
610#else /* notyet */
611		error = EOPNOTSUPP;
612		goto done;
613#endif /* notyet */
614	} else if ((flags & LINUX_MREMAP_MAYMOVE) != 0) {
615		uvmflags = 0;
616	} else {
617		newva = oldva;
618		uvmflags = MAP_FIXED;
619	}
620	p = l->l_proc;
621	map = &p->p_vmspace->vm_map;
622	error = uvm_mremap(map, oldva, oldsize, map, &newva, newsize, p,
623	    uvmflags);
624
625done:
626	*retval = (error != 0) ? 0 : (register_t)newva;
627	return error;
628}
629
630#ifdef USRSTACK
631int
632linux_sys_mprotect(struct lwp *l, const struct linux_sys_mprotect_args *uap, register_t *retval)
633{
634	/* {
635		syscallarg(const void *) start;
636		syscallarg(unsigned long) len;
637		syscallarg(int) prot;
638	} */
639	struct vm_map_entry *entry;
640	struct vm_map *map;
641	struct proc *p;
642	vaddr_t end, start, len, stacklim;
643	int prot, grows;
644
645	start = (vaddr_t)SCARG(uap, start);
646	len = round_page(SCARG(uap, len));
647	prot = SCARG(uap, prot);
648	grows = prot & (LINUX_PROT_GROWSDOWN | LINUX_PROT_GROWSUP);
649	prot &= ~grows;
650	end = start + len;
651
652	if (start & PAGE_MASK)
653		return EINVAL;
654	if (end < start)
655		return EINVAL;
656	if (end == start)
657		return 0;
658
659	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
660		return EINVAL;
661	if (grows == (LINUX_PROT_GROWSDOWN | LINUX_PROT_GROWSUP))
662		return EINVAL;
663
664	p = l->l_proc;
665	map = &p->p_vmspace->vm_map;
666	vm_map_lock(map);
667# ifdef notdef
668	VM_MAP_RANGE_CHECK(map, start, end);
669# endif
670	if (!uvm_map_lookup_entry(map, start, &entry) || entry->start > start) {
671		vm_map_unlock(map);
672		return ENOMEM;
673	}
674
675	/*
676	 * Approximate the behaviour of PROT_GROWS{DOWN,UP}.
677	 */
678
679	stacklim = (vaddr_t)p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur;
680	if (grows & LINUX_PROT_GROWSDOWN) {
681		if (USRSTACK - stacklim <= start && start < USRSTACK) {
682			start = USRSTACK - stacklim;
683		} else {
684			start = entry->start;
685		}
686	} else if (grows & LINUX_PROT_GROWSUP) {
687		if (USRSTACK <= end && end < USRSTACK + stacklim) {
688			end = USRSTACK + stacklim;
689		} else {
690			end = entry->end;
691		}
692	}
693	vm_map_unlock(map);
694	return uvm_map_protect_user(l, start, end, prot);
695}
696#endif /* USRSTACK */
697
698/*
699 * This code is partly stolen from src/lib/libc/compat-43/times.c
700 */
701
702#define	CONVTCK(r)	(r.tv_sec * hz + r.tv_usec / (1000000 / hz))
703
704int
705linux_sys_times(struct lwp *l, const struct linux_sys_times_args *uap, register_t *retval)
706{
707	/* {
708		syscallarg(struct times *) tms;
709	} */
710	struct proc *p = l->l_proc;
711	struct timeval t;
712	int error;
713
714	if (SCARG(uap, tms)) {
715		struct linux_tms ltms;
716		struct rusage ru;
717
718		memset(&ltms, 0, sizeof(ltms));
719
720		mutex_enter(p->p_lock);
721		calcru(p, &ru.ru_utime, &ru.ru_stime, NULL, NULL);
722		ltms.ltms_utime = CONVTCK(ru.ru_utime);
723		ltms.ltms_stime = CONVTCK(ru.ru_stime);
724		ltms.ltms_cutime = CONVTCK(p->p_stats->p_cru.ru_utime);
725		ltms.ltms_cstime = CONVTCK(p->p_stats->p_cru.ru_stime);
726		mutex_exit(p->p_lock);
727
728		if ((error = copyout(&ltms, SCARG(uap, tms), sizeof ltms)))
729			return error;
730	}
731
732	getmicrouptime(&t);
733
734	retval[0] = ((linux_clock_t)(CONVTCK(t)));
735	return 0;
736}
737
738#undef CONVTCK
739
740#if !defined(__aarch64__)
741/*
742 * Linux 'readdir' call. This code is mostly taken from the
743 * SunOS getdents call (see compat/sunos/sunos_misc.c), though
744 * an attempt has been made to keep it a little cleaner (failing
745 * miserably, because of the cruft needed if count 1 is passed).
746 *
747 * The d_off field should contain the offset of the next valid entry,
748 * but in Linux it has the offset of the entry itself. We emulate
749 * that bug here.
750 *
751 * Read in BSD-style entries, convert them, and copy them out.
752 *
753 * Note that this doesn't handle union-mounted filesystems.
754 */
755int
756linux_sys_getdents(struct lwp *l, const struct linux_sys_getdents_args *uap, register_t *retval)
757{
758	/* {
759		syscallarg(int) fd;
760		syscallarg(struct linux_dirent *) dent;
761		syscallarg(unsigned int) count;
762	} */
763	struct dirent *bdp;
764	struct vnode *vp;
765	char *inp, *tbuf;		/* BSD-format */
766	int len, reclen;		/* BSD-format */
767	char *outp;			/* Linux-format */
768	int resid, linux_reclen = 0;	/* Linux-format */
769	struct file *fp;
770	struct uio auio;
771	struct iovec aiov;
772	struct linux_dirent idb;
773	off_t off;		/* true file offset */
774	int buflen, error, eofflag, nbytes, oldcall;
775	struct vattr va;
776	off_t *cookiebuf = NULL, *cookie;
777	int ncookies;
778
779	/* fd_getvnode() will use the descriptor for us */
780	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
781		return (error);
782
783	if ((fp->f_flag & FREAD) == 0) {
784		error = EBADF;
785		goto out1;
786	}
787
788	vp = (struct vnode *)fp->f_data;
789	if (vp->v_type != VDIR) {
790		error = ENOTDIR;
791		goto out1;
792	}
793
794	vn_lock(vp, LK_SHARED | LK_RETRY);
795	error = VOP_GETATTR(vp, &va, l->l_cred);
796	VOP_UNLOCK(vp);
797	if (error)
798		goto out1;
799
800	nbytes = SCARG(uap, count);
801	if (nbytes == 1) {	/* emulating old, broken behaviour */
802		nbytes = sizeof (idb);
803		buflen = uimax(va.va_blocksize, nbytes);
804		oldcall = 1;
805	} else {
806		buflen = uimin(MAXBSIZE, nbytes);
807		if (buflen < va.va_blocksize)
808			buflen = va.va_blocksize;
809		oldcall = 0;
810	}
811	tbuf = malloc(buflen, M_TEMP, M_WAITOK);
812
813	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
814	off = fp->f_offset;
815again:
816	aiov.iov_base = tbuf;
817	aiov.iov_len = buflen;
818	auio.uio_iov = &aiov;
819	auio.uio_iovcnt = 1;
820	auio.uio_rw = UIO_READ;
821	auio.uio_resid = buflen;
822	auio.uio_offset = off;
823	UIO_SETUP_SYSSPACE(&auio);
824	/*
825         * First we read into the malloc'ed buffer, then
826         * we massage it into user space, one record at a time.
827         */
828	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &cookiebuf,
829	    &ncookies);
830	if (error)
831		goto out;
832
833	inp = tbuf;
834	outp = (void *)SCARG(uap, dent);
835	resid = nbytes;
836	if ((len = buflen - auio.uio_resid) == 0)
837		goto eof;
838
839	for (cookie = cookiebuf; len > 0; len -= reclen) {
840		bdp = (struct dirent *)inp;
841		reclen = bdp->d_reclen;
842		if (reclen & 3) {
843			error = EIO;
844			goto out;
845		}
846		if (bdp->d_fileno == 0) {
847			inp += reclen;	/* it is a hole; squish it out */
848			if (cookie)
849				off = *cookie++;
850			else
851				off += reclen;
852			continue;
853		}
854		linux_reclen = LINUX_RECLEN(&idb, bdp->d_namlen);
855		if (reclen > len || resid < linux_reclen) {
856			/* entry too big for buffer, so just stop */
857			outp++;
858			break;
859		}
860		/*
861		 * Massage in place to make a Linux-shaped dirent (otherwise
862		 * we have to worry about touching user memory outside of
863		 * the copyout() call).
864		 */
865		memset(&idb, 0, sizeof(idb));
866		idb.d_ino = bdp->d_fileno;
867		/*
868		 * The old readdir() call misuses the offset and reclen fields.
869		 */
870		if (oldcall) {
871			idb.d_off = (linux_off_t)linux_reclen;
872			idb.d_reclen = (u_short)bdp->d_namlen;
873		} else {
874			if (sizeof (idb.d_off) <= 4 && (off >> 32) != 0) {
875				compat_offseterr(vp, "linux_getdents");
876				error = EINVAL;
877				goto out;
878			}
879			idb.d_off = (linux_off_t)off;
880			idb.d_reclen = (u_short)linux_reclen;
881			/* Linux puts d_type at the end of each record */
882			*((char *)&idb + idb.d_reclen - 1) = bdp->d_type;
883		}
884		memcpy(idb.d_name, bdp->d_name,
885		    MIN(sizeof(idb.d_name), bdp->d_namlen + 1));
886		if ((error = copyout((void *)&idb, outp, linux_reclen)))
887			goto out;
888		/* advance past this real entry */
889		inp += reclen;
890		if (cookie)
891			off = *cookie++; /* each entry points to itself */
892		else
893			off += reclen;
894		/* advance output past Linux-shaped entry */
895		outp += linux_reclen;
896		resid -= linux_reclen;
897		if (oldcall)
898			break;
899	}
900
901	/* if we squished out the whole block, try again */
902	if (outp == (void *)SCARG(uap, dent)) {
903		if (cookiebuf)
904			free(cookiebuf, M_TEMP);
905		cookiebuf = NULL;
906		goto again;
907	}
908	fp->f_offset = off;	/* update the vnode offset */
909
910	if (oldcall)
911		nbytes = resid + linux_reclen;
912
913eof:
914	*retval = nbytes - resid;
915out:
916	VOP_UNLOCK(vp);
917	if (cookiebuf)
918		free(cookiebuf, M_TEMP);
919	free(tbuf, M_TEMP);
920out1:
921	fd_putfile(SCARG(uap, fd));
922	return error;
923}
924#endif
925
926#if !defined(__aarch64__)
927/*
928 * Even when just using registers to pass arguments to syscalls you can
929 * have 5 of them on the i386. So this newer version of select() does
930 * this.
931 */
932int
933linux_sys_select(struct lwp *l, const struct linux_sys_select_args *uap, register_t *retval)
934{
935	/* {
936		syscallarg(int) nfds;
937		syscallarg(fd_set *) readfds;
938		syscallarg(fd_set *) writefds;
939		syscallarg(fd_set *) exceptfds;
940		syscallarg(struct timeval50 *) timeout;
941	} */
942
943	return linux_select1(l, retval, SCARG(uap, nfds), SCARG(uap, readfds),
944	    SCARG(uap, writefds), SCARG(uap, exceptfds),
945	    (struct linux_timeval *)SCARG(uap, timeout));
946}
947
948/*
949 * Common code for the old and new versions of select(). A couple of
950 * things are important:
951 * 1) return the amount of time left in the 'timeout' parameter
952 * 2) select never returns ERESTART on Linux, always return EINTR
953 */
954int
955linux_select1(struct lwp *l, register_t *retval, int nfds, fd_set *readfds,
956    fd_set *writefds, fd_set *exceptfds, struct linux_timeval *timeout)
957{
958	struct timespec ts0, ts1, uts, *ts = NULL;
959	struct linux_timeval ltv;
960	int error;
961
962	/*
963	 * Store current time for computation of the amount of
964	 * time left.
965	 */
966	if (timeout) {
967		if ((error = copyin(timeout, &ltv, sizeof(ltv))))
968			return error;
969		uts.tv_sec = ltv.tv_sec;
970		uts.tv_nsec = (long)((unsigned long)ltv.tv_usec * 1000);
971		if (itimespecfix(&uts)) {
972			/*
973			 * The timeval was invalid.  Convert it to something
974			 * valid that will act as it does under Linux.
975			 */
976			uts.tv_sec += uts.tv_nsec / 1000000000;
977			uts.tv_nsec %= 1000000000;
978			if (uts.tv_nsec < 0) {
979				uts.tv_sec -= 1;
980				uts.tv_nsec += 1000000000;
981			}
982			if (uts.tv_sec < 0)
983				timespecclear(&uts);
984		}
985		ts = &uts;
986		nanotime(&ts0);
987	}
988
989	error = selcommon(retval, nfds, readfds, writefds, exceptfds, ts, NULL);
990
991	if (error) {
992		/*
993		 * See fs/select.c in the Linux kernel.  Without this,
994		 * Maelstrom doesn't work.
995		 */
996		if (error == ERESTART)
997			error = EINTR;
998		return error;
999	}
1000
1001	if (timeout) {
1002		if (*retval) {
1003			/*
1004			 * Compute how much time was left of the timeout,
1005			 * by subtracting the current time and the time
1006			 * before we started the call, and subtracting
1007			 * that result from the user-supplied value.
1008			 */
1009			nanotime(&ts1);
1010			timespecsub(&ts1, &ts0, &ts1);
1011			timespecsub(&uts, &ts1, &uts);
1012			if (uts.tv_sec < 0)
1013				timespecclear(&uts);
1014		} else
1015			timespecclear(&uts);
1016		ltv.tv_sec = uts.tv_sec;
1017		ltv.tv_usec = uts.tv_nsec / 1000;
1018		if ((error = copyout(&ltv, timeout, sizeof(ltv))))
1019			return error;
1020	}
1021
1022	return 0;
1023}
1024#endif
1025
1026/*
1027 * Derived from FreeBSD's sys/compat/linux/linux_misc.c:linux_pselect6()
1028 * which was contributed by Dmitry Chagin
1029 * https://svnweb.freebsd.org/base?view=revision&revision=283403
1030 */
1031int
1032linux_sys_pselect6(struct lwp *l,
1033	const struct linux_sys_pselect6_args *uap, register_t *retval)
1034{
1035	/* {
1036		syscallarg(int) nfds;
1037		syscallarg(fd_set *) readfds;
1038		syscallarg(fd_set *) writefds;
1039		syscallarg(fd_set *) exceptfds;
1040		syscallarg(struct timespec *) timeout;
1041		syscallarg(linux_sized_sigset_t *) ss;
1042	} */
1043	struct timespec uts, ts0, ts1, *tsp;
1044	linux_sized_sigset_t lsss;
1045	struct linux_timespec lts;
1046	linux_sigset_t lss;
1047	sigset_t *ssp;
1048	sigset_t ss;
1049	int error;
1050
1051	ssp = NULL;
1052	if (SCARG(uap, ss) != NULL) {
1053		if ((error = copyin(SCARG(uap, ss), &lsss, sizeof(lsss))) != 0)
1054			return (error);
1055		if (lsss.ss_len != sizeof(lss))
1056			return (EINVAL);
1057		if (lsss.ss != NULL) {
1058			if ((error = copyin(lsss.ss, &lss, sizeof(lss))) != 0)
1059				return (error);
1060			linux_to_native_sigset(&ss, &lss);
1061			ssp = &ss;
1062		}
1063	}
1064
1065	if (SCARG(uap, timeout) != NULL) {
1066		error = copyin(SCARG(uap, timeout), &lts, sizeof(lts));
1067		if (error != 0)
1068			return (error);
1069		linux_to_native_timespec(&uts, &lts);
1070
1071		if (itimespecfix(&uts))
1072			return (EINVAL);
1073
1074		nanotime(&ts0);
1075		tsp = &uts;
1076	} else {
1077		tsp = NULL;
1078	}
1079
1080	error = selcommon(retval, SCARG(uap, nfds), SCARG(uap, readfds),
1081	    SCARG(uap, writefds), SCARG(uap, exceptfds), tsp, ssp);
1082
1083	if (error == 0 && tsp != NULL) {
1084		if (retval != 0) {
1085			/*
1086			 * Compute how much time was left of the timeout,
1087			 * by subtracting the current time and the time
1088			 * before we started the call, and subtracting
1089			 * that result from the user-supplied value.
1090			 */
1091			nanotime(&ts1);
1092			timespecsub(&ts1, &ts0, &ts1);
1093			timespecsub(&uts, &ts1, &uts);
1094			if (uts.tv_sec < 0)
1095				timespecclear(&uts);
1096		} else {
1097			timespecclear(&uts);
1098		}
1099
1100		native_to_linux_timespec(&lts, &uts);
1101		error = copyout(&lts, SCARG(uap, timeout), sizeof(lts));
1102	}
1103
1104	return (error);
1105}
1106
1107int
1108linux_sys_ppoll(struct lwp *l,
1109	const struct linux_sys_ppoll_args *uap, register_t *retval)
1110{
1111	/* {
1112		syscallarg(struct pollfd *) fds;
1113		syscallarg(u_int) nfds;
1114		syscallarg(struct linux_timespec *) timeout;
1115		syscallarg(linux_sigset_t *) sigset;
1116	} */
1117	struct linux_timespec lts0, *lts;
1118	struct timespec ts0, *ts = NULL;
1119	linux_sigset_t lsigmask0, *lsigmask;
1120	sigset_t sigmask0, *sigmask = NULL;
1121	int error;
1122
1123	lts = SCARG(uap, timeout);
1124	if (lts) {
1125		if ((error = copyin(lts, &lts0, sizeof(lts0))) != 0)
1126			return error;
1127		linux_to_native_timespec(&ts0, &lts0);
1128		ts = &ts0;
1129	}
1130
1131	lsigmask = SCARG(uap, sigset);
1132	if (lsigmask) {
1133		if ((error = copyin(lsigmask, &lsigmask0, sizeof(lsigmask0))))
1134			return error;
1135		linux_to_native_sigset(&sigmask0, &lsigmask0);
1136		sigmask = &sigmask0;
1137	}
1138
1139	return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds),
1140	    ts, sigmask);
1141}
1142
1143/*
1144 * Set the 'personality' (emulation mode) for the current process. Only
1145 * accept the Linux personality here (0). This call is needed because
1146 * the Linux ELF crt0 issues it in an ugly kludge to make sure that
1147 * ELF binaries run in Linux mode, not SVR4 mode.
1148 */
1149int
1150linux_sys_personality(struct lwp *l, const struct linux_sys_personality_args *uap, register_t *retval)
1151{
1152	/* {
1153		syscallarg(unsigned long) per;
1154	} */
1155	struct linux_emuldata *led;
1156	int per;
1157
1158	per = SCARG(uap, per);
1159	led = l->l_emuldata;
1160	if (per == LINUX_PER_QUERY) {
1161		retval[0] = led->led_personality;
1162		return 0;
1163	}
1164
1165	switch (per & LINUX_PER_MASK) {
1166	case LINUX_PER_LINUX:
1167	case LINUX_PER_LINUX32:
1168		led->led_personality = per;
1169		break;
1170
1171	default:
1172		return EINVAL;
1173	}
1174
1175	retval[0] = per;
1176	return 0;
1177}
1178
1179/*
1180 * We have nonexistent fsuid equal to uid.
1181 * If modification is requested, refuse.
1182 */
1183int
1184linux_sys_setfsuid(struct lwp *l, const struct linux_sys_setfsuid_args *uap, register_t *retval)
1185{
1186	 /* {
1187		 syscallarg(uid_t) uid;
1188	 } */
1189	 uid_t uid;
1190
1191	 uid = SCARG(uap, uid);
1192	 if (kauth_cred_getuid(l->l_cred) != uid)
1193		 return sys_nosys(l, uap, retval);
1194
1195	 *retval = uid;
1196	 return 0;
1197}
1198
1199int
1200linux_sys_setfsgid(struct lwp *l, const struct linux_sys_setfsgid_args *uap, register_t *retval)
1201{
1202	/* {
1203		syscallarg(gid_t) gid;
1204	} */
1205	gid_t gid;
1206
1207	gid = SCARG(uap, gid);
1208	if (kauth_cred_getgid(l->l_cred) != gid)
1209		return sys_nosys(l, uap, retval);
1210
1211	*retval = gid;
1212	return 0;
1213}
1214
1215int
1216linux_sys_setresuid(struct lwp *l, const struct linux_sys_setresuid_args *uap, register_t *retval)
1217{
1218	/* {
1219		syscallarg(uid_t) ruid;
1220		syscallarg(uid_t) euid;
1221		syscallarg(uid_t) suid;
1222	} */
1223
1224	/*
1225	 * Note: These checks are a little different than the NetBSD
1226	 * setreuid(2) call performs.  This precisely follows the
1227	 * behavior of the Linux kernel.
1228	 */
1229
1230	return do_setresuid(l, SCARG(uap, ruid), SCARG(uap, euid),
1231			    SCARG(uap, suid),
1232			    ID_R_EQ_R | ID_R_EQ_E | ID_R_EQ_S |
1233			    ID_E_EQ_R | ID_E_EQ_E | ID_E_EQ_S |
1234			    ID_S_EQ_R | ID_S_EQ_E | ID_S_EQ_S );
1235}
1236
1237int
1238linux_sys_getresuid(struct lwp *l, const struct linux_sys_getresuid_args *uap, register_t *retval)
1239{
1240	/* {
1241		syscallarg(uid_t *) ruid;
1242		syscallarg(uid_t *) euid;
1243		syscallarg(uid_t *) suid;
1244	} */
1245	kauth_cred_t pc = l->l_cred;
1246	int error;
1247	uid_t uid;
1248
1249	/*
1250	 * Linux copies these values out to userspace like so:
1251	 *
1252	 *	1. Copy out ruid.
1253	 *	2. If that succeeds, copy out euid.
1254	 *	3. If both of those succeed, copy out suid.
1255	 */
1256	uid = kauth_cred_getuid(pc);
1257	if ((error = copyout(&uid, SCARG(uap, ruid), sizeof(uid_t))) != 0)
1258		return (error);
1259
1260	uid = kauth_cred_geteuid(pc);
1261	if ((error = copyout(&uid, SCARG(uap, euid), sizeof(uid_t))) != 0)
1262		return (error);
1263
1264	uid = kauth_cred_getsvuid(pc);
1265
1266	return (copyout(&uid, SCARG(uap, suid), sizeof(uid_t)));
1267}
1268
1269int
1270linux_sys_ptrace(struct lwp *l, const struct linux_sys_ptrace_args *uap, register_t *retval)
1271{
1272	/* {
1273		i386, m68k, powerpc: T=int
1274		alpha, amd64: T=long
1275		syscallarg(T) request;
1276		syscallarg(T) pid;
1277		syscallarg(T) addr;
1278		syscallarg(T) data;
1279	} */
1280	const int *ptr;
1281	int request;
1282	int error;
1283
1284	ptr = linux_ptrace_request_map;
1285	request = SCARG(uap, request);
1286	while (*ptr != -1)
1287		if (*ptr++ == request) {
1288			struct sys_ptrace_args pta;
1289
1290			SCARG(&pta, req) = *ptr;
1291			SCARG(&pta, pid) = SCARG(uap, pid);
1292			SCARG(&pta, addr) = (void *)SCARG(uap, addr);
1293			SCARG(&pta, data) = SCARG(uap, data);
1294
1295			/*
1296			 * Linux ptrace(PTRACE_CONT, pid, 0, 0) means actually
1297			 * to continue where the process left off previously.
1298 			 * The same thing is achieved by addr == (void *) 1
1299			 * on NetBSD, so rewrite 'addr' appropriately.
1300			 */
1301			if (request == LINUX_PTRACE_CONT && SCARG(uap, addr)==0)
1302				SCARG(&pta, addr) = (void *) 1;
1303
1304			error = sysent[SYS_ptrace].sy_call(l, &pta, retval);
1305			if (error)
1306				return error;
1307			switch (request) {
1308			case LINUX_PTRACE_PEEKTEXT:
1309			case LINUX_PTRACE_PEEKDATA:
1310				error = copyout (retval,
1311				    (void *)SCARG(uap, data),
1312				    sizeof *retval);
1313				*retval = SCARG(uap, data);
1314				break;
1315			default:
1316				break;
1317			}
1318			return error;
1319		}
1320		else
1321			ptr++;
1322
1323	return LINUX_SYS_PTRACE_ARCH(l, uap, retval);
1324}
1325
1326int
1327linux_sys_reboot(struct lwp *l, const struct linux_sys_reboot_args *uap, register_t *retval)
1328{
1329	/* {
1330		syscallarg(int) magic1;
1331		syscallarg(int) magic2;
1332		syscallarg(int) cmd;
1333		syscallarg(void *) arg;
1334	} */
1335	struct sys_reboot_args /* {
1336		syscallarg(int) opt;
1337		syscallarg(char *) bootstr;
1338	} */ sra;
1339	int error;
1340
1341	if ((error = kauth_authorize_system(l->l_cred,
1342	    KAUTH_SYSTEM_REBOOT, 0, NULL, NULL, NULL)) != 0)
1343		return(error);
1344
1345	if (SCARG(uap, magic1) != LINUX_REBOOT_MAGIC1)
1346		return(EINVAL);
1347	if (SCARG(uap, magic2) != LINUX_REBOOT_MAGIC2 &&
1348	    SCARG(uap, magic2) != LINUX_REBOOT_MAGIC2A &&
1349	    SCARG(uap, magic2) != LINUX_REBOOT_MAGIC2B)
1350		return(EINVAL);
1351
1352	switch ((unsigned long)SCARG(uap, cmd)) {
1353	case LINUX_REBOOT_CMD_RESTART:
1354		SCARG(&sra, opt) = RB_AUTOBOOT;
1355		break;
1356	case LINUX_REBOOT_CMD_HALT:
1357		SCARG(&sra, opt) = RB_HALT;
1358		break;
1359	case LINUX_REBOOT_CMD_POWER_OFF:
1360		SCARG(&sra, opt) = RB_HALT|RB_POWERDOWN;
1361		break;
1362	case LINUX_REBOOT_CMD_RESTART2:
1363		/* Reboot with an argument. */
1364		SCARG(&sra, opt) = RB_AUTOBOOT|RB_STRING;
1365		SCARG(&sra, bootstr) = SCARG(uap, arg);
1366		break;
1367	case LINUX_REBOOT_CMD_CAD_ON:
1368		return(EINVAL);	/* We don't implement ctrl-alt-delete */
1369	case LINUX_REBOOT_CMD_CAD_OFF:
1370		return(0);
1371	default:
1372		return(EINVAL);
1373	}
1374
1375	return(sys_reboot(l, &sra, retval));
1376}
1377
1378/*
1379 * Copy of compat_12_sys_swapon().
1380 */
1381int
1382linux_sys_swapon(struct lwp *l, const struct linux_sys_swapon_args *uap, register_t *retval)
1383{
1384	/* {
1385		syscallarg(const char *) name;
1386	} */
1387	struct sys_swapctl_args ua;
1388
1389	SCARG(&ua, cmd) = SWAP_ON;
1390	SCARG(&ua, arg) = (void *)__UNCONST(SCARG(uap, name));
1391	SCARG(&ua, misc) = 0;	/* priority */
1392	return (sys_swapctl(l, &ua, retval));
1393}
1394
1395/*
1396 * Stop swapping to the file or block device specified by path.
1397 */
1398int
1399linux_sys_swapoff(struct lwp *l, const struct linux_sys_swapoff_args *uap, register_t *retval)
1400{
1401	/* {
1402		syscallarg(const char *) path;
1403	} */
1404	struct sys_swapctl_args ua;
1405
1406	SCARG(&ua, cmd) = SWAP_OFF;
1407	SCARG(&ua, arg) = __UNCONST(SCARG(uap, path)); /*XXXUNCONST*/
1408	return (sys_swapctl(l, &ua, retval));
1409}
1410
1411/*
1412 * Copy of compat_09_sys_setdomainname()
1413 */
1414/* ARGSUSED */
1415int
1416linux_sys_setdomainname(struct lwp *l, const struct linux_sys_setdomainname_args *uap, register_t *retval)
1417{
1418	/* {
1419		syscallarg(char *) domainname;
1420		syscallarg(int) len;
1421	} */
1422	int name[2];
1423
1424	name[0] = CTL_KERN;
1425	name[1] = KERN_DOMAINNAME;
1426	return (old_sysctl(&name[0], 2, 0, 0, SCARG(uap, domainname),
1427			    SCARG(uap, len), l));
1428}
1429
1430/*
1431 * sysinfo()
1432 */
1433/* ARGSUSED */
1434int
1435linux_sys_sysinfo(struct lwp *l, const struct linux_sys_sysinfo_args *uap, register_t *retval)
1436{
1437	/* {
1438		syscallarg(struct linux_sysinfo *) arg;
1439	} */
1440	struct linux_sysinfo si;
1441	struct loadavg *la;
1442	int64_t filepg;
1443
1444	memset(&si, 0, sizeof(si));
1445	si.uptime = time_uptime;
1446	la = &averunnable;
1447	si.loads[0] = la->ldavg[0] * LINUX_SYSINFO_LOADS_SCALE / la->fscale;
1448	si.loads[1] = la->ldavg[1] * LINUX_SYSINFO_LOADS_SCALE / la->fscale;
1449	si.loads[2] = la->ldavg[2] * LINUX_SYSINFO_LOADS_SCALE / la->fscale;
1450	si.totalram = ctob((u_long)physmem);
1451	/* uvm_availmem() may sync the counters. */
1452	si.freeram = (u_long)uvm_availmem(true) * uvmexp.pagesize;
1453	filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
1454	    cpu_count_get(CPU_COUNT_FILEDIRTY) +
1455	    cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
1456	    cpu_count_get(CPU_COUNT_EXECPAGES);
1457	si.sharedram = 0;	/* XXX */
1458	si.bufferram = (u_long)(filepg * uvmexp.pagesize);
1459	si.totalswap = (u_long)uvmexp.swpages * uvmexp.pagesize;
1460	si.freeswap =
1461	    (u_long)(uvmexp.swpages - uvmexp.swpginuse) * uvmexp.pagesize;
1462	si.procs = atomic_load_relaxed(&nprocs);
1463
1464	/* The following are only present in newer Linux kernels. */
1465	si.totalbig = 0;
1466	si.freebig = 0;
1467	si.mem_unit = 1;
1468
1469	return (copyout(&si, SCARG(uap, arg), sizeof si));
1470}
1471
1472int
1473linux_sys_getrlimit(struct lwp *l, const struct linux_sys_getrlimit_args *uap, register_t *retval)
1474{
1475	/* {
1476		syscallarg(int) which;
1477# ifdef LINUX_LARGEFILE64
1478		syscallarg(struct rlimit *) rlp;
1479# else
1480		syscallarg(struct orlimit *) rlp;
1481# endif
1482	} */
1483# ifdef LINUX_LARGEFILE64
1484	struct rlimit orl;
1485# else
1486	struct orlimit orl;
1487# endif
1488	int which;
1489
1490	which = linux_to_bsd_limit(SCARG(uap, which));
1491	if (which < 0)
1492		return -which;
1493
1494	memset(&orl, 0, sizeof(orl));
1495	bsd_to_linux_rlimit(&orl, &l->l_proc->p_rlimit[which]);
1496
1497	return copyout(&orl, SCARG(uap, rlp), sizeof(orl));
1498}
1499
1500int
1501linux_sys_setrlimit(struct lwp *l, const struct linux_sys_setrlimit_args *uap, register_t *retval)
1502{
1503	/* {
1504		syscallarg(int) which;
1505# ifdef LINUX_LARGEFILE64
1506		syscallarg(struct rlimit *) rlp;
1507# else
1508		syscallarg(struct orlimit *) rlp;
1509# endif
1510	} */
1511	struct rlimit rl;
1512# ifdef LINUX_LARGEFILE64
1513	struct rlimit orl;
1514# else
1515	struct orlimit orl;
1516# endif
1517	int error;
1518	int which;
1519
1520	if ((error = copyin(SCARG(uap, rlp), &orl, sizeof(orl))) != 0)
1521		return error;
1522
1523	which = linux_to_bsd_limit(SCARG(uap, which));
1524	if (which < 0)
1525		return -which;
1526
1527	linux_to_bsd_rlimit(&rl, &orl);
1528	return dosetrlimit(l, l->l_proc, which, &rl);
1529}
1530
1531# if !defined(__aarch64__) && !defined(__mips__) && !defined(__amd64__)
1532/* XXX: this doesn't look 100% common, at least mips doesn't have it */
1533int
1534linux_sys_ugetrlimit(struct lwp *l, const struct linux_sys_ugetrlimit_args *uap, register_t *retval)
1535{
1536	return linux_sys_getrlimit(l, (const void *)uap, retval);
1537}
1538# endif
1539
1540int
1541linux_sys_prlimit64(struct lwp *l, const struct linux_sys_prlimit64_args *uap, register_t *retval)
1542{
1543	/* {
1544		syscallarg(pid_t) pid;
1545		syscallarg(int) witch;
1546		syscallarg(struct rlimit *) new_rlp;
1547		syscallarg(struct rlimit *) old_rlp;
1548	}; */
1549	struct rlimit rl, nrl, orl;
1550	struct rlimit *p;
1551	int which;
1552	int error;
1553
1554	/* XXX: Cannot operate any process other than its own */
1555	if (SCARG(uap, pid) != 0)
1556		return EPERM;
1557
1558	which = linux_to_bsd_limit(SCARG(uap, which));
1559	if (which < 0)
1560		return -which;
1561
1562	p = SCARG(uap, old_rlp);
1563	if (p != NULL) {
1564		memset(&orl, 0, sizeof(orl));
1565		bsd_to_linux_rlimit64(&orl, &l->l_proc->p_rlimit[which]);
1566		if ((error = copyout(&orl, p, sizeof(orl))) != 0)
1567			return error;
1568	}
1569
1570	p = SCARG(uap, new_rlp);
1571	if (p != NULL) {
1572		if ((error = copyin(p, &nrl, sizeof(nrl))) != 0)
1573			return error;
1574
1575		linux_to_bsd_rlimit(&rl, &nrl);
1576		return dosetrlimit(l, l->l_proc, which, &rl);
1577	}
1578
1579	return 0;
1580}
1581
1582/*
1583 * This gets called for unsupported syscalls. The difference to sys_nosys()
1584 * is that process does not get SIGSYS, the call just returns with ENOSYS.
1585 * This is the way Linux does it and glibc depends on this behaviour.
1586 */
1587int
1588linux_sys_nosys(struct lwp *l, const void *v, register_t *retval)
1589{
1590	return (ENOSYS);
1591}
1592
1593int
1594linux_sys_getpriority(struct lwp *l, const struct linux_sys_getpriority_args *uap, register_t *retval)
1595{
1596        /* {
1597                syscallarg(int) which;
1598                syscallarg(int) who;
1599        } */
1600        struct sys_getpriority_args bsa;
1601        int error;
1602
1603        SCARG(&bsa, which) = SCARG(uap, which);
1604        SCARG(&bsa, who) = SCARG(uap, who);
1605
1606        if ((error = sys_getpriority(l, &bsa, retval)))
1607                return error;
1608
1609        *retval = NZERO - *retval;
1610
1611        return 0;
1612}
1613
1614int
1615linux_do_sys_utimensat(struct lwp *l, int fd, const char *path, struct timespec *tsp, int flags, register_t *retval)
1616{
1617	int follow, error;
1618
1619	follow = (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
1620
1621	if (path == NULL && fd != AT_FDCWD) {
1622		file_t *fp;
1623
1624		/* fd_getvnode() will use the descriptor for us */
1625		if ((error = fd_getvnode(fd, &fp)) != 0)
1626			return error;
1627		error = do_sys_utimensat(l, AT_FDCWD, fp->f_data, NULL, 0,
1628		    tsp, UIO_SYSSPACE);
1629		fd_putfile(fd);
1630		return error;
1631	}
1632
1633	return do_sys_utimensat(l, fd, NULL, path, follow, tsp, UIO_SYSSPACE);
1634}
1635
1636int
1637linux_sys_utimensat(struct lwp *l, const struct linux_sys_utimensat_args *uap,
1638	register_t *retval)
1639{
1640	/* {
1641		syscallarg(int) fd;
1642		syscallarg(const char *) path;
1643		syscallarg(const struct linux_timespec *) times;
1644		syscallarg(int) flag;
1645	} */
1646	int error;
1647	struct linux_timespec lts[2];
1648	struct timespec *tsp = NULL, ts[2];
1649
1650	if (SCARG(uap, times)) {
1651		error = copyin(SCARG(uap, times), &lts, sizeof(lts));
1652		if (error != 0)
1653			return error;
1654		linux_to_native_timespec(&ts[0], &lts[0]);
1655		linux_to_native_timespec(&ts[1], &lts[1]);
1656		tsp = ts;
1657	}
1658
1659	return linux_do_sys_utimensat(l, SCARG(uap, fd), SCARG(uap, path),
1660	    tsp, SCARG(uap, flag), retval);
1661}
1662
1663int
1664linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap,
1665	register_t *retval)
1666{
1667	/* {
1668		syscallarg(int *) uaddr;
1669		syscallarg(int) op;
1670		syscallarg(int) val;
1671		syscallarg(const struct linux_timespec *) timeout;
1672		syscallarg(int *) uaddr2;
1673		syscallarg(int) val3;
1674	} */
1675	struct linux_timespec lts;
1676	struct timespec ts, *tsp = NULL;
1677	int val2 = 0;
1678	int error;
1679
1680	/*
1681	 * Linux overlays the "timeout" field and the "val2" field.
1682	 * "timeout" is only valid for FUTEX_WAIT and FUTEX_WAIT_BITSET
1683	 * on Linux.
1684	 */
1685	const int op = (SCARG(uap, op) & FUTEX_CMD_MASK);
1686	if ((op == FUTEX_WAIT || op == FUTEX_WAIT_BITSET) &&
1687	    SCARG(uap, timeout) != NULL) {
1688		if ((error = copyin(SCARG(uap, timeout),
1689		    &lts, sizeof(lts))) != 0) {
1690			return error;
1691		}
1692		linux_to_native_timespec(&ts, &lts);
1693		tsp = &ts;
1694	} else {
1695		val2 = (int)(uintptr_t)SCARG(uap, timeout);
1696	}
1697
1698	return linux_do_futex(SCARG(uap, uaddr), SCARG(uap, op),
1699	    SCARG(uap, val), tsp, SCARG(uap, uaddr2), val2,
1700	    SCARG(uap, val3), retval);
1701}
1702
1703int
1704linux_do_futex(int *uaddr, int op, int val, struct timespec *timeout,
1705    int *uaddr2, int val2, int val3, register_t *retval)
1706{
1707	/*
1708	 * Always clear FUTEX_PRIVATE_FLAG for Linux processes.
1709	 * NetBSD-native futexes exist in different namespace
1710	 * depending on FUTEX_PRIVATE_FLAG.  This appears not
1711	 * to be the case in Linux, and some futex users will
1712	 * mix private and non-private ops on the same futex
1713	 * object.
1714	 */
1715	return do_futex(uaddr, op & ~FUTEX_PRIVATE_FLAG,
1716			val, timeout, uaddr2, val2, val3, retval);
1717}
1718
1719#define	LINUX_EFD_SEMAPHORE	0x0001
1720#define	LINUX_EFD_CLOEXEC	LINUX_O_CLOEXEC
1721#define	LINUX_EFD_NONBLOCK	LINUX_O_NONBLOCK
1722
1723static int
1724linux_do_eventfd2(struct lwp *l, unsigned int initval, int flags,
1725    register_t *retval)
1726{
1727	int nflags = 0;
1728
1729	if (flags & ~(LINUX_EFD_SEMAPHORE | LINUX_EFD_CLOEXEC |
1730		      LINUX_EFD_NONBLOCK)) {
1731		return EINVAL;
1732	}
1733	if (flags & LINUX_EFD_SEMAPHORE) {
1734		nflags |= EFD_SEMAPHORE;
1735	}
1736	if (flags & LINUX_EFD_CLOEXEC) {
1737		nflags |= EFD_CLOEXEC;
1738	}
1739	if (flags & LINUX_EFD_NONBLOCK) {
1740		nflags |= EFD_NONBLOCK;
1741	}
1742
1743	return do_eventfd(l, initval, nflags, retval);
1744}
1745
1746int
1747linux_sys_eventfd(struct lwp *l, const struct linux_sys_eventfd_args *uap,
1748    register_t *retval)
1749{
1750	/* {
1751		syscallarg(unsigned int) initval;
1752	} */
1753
1754	return linux_do_eventfd2(l, SCARG(uap, initval), 0, retval);
1755}
1756
1757int
1758linux_sys_eventfd2(struct lwp *l, const struct linux_sys_eventfd2_args *uap,
1759    register_t *retval)
1760{
1761	/* {
1762		syscallarg(unsigned int) initval;
1763		syscallarg(int) flags;
1764	} */
1765
1766	return linux_do_eventfd2(l, SCARG(uap, initval), SCARG(uap, flags),
1767				 retval);
1768}
1769
1770#ifndef __aarch64__
1771/*
1772 * epoll_create(2).  Check size and call sys_epoll_create1.
1773 */
1774int
1775linux_sys_epoll_create(struct lwp *l,
1776    const struct linux_sys_epoll_create_args *uap, register_t *retval)
1777{
1778	/* {
1779		syscallarg(int) size;
1780	} */
1781	struct sys_epoll_create1_args ca;
1782
1783	/*
1784	 * SCARG(uap, size) is unused.  Linux just tests it and then
1785	 * forgets it as well.
1786	 */
1787	if (SCARG(uap, size) <= 0)
1788		return EINVAL;
1789
1790	SCARG(&ca, flags) = 0;
1791	return sys_epoll_create1(l, &ca, retval);
1792}
1793#endif /* !__aarch64__ */
1794
1795/*
1796 * epoll_create1(2).  Translate the flags and call sys_epoll_create1.
1797 */
1798int
1799linux_sys_epoll_create1(struct lwp *l,
1800    const struct linux_sys_epoll_create1_args *uap, register_t *retval)
1801{
1802	/* {
1803		syscallarg(int) flags;
1804	} */
1805	struct sys_epoll_create1_args ca;
1806
1807        if ((SCARG(uap, flags) & ~(LINUX_O_CLOEXEC)) != 0)
1808		return EINVAL;
1809
1810	SCARG(&ca, flags) = 0;
1811	if ((SCARG(uap, flags) & LINUX_O_CLOEXEC) != 0)
1812		SCARG(&ca, flags) |= EPOLL_CLOEXEC;
1813
1814	return sys_epoll_create1(l, &ca, retval);
1815}
1816
1817/*
1818 * epoll_ctl(2).  Copyin event and translate it if necessary and then
1819 * call epoll_ctl_common().
1820 */
1821int
1822linux_sys_epoll_ctl(struct lwp *l, const struct linux_sys_epoll_ctl_args *uap,
1823    register_t *retval)
1824{
1825	/* {
1826		syscallarg(int) epfd;
1827		syscallarg(int) op;
1828		syscallarg(int) fd;
1829		syscallarg(struct linux_epoll_event *) event;
1830	} */
1831	struct linux_epoll_event lee;
1832	struct epoll_event ee;
1833	struct epoll_event *eep;
1834	int error;
1835
1836	if (SCARG(uap, op) != EPOLL_CTL_DEL) {
1837		error = copyin(SCARG(uap, event), &lee, sizeof(lee));
1838		if (error != 0)
1839			return error;
1840
1841		/*
1842		 * On some architectures, struct linux_epoll_event and
1843		 * struct epoll_event are packed differently... but otherwise
1844		 * the contents are the same.
1845		 */
1846		ee.events = lee.events;
1847		ee.data = lee.data;
1848
1849		eep = &ee;
1850	} else
1851		eep = NULL;
1852
1853	return epoll_ctl_common(l, retval, SCARG(uap, epfd), SCARG(uap, op),
1854	    SCARG(uap, fd), eep);
1855}
1856
1857#ifndef __aarch64__
1858/*
1859 * epoll_wait(2).  Call sys_epoll_pwait().
1860 */
1861int
1862linux_sys_epoll_wait(struct lwp *l,
1863    const struct linux_sys_epoll_wait_args *uap, register_t *retval)
1864{
1865	/* {
1866		syscallarg(int) epfd;
1867		syscallarg(struct linux_epoll_event *) events;
1868		syscallarg(int) maxevents;
1869		syscallarg(int) timeout;
1870	} */
1871	struct linux_sys_epoll_pwait_args ea;
1872
1873	SCARG(&ea, epfd) = SCARG(uap, epfd);
1874	SCARG(&ea, events) = SCARG(uap, events);
1875	SCARG(&ea, maxevents) = SCARG(uap, maxevents);
1876	SCARG(&ea, timeout) = SCARG(uap, timeout);
1877	SCARG(&ea, sigmask) = NULL;
1878
1879	return linux_sys_epoll_pwait(l, &ea, retval);
1880}
1881#endif /* !__aarch64__ */
1882
1883/*
1884 * Main body of epoll_pwait2(2).  Translate timeout and sigmask and
1885 * call epoll_wait_common.
1886 */
1887static int
1888linux_epoll_pwait2_common(struct lwp *l, register_t *retval, int epfd,
1889    struct linux_epoll_event *events, int maxevents,
1890    struct linux_timespec *timeout, const linux_sigset_t *sigmask)
1891{
1892	struct timespec ts, *tsp;
1893	linux_sigset_t lss;
1894	sigset_t ss, *ssp;
1895	struct epoll_event *eep;
1896	struct linux_epoll_event *leep;
1897	int i, error;
1898
1899	if (maxevents <= 0 || maxevents > EPOLL_MAX_EVENTS)
1900		return EINVAL;
1901
1902	if (timeout != NULL) {
1903		linux_to_native_timespec(&ts, timeout);
1904		tsp = &ts;
1905	} else
1906		tsp = NULL;
1907
1908	if (sigmask != NULL) {
1909		error = copyin(sigmask, &lss, sizeof(lss));
1910		if (error != 0)
1911			return error;
1912
1913		linux_to_native_sigset(&ss, &lss);
1914		ssp = &ss;
1915	} else
1916		ssp = NULL;
1917
1918	eep = kmem_alloc(maxevents * sizeof(*eep), KM_SLEEP);
1919
1920	error = epoll_wait_common(l, retval, epfd, eep, maxevents, tsp,
1921	    ssp);
1922	if (error == 0 && *retval > 0) {
1923		leep = kmem_alloc((*retval) * sizeof(*leep), KM_SLEEP);
1924
1925		/* Translate the events (because of packing). */
1926		for (i = 0; i < *retval; i++) {
1927			leep[i].events = eep[i].events;
1928			leep[i].data = eep[i].data;
1929		}
1930
1931		error = copyout(leep, events, (*retval) * sizeof(*leep));
1932		kmem_free(leep, (*retval) * sizeof(*leep));
1933	}
1934
1935	kmem_free(eep, maxevents * sizeof(*eep));
1936	return error;
1937}
1938
1939/*
1940 * epoll_pwait(2).  Translate timeout and call sys_epoll_pwait2.
1941 */
1942int
1943linux_sys_epoll_pwait(struct lwp *l,
1944    const struct linux_sys_epoll_pwait_args *uap, register_t *retval)
1945{
1946	/* {
1947		syscallarg(int) epfd;
1948		syscallarg(struct linux_epoll_event *) events;
1949		syscallarg(int) maxevents;
1950		syscallarg(int) timeout;
1951		syscallarg(linux_sigset_t *) sigmask;
1952	} */
1953        struct linux_timespec lts, *ltsp;
1954	const int timeout = SCARG(uap, timeout);
1955
1956	if (timeout >= 0) {
1957		/* Convert from milliseconds to timespec. */
1958		lts.tv_sec = timeout / 1000;
1959		lts.tv_nsec = (timeout % 1000) * 1000000;
1960
1961	        ltsp = &lts;
1962	} else
1963		ltsp = NULL;
1964
1965	return linux_epoll_pwait2_common(l, retval, SCARG(uap, epfd),
1966	    SCARG(uap, events), SCARG(uap, maxevents), ltsp,
1967	    SCARG(uap, sigmask));
1968}
1969
1970
1971/*
1972 * epoll_pwait2(2).  Copyin timeout and call linux_epoll_pwait2_common().
1973 */
1974int
1975linux_sys_epoll_pwait2(struct lwp *l,
1976    const struct linux_sys_epoll_pwait2_args *uap, register_t *retval)
1977{
1978	/* {
1979		syscallarg(int) epfd;
1980		syscallarg(struct linux_epoll_event *) events;
1981		syscallarg(int) maxevents;
1982	        syscallarg(struct linux_timespec *) timeout;
1983		syscallarg(linux_sigset_t *) sigmask;
1984	} */
1985	struct linux_timespec lts, *ltsp;
1986	int error;
1987
1988	if (SCARG(uap, timeout) != NULL) {
1989		error = copyin(SCARG(uap, timeout), &lts, sizeof(lts));
1990		if (error != 0)
1991			return error;
1992
1993		ltsp = &lts;
1994	} else
1995		ltsp = NULL;
1996
1997	return linux_epoll_pwait2_common(l, retval, SCARG(uap, epfd),
1998	    SCARG(uap, events), SCARG(uap, maxevents), ltsp,
1999	    SCARG(uap, sigmask));
2000}
2001
2002#define	LINUX_MFD_CLOEXEC	0x0001U
2003#define	LINUX_MFD_ALLOW_SEALING	0x0002U
2004#define	LINUX_MFD_HUGETLB	0x0004U
2005#define	LINUX_MFD_NOEXEC_SEAL	0x0008U
2006#define	LINUX_MFD_EXEC		0x0010U
2007#define	LINUX_MFD_HUGE_FLAGS	(0x3f << 26)
2008
2009#define	LINUX_MFD_ALL_FLAGS	(LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING \
2010				|LINUX_MFD_HUGETLB|LINUX_MFD_NOEXEC_SEAL \
2011				|LINUX_MFD_EXEC|LINUX_MFD_HUGE_FLAGS)
2012#define	LINUX_MFD_KNOWN_FLAGS	(LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING)
2013
2014#define LINUX_MFD_NAME_MAX	249
2015
2016/*
2017 * memfd_create(2).  Do some error checking and then call NetBSD's
2018 * version.
2019 */
2020int
2021linux_sys_memfd_create(struct lwp *l,
2022    const struct linux_sys_memfd_create_args *uap, register_t *retval)
2023{
2024	/* {
2025		syscallarg(const char *) name;
2026		syscallarg(unsigned int) flags;
2027	} */
2028	int error;
2029	char *pbuf;
2030	struct sys_memfd_create_args muap;
2031	const unsigned int lflags = SCARG(uap, flags);
2032
2033	KASSERT(LINUX_MFD_NAME_MAX < NAME_MAX); /* sanity check */
2034
2035	if (lflags & ~LINUX_MFD_ALL_FLAGS)
2036		return EINVAL;
2037	if ((lflags & LINUX_MFD_HUGE_FLAGS) != 0 &&
2038	    (lflags & LINUX_MFD_HUGETLB) == 0)
2039		return EINVAL;
2040	if ((lflags & LINUX_MFD_HUGETLB) && (lflags & LINUX_MFD_ALLOW_SEALING))
2041		return EINVAL;
2042
2043	/* Linux has a stricter limit for name size */
2044	pbuf = PNBUF_GET();
2045	error = copyinstr(SCARG(uap, name), pbuf, LINUX_MFD_NAME_MAX+1, NULL);
2046	PNBUF_PUT(pbuf);
2047	pbuf = NULL;
2048	if (error != 0) {
2049		if (error == ENAMETOOLONG)
2050			error = EINVAL;
2051		return error;
2052	}
2053
2054	if (lflags & ~LINUX_MFD_KNOWN_FLAGS) {
2055		DPRINTF(("linux_sys_memfd_create: ignored flags %x\n",
2056		    lflags & ~LINUX_MFD_KNOWN_FLAGS));
2057	}
2058
2059	SCARG(&muap, name) = SCARG(uap, name);
2060	SCARG(&muap, flags) = lflags & LINUX_MFD_KNOWN_FLAGS;
2061
2062	return sys_memfd_create(l, &muap, retval);
2063}
2064
2065#define	LINUX_CLOSE_RANGE_UNSHARE	0x02U
2066#define	LINUX_CLOSE_RANGE_CLOEXEC	0x04U
2067
2068/*
2069 * close_range(2).
2070 */
2071int
2072linux_sys_close_range(struct lwp *l,
2073    const struct linux_sys_close_range_args *uap, register_t *retval)
2074{
2075	/* {
2076		syscallarg(unsigned int) first;
2077		syscallarg(unsigned int) last;
2078		syscallarg(unsigned int) flags;
2079	} */
2080	unsigned int fd, last;
2081	file_t *fp;
2082	filedesc_t *fdp;
2083	const unsigned int flags = SCARG(uap, flags);
2084
2085	if (flags & ~(LINUX_CLOSE_RANGE_CLOEXEC|LINUX_CLOSE_RANGE_UNSHARE))
2086		return EINVAL;
2087	if (SCARG(uap, first) > SCARG(uap, last))
2088		return EINVAL;
2089
2090	if (flags & LINUX_CLOSE_RANGE_UNSHARE) {
2091		fdp = fd_copy();
2092		fd_free();
2093	        l->l_proc->p_fd = fdp;
2094	        l->l_fd = fdp;
2095	}
2096
2097	last = MIN(SCARG(uap, last), l->l_proc->p_fd->fd_lastfile);
2098	for (fd = SCARG(uap, first); fd <= last; fd++) {
2099		fp = fd_getfile(fd);
2100		if (fp == NULL)
2101			continue;
2102
2103		if (flags & LINUX_CLOSE_RANGE_CLOEXEC) {
2104			fd_set_exclose(l, fd, true);
2105			fd_putfile(fd);
2106		} else
2107			fd_close(fd);
2108	}
2109
2110	return 0;
2111}
2112
2113/*
2114 * readahead(2).  Call posix_fadvise with POSIX_FADV_WILLNEED with some extra
2115 * error checking.
2116 */
2117int
2118linux_sys_readahead(struct lwp *l, const struct linux_sys_readahead_args *uap,
2119    register_t *retval)
2120{
2121	/* {
2122		syscallarg(int) fd;
2123		syscallarg(off_t) offset;
2124		syscallarg(size_t) count;
2125	} */
2126	file_t *fp;
2127	int error = 0;
2128	const int fd = SCARG(uap, fd);
2129
2130	fp = fd_getfile(fd);
2131	if (fp == NULL)
2132		return EBADF;
2133	if ((fp->f_flag & FREAD) == 0)
2134		error = EBADF;
2135	else if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VREG)
2136		error = EINVAL;
2137	fd_putfile(fd);
2138	if (error != 0)
2139		return error;
2140
2141	return do_posix_fadvise(fd, SCARG(uap, offset), SCARG(uap, count),
2142	    POSIX_FADV_WILLNEED);
2143}
2144