1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1994-1995 S��ren Schmidt
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer
13 *    in this position and unchanged.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. The name of the author may not be used to endorse or promote products
18 *    derived from this software without specific prior written permission
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include "opt_compat.h"
36
37#include <sys/param.h>
38#include <sys/blist.h>
39#include <sys/fcntl.h>
40#if defined(__i386__)
41#include <sys/imgact_aout.h>
42#endif
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/limits.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/mman.h>
49#include <sys/mount.h>
50#include <sys/msgbuf.h>
51#include <sys/mutex.h>
52#include <sys/namei.h>
53#include <sys/priv.h>
54#include <sys/proc.h>
55#include <sys/procctl.h>
56#include <sys/reboot.h>
57#include <sys/racct.h>
58#include <sys/random.h>
59#include <sys/resourcevar.h>
60#include <sys/sched.h>
61#include <sys/sdt.h>
62#include <sys/signalvar.h>
63#include <sys/stat.h>
64#include <sys/syscallsubr.h>
65#include <sys/sysctl.h>
66#include <sys/sysproto.h>
67#include <sys/systm.h>
68#include <sys/time.h>
69#include <sys/vmmeter.h>
70#include <sys/vnode.h>
71#include <sys/wait.h>
72#include <sys/cpuset.h>
73#include <sys/uio.h>
74
75#include <security/mac/mac_framework.h>
76
77#include <vm/vm.h>
78#include <vm/pmap.h>
79#include <vm/vm_kern.h>
80#include <vm/vm_map.h>
81#include <vm/vm_extern.h>
82#include <vm/swap_pager.h>
83
84#ifdef COMPAT_LINUX32
85#include <machine/../linux32/linux.h>
86#include <machine/../linux32/linux32_proto.h>
87#else
88#include <machine/../linux/linux.h>
89#include <machine/../linux/linux_proto.h>
90#endif
91
92#include <compat/linux/linux_dtrace.h>
93#include <compat/linux/linux_file.h>
94#include <compat/linux/linux_mib.h>
95#include <compat/linux/linux_signal.h>
96#include <compat/linux/linux_timer.h>
97#include <compat/linux/linux_util.h>
98#include <compat/linux/linux_sysproto.h>
99#include <compat/linux/linux_emul.h>
100#include <compat/linux/linux_misc.h>
101
102int stclohz;				/* Statistics clock frequency */
103
104static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
105	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
106	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
107	RLIMIT_MEMLOCK, RLIMIT_AS
108};
109
110struct l_sysinfo {
111	l_long		uptime;		/* Seconds since boot */
112	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
113#define LINUX_SYSINFO_LOADS_SCALE 65536
114	l_ulong		totalram;	/* Total usable main memory size */
115	l_ulong		freeram;	/* Available memory size */
116	l_ulong		sharedram;	/* Amount of shared memory */
117	l_ulong		bufferram;	/* Memory used by buffers */
118	l_ulong		totalswap;	/* Total swap space size */
119	l_ulong		freeswap;	/* swap space still available */
120	l_ushort	procs;		/* Number of current processes */
121	l_ushort	pads;
122	l_ulong		totalhigh;
123	l_ulong		freehigh;
124	l_uint		mem_unit;
125	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
126};
127
128struct l_pselect6arg {
129	l_uintptr_t	ss;
130	l_size_t	ss_len;
131};
132
133static int	linux_utimensat_nsec_valid(l_long);
134
135
136int
137linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
138{
139	struct l_sysinfo sysinfo;
140	int i, j;
141	struct timespec ts;
142
143	bzero(&sysinfo, sizeof(sysinfo));
144	getnanouptime(&ts);
145	if (ts.tv_nsec != 0)
146		ts.tv_sec++;
147	sysinfo.uptime = ts.tv_sec;
148
149	/* Use the information from the mib to get our load averages */
150	for (i = 0; i < 3; i++)
151		sysinfo.loads[i] = averunnable.ldavg[i] *
152		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
153
154	sysinfo.totalram = physmem * PAGE_SIZE;
155	sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
156
157	/*
158	 * sharedram counts pages allocated to named, swap-backed objects such
159	 * as shared memory segments and tmpfs files.  There is no cheap way to
160	 * compute this, so just leave the field unpopulated.  Linux itself only
161	 * started setting this field in the 3.x timeframe.
162	 */
163	sysinfo.sharedram = 0;
164	sysinfo.bufferram = 0;
165
166	swap_pager_status(&i, &j);
167	sysinfo.totalswap = i * PAGE_SIZE;
168	sysinfo.freeswap = (i - j) * PAGE_SIZE;
169
170	sysinfo.procs = nprocs;
171
172	/*
173	 * Platforms supported by the emulation layer do not have a notion of
174	 * high memory.
175	 */
176	sysinfo.totalhigh = 0;
177	sysinfo.freehigh = 0;
178
179	sysinfo.mem_unit = 1;
180
181	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
182}
183
184#ifdef LINUX_LEGACY_SYSCALLS
185int
186linux_alarm(struct thread *td, struct linux_alarm_args *args)
187{
188	struct itimerval it, old_it;
189	u_int secs;
190	int error;
191
192	secs = args->secs;
193	/*
194	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
195	 * to match kern_setitimer()'s limit to avoid error from it.
196	 *
197	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
198	 * platforms.
199	 */
200	if (secs > INT32_MAX / 2)
201		secs = INT32_MAX / 2;
202
203	it.it_value.tv_sec = secs;
204	it.it_value.tv_usec = 0;
205	timevalclear(&it.it_interval);
206	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
207	KASSERT(error == 0, ("kern_setitimer returns %d", error));
208
209	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
210	    old_it.it_value.tv_usec >= 500000)
211		old_it.it_value.tv_sec++;
212	td->td_retval[0] = old_it.it_value.tv_sec;
213	return (0);
214}
215#endif
216
217int
218linux_brk(struct thread *td, struct linux_brk_args *args)
219{
220	struct vmspace *vm = td->td_proc->p_vmspace;
221	uintptr_t new, old;
222
223	old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
224	new = (uintptr_t)args->dsend;
225	if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
226		td->td_retval[0] = (register_t)new;
227	else
228		td->td_retval[0] = (register_t)old;
229
230	return (0);
231}
232
233#if defined(__i386__)
234/* XXX: what about amd64/linux32? */
235
236int
237linux_uselib(struct thread *td, struct linux_uselib_args *args)
238{
239	struct nameidata ni;
240	struct vnode *vp;
241	struct exec *a_out;
242	vm_map_t map;
243	vm_map_entry_t entry;
244	struct vattr attr;
245	vm_offset_t vmaddr;
246	unsigned long file_offset;
247	unsigned long bss_size;
248	char *library;
249	ssize_t aresid;
250	int error;
251	bool locked, opened, textset;
252
253	LCONVPATHEXIST(td, args->library, &library);
254
255	a_out = NULL;
256	vp = NULL;
257	locked = false;
258	textset = false;
259	opened = false;
260
261	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
262	    UIO_SYSSPACE, library, td);
263	error = namei(&ni);
264	LFREEPATH(library);
265	if (error)
266		goto cleanup;
267
268	vp = ni.ni_vp;
269	NDFREE(&ni, NDF_ONLY_PNBUF);
270
271	/*
272	 * From here on down, we have a locked vnode that must be unlocked.
273	 * XXX: The code below largely duplicates exec_check_permissions().
274	 */
275	locked = true;
276
277	/* Executable? */
278	error = VOP_GETATTR(vp, &attr, td->td_ucred);
279	if (error)
280		goto cleanup;
281
282	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
283	    ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
284		/* EACCESS is what exec(2) returns. */
285		error = ENOEXEC;
286		goto cleanup;
287	}
288
289	/* Sensible size? */
290	if (attr.va_size == 0) {
291		error = ENOEXEC;
292		goto cleanup;
293	}
294
295	/* Can we access it? */
296	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
297	if (error)
298		goto cleanup;
299
300	/*
301	 * XXX: This should use vn_open() so that it is properly authorized,
302	 * and to reduce code redundancy all over the place here.
303	 * XXX: Not really, it duplicates far more of exec_check_permissions()
304	 * than vn_open().
305	 */
306#ifdef MAC
307	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
308	if (error)
309		goto cleanup;
310#endif
311	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
312	if (error)
313		goto cleanup;
314	opened = true;
315
316	/* Pull in executable header into exec_map */
317	error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
318	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
319	if (error)
320		goto cleanup;
321
322	/* Is it a Linux binary ? */
323	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
324		error = ENOEXEC;
325		goto cleanup;
326	}
327
328	/*
329	 * While we are here, we should REALLY do some more checks
330	 */
331
332	/* Set file/virtual offset based on a.out variant. */
333	switch ((int)(a_out->a_magic & 0xffff)) {
334	case 0413:			/* ZMAGIC */
335		file_offset = 1024;
336		break;
337	case 0314:			/* QMAGIC */
338		file_offset = 0;
339		break;
340	default:
341		error = ENOEXEC;
342		goto cleanup;
343	}
344
345	bss_size = round_page(a_out->a_bss);
346
347	/* Check various fields in header for validity/bounds. */
348	if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
349		error = ENOEXEC;
350		goto cleanup;
351	}
352
353	/* text + data can't exceed file size */
354	if (a_out->a_data + a_out->a_text > attr.va_size) {
355		error = EFAULT;
356		goto cleanup;
357	}
358
359	/*
360	 * text/data/bss must not exceed limits
361	 * XXX - this is not complete. it should check current usage PLUS
362	 * the resources needed by this library.
363	 */
364	PROC_LOCK(td->td_proc);
365	if (a_out->a_text > maxtsiz ||
366	    a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
367	    racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
368	    bss_size) != 0) {
369		PROC_UNLOCK(td->td_proc);
370		error = ENOMEM;
371		goto cleanup;
372	}
373	PROC_UNLOCK(td->td_proc);
374
375	/*
376	 * Prevent more writers.
377	 */
378	error = VOP_SET_TEXT(vp);
379	if (error != 0)
380		goto cleanup;
381	textset = true;
382
383	/*
384	 * Lock no longer needed
385	 */
386	locked = false;
387	VOP_UNLOCK(vp, 0);
388
389	/*
390	 * Check if file_offset page aligned. Currently we cannot handle
391	 * misalinged file offsets, and so we read in the entire image
392	 * (what a waste).
393	 */
394	if (file_offset & PAGE_MASK) {
395		/* Map text+data read/write/execute */
396
397		/* a_entry is the load address and is page aligned */
398		vmaddr = trunc_page(a_out->a_entry);
399
400		/* get anon user mapping, read+write+execute */
401		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
402		    &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
403		    VM_PROT_ALL, VM_PROT_ALL, 0);
404		if (error)
405			goto cleanup;
406
407		error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
408		    a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
409		    td->td_ucred, NOCRED, &aresid, td);
410		if (error != 0)
411			goto cleanup;
412		if (aresid != 0) {
413			error = ENOEXEC;
414			goto cleanup;
415		}
416	} else {
417		/*
418		 * for QMAGIC, a_entry is 20 bytes beyond the load address
419		 * to skip the executable header
420		 */
421		vmaddr = trunc_page(a_out->a_entry);
422
423		/*
424		 * Map it all into the process's space as a single
425		 * copy-on-write "data" segment.
426		 */
427		map = &td->td_proc->p_vmspace->vm_map;
428		error = vm_mmap(map, &vmaddr,
429		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
430		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
431		if (error)
432			goto cleanup;
433		vm_map_lock(map);
434		if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
435			vm_map_unlock(map);
436			error = EDOOFUS;
437			goto cleanup;
438		}
439		entry->eflags |= MAP_ENTRY_VN_EXEC;
440		vm_map_unlock(map);
441		textset = false;
442	}
443
444	if (bss_size != 0) {
445		/* Calculate BSS start address */
446		vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
447		    a_out->a_data;
448
449		/* allocate some 'anon' space */
450		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
451		    &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
452		    VM_PROT_ALL, 0);
453		if (error)
454			goto cleanup;
455	}
456
457cleanup:
458	if (opened) {
459		if (locked)
460			VOP_UNLOCK(vp, 0);
461		locked = false;
462		VOP_CLOSE(vp, FREAD, td->td_ucred, td);
463	}
464	if (textset) {
465		if (!locked) {
466			locked = true;
467			VOP_LOCK(vp, LK_SHARED | LK_RETRY);
468		}
469		VOP_UNSET_TEXT_CHECKED(vp);
470	}
471	if (locked)
472		VOP_UNLOCK(vp, 0);
473
474	/* Release the temporary mapping. */
475	if (a_out)
476		kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
477
478	return (error);
479}
480
481#endif	/* __i386__ */
482
483#ifdef LINUX_LEGACY_SYSCALLS
484int
485linux_select(struct thread *td, struct linux_select_args *args)
486{
487	l_timeval ltv;
488	struct timeval tv0, tv1, utv, *tvp;
489	int error;
490
491	/*
492	 * Store current time for computation of the amount of
493	 * time left.
494	 */
495	if (args->timeout) {
496		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
497			goto select_out;
498		utv.tv_sec = ltv.tv_sec;
499		utv.tv_usec = ltv.tv_usec;
500
501		if (itimerfix(&utv)) {
502			/*
503			 * The timeval was invalid.  Convert it to something
504			 * valid that will act as it does under Linux.
505			 */
506			utv.tv_sec += utv.tv_usec / 1000000;
507			utv.tv_usec %= 1000000;
508			if (utv.tv_usec < 0) {
509				utv.tv_sec -= 1;
510				utv.tv_usec += 1000000;
511			}
512			if (utv.tv_sec < 0)
513				timevalclear(&utv);
514		}
515		microtime(&tv0);
516		tvp = &utv;
517	} else
518		tvp = NULL;
519
520	error = kern_select(td, args->nfds, args->readfds, args->writefds,
521	    args->exceptfds, tvp, LINUX_NFDBITS);
522	if (error)
523		goto select_out;
524
525	if (args->timeout) {
526		if (td->td_retval[0]) {
527			/*
528			 * Compute how much time was left of the timeout,
529			 * by subtracting the current time and the time
530			 * before we started the call, and subtracting
531			 * that result from the user-supplied value.
532			 */
533			microtime(&tv1);
534			timevalsub(&tv1, &tv0);
535			timevalsub(&utv, &tv1);
536			if (utv.tv_sec < 0)
537				timevalclear(&utv);
538		} else
539			timevalclear(&utv);
540		ltv.tv_sec = utv.tv_sec;
541		ltv.tv_usec = utv.tv_usec;
542		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
543			goto select_out;
544	}
545
546select_out:
547	return (error);
548}
549#endif
550
551int
552linux_mremap(struct thread *td, struct linux_mremap_args *args)
553{
554	uintptr_t addr;
555	size_t len;
556	int error = 0;
557
558	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
559		td->td_retval[0] = 0;
560		return (EINVAL);
561	}
562
563	/*
564	 * Check for the page alignment.
565	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
566	 */
567	if (args->addr & PAGE_MASK) {
568		td->td_retval[0] = 0;
569		return (EINVAL);
570	}
571
572	args->new_len = round_page(args->new_len);
573	args->old_len = round_page(args->old_len);
574
575	if (args->new_len > args->old_len) {
576		td->td_retval[0] = 0;
577		return (ENOMEM);
578	}
579
580	if (args->new_len < args->old_len) {
581		addr = args->addr + args->new_len;
582		len = args->old_len - args->new_len;
583		error = kern_munmap(td, addr, len);
584	}
585
586	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
587	return (error);
588}
589
590#define LINUX_MS_ASYNC       0x0001
591#define LINUX_MS_INVALIDATE  0x0002
592#define LINUX_MS_SYNC        0x0004
593
594int
595linux_msync(struct thread *td, struct linux_msync_args *args)
596{
597
598	return (kern_msync(td, args->addr, args->len,
599	    args->fl & ~LINUX_MS_SYNC));
600}
601
602#ifdef LINUX_LEGACY_SYSCALLS
603int
604linux_time(struct thread *td, struct linux_time_args *args)
605{
606	struct timeval tv;
607	l_time_t tm;
608	int error;
609
610	microtime(&tv);
611	tm = tv.tv_sec;
612	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
613		return (error);
614	td->td_retval[0] = tm;
615	return (0);
616}
617#endif
618
619struct l_times_argv {
620	l_clock_t	tms_utime;
621	l_clock_t	tms_stime;
622	l_clock_t	tms_cutime;
623	l_clock_t	tms_cstime;
624};
625
626
627/*
628 * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
629 * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
630 * auxiliary vector entry.
631 */
632#define	CLK_TCK		100
633
634#define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
635#define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
636
637#define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER_2004000 ?		\
638			    CONVNTCK(r) : CONVOTCK(r))
639
640int
641linux_times(struct thread *td, struct linux_times_args *args)
642{
643	struct timeval tv, utime, stime, cutime, cstime;
644	struct l_times_argv tms;
645	struct proc *p;
646	int error;
647
648	if (args->buf != NULL) {
649		p = td->td_proc;
650		PROC_LOCK(p);
651		PROC_STATLOCK(p);
652		calcru(p, &utime, &stime);
653		PROC_STATUNLOCK(p);
654		calccru(p, &cutime, &cstime);
655		PROC_UNLOCK(p);
656
657		tms.tms_utime = CONVTCK(utime);
658		tms.tms_stime = CONVTCK(stime);
659
660		tms.tms_cutime = CONVTCK(cutime);
661		tms.tms_cstime = CONVTCK(cstime);
662
663		if ((error = copyout(&tms, args->buf, sizeof(tms))))
664			return (error);
665	}
666
667	microuptime(&tv);
668	td->td_retval[0] = (int)CONVTCK(tv);
669	return (0);
670}
671
672int
673linux_newuname(struct thread *td, struct linux_newuname_args *args)
674{
675	struct l_new_utsname utsname;
676	char osname[LINUX_MAX_UTSNAME];
677	char osrelease[LINUX_MAX_UTSNAME];
678	char *p;
679
680	linux_get_osname(td, osname);
681	linux_get_osrelease(td, osrelease);
682
683	bzero(&utsname, sizeof(utsname));
684	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
685	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
686	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
687	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
688	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
689	for (p = utsname.version; *p != '\0'; ++p)
690		if (*p == '\n') {
691			*p = '\0';
692			break;
693		}
694#if defined(__amd64__)
695	/*
696	 * On amd64, Linux uname(2) needs to return "x86_64"
697	 * for both 64-bit and 32-bit applications.  On 32-bit,
698	 * the string returned by getauxval(AT_PLATFORM) needs
699	 * to remain "i686", though.
700	 */
701	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
702#else
703	strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
704#endif
705
706	return (copyout(&utsname, args->buf, sizeof(utsname)));
707}
708
709struct l_utimbuf {
710	l_time_t l_actime;
711	l_time_t l_modtime;
712};
713
714#ifdef LINUX_LEGACY_SYSCALLS
715int
716linux_utime(struct thread *td, struct linux_utime_args *args)
717{
718	struct timeval tv[2], *tvp;
719	struct l_utimbuf lut;
720	char *fname;
721	int error;
722
723	LCONVPATHEXIST(td, args->fname, &fname);
724
725	if (args->times) {
726		if ((error = copyin(args->times, &lut, sizeof lut))) {
727			LFREEPATH(fname);
728			return (error);
729		}
730		tv[0].tv_sec = lut.l_actime;
731		tv[0].tv_usec = 0;
732		tv[1].tv_sec = lut.l_modtime;
733		tv[1].tv_usec = 0;
734		tvp = tv;
735	} else
736		tvp = NULL;
737
738	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
739	    UIO_SYSSPACE);
740	LFREEPATH(fname);
741	return (error);
742}
743#endif
744
745#ifdef LINUX_LEGACY_SYSCALLS
746int
747linux_utimes(struct thread *td, struct linux_utimes_args *args)
748{
749	l_timeval ltv[2];
750	struct timeval tv[2], *tvp = NULL;
751	char *fname;
752	int error;
753
754	LCONVPATHEXIST(td, args->fname, &fname);
755
756	if (args->tptr != NULL) {
757		if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
758			LFREEPATH(fname);
759			return (error);
760		}
761		tv[0].tv_sec = ltv[0].tv_sec;
762		tv[0].tv_usec = ltv[0].tv_usec;
763		tv[1].tv_sec = ltv[1].tv_sec;
764		tv[1].tv_usec = ltv[1].tv_usec;
765		tvp = tv;
766	}
767
768	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
769	    tvp, UIO_SYSSPACE);
770	LFREEPATH(fname);
771	return (error);
772}
773#endif
774
775static int
776linux_utimensat_nsec_valid(l_long nsec)
777{
778
779	if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
780		return (0);
781	if (nsec >= 0 && nsec <= 999999999)
782		return (0);
783	return (1);
784}
785
786int
787linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
788{
789	struct l_timespec l_times[2];
790	struct timespec times[2], *timesp = NULL;
791	char *path = NULL;
792	int error, dfd, flags = 0;
793
794	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
795
796	if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
797		return (EINVAL);
798
799	if (args->times != NULL) {
800		error = copyin(args->times, l_times, sizeof(l_times));
801		if (error != 0)
802			return (error);
803
804		if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
805		    linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
806			return (EINVAL);
807
808		times[0].tv_sec = l_times[0].tv_sec;
809		switch (l_times[0].tv_nsec)
810		{
811		case LINUX_UTIME_OMIT:
812			times[0].tv_nsec = UTIME_OMIT;
813			break;
814		case LINUX_UTIME_NOW:
815			times[0].tv_nsec = UTIME_NOW;
816			break;
817		default:
818			times[0].tv_nsec = l_times[0].tv_nsec;
819		}
820
821		times[1].tv_sec = l_times[1].tv_sec;
822		switch (l_times[1].tv_nsec)
823		{
824		case LINUX_UTIME_OMIT:
825			times[1].tv_nsec = UTIME_OMIT;
826			break;
827		case LINUX_UTIME_NOW:
828			times[1].tv_nsec = UTIME_NOW;
829			break;
830		default:
831			times[1].tv_nsec = l_times[1].tv_nsec;
832			break;
833		}
834		timesp = times;
835
836		/* This breaks POSIX, but is what the Linux kernel does
837		 * _on purpose_ (documented in the man page for utimensat(2)),
838		 * so we must follow that behaviour. */
839		if (times[0].tv_nsec == UTIME_OMIT &&
840		    times[1].tv_nsec == UTIME_OMIT)
841			return (0);
842	}
843
844	if (args->pathname != NULL)
845		LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
846	else if (args->flags != 0)
847		return (EINVAL);
848
849	if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
850		flags |= AT_SYMLINK_NOFOLLOW;
851
852	if (path == NULL)
853		error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
854	else {
855		error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
856			UIO_SYSSPACE, flags);
857		LFREEPATH(path);
858	}
859
860	return (error);
861}
862
863#ifdef LINUX_LEGACY_SYSCALLS
864int
865linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
866{
867	l_timeval ltv[2];
868	struct timeval tv[2], *tvp = NULL;
869	char *fname;
870	int error, dfd;
871
872	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
873	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
874
875	if (args->utimes != NULL) {
876		if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
877			LFREEPATH(fname);
878			return (error);
879		}
880		tv[0].tv_sec = ltv[0].tv_sec;
881		tv[0].tv_usec = ltv[0].tv_usec;
882		tv[1].tv_sec = ltv[1].tv_sec;
883		tv[1].tv_usec = ltv[1].tv_usec;
884		tvp = tv;
885	}
886
887	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
888	LFREEPATH(fname);
889	return (error);
890}
891#endif
892
893static int
894linux_common_wait(struct thread *td, int pid, int *statusp,
895    int options, struct __wrusage *wrup)
896{
897	siginfo_t siginfo;
898	idtype_t idtype;
899	id_t id;
900	int error, status, tmpstat;
901
902	if (pid == WAIT_ANY) {
903		idtype = P_ALL;
904		id = 0;
905	} else if (pid < 0) {
906		idtype = P_PGID;
907		id = (id_t)-pid;
908	} else {
909		idtype = P_PID;
910		id = (id_t)pid;
911	}
912
913	/*
914	 * For backward compatibility we implicitly add flags WEXITED
915	 * and WTRAPPED here.
916	 */
917	options |= WEXITED | WTRAPPED;
918	error = kern_wait6(td, idtype, id, &status, options, wrup, &siginfo);
919	if (error)
920		return (error);
921
922	if (statusp) {
923		tmpstat = status & 0xffff;
924		if (WIFSIGNALED(tmpstat)) {
925			tmpstat = (tmpstat & 0xffffff80) |
926			    bsd_to_linux_signal(WTERMSIG(tmpstat));
927		} else if (WIFSTOPPED(tmpstat)) {
928			tmpstat = (tmpstat & 0xffff00ff) |
929			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
930#if defined(__amd64__) && !defined(COMPAT_LINUX32)
931			if (WSTOPSIG(status) == SIGTRAP) {
932				tmpstat = linux_ptrace_status(td,
933				    siginfo.si_pid, tmpstat);
934			}
935#endif
936		} else if (WIFCONTINUED(tmpstat)) {
937			tmpstat = 0xffff;
938		}
939		error = copyout(&tmpstat, statusp, sizeof(int));
940	}
941
942	return (error);
943}
944
945#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
946int
947linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
948{
949	struct linux_wait4_args wait4_args;
950
951	wait4_args.pid = args->pid;
952	wait4_args.status = args->status;
953	wait4_args.options = args->options;
954	wait4_args.rusage = NULL;
955
956	return (linux_wait4(td, &wait4_args));
957}
958#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
959
960int
961linux_wait4(struct thread *td, struct linux_wait4_args *args)
962{
963	int error, options;
964	struct __wrusage wru, *wrup;
965
966	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
967	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
968		return (EINVAL);
969
970	options = WEXITED;
971	linux_to_bsd_waitopts(args->options, &options);
972
973	if (args->rusage != NULL)
974		wrup = &wru;
975	else
976		wrup = NULL;
977	error = linux_common_wait(td, args->pid, args->status, options, wrup);
978	if (error != 0)
979		return (error);
980	if (args->rusage != NULL)
981		error = linux_copyout_rusage(&wru.wru_self, args->rusage);
982	return (error);
983}
984
985int
986linux_waitid(struct thread *td, struct linux_waitid_args *args)
987{
988	int status, options, sig;
989	struct __wrusage wru;
990	siginfo_t siginfo;
991	l_siginfo_t lsi;
992	idtype_t idtype;
993	struct proc *p;
994	int error;
995
996	options = 0;
997	linux_to_bsd_waitopts(args->options, &options);
998
999	if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
1000		return (EINVAL);
1001	if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
1002		return (EINVAL);
1003
1004	switch (args->idtype) {
1005	case LINUX_P_ALL:
1006		idtype = P_ALL;
1007		break;
1008	case LINUX_P_PID:
1009		if (args->id <= 0)
1010			return (EINVAL);
1011		idtype = P_PID;
1012		break;
1013	case LINUX_P_PGID:
1014		if (args->id <= 0)
1015			return (EINVAL);
1016		idtype = P_PGID;
1017		break;
1018	default:
1019		return (EINVAL);
1020	}
1021
1022	error = kern_wait6(td, idtype, args->id, &status, options,
1023	    &wru, &siginfo);
1024	if (error != 0)
1025		return (error);
1026	if (args->rusage != NULL) {
1027		error = linux_copyout_rusage(&wru.wru_children,
1028		    args->rusage);
1029		if (error != 0)
1030			return (error);
1031	}
1032	if (args->info != NULL) {
1033		p = td->td_proc;
1034		bzero(&lsi, sizeof(lsi));
1035		if (td->td_retval[0] != 0) {
1036			sig = bsd_to_linux_signal(siginfo.si_signo);
1037			siginfo_to_lsiginfo(&siginfo, &lsi, sig);
1038		}
1039		error = copyout(&lsi, args->info, sizeof(lsi));
1040	}
1041	td->td_retval[0] = 0;
1042
1043	return (error);
1044}
1045
1046#ifdef LINUX_LEGACY_SYSCALLS
1047int
1048linux_mknod(struct thread *td, struct linux_mknod_args *args)
1049{
1050	char *path;
1051	int error;
1052
1053	LCONVPATHCREAT(td, args->path, &path);
1054
1055	switch (args->mode & S_IFMT) {
1056	case S_IFIFO:
1057	case S_IFSOCK:
1058		error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE,
1059		    args->mode);
1060		break;
1061
1062	case S_IFCHR:
1063	case S_IFBLK:
1064		error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE,
1065		    args->mode, args->dev);
1066		break;
1067
1068	case S_IFDIR:
1069		error = EPERM;
1070		break;
1071
1072	case 0:
1073		args->mode |= S_IFREG;
1074		/* FALLTHROUGH */
1075	case S_IFREG:
1076		error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE,
1077		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1078		if (error == 0)
1079			kern_close(td, td->td_retval[0]);
1080		break;
1081
1082	default:
1083		error = EINVAL;
1084		break;
1085	}
1086	LFREEPATH(path);
1087	return (error);
1088}
1089#endif
1090
1091int
1092linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
1093{
1094	char *path;
1095	int error, dfd;
1096
1097	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
1098	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
1099
1100	switch (args->mode & S_IFMT) {
1101	case S_IFIFO:
1102	case S_IFSOCK:
1103		error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
1104		break;
1105
1106	case S_IFCHR:
1107	case S_IFBLK:
1108		error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
1109		    args->dev);
1110		break;
1111
1112	case S_IFDIR:
1113		error = EPERM;
1114		break;
1115
1116	case 0:
1117		args->mode |= S_IFREG;
1118		/* FALLTHROUGH */
1119	case S_IFREG:
1120		error = kern_openat(td, dfd, path, UIO_SYSSPACE,
1121		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
1122		if (error == 0)
1123			kern_close(td, td->td_retval[0]);
1124		break;
1125
1126	default:
1127		error = EINVAL;
1128		break;
1129	}
1130	LFREEPATH(path);
1131	return (error);
1132}
1133
1134/*
1135 * UGH! This is just about the dumbest idea I've ever heard!!
1136 */
1137int
1138linux_personality(struct thread *td, struct linux_personality_args *args)
1139{
1140	struct linux_pemuldata *pem;
1141	struct proc *p = td->td_proc;
1142	uint32_t old;
1143
1144	PROC_LOCK(p);
1145	pem = pem_find(p);
1146	old = pem->persona;
1147	if (args->per != 0xffffffff)
1148		pem->persona = args->per;
1149	PROC_UNLOCK(p);
1150
1151	td->td_retval[0] = old;
1152	return (0);
1153}
1154
1155struct l_itimerval {
1156	l_timeval it_interval;
1157	l_timeval it_value;
1158};
1159
1160#define	B2L_ITIMERVAL(bip, lip)						\
1161	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
1162	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
1163	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
1164	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
1165
1166int
1167linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
1168{
1169	int error;
1170	struct l_itimerval ls;
1171	struct itimerval aitv, oitv;
1172
1173	if (uap->itv == NULL) {
1174		uap->itv = uap->oitv;
1175		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
1176	}
1177
1178	error = copyin(uap->itv, &ls, sizeof(ls));
1179	if (error != 0)
1180		return (error);
1181	B2L_ITIMERVAL(&aitv, &ls);
1182	error = kern_setitimer(td, uap->which, &aitv, &oitv);
1183	if (error != 0 || uap->oitv == NULL)
1184		return (error);
1185	B2L_ITIMERVAL(&ls, &oitv);
1186
1187	return (copyout(&ls, uap->oitv, sizeof(ls)));
1188}
1189
1190int
1191linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1192{
1193	int error;
1194	struct l_itimerval ls;
1195	struct itimerval aitv;
1196
1197	error = kern_getitimer(td, uap->which, &aitv);
1198	if (error != 0)
1199		return (error);
1200	B2L_ITIMERVAL(&ls, &aitv);
1201	return (copyout(&ls, uap->itv, sizeof(ls)));
1202}
1203
1204#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1205int
1206linux_nice(struct thread *td, struct linux_nice_args *args)
1207{
1208	struct setpriority_args bsd_args;
1209
1210	bsd_args.which = PRIO_PROCESS;
1211	bsd_args.who = 0;		/* current process */
1212	bsd_args.prio = args->inc;
1213	return (sys_setpriority(td, &bsd_args));
1214}
1215#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1216
1217int
1218linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1219{
1220	struct ucred *newcred, *oldcred;
1221	l_gid_t *linux_gidset;
1222	gid_t *bsd_gidset;
1223	int ngrp, error;
1224	struct proc *p;
1225
1226	ngrp = args->gidsetsize;
1227	if (ngrp < 0 || ngrp >= ngroups_max + 1)
1228		return (EINVAL);
1229	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1230	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1231	if (error)
1232		goto out;
1233	newcred = crget();
1234	crextend(newcred, ngrp + 1);
1235	p = td->td_proc;
1236	PROC_LOCK(p);
1237	oldcred = p->p_ucred;
1238	crcopy(newcred, oldcred);
1239
1240	/*
1241	 * cr_groups[0] holds egid. Setting the whole set from
1242	 * the supplied set will cause egid to be changed too.
1243	 * Keep cr_groups[0] unchanged to prevent that.
1244	 */
1245
1246	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
1247		PROC_UNLOCK(p);
1248		crfree(newcred);
1249		goto out;
1250	}
1251
1252	if (ngrp > 0) {
1253		newcred->cr_ngroups = ngrp + 1;
1254
1255		bsd_gidset = newcred->cr_groups;
1256		ngrp--;
1257		while (ngrp >= 0) {
1258			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1259			ngrp--;
1260		}
1261	} else
1262		newcred->cr_ngroups = 1;
1263
1264	setsugid(p);
1265	proc_set_cred(p, newcred);
1266	PROC_UNLOCK(p);
1267	crfree(oldcred);
1268	error = 0;
1269out:
1270	free(linux_gidset, M_LINUX);
1271	return (error);
1272}
1273
1274int
1275linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1276{
1277	struct ucred *cred;
1278	l_gid_t *linux_gidset;
1279	gid_t *bsd_gidset;
1280	int bsd_gidsetsz, ngrp, error;
1281
1282	cred = td->td_ucred;
1283	bsd_gidset = cred->cr_groups;
1284	bsd_gidsetsz = cred->cr_ngroups - 1;
1285
1286	/*
1287	 * cr_groups[0] holds egid. Returning the whole set
1288	 * here will cause a duplicate. Exclude cr_groups[0]
1289	 * to prevent that.
1290	 */
1291
1292	if ((ngrp = args->gidsetsize) == 0) {
1293		td->td_retval[0] = bsd_gidsetsz;
1294		return (0);
1295	}
1296
1297	if (ngrp < bsd_gidsetsz)
1298		return (EINVAL);
1299
1300	ngrp = 0;
1301	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1302	    M_LINUX, M_WAITOK);
1303	while (ngrp < bsd_gidsetsz) {
1304		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1305		ngrp++;
1306	}
1307
1308	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1309	free(linux_gidset, M_LINUX);
1310	if (error)
1311		return (error);
1312
1313	td->td_retval[0] = ngrp;
1314	return (0);
1315}
1316
1317int
1318linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1319{
1320	struct rlimit bsd_rlim;
1321	struct l_rlimit rlim;
1322	u_int which;
1323	int error;
1324
1325	if (args->resource >= LINUX_RLIM_NLIMITS)
1326		return (EINVAL);
1327
1328	which = linux_to_bsd_resource[args->resource];
1329	if (which == -1)
1330		return (EINVAL);
1331
1332	error = copyin(args->rlim, &rlim, sizeof(rlim));
1333	if (error)
1334		return (error);
1335
1336	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1337	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1338	return (kern_setrlimit(td, which, &bsd_rlim));
1339}
1340
1341#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1342int
1343linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1344{
1345	struct l_rlimit rlim;
1346	struct rlimit bsd_rlim;
1347	u_int which;
1348
1349	if (args->resource >= LINUX_RLIM_NLIMITS)
1350		return (EINVAL);
1351
1352	which = linux_to_bsd_resource[args->resource];
1353	if (which == -1)
1354		return (EINVAL);
1355
1356	lim_rlimit(td, which, &bsd_rlim);
1357
1358#ifdef COMPAT_LINUX32
1359	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1360	if (rlim.rlim_cur == UINT_MAX)
1361		rlim.rlim_cur = INT_MAX;
1362	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1363	if (rlim.rlim_max == UINT_MAX)
1364		rlim.rlim_max = INT_MAX;
1365#else
1366	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1367	if (rlim.rlim_cur == ULONG_MAX)
1368		rlim.rlim_cur = LONG_MAX;
1369	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1370	if (rlim.rlim_max == ULONG_MAX)
1371		rlim.rlim_max = LONG_MAX;
1372#endif
1373	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1374}
1375#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1376
1377int
1378linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1379{
1380	struct l_rlimit rlim;
1381	struct rlimit bsd_rlim;
1382	u_int which;
1383
1384	if (args->resource >= LINUX_RLIM_NLIMITS)
1385		return (EINVAL);
1386
1387	which = linux_to_bsd_resource[args->resource];
1388	if (which == -1)
1389		return (EINVAL);
1390
1391	lim_rlimit(td, which, &bsd_rlim);
1392
1393	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1394	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1395	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1396}
1397
1398int
1399linux_sched_setscheduler(struct thread *td,
1400    struct linux_sched_setscheduler_args *args)
1401{
1402	struct sched_param sched_param;
1403	struct thread *tdt;
1404	int error, policy;
1405
1406	switch (args->policy) {
1407	case LINUX_SCHED_OTHER:
1408		policy = SCHED_OTHER;
1409		break;
1410	case LINUX_SCHED_FIFO:
1411		policy = SCHED_FIFO;
1412		break;
1413	case LINUX_SCHED_RR:
1414		policy = SCHED_RR;
1415		break;
1416	default:
1417		return (EINVAL);
1418	}
1419
1420	error = copyin(args->param, &sched_param, sizeof(sched_param));
1421	if (error)
1422		return (error);
1423
1424	if (linux_map_sched_prio) {
1425		switch (policy) {
1426		case SCHED_OTHER:
1427			if (sched_param.sched_priority != 0)
1428				return (EINVAL);
1429
1430			sched_param.sched_priority =
1431			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1432			break;
1433		case SCHED_FIFO:
1434		case SCHED_RR:
1435			if (sched_param.sched_priority < 1 ||
1436			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1437				return (EINVAL);
1438
1439			/*
1440			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1441			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1442			 */
1443			sched_param.sched_priority =
1444			    (sched_param.sched_priority - 1) *
1445			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1446			    (LINUX_MAX_RT_PRIO - 1);
1447			break;
1448		}
1449	}
1450
1451	tdt = linux_tdfind(td, args->pid, -1);
1452	if (tdt == NULL)
1453		return (ESRCH);
1454
1455	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1456	PROC_UNLOCK(tdt->td_proc);
1457	return (error);
1458}
1459
1460int
1461linux_sched_getscheduler(struct thread *td,
1462    struct linux_sched_getscheduler_args *args)
1463{
1464	struct thread *tdt;
1465	int error, policy;
1466
1467	tdt = linux_tdfind(td, args->pid, -1);
1468	if (tdt == NULL)
1469		return (ESRCH);
1470
1471	error = kern_sched_getscheduler(td, tdt, &policy);
1472	PROC_UNLOCK(tdt->td_proc);
1473
1474	switch (policy) {
1475	case SCHED_OTHER:
1476		td->td_retval[0] = LINUX_SCHED_OTHER;
1477		break;
1478	case SCHED_FIFO:
1479		td->td_retval[0] = LINUX_SCHED_FIFO;
1480		break;
1481	case SCHED_RR:
1482		td->td_retval[0] = LINUX_SCHED_RR;
1483		break;
1484	}
1485	return (error);
1486}
1487
1488int
1489linux_sched_get_priority_max(struct thread *td,
1490    struct linux_sched_get_priority_max_args *args)
1491{
1492	struct sched_get_priority_max_args bsd;
1493
1494	if (linux_map_sched_prio) {
1495		switch (args->policy) {
1496		case LINUX_SCHED_OTHER:
1497			td->td_retval[0] = 0;
1498			return (0);
1499		case LINUX_SCHED_FIFO:
1500		case LINUX_SCHED_RR:
1501			td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1502			return (0);
1503		default:
1504			return (EINVAL);
1505		}
1506	}
1507
1508	switch (args->policy) {
1509	case LINUX_SCHED_OTHER:
1510		bsd.policy = SCHED_OTHER;
1511		break;
1512	case LINUX_SCHED_FIFO:
1513		bsd.policy = SCHED_FIFO;
1514		break;
1515	case LINUX_SCHED_RR:
1516		bsd.policy = SCHED_RR;
1517		break;
1518	default:
1519		return (EINVAL);
1520	}
1521	return (sys_sched_get_priority_max(td, &bsd));
1522}
1523
1524int
1525linux_sched_get_priority_min(struct thread *td,
1526    struct linux_sched_get_priority_min_args *args)
1527{
1528	struct sched_get_priority_min_args bsd;
1529
1530	if (linux_map_sched_prio) {
1531		switch (args->policy) {
1532		case LINUX_SCHED_OTHER:
1533			td->td_retval[0] = 0;
1534			return (0);
1535		case LINUX_SCHED_FIFO:
1536		case LINUX_SCHED_RR:
1537			td->td_retval[0] = 1;
1538			return (0);
1539		default:
1540			return (EINVAL);
1541		}
1542	}
1543
1544	switch (args->policy) {
1545	case LINUX_SCHED_OTHER:
1546		bsd.policy = SCHED_OTHER;
1547		break;
1548	case LINUX_SCHED_FIFO:
1549		bsd.policy = SCHED_FIFO;
1550		break;
1551	case LINUX_SCHED_RR:
1552		bsd.policy = SCHED_RR;
1553		break;
1554	default:
1555		return (EINVAL);
1556	}
1557	return (sys_sched_get_priority_min(td, &bsd));
1558}
1559
1560#define REBOOT_CAD_ON	0x89abcdef
1561#define REBOOT_CAD_OFF	0
1562#define REBOOT_HALT	0xcdef0123
1563#define REBOOT_RESTART	0x01234567
1564#define REBOOT_RESTART2	0xA1B2C3D4
1565#define REBOOT_POWEROFF	0x4321FEDC
1566#define REBOOT_MAGIC1	0xfee1dead
1567#define REBOOT_MAGIC2	0x28121969
1568#define REBOOT_MAGIC2A	0x05121996
1569#define REBOOT_MAGIC2B	0x16041998
1570
1571int
1572linux_reboot(struct thread *td, struct linux_reboot_args *args)
1573{
1574	struct reboot_args bsd_args;
1575
1576	if (args->magic1 != REBOOT_MAGIC1)
1577		return (EINVAL);
1578
1579	switch (args->magic2) {
1580	case REBOOT_MAGIC2:
1581	case REBOOT_MAGIC2A:
1582	case REBOOT_MAGIC2B:
1583		break;
1584	default:
1585		return (EINVAL);
1586	}
1587
1588	switch (args->cmd) {
1589	case REBOOT_CAD_ON:
1590	case REBOOT_CAD_OFF:
1591		return (priv_check(td, PRIV_REBOOT));
1592	case REBOOT_HALT:
1593		bsd_args.opt = RB_HALT;
1594		break;
1595	case REBOOT_RESTART:
1596	case REBOOT_RESTART2:
1597		bsd_args.opt = 0;
1598		break;
1599	case REBOOT_POWEROFF:
1600		bsd_args.opt = RB_POWEROFF;
1601		break;
1602	default:
1603		return (EINVAL);
1604	}
1605	return (sys_reboot(td, &bsd_args));
1606}
1607
1608
1609int
1610linux_getpid(struct thread *td, struct linux_getpid_args *args)
1611{
1612
1613	td->td_retval[0] = td->td_proc->p_pid;
1614
1615	return (0);
1616}
1617
1618int
1619linux_gettid(struct thread *td, struct linux_gettid_args *args)
1620{
1621	struct linux_emuldata *em;
1622
1623	em = em_find(td);
1624	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1625
1626	td->td_retval[0] = em->em_tid;
1627
1628	return (0);
1629}
1630
1631
1632int
1633linux_getppid(struct thread *td, struct linux_getppid_args *args)
1634{
1635
1636	td->td_retval[0] = kern_getppid(td);
1637	return (0);
1638}
1639
1640int
1641linux_getgid(struct thread *td, struct linux_getgid_args *args)
1642{
1643
1644	td->td_retval[0] = td->td_ucred->cr_rgid;
1645	return (0);
1646}
1647
1648int
1649linux_getuid(struct thread *td, struct linux_getuid_args *args)
1650{
1651
1652	td->td_retval[0] = td->td_ucred->cr_ruid;
1653	return (0);
1654}
1655
1656
1657int
1658linux_getsid(struct thread *td, struct linux_getsid_args *args)
1659{
1660	struct getsid_args bsd;
1661
1662	bsd.pid = args->pid;
1663	return (sys_getsid(td, &bsd));
1664}
1665
1666int
1667linux_nosys(struct thread *td, struct nosys_args *ignore)
1668{
1669
1670	return (ENOSYS);
1671}
1672
1673int
1674linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1675{
1676	struct getpriority_args bsd_args;
1677	int error;
1678
1679	bsd_args.which = args->which;
1680	bsd_args.who = args->who;
1681	error = sys_getpriority(td, &bsd_args);
1682	td->td_retval[0] = 20 - td->td_retval[0];
1683	return (error);
1684}
1685
1686int
1687linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1688{
1689	int name[2];
1690
1691	name[0] = CTL_KERN;
1692	name[1] = KERN_HOSTNAME;
1693	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1694	    args->len, 0, 0));
1695}
1696
1697int
1698linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1699{
1700	int name[2];
1701
1702	name[0] = CTL_KERN;
1703	name[1] = KERN_NISDOMAINNAME;
1704	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1705	    args->len, 0, 0));
1706}
1707
1708int
1709linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1710{
1711
1712	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1713	    args->error_code);
1714
1715	/*
1716	 * XXX: we should send a signal to the parent if
1717	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1718	 * as it doesnt occur often.
1719	 */
1720	exit1(td, args->error_code, 0);
1721		/* NOTREACHED */
1722}
1723
1724#define _LINUX_CAPABILITY_VERSION_1  0x19980330
1725#define _LINUX_CAPABILITY_VERSION_2  0x20071026
1726#define _LINUX_CAPABILITY_VERSION_3  0x20080522
1727
1728struct l_user_cap_header {
1729	l_int	version;
1730	l_int	pid;
1731};
1732
1733struct l_user_cap_data {
1734	l_int	effective;
1735	l_int	permitted;
1736	l_int	inheritable;
1737};
1738
1739int
1740linux_capget(struct thread *td, struct linux_capget_args *uap)
1741{
1742	struct l_user_cap_header luch;
1743	struct l_user_cap_data lucd[2];
1744	int error, u32s;
1745
1746	if (uap->hdrp == NULL)
1747		return (EFAULT);
1748
1749	error = copyin(uap->hdrp, &luch, sizeof(luch));
1750	if (error != 0)
1751		return (error);
1752
1753	switch (luch.version) {
1754	case _LINUX_CAPABILITY_VERSION_1:
1755		u32s = 1;
1756		break;
1757	case _LINUX_CAPABILITY_VERSION_2:
1758	case _LINUX_CAPABILITY_VERSION_3:
1759		u32s = 2;
1760		break;
1761	default:
1762		luch.version = _LINUX_CAPABILITY_VERSION_1;
1763		error = copyout(&luch, uap->hdrp, sizeof(luch));
1764		if (error)
1765			return (error);
1766		return (EINVAL);
1767	}
1768
1769	if (luch.pid)
1770		return (EPERM);
1771
1772	if (uap->datap) {
1773		/*
1774		 * The current implementation doesn't support setting
1775		 * a capability (it's essentially a stub) so indicate
1776		 * that no capabilities are currently set or available
1777		 * to request.
1778		 */
1779		memset(&lucd, 0, u32s * sizeof(lucd[0]));
1780		error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1781	}
1782
1783	return (error);
1784}
1785
1786int
1787linux_capset(struct thread *td, struct linux_capset_args *uap)
1788{
1789	struct l_user_cap_header luch;
1790	struct l_user_cap_data lucd[2];
1791	int error, i, u32s;
1792
1793	if (uap->hdrp == NULL || uap->datap == NULL)
1794		return (EFAULT);
1795
1796	error = copyin(uap->hdrp, &luch, sizeof(luch));
1797	if (error != 0)
1798		return (error);
1799
1800	switch (luch.version) {
1801	case _LINUX_CAPABILITY_VERSION_1:
1802		u32s = 1;
1803		break;
1804	case _LINUX_CAPABILITY_VERSION_2:
1805	case _LINUX_CAPABILITY_VERSION_3:
1806		u32s = 2;
1807		break;
1808	default:
1809		luch.version = _LINUX_CAPABILITY_VERSION_1;
1810		error = copyout(&luch, uap->hdrp, sizeof(luch));
1811		if (error)
1812			return (error);
1813		return (EINVAL);
1814	}
1815
1816	if (luch.pid)
1817		return (EPERM);
1818
1819	error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1820	if (error != 0)
1821		return (error);
1822
1823	/* We currently don't support setting any capabilities. */
1824	for (i = 0; i < u32s; i++) {
1825		if (lucd[i].effective || lucd[i].permitted ||
1826		    lucd[i].inheritable) {
1827			linux_msg(td,
1828			    "capset[%d] effective=0x%x, permitted=0x%x, "
1829			    "inheritable=0x%x is not implemented", i,
1830			    (int)lucd[i].effective, (int)lucd[i].permitted,
1831			    (int)lucd[i].inheritable);
1832			return (EPERM);
1833		}
1834	}
1835
1836	return (0);
1837}
1838
1839int
1840linux_prctl(struct thread *td, struct linux_prctl_args *args)
1841{
1842	int error = 0, max_size;
1843	struct proc *p = td->td_proc;
1844	char comm[LINUX_MAX_COMM_LEN];
1845	int pdeath_signal;
1846
1847	switch (args->option) {
1848	case LINUX_PR_SET_PDEATHSIG:
1849		if (!LINUX_SIG_VALID(args->arg2))
1850			return (EINVAL);
1851		pdeath_signal = linux_to_bsd_signal(args->arg2);
1852		return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1853		    &pdeath_signal));
1854	case LINUX_PR_GET_PDEATHSIG:
1855		error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1856		    &pdeath_signal);
1857		if (error != 0)
1858			return (error);
1859		pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1860		return (copyout(&pdeath_signal,
1861		    (void *)(register_t)args->arg2,
1862		    sizeof(pdeath_signal)));
1863		break;
1864	case LINUX_PR_GET_KEEPCAPS:
1865		/*
1866		 * Indicate that we always clear the effective and
1867		 * permitted capability sets when the user id becomes
1868		 * non-zero (actually the capability sets are simply
1869		 * always zero in the current implementation).
1870		 */
1871		td->td_retval[0] = 0;
1872		break;
1873	case LINUX_PR_SET_KEEPCAPS:
1874		/*
1875		 * Ignore requests to keep the effective and permitted
1876		 * capability sets when the user id becomes non-zero.
1877		 */
1878		break;
1879	case LINUX_PR_SET_NAME:
1880		/*
1881		 * To be on the safe side we need to make sure to not
1882		 * overflow the size a Linux program expects. We already
1883		 * do this here in the copyin, so that we don't need to
1884		 * check on copyout.
1885		 */
1886		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1887		error = copyinstr((void *)(register_t)args->arg2, comm,
1888		    max_size, NULL);
1889
1890		/* Linux silently truncates the name if it is too long. */
1891		if (error == ENAMETOOLONG) {
1892			/*
1893			 * XXX: copyinstr() isn't documented to populate the
1894			 * array completely, so do a copyin() to be on the
1895			 * safe side. This should be changed in case
1896			 * copyinstr() is changed to guarantee this.
1897			 */
1898			error = copyin((void *)(register_t)args->arg2, comm,
1899			    max_size - 1);
1900			comm[max_size - 1] = '\0';
1901		}
1902		if (error)
1903			return (error);
1904
1905		PROC_LOCK(p);
1906		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1907		PROC_UNLOCK(p);
1908		break;
1909	case LINUX_PR_GET_NAME:
1910		PROC_LOCK(p);
1911		strlcpy(comm, p->p_comm, sizeof(comm));
1912		PROC_UNLOCK(p);
1913		error = copyout(comm, (void *)(register_t)args->arg2,
1914		    strlen(comm) + 1);
1915		break;
1916	default:
1917		error = EINVAL;
1918		break;
1919	}
1920
1921	return (error);
1922}
1923
1924int
1925linux_sched_setparam(struct thread *td,
1926    struct linux_sched_setparam_args *uap)
1927{
1928	struct sched_param sched_param;
1929	struct thread *tdt;
1930	int error, policy;
1931
1932	error = copyin(uap->param, &sched_param, sizeof(sched_param));
1933	if (error)
1934		return (error);
1935
1936	tdt = linux_tdfind(td, uap->pid, -1);
1937	if (tdt == NULL)
1938		return (ESRCH);
1939
1940	if (linux_map_sched_prio) {
1941		error = kern_sched_getscheduler(td, tdt, &policy);
1942		if (error)
1943			goto out;
1944
1945		switch (policy) {
1946		case SCHED_OTHER:
1947			if (sched_param.sched_priority != 0) {
1948				error = EINVAL;
1949				goto out;
1950			}
1951			sched_param.sched_priority =
1952			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1953			break;
1954		case SCHED_FIFO:
1955		case SCHED_RR:
1956			if (sched_param.sched_priority < 1 ||
1957			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1958				error = EINVAL;
1959				goto out;
1960			}
1961			/*
1962			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1963			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1964			 */
1965			sched_param.sched_priority =
1966			    (sched_param.sched_priority - 1) *
1967			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1968			    (LINUX_MAX_RT_PRIO - 1);
1969			break;
1970		}
1971	}
1972
1973	error = kern_sched_setparam(td, tdt, &sched_param);
1974out:	PROC_UNLOCK(tdt->td_proc);
1975	return (error);
1976}
1977
1978int
1979linux_sched_getparam(struct thread *td,
1980    struct linux_sched_getparam_args *uap)
1981{
1982	struct sched_param sched_param;
1983	struct thread *tdt;
1984	int error, policy;
1985
1986	tdt = linux_tdfind(td, uap->pid, -1);
1987	if (tdt == NULL)
1988		return (ESRCH);
1989
1990	error = kern_sched_getparam(td, tdt, &sched_param);
1991	if (error) {
1992		PROC_UNLOCK(tdt->td_proc);
1993		return (error);
1994	}
1995
1996	if (linux_map_sched_prio) {
1997		error = kern_sched_getscheduler(td, tdt, &policy);
1998		PROC_UNLOCK(tdt->td_proc);
1999		if (error)
2000			return (error);
2001
2002		switch (policy) {
2003		case SCHED_OTHER:
2004			sched_param.sched_priority = 0;
2005			break;
2006		case SCHED_FIFO:
2007		case SCHED_RR:
2008			/*
2009			 * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
2010			 * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
2011			 */
2012			sched_param.sched_priority =
2013			    (sched_param.sched_priority *
2014			    (LINUX_MAX_RT_PRIO - 1) +
2015			    (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
2016			    (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
2017			break;
2018		}
2019	} else
2020		PROC_UNLOCK(tdt->td_proc);
2021
2022	error = copyout(&sched_param, uap->param, sizeof(sched_param));
2023	return (error);
2024}
2025
2026/*
2027 * Get affinity of a process.
2028 */
2029int
2030linux_sched_getaffinity(struct thread *td,
2031    struct linux_sched_getaffinity_args *args)
2032{
2033	int error;
2034	struct thread *tdt;
2035
2036	if (args->len < sizeof(cpuset_t))
2037		return (EINVAL);
2038
2039	tdt = linux_tdfind(td, args->pid, -1);
2040	if (tdt == NULL)
2041		return (ESRCH);
2042
2043	PROC_UNLOCK(tdt->td_proc);
2044
2045	error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2046	    tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
2047	if (error == 0)
2048		td->td_retval[0] = sizeof(cpuset_t);
2049
2050	return (error);
2051}
2052
2053/*
2054 *  Set affinity of a process.
2055 */
2056int
2057linux_sched_setaffinity(struct thread *td,
2058    struct linux_sched_setaffinity_args *args)
2059{
2060	struct thread *tdt;
2061
2062	if (args->len < sizeof(cpuset_t))
2063		return (EINVAL);
2064
2065	tdt = linux_tdfind(td, args->pid, -1);
2066	if (tdt == NULL)
2067		return (ESRCH);
2068
2069	PROC_UNLOCK(tdt->td_proc);
2070
2071	return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2072	    tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
2073}
2074
2075struct linux_rlimit64 {
2076	uint64_t	rlim_cur;
2077	uint64_t	rlim_max;
2078};
2079
2080int
2081linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2082{
2083	struct rlimit rlim, nrlim;
2084	struct linux_rlimit64 lrlim;
2085	struct proc *p;
2086	u_int which;
2087	int flags;
2088	int error;
2089
2090	if (args->resource >= LINUX_RLIM_NLIMITS)
2091		return (EINVAL);
2092
2093	which = linux_to_bsd_resource[args->resource];
2094	if (which == -1)
2095		return (EINVAL);
2096
2097	if (args->new != NULL) {
2098		/*
2099		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2100		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
2101		 * as INFINITY so we do not need a conversion even.
2102		 */
2103		error = copyin(args->new, &nrlim, sizeof(nrlim));
2104		if (error != 0)
2105			return (error);
2106	}
2107
2108	flags = PGET_HOLD | PGET_NOTWEXIT;
2109	if (args->new != NULL)
2110		flags |= PGET_CANDEBUG;
2111	else
2112		flags |= PGET_CANSEE;
2113	if (args->pid == 0) {
2114		p = td->td_proc;
2115		PHOLD(p);
2116	} else {
2117		error = pget(args->pid, flags, &p);
2118		if (error != 0)
2119			return (error);
2120	}
2121	if (args->old != NULL) {
2122		PROC_LOCK(p);
2123		lim_rlimit_proc(p, which, &rlim);
2124		PROC_UNLOCK(p);
2125		if (rlim.rlim_cur == RLIM_INFINITY)
2126			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2127		else
2128			lrlim.rlim_cur = rlim.rlim_cur;
2129		if (rlim.rlim_max == RLIM_INFINITY)
2130			lrlim.rlim_max = LINUX_RLIM_INFINITY;
2131		else
2132			lrlim.rlim_max = rlim.rlim_max;
2133		error = copyout(&lrlim, args->old, sizeof(lrlim));
2134		if (error != 0)
2135			goto out;
2136	}
2137
2138	if (args->new != NULL)
2139		error = kern_proc_setrlimit(td, p, which, &nrlim);
2140
2141 out:
2142	PRELE(p);
2143	return (error);
2144}
2145
2146int
2147linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2148{
2149	struct timeval utv, tv0, tv1, *tvp;
2150	struct l_pselect6arg lpse6;
2151	struct l_timespec lts;
2152	struct timespec uts;
2153	l_sigset_t l_ss;
2154	sigset_t *ssp;
2155	sigset_t ss;
2156	int error;
2157
2158	ssp = NULL;
2159	if (args->sig != NULL) {
2160		error = copyin(args->sig, &lpse6, sizeof(lpse6));
2161		if (error != 0)
2162			return (error);
2163		if (lpse6.ss_len != sizeof(l_ss))
2164			return (EINVAL);
2165		if (lpse6.ss != 0) {
2166			error = copyin(PTRIN(lpse6.ss), &l_ss,
2167			    sizeof(l_ss));
2168			if (error != 0)
2169				return (error);
2170			linux_to_bsd_sigset(&l_ss, &ss);
2171			ssp = &ss;
2172		}
2173	}
2174
2175	/*
2176	 * Currently glibc changes nanosecond number to microsecond.
2177	 * This mean losing precision but for now it is hardly seen.
2178	 */
2179	if (args->tsp != NULL) {
2180		error = copyin(args->tsp, &lts, sizeof(lts));
2181		if (error != 0)
2182			return (error);
2183		error = linux_to_native_timespec(&uts, &lts);
2184		if (error != 0)
2185			return (error);
2186
2187		TIMESPEC_TO_TIMEVAL(&utv, &uts);
2188		if (itimerfix(&utv))
2189			return (EINVAL);
2190
2191		microtime(&tv0);
2192		tvp = &utv;
2193	} else
2194		tvp = NULL;
2195
2196	error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
2197	    args->exceptfds, tvp, ssp, LINUX_NFDBITS);
2198
2199	if (error == 0 && args->tsp != NULL) {
2200		if (td->td_retval[0] != 0) {
2201			/*
2202			 * Compute how much time was left of the timeout,
2203			 * by subtracting the current time and the time
2204			 * before we started the call, and subtracting
2205			 * that result from the user-supplied value.
2206			 */
2207
2208			microtime(&tv1);
2209			timevalsub(&tv1, &tv0);
2210			timevalsub(&utv, &tv1);
2211			if (utv.tv_sec < 0)
2212				timevalclear(&utv);
2213		} else
2214			timevalclear(&utv);
2215
2216		TIMEVAL_TO_TIMESPEC(&utv, &uts);
2217
2218		error = native_to_linux_timespec(&lts, &uts);
2219		if (error == 0)
2220			error = copyout(&lts, args->tsp, sizeof(lts));
2221	}
2222
2223	return (error);
2224}
2225
2226int
2227linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2228{
2229	struct timespec ts0, ts1;
2230	struct l_timespec lts;
2231	struct timespec uts, *tsp;
2232	l_sigset_t l_ss;
2233	sigset_t *ssp;
2234	sigset_t ss;
2235	int error;
2236
2237	if (args->sset != NULL) {
2238		if (args->ssize != sizeof(l_ss))
2239			return (EINVAL);
2240		error = copyin(args->sset, &l_ss, sizeof(l_ss));
2241		if (error)
2242			return (error);
2243		linux_to_bsd_sigset(&l_ss, &ss);
2244		ssp = &ss;
2245	} else
2246		ssp = NULL;
2247	if (args->tsp != NULL) {
2248		error = copyin(args->tsp, &lts, sizeof(lts));
2249		if (error)
2250			return (error);
2251		error = linux_to_native_timespec(&uts, &lts);
2252		if (error != 0)
2253			return (error);
2254
2255		nanotime(&ts0);
2256		tsp = &uts;
2257	} else
2258		tsp = NULL;
2259
2260	error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
2261
2262	if (error == 0 && args->tsp != NULL) {
2263		if (td->td_retval[0]) {
2264			nanotime(&ts1);
2265			timespecsub(&ts1, &ts0, &ts1);
2266			timespecsub(&uts, &ts1, &uts);
2267			if (uts.tv_sec < 0)
2268				timespecclear(&uts);
2269		} else
2270			timespecclear(&uts);
2271
2272		error = native_to_linux_timespec(&lts, &uts);
2273		if (error == 0)
2274			error = copyout(&lts, args->tsp, sizeof(lts));
2275	}
2276
2277	return (error);
2278}
2279
2280int
2281linux_sched_rr_get_interval(struct thread *td,
2282    struct linux_sched_rr_get_interval_args *uap)
2283{
2284	struct timespec ts;
2285	struct l_timespec lts;
2286	struct thread *tdt;
2287	int error;
2288
2289	/*
2290	 * According to man in case the invalid pid specified
2291	 * EINVAL should be returned.
2292	 */
2293	if (uap->pid < 0)
2294		return (EINVAL);
2295
2296	tdt = linux_tdfind(td, uap->pid, -1);
2297	if (tdt == NULL)
2298		return (ESRCH);
2299
2300	error = kern_sched_rr_get_interval_td(td, tdt, &ts);
2301	PROC_UNLOCK(tdt->td_proc);
2302	if (error != 0)
2303		return (error);
2304	error = native_to_linux_timespec(&lts, &ts);
2305	if (error != 0)
2306		return (error);
2307	return (copyout(&lts, uap->interval, sizeof(lts)));
2308}
2309
2310/*
2311 * In case when the Linux thread is the initial thread in
2312 * the thread group thread id is equal to the process id.
2313 * Glibc depends on this magic (assert in pthread_getattr_np.c).
2314 */
2315struct thread *
2316linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2317{
2318	struct linux_emuldata *em;
2319	struct thread *tdt;
2320	struct proc *p;
2321
2322	tdt = NULL;
2323	if (tid == 0 || tid == td->td_tid) {
2324		tdt = td;
2325		PROC_LOCK(tdt->td_proc);
2326	} else if (tid > PID_MAX)
2327		tdt = tdfind(tid, pid);
2328	else {
2329		/*
2330		 * Initial thread where the tid equal to the pid.
2331		 */
2332		p = pfind(tid);
2333		if (p != NULL) {
2334			if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
2335				/*
2336				 * p is not a Linuxulator process.
2337				 */
2338				PROC_UNLOCK(p);
2339				return (NULL);
2340			}
2341			FOREACH_THREAD_IN_PROC(p, tdt) {
2342				em = em_find(tdt);
2343				if (tid == em->em_tid)
2344					return (tdt);
2345			}
2346			PROC_UNLOCK(p);
2347		}
2348		return (NULL);
2349	}
2350
2351	return (tdt);
2352}
2353
2354void
2355linux_to_bsd_waitopts(int options, int *bsdopts)
2356{
2357
2358	if (options & LINUX_WNOHANG)
2359		*bsdopts |= WNOHANG;
2360	if (options & LINUX_WUNTRACED)
2361		*bsdopts |= WUNTRACED;
2362	if (options & LINUX_WEXITED)
2363		*bsdopts |= WEXITED;
2364	if (options & LINUX_WCONTINUED)
2365		*bsdopts |= WCONTINUED;
2366	if (options & LINUX_WNOWAIT)
2367		*bsdopts |= WNOWAIT;
2368
2369	if (options & __WCLONE)
2370		*bsdopts |= WLINUXCLONE;
2371}
2372
2373int
2374linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2375{
2376	struct uio uio;
2377	struct iovec iov;
2378	int error;
2379
2380	if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2381		return (EINVAL);
2382	if (args->count > INT_MAX)
2383		args->count = INT_MAX;
2384
2385	iov.iov_base = args->buf;
2386	iov.iov_len = args->count;
2387
2388	uio.uio_iov = &iov;
2389	uio.uio_iovcnt = 1;
2390	uio.uio_resid = iov.iov_len;
2391	uio.uio_segflg = UIO_USERSPACE;
2392	uio.uio_rw = UIO_READ;
2393	uio.uio_td = td;
2394
2395	error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2396	if (error == 0)
2397		td->td_retval[0] = args->count - uio.uio_resid;
2398	return (error);
2399}
2400
2401int
2402linux_mincore(struct thread *td, struct linux_mincore_args *args)
2403{
2404
2405	/* Needs to be page-aligned */
2406	if (args->start & PAGE_MASK)
2407		return (EINVAL);
2408	return (kern_mincore(td, args->start, args->len, args->vec));
2409}
2410
2411#define	SYSLOG_TAG	"<6>"
2412
2413int
2414linux_syslog(struct thread *td, struct linux_syslog_args *args)
2415{
2416	char buf[128], *src, *dst;
2417	u_int seq;
2418	int buflen, error;
2419
2420	if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2421		linux_msg(td, "syslog unsupported type 0x%x", args->type);
2422		return (EINVAL);
2423	}
2424
2425	if (args->len < 6) {
2426		td->td_retval[0] = 0;
2427		return (0);
2428	}
2429
2430	error = priv_check(td, PRIV_MSGBUF);
2431	if (error)
2432		return (error);
2433
2434	mtx_lock(&msgbuf_lock);
2435	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2436	mtx_unlock(&msgbuf_lock);
2437
2438	dst = args->buf;
2439	error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2440	/* The -1 is to skip the trailing '\0'. */
2441	dst += sizeof(SYSLOG_TAG) - 1;
2442
2443	while (error == 0) {
2444		mtx_lock(&msgbuf_lock);
2445		buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2446		mtx_unlock(&msgbuf_lock);
2447
2448		if (buflen == 0)
2449			break;
2450
2451		for (src = buf; src < buf + buflen && error == 0; src++) {
2452			if (*src == '\0')
2453				continue;
2454
2455			if (dst >= args->buf + args->len)
2456				goto out;
2457
2458			error = copyout(src, dst, 1);
2459			dst++;
2460
2461			if (*src == '\n' && *(src + 1) != '<' &&
2462			    dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2463				error = copyout(&SYSLOG_TAG,
2464				    dst, sizeof(SYSLOG_TAG));
2465				dst += sizeof(SYSLOG_TAG) - 1;
2466			}
2467		}
2468	}
2469out:
2470	td->td_retval[0] = dst - args->buf;
2471	return (error);
2472}
2473
2474int
2475linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2476{
2477	int cpu, error, node;
2478
2479	cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2480	error = 0;
2481	node = cpuid_to_pcpu[cpu]->pc_domain;
2482
2483	if (args->cpu != NULL)
2484		error = copyout(&cpu, args->cpu, sizeof(l_int));
2485	if (args->node != NULL)
2486		error = copyout(&node, args->node, sizeof(l_int));
2487	return (error);
2488}
2489