1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD$");
41
42#include "opt_capsicum.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capsicum.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <fs/devfs/devfs.h>
91
92#include <ufs/ufs/quota.h>
93
94MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
95
96static int kern_chflagsat(struct thread *td, int fd, const char *path,
97    enum uio_seg pathseg, u_long flags, int atflag);
98static int setfflags(struct thread *td, struct vnode *, u_long);
99static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
100static int getutimens(const struct timespec *, enum uio_seg,
101    struct timespec *, int *);
102static int setutimes(struct thread *td, struct vnode *,
103    const struct timespec *, int, int);
104static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
105    struct thread *td);
106static int kern_fhlinkat(struct thread *td, int fd, const char *path,
107    enum uio_seg pathseg, fhandle_t *fhp);
108static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
109    size_t count, struct thread *td);
110static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
111    const char *path, enum uio_seg segflag);
112
113static uint64_t
114at2cnpflags(u_int at_flags, u_int mask)
115{
116	u_int64_t res;
117
118	MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) !=
119	    (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW));
120
121	res = 0;
122	at_flags &= mask;
123	if ((at_flags & AT_RESOLVE_BENEATH) != 0)
124		res |= RBENEATH;
125	if ((at_flags & AT_SYMLINK_FOLLOW) != 0)
126		res |= FOLLOW;
127	/* NOFOLLOW is pseudo flag */
128	if ((mask & AT_SYMLINK_NOFOLLOW) != 0) {
129		res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
130		    FOLLOW;
131	}
132	if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0)
133		res |= EMPTYPATH;
134	return (res);
135}
136
137int
138kern_sync(struct thread *td)
139{
140	struct mount *mp, *nmp;
141	int save;
142
143	mtx_lock(&mountlist_mtx);
144	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
145		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
146			nmp = TAILQ_NEXT(mp, mnt_list);
147			continue;
148		}
149		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
150		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
151			save = curthread_pflags_set(TDP_SYNCIO);
152			vfs_periodic(mp, MNT_NOWAIT);
153			VFS_SYNC(mp, MNT_NOWAIT);
154			curthread_pflags_restore(save);
155			vn_finished_write(mp);
156		}
157		mtx_lock(&mountlist_mtx);
158		nmp = TAILQ_NEXT(mp, mnt_list);
159		vfs_unbusy(mp);
160	}
161	mtx_unlock(&mountlist_mtx);
162	return (0);
163}
164
165/*
166 * Sync each mounted filesystem.
167 */
168#ifndef _SYS_SYSPROTO_H_
169struct sync_args {
170	int     dummy;
171};
172#endif
173/* ARGSUSED */
174int
175sys_sync(struct thread *td, struct sync_args *uap)
176{
177
178	return (kern_sync(td));
179}
180
181/*
182 * Change filesystem quotas.
183 */
184#ifndef _SYS_SYSPROTO_H_
185struct quotactl_args {
186	char *path;
187	int cmd;
188	int uid;
189	caddr_t arg;
190};
191#endif
192int
193sys_quotactl(struct thread *td, struct quotactl_args *uap)
194{
195	struct mount *mp;
196	struct nameidata nd;
197	int error;
198
199	AUDIT_ARG_CMD(uap->cmd);
200	AUDIT_ARG_UID(uap->uid);
201	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
202		return (EPERM);
203	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
204	    uap->path, td);
205	if ((error = namei(&nd)) != 0)
206		return (error);
207	NDFREE(&nd, NDF_ONLY_PNBUF);
208	mp = nd.ni_vp->v_mount;
209	vfs_ref(mp);
210	vput(nd.ni_vp);
211	error = vfs_busy(mp, 0);
212	if (error != 0) {
213		vfs_rel(mp);
214		return (error);
215	}
216	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
217
218	/*
219	 * Since quota on operation typically needs to open quota
220	 * file, the Q_QUOTAON handler needs to unbusy the mount point
221	 * before calling into namei.  Otherwise, unmount might be
222	 * started between two vfs_busy() invocations (first is our,
223	 * second is from mount point cross-walk code in lookup()),
224	 * causing deadlock.
225	 *
226	 * Require that Q_QUOTAON handles the vfs_busy() reference on
227	 * its own, always returning with ubusied mount point.
228	 */
229	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON &&
230	    (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF)
231		vfs_unbusy(mp);
232	vfs_rel(mp);
233	return (error);
234}
235
236/*
237 * Used by statfs conversion routines to scale the block size up if
238 * necessary so that all of the block counts are <= 'max_size'.  Note
239 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
240 * value of 'n'.
241 */
242void
243statfs_scale_blocks(struct statfs *sf, long max_size)
244{
245	uint64_t count;
246	int shift;
247
248	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
249
250	/*
251	 * Attempt to scale the block counts to give a more accurate
252	 * overview to userland of the ratio of free space to used
253	 * space.  To do this, find the largest block count and compute
254	 * a divisor that lets it fit into a signed integer <= max_size.
255	 */
256	if (sf->f_bavail < 0)
257		count = -sf->f_bavail;
258	else
259		count = sf->f_bavail;
260	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
261	if (count <= max_size)
262		return;
263
264	count >>= flsl(max_size);
265	shift = 0;
266	while (count > 0) {
267		shift++;
268		count >>=1;
269	}
270
271	sf->f_bsize <<= shift;
272	sf->f_blocks >>= shift;
273	sf->f_bfree >>= shift;
274	sf->f_bavail >>= shift;
275}
276
277static int
278kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
279{
280	int error;
281
282	if (mp == NULL)
283		return (EBADF);
284	error = vfs_busy(mp, 0);
285	vfs_rel(mp);
286	if (error != 0)
287		return (error);
288#ifdef MAC
289	error = mac_mount_check_stat(td->td_ucred, mp);
290	if (error != 0)
291		goto out;
292#endif
293	error = VFS_STATFS(mp, buf);
294	if (error != 0)
295		goto out;
296	if (priv_check_cred_vfs_generation(td->td_ucred)) {
297		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
298		prison_enforce_statfs(td->td_ucred, mp, buf);
299	}
300out:
301	vfs_unbusy(mp);
302	return (error);
303}
304
305/*
306 * Get filesystem statistics.
307 */
308#ifndef _SYS_SYSPROTO_H_
309struct statfs_args {
310	char *path;
311	struct statfs *buf;
312};
313#endif
314int
315sys_statfs(struct thread *td, struct statfs_args *uap)
316{
317	struct statfs *sfp;
318	int error;
319
320	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
321	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
322	if (error == 0)
323		error = copyout(sfp, uap->buf, sizeof(struct statfs));
324	free(sfp, M_STATFS);
325	return (error);
326}
327
328int
329kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg,
330    struct statfs *buf)
331{
332	struct mount *mp;
333	struct nameidata nd;
334	int error;
335
336	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
337	error = namei(&nd);
338	if (error != 0)
339		return (error);
340	mp = vfs_ref_from_vp(nd.ni_vp);
341	NDFREE_NOTHING(&nd);
342	vrele(nd.ni_vp);
343	return (kern_do_statfs(td, mp, buf));
344}
345
346/*
347 * Get filesystem statistics.
348 */
349#ifndef _SYS_SYSPROTO_H_
350struct fstatfs_args {
351	int fd;
352	struct statfs *buf;
353};
354#endif
355int
356sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
357{
358	struct statfs *sfp;
359	int error;
360
361	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
362	error = kern_fstatfs(td, uap->fd, sfp);
363	if (error == 0)
364		error = copyout(sfp, uap->buf, sizeof(struct statfs));
365	free(sfp, M_STATFS);
366	return (error);
367}
368
369int
370kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
371{
372	struct file *fp;
373	struct mount *mp;
374	struct vnode *vp;
375	int error;
376
377	AUDIT_ARG_FD(fd);
378	error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp);
379	if (error != 0)
380		return (error);
381	vp = fp->f_vnode;
382#ifdef AUDIT
383	if (AUDITING_TD(td)) {
384		vn_lock(vp, LK_SHARED | LK_RETRY);
385		AUDIT_ARG_VNODE1(vp);
386		VOP_UNLOCK(vp);
387	}
388#endif
389	mp = vfs_ref_from_vp(vp);
390	fdrop(fp, td);
391	return (kern_do_statfs(td, mp, buf));
392}
393
394/*
395 * Get statistics on all filesystems.
396 */
397#ifndef _SYS_SYSPROTO_H_
398struct getfsstat_args {
399	struct statfs *buf;
400	long bufsize;
401	int mode;
402};
403#endif
404int
405sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
406{
407	size_t count;
408	int error;
409
410	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
411		return (EINVAL);
412	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
413	    UIO_USERSPACE, uap->mode);
414	if (error == 0)
415		td->td_retval[0] = count;
416	return (error);
417}
418
419/*
420 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
421 *	The caller is responsible for freeing memory which will be allocated
422 *	in '*buf'.
423 */
424int
425kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
426    size_t *countp, enum uio_seg bufseg, int mode)
427{
428	struct mount *mp, *nmp;
429	struct statfs *sfsp, *sp, *sptmp, *tofree;
430	size_t count, maxcount;
431	int error;
432
433	switch (mode) {
434	case MNT_WAIT:
435	case MNT_NOWAIT:
436		break;
437	default:
438		if (bufseg == UIO_SYSSPACE)
439			*buf = NULL;
440		return (EINVAL);
441	}
442restart:
443	maxcount = bufsize / sizeof(struct statfs);
444	if (bufsize == 0) {
445		sfsp = NULL;
446		tofree = NULL;
447	} else if (bufseg == UIO_USERSPACE) {
448		sfsp = *buf;
449		tofree = NULL;
450	} else /* if (bufseg == UIO_SYSSPACE) */ {
451		count = 0;
452		mtx_lock(&mountlist_mtx);
453		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
454			count++;
455		}
456		mtx_unlock(&mountlist_mtx);
457		if (maxcount > count)
458			maxcount = count;
459		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
460		    M_STATFS, M_WAITOK);
461	}
462
463	count = 0;
464
465	/*
466	 * If there is no target buffer they only want the count.
467	 *
468	 * This could be TAILQ_FOREACH but it is open-coded to match the original
469	 * code below.
470	 */
471	if (sfsp == NULL) {
472		mtx_lock(&mountlist_mtx);
473		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
474			if (prison_canseemount(td->td_ucred, mp) != 0) {
475				nmp = TAILQ_NEXT(mp, mnt_list);
476				continue;
477			}
478#ifdef MAC
479			if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
480				nmp = TAILQ_NEXT(mp, mnt_list);
481				continue;
482			}
483#endif
484			count++;
485			nmp = TAILQ_NEXT(mp, mnt_list);
486		}
487		mtx_unlock(&mountlist_mtx);
488		*countp = count;
489		return (0);
490	}
491
492	/*
493	 * They want the entire thing.
494	 *
495	 * Short-circuit the corner case of no room for anything, avoids
496	 * relocking below.
497	 */
498	if (maxcount < 1) {
499		goto out;
500	}
501
502	mtx_lock(&mountlist_mtx);
503	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
504		if (prison_canseemount(td->td_ucred, mp) != 0) {
505			nmp = TAILQ_NEXT(mp, mnt_list);
506			continue;
507		}
508#ifdef MAC
509		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
510			nmp = TAILQ_NEXT(mp, mnt_list);
511			continue;
512		}
513#endif
514		if (mode == MNT_WAIT) {
515			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
516				/*
517				 * If vfs_busy() failed, and MBF_NOWAIT
518				 * wasn't passed, then the mp is gone.
519				 * Furthermore, because of MBF_MNTLSTLOCK,
520				 * the mountlist_mtx was dropped.  We have
521				 * no other choice than to start over.
522				 */
523				mtx_unlock(&mountlist_mtx);
524				free(tofree, M_STATFS);
525				goto restart;
526			}
527		} else {
528			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
529				nmp = TAILQ_NEXT(mp, mnt_list);
530				continue;
531			}
532		}
533		sp = &mp->mnt_stat;
534		/*
535		 * If MNT_NOWAIT is specified, do not refresh
536		 * the fsstat cache.
537		 */
538		if (mode != MNT_NOWAIT) {
539			error = VFS_STATFS(mp, sp);
540			if (error != 0) {
541				mtx_lock(&mountlist_mtx);
542				nmp = TAILQ_NEXT(mp, mnt_list);
543				vfs_unbusy(mp);
544				continue;
545			}
546		}
547		if (priv_check_cred_vfs_generation(td->td_ucred)) {
548			sptmp = malloc(sizeof(struct statfs), M_STATFS,
549			    M_WAITOK);
550			*sptmp = *sp;
551			sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
552			prison_enforce_statfs(td->td_ucred, mp, sptmp);
553			sp = sptmp;
554		} else
555			sptmp = NULL;
556		if (bufseg == UIO_SYSSPACE) {
557			bcopy(sp, sfsp, sizeof(*sp));
558			free(sptmp, M_STATFS);
559		} else /* if (bufseg == UIO_USERSPACE) */ {
560			error = copyout(sp, sfsp, sizeof(*sp));
561			free(sptmp, M_STATFS);
562			if (error != 0) {
563				vfs_unbusy(mp);
564				return (error);
565			}
566		}
567		sfsp++;
568		count++;
569
570		if (count == maxcount) {
571			vfs_unbusy(mp);
572			goto out;
573		}
574
575		mtx_lock(&mountlist_mtx);
576		nmp = TAILQ_NEXT(mp, mnt_list);
577		vfs_unbusy(mp);
578	}
579	mtx_unlock(&mountlist_mtx);
580out:
581	*countp = count;
582	return (0);
583}
584
585#ifdef COMPAT_FREEBSD4
586/*
587 * Get old format filesystem statistics.
588 */
589static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
590
591#ifndef _SYS_SYSPROTO_H_
592struct freebsd4_statfs_args {
593	char *path;
594	struct ostatfs *buf;
595};
596#endif
597int
598freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
599{
600	struct ostatfs osb;
601	struct statfs *sfp;
602	int error;
603
604	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
605	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
606	if (error == 0) {
607		freebsd4_cvtstatfs(sfp, &osb);
608		error = copyout(&osb, uap->buf, sizeof(osb));
609	}
610	free(sfp, M_STATFS);
611	return (error);
612}
613
614/*
615 * Get filesystem statistics.
616 */
617#ifndef _SYS_SYSPROTO_H_
618struct freebsd4_fstatfs_args {
619	int fd;
620	struct ostatfs *buf;
621};
622#endif
623int
624freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
625{
626	struct ostatfs osb;
627	struct statfs *sfp;
628	int error;
629
630	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
631	error = kern_fstatfs(td, uap->fd, sfp);
632	if (error == 0) {
633		freebsd4_cvtstatfs(sfp, &osb);
634		error = copyout(&osb, uap->buf, sizeof(osb));
635	}
636	free(sfp, M_STATFS);
637	return (error);
638}
639
640/*
641 * Get statistics on all filesystems.
642 */
643#ifndef _SYS_SYSPROTO_H_
644struct freebsd4_getfsstat_args {
645	struct ostatfs *buf;
646	long bufsize;
647	int mode;
648};
649#endif
650int
651freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
652{
653	struct statfs *buf, *sp;
654	struct ostatfs osb;
655	size_t count, size;
656	int error;
657
658	if (uap->bufsize < 0)
659		return (EINVAL);
660	count = uap->bufsize / sizeof(struct ostatfs);
661	if (count > SIZE_MAX / sizeof(struct statfs))
662		return (EINVAL);
663	size = count * sizeof(struct statfs);
664	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
665	    uap->mode);
666	if (error == 0)
667		td->td_retval[0] = count;
668	if (size != 0) {
669		sp = buf;
670		while (count != 0 && error == 0) {
671			freebsd4_cvtstatfs(sp, &osb);
672			error = copyout(&osb, uap->buf, sizeof(osb));
673			sp++;
674			uap->buf++;
675			count--;
676		}
677		free(buf, M_STATFS);
678	}
679	return (error);
680}
681
682/*
683 * Implement fstatfs() for (NFS) file handles.
684 */
685#ifndef _SYS_SYSPROTO_H_
686struct freebsd4_fhstatfs_args {
687	struct fhandle *u_fhp;
688	struct ostatfs *buf;
689};
690#endif
691int
692freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
693{
694	struct ostatfs osb;
695	struct statfs *sfp;
696	fhandle_t fh;
697	int error;
698
699	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
700	if (error != 0)
701		return (error);
702	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
703	error = kern_fhstatfs(td, fh, sfp);
704	if (error == 0) {
705		freebsd4_cvtstatfs(sfp, &osb);
706		error = copyout(&osb, uap->buf, sizeof(osb));
707	}
708	free(sfp, M_STATFS);
709	return (error);
710}
711
712/*
713 * Convert a new format statfs structure to an old format statfs structure.
714 */
715static void
716freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
717{
718
719	statfs_scale_blocks(nsp, LONG_MAX);
720	bzero(osp, sizeof(*osp));
721	osp->f_bsize = nsp->f_bsize;
722	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
723	osp->f_blocks = nsp->f_blocks;
724	osp->f_bfree = nsp->f_bfree;
725	osp->f_bavail = nsp->f_bavail;
726	osp->f_files = MIN(nsp->f_files, LONG_MAX);
727	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
728	osp->f_owner = nsp->f_owner;
729	osp->f_type = nsp->f_type;
730	osp->f_flags = nsp->f_flags;
731	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
732	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
733	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
734	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
735	strlcpy(osp->f_fstypename, nsp->f_fstypename,
736	    MIN(MFSNAMELEN, OMFSNAMELEN));
737	strlcpy(osp->f_mntonname, nsp->f_mntonname,
738	    MIN(MNAMELEN, OMNAMELEN));
739	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
740	    MIN(MNAMELEN, OMNAMELEN));
741	osp->f_fsid = nsp->f_fsid;
742}
743#endif /* COMPAT_FREEBSD4 */
744
745#if defined(COMPAT_FREEBSD11)
746/*
747 * Get old format filesystem statistics.
748 */
749static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
750
751int
752freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
753{
754	struct freebsd11_statfs osb;
755	struct statfs *sfp;
756	int error;
757
758	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
759	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
760	if (error == 0) {
761		freebsd11_cvtstatfs(sfp, &osb);
762		error = copyout(&osb, uap->buf, sizeof(osb));
763	}
764	free(sfp, M_STATFS);
765	return (error);
766}
767
768/*
769 * Get filesystem statistics.
770 */
771int
772freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
773{
774	struct freebsd11_statfs osb;
775	struct statfs *sfp;
776	int error;
777
778	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
779	error = kern_fstatfs(td, uap->fd, sfp);
780	if (error == 0) {
781		freebsd11_cvtstatfs(sfp, &osb);
782		error = copyout(&osb, uap->buf, sizeof(osb));
783	}
784	free(sfp, M_STATFS);
785	return (error);
786}
787
788/*
789 * Get statistics on all filesystems.
790 */
791int
792freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
793{
794	struct freebsd11_statfs osb;
795	struct statfs *buf, *sp;
796	size_t count, size;
797	int error;
798
799	count = uap->bufsize / sizeof(struct ostatfs);
800	size = count * sizeof(struct statfs);
801	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
802	    uap->mode);
803	if (error == 0)
804		td->td_retval[0] = count;
805	if (size > 0) {
806		sp = buf;
807		while (count > 0 && error == 0) {
808			freebsd11_cvtstatfs(sp, &osb);
809			error = copyout(&osb, uap->buf, sizeof(osb));
810			sp++;
811			uap->buf++;
812			count--;
813		}
814		free(buf, M_STATFS);
815	}
816	return (error);
817}
818
819/*
820 * Implement fstatfs() for (NFS) file handles.
821 */
822int
823freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
824{
825	struct freebsd11_statfs osb;
826	struct statfs *sfp;
827	fhandle_t fh;
828	int error;
829
830	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
831	if (error)
832		return (error);
833	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
834	error = kern_fhstatfs(td, fh, sfp);
835	if (error == 0) {
836		freebsd11_cvtstatfs(sfp, &osb);
837		error = copyout(&osb, uap->buf, sizeof(osb));
838	}
839	free(sfp, M_STATFS);
840	return (error);
841}
842
843/*
844 * Convert a new format statfs structure to an old format statfs structure.
845 */
846static void
847freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
848{
849
850	bzero(osp, sizeof(*osp));
851	osp->f_version = FREEBSD11_STATFS_VERSION;
852	osp->f_type = nsp->f_type;
853	osp->f_flags = nsp->f_flags;
854	osp->f_bsize = nsp->f_bsize;
855	osp->f_iosize = nsp->f_iosize;
856	osp->f_blocks = nsp->f_blocks;
857	osp->f_bfree = nsp->f_bfree;
858	osp->f_bavail = nsp->f_bavail;
859	osp->f_files = nsp->f_files;
860	osp->f_ffree = nsp->f_ffree;
861	osp->f_syncwrites = nsp->f_syncwrites;
862	osp->f_asyncwrites = nsp->f_asyncwrites;
863	osp->f_syncreads = nsp->f_syncreads;
864	osp->f_asyncreads = nsp->f_asyncreads;
865	osp->f_namemax = nsp->f_namemax;
866	osp->f_owner = nsp->f_owner;
867	osp->f_fsid = nsp->f_fsid;
868	strlcpy(osp->f_fstypename, nsp->f_fstypename,
869	    MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
870	strlcpy(osp->f_mntonname, nsp->f_mntonname,
871	    MIN(MNAMELEN, sizeof(osp->f_mntonname)));
872	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
873	    MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
874}
875#endif /* COMPAT_FREEBSD11 */
876
877/*
878 * Change current working directory to a given file descriptor.
879 */
880#ifndef _SYS_SYSPROTO_H_
881struct fchdir_args {
882	int	fd;
883};
884#endif
885int
886sys_fchdir(struct thread *td, struct fchdir_args *uap)
887{
888	struct vnode *vp, *tdp;
889	struct mount *mp;
890	struct file *fp;
891	int error;
892
893	AUDIT_ARG_FD(uap->fd);
894	error = getvnode_path(td, uap->fd, &cap_fchdir_rights,
895	    &fp);
896	if (error != 0)
897		return (error);
898	vp = fp->f_vnode;
899	vref(vp);
900	fdrop(fp, td);
901	vn_lock(vp, LK_SHARED | LK_RETRY);
902	AUDIT_ARG_VNODE1(vp);
903	error = change_dir(vp, td);
904	while (!error && (mp = vp->v_mountedhere) != NULL) {
905		if (vfs_busy(mp, 0))
906			continue;
907		error = VFS_ROOT(mp, LK_SHARED, &tdp);
908		vfs_unbusy(mp);
909		if (error != 0)
910			break;
911		vput(vp);
912		vp = tdp;
913	}
914	if (error != 0) {
915		vput(vp);
916		return (error);
917	}
918	VOP_UNLOCK(vp);
919	pwd_chdir(td, vp);
920	return (0);
921}
922
923/*
924 * Change current working directory (``.'').
925 */
926#ifndef _SYS_SYSPROTO_H_
927struct chdir_args {
928	char	*path;
929};
930#endif
931int
932sys_chdir(struct thread *td, struct chdir_args *uap)
933{
934
935	return (kern_chdir(td, uap->path, UIO_USERSPACE));
936}
937
938int
939kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg)
940{
941	struct nameidata nd;
942	int error;
943
944	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
945	    pathseg, path, td);
946	if ((error = namei(&nd)) != 0)
947		return (error);
948	if ((error = change_dir(nd.ni_vp, td)) != 0) {
949		vput(nd.ni_vp);
950		NDFREE_NOTHING(&nd);
951		return (error);
952	}
953	VOP_UNLOCK(nd.ni_vp);
954	NDFREE_NOTHING(&nd);
955	pwd_chdir(td, nd.ni_vp);
956	return (0);
957}
958
959/*
960 * Change notion of root (``/'') directory.
961 */
962#ifndef _SYS_SYSPROTO_H_
963struct chroot_args {
964	char	*path;
965};
966#endif
967int
968sys_chroot(struct thread *td, struct chroot_args *uap)
969{
970	struct nameidata nd;
971	int error;
972
973	error = priv_check(td, PRIV_VFS_CHROOT);
974	if (error != 0)
975		return (error);
976	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
977	    UIO_USERSPACE, uap->path, td);
978	error = namei(&nd);
979	if (error != 0)
980		goto error;
981	error = change_dir(nd.ni_vp, td);
982	if (error != 0)
983		goto e_vunlock;
984#ifdef MAC
985	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
986	if (error != 0)
987		goto e_vunlock;
988#endif
989	VOP_UNLOCK(nd.ni_vp);
990	error = pwd_chroot(td, nd.ni_vp);
991	vrele(nd.ni_vp);
992	NDFREE_NOTHING(&nd);
993	return (error);
994e_vunlock:
995	vput(nd.ni_vp);
996error:
997	NDFREE_NOTHING(&nd);
998	return (error);
999}
1000
1001/*
1002 * Common routine for chroot and chdir.  Callers must provide a locked vnode
1003 * instance.
1004 */
1005int
1006change_dir(struct vnode *vp, struct thread *td)
1007{
1008#ifdef MAC
1009	int error;
1010#endif
1011
1012	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
1013	if (vp->v_type != VDIR)
1014		return (ENOTDIR);
1015#ifdef MAC
1016	error = mac_vnode_check_chdir(td->td_ucred, vp);
1017	if (error != 0)
1018		return (error);
1019#endif
1020	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
1021}
1022
1023static __inline void
1024flags_to_rights(int flags, cap_rights_t *rightsp)
1025{
1026	if (flags & O_EXEC) {
1027		cap_rights_set_one(rightsp, CAP_FEXECVE);
1028		if (flags & O_PATH)
1029			return;
1030	} else {
1031		switch ((flags & O_ACCMODE)) {
1032		case O_RDONLY:
1033			cap_rights_set_one(rightsp, CAP_READ);
1034			break;
1035		case O_RDWR:
1036			cap_rights_set_one(rightsp, CAP_READ);
1037			/* FALLTHROUGH */
1038		case O_WRONLY:
1039			cap_rights_set_one(rightsp, CAP_WRITE);
1040			if (!(flags & (O_APPEND | O_TRUNC)))
1041				cap_rights_set_one(rightsp, CAP_SEEK);
1042			break;
1043		}
1044	}
1045
1046	if (flags & O_CREAT)
1047		cap_rights_set_one(rightsp, CAP_CREATE);
1048
1049	if (flags & O_TRUNC)
1050		cap_rights_set_one(rightsp, CAP_FTRUNCATE);
1051
1052	if (flags & (O_SYNC | O_FSYNC))
1053		cap_rights_set_one(rightsp, CAP_FSYNC);
1054
1055	if (flags & (O_EXLOCK | O_SHLOCK))
1056		cap_rights_set_one(rightsp, CAP_FLOCK);
1057}
1058
1059/*
1060 * Check permissions, allocate an open file structure, and call the device
1061 * open routine if any.
1062 */
1063#ifndef _SYS_SYSPROTO_H_
1064struct open_args {
1065	char	*path;
1066	int	flags;
1067	int	mode;
1068};
1069#endif
1070int
1071sys_open(struct thread *td, struct open_args *uap)
1072{
1073
1074	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1075	    uap->flags, uap->mode));
1076}
1077
1078#ifndef _SYS_SYSPROTO_H_
1079struct openat_args {
1080	int	fd;
1081	char	*path;
1082	int	flag;
1083	int	mode;
1084};
1085#endif
1086int
1087sys_openat(struct thread *td, struct openat_args *uap)
1088{
1089
1090	AUDIT_ARG_FD(uap->fd);
1091	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1092	    uap->mode));
1093}
1094
1095int
1096kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
1097    int flags, int mode)
1098{
1099	struct proc *p = td->td_proc;
1100	struct filedesc *fdp;
1101	struct pwddesc *pdp;
1102	struct file *fp;
1103	struct vnode *vp;
1104	struct nameidata nd;
1105	cap_rights_t rights;
1106	int cmode, error, indx;
1107
1108	indx = -1;
1109	fdp = p->p_fd;
1110	pdp = p->p_pd;
1111
1112	AUDIT_ARG_FFLAGS(flags);
1113	AUDIT_ARG_MODE(mode);
1114	cap_rights_init_one(&rights, CAP_LOOKUP);
1115	flags_to_rights(flags, &rights);
1116
1117	/*
1118	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1119	 * may be specified.  On the other hand, for O_PATH any mode
1120	 * except O_EXEC is ignored.
1121	 */
1122	if ((flags & O_PATH) != 0) {
1123		flags &= ~(O_CREAT | O_ACCMODE);
1124	} else if ((flags & O_EXEC) != 0) {
1125		if (flags & O_ACCMODE)
1126			return (EINVAL);
1127	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1128		return (EINVAL);
1129	} else {
1130		flags = FFLAGS(flags);
1131	}
1132
1133	/*
1134	 * Allocate a file structure. The descriptor to reference it
1135	 * is allocated and used by finstall_refed() below.
1136	 */
1137	error = falloc_noinstall(td, &fp);
1138	if (error != 0)
1139		return (error);
1140	/* Set the flags early so the finit in devfs can pick them up. */
1141	fp->f_flag = flags & FMASK;
1142	cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
1143	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1144	    &rights, td);
1145	td->td_dupfd = -1;		/* XXX check for fdopen */
1146	error = vn_open(&nd, &flags, cmode, fp);
1147	if (error != 0) {
1148		/*
1149		 * If the vn_open replaced the method vector, something
1150		 * wonderous happened deep below and we just pass it up
1151		 * pretending we know what we do.
1152		 */
1153		if (error == ENXIO && fp->f_ops != &badfileops) {
1154			MPASS((flags & O_PATH) == 0);
1155			goto success;
1156		}
1157
1158		/*
1159		 * Handle special fdopen() case. bleh.
1160		 *
1161		 * Don't do this for relative (capability) lookups; we don't
1162		 * understand exactly what would happen, and we don't think
1163		 * that it ever should.
1164		 */
1165		if ((nd.ni_resflags & NIRES_STRICTREL) == 0 &&
1166		    (error == ENODEV || error == ENXIO) &&
1167		    td->td_dupfd >= 0) {
1168			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1169			    &indx);
1170			if (error == 0)
1171				goto success;
1172		}
1173
1174		goto bad;
1175	}
1176	td->td_dupfd = 0;
1177	NDFREE(&nd, NDF_ONLY_PNBUF);
1178	vp = nd.ni_vp;
1179
1180	/*
1181	 * Store the vnode, for any f_type. Typically, the vnode use
1182	 * count is decremented by direct call to vn_closefile() for
1183	 * files that switched type in the cdevsw fdopen() method.
1184	 */
1185	fp->f_vnode = vp;
1186
1187	/*
1188	 * If the file wasn't claimed by devfs bind it to the normal
1189	 * vnode operations here.
1190	 */
1191	if (fp->f_ops == &badfileops) {
1192		KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0,
1193		    ("Unexpected fifo fp %p vp %p", fp, vp));
1194		if ((flags & O_PATH) != 0) {
1195			finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED),
1196			    DTYPE_VNODE, NULL, &path_fileops);
1197			vhold(vp);
1198			vunref(vp);
1199		} else {
1200			finit_vnode(fp, flags, NULL, &vnops);
1201		}
1202	}
1203
1204	VOP_UNLOCK(vp);
1205	if (flags & O_TRUNC) {
1206		error = fo_truncate(fp, 0, td->td_ucred, td);
1207		if (error != 0)
1208			goto bad;
1209	}
1210success:
1211	/*
1212	 * If we haven't already installed the FD (for dupfdopen), do so now.
1213	 */
1214	if (indx == -1) {
1215		struct filecaps *fcaps;
1216
1217#ifdef CAPABILITIES
1218		if ((nd.ni_resflags & NIRES_STRICTREL) != 0)
1219			fcaps = &nd.ni_filecaps;
1220		else
1221#endif
1222			fcaps = NULL;
1223		error = finstall_refed(td, fp, &indx, flags, fcaps);
1224		/* On success finstall_refed() consumes fcaps. */
1225		if (error != 0) {
1226			filecaps_free(&nd.ni_filecaps);
1227			goto bad;
1228		}
1229	} else {
1230		filecaps_free(&nd.ni_filecaps);
1231		falloc_abort(td, fp);
1232	}
1233
1234	td->td_retval[0] = indx;
1235	return (0);
1236bad:
1237	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1238	falloc_abort(td, fp);
1239	return (error);
1240}
1241
1242#ifdef COMPAT_43
1243/*
1244 * Create a file.
1245 */
1246#ifndef _SYS_SYSPROTO_H_
1247struct ocreat_args {
1248	char	*path;
1249	int	mode;
1250};
1251#endif
1252int
1253ocreat(struct thread *td, struct ocreat_args *uap)
1254{
1255
1256	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1257	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1258}
1259#endif /* COMPAT_43 */
1260
1261/*
1262 * Create a special file.
1263 */
1264#ifndef _SYS_SYSPROTO_H_
1265struct mknodat_args {
1266	int	fd;
1267	char	*path;
1268	mode_t	mode;
1269	dev_t	dev;
1270};
1271#endif
1272int
1273sys_mknodat(struct thread *td, struct mknodat_args *uap)
1274{
1275
1276	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1277	    uap->dev));
1278}
1279
1280#if defined(COMPAT_FREEBSD11)
1281int
1282freebsd11_mknod(struct thread *td,
1283    struct freebsd11_mknod_args *uap)
1284{
1285
1286	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1287	    uap->mode, uap->dev));
1288}
1289
1290int
1291freebsd11_mknodat(struct thread *td,
1292    struct freebsd11_mknodat_args *uap)
1293{
1294
1295	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1296	    uap->dev));
1297}
1298#endif /* COMPAT_FREEBSD11 */
1299
1300int
1301kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
1302    int mode, dev_t dev)
1303{
1304	struct vnode *vp;
1305	struct mount *mp;
1306	struct vattr vattr;
1307	struct nameidata nd;
1308	int error, whiteout = 0;
1309
1310	AUDIT_ARG_MODE(mode);
1311	AUDIT_ARG_DEV(dev);
1312	switch (mode & S_IFMT) {
1313	case S_IFCHR:
1314	case S_IFBLK:
1315		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1316		if (error == 0 && dev == VNOVAL)
1317			error = EINVAL;
1318		break;
1319	case S_IFWHT:
1320		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1321		break;
1322	case S_IFIFO:
1323		if (dev == 0)
1324			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1325		/* FALLTHROUGH */
1326	default:
1327		error = EINVAL;
1328		break;
1329	}
1330	if (error != 0)
1331		return (error);
1332restart:
1333	bwillwrite();
1334	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1335	    NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
1336	    td);
1337	if ((error = namei(&nd)) != 0)
1338		return (error);
1339	vp = nd.ni_vp;
1340	if (vp != NULL) {
1341		NDFREE(&nd, NDF_ONLY_PNBUF);
1342		if (vp == nd.ni_dvp)
1343			vrele(nd.ni_dvp);
1344		else
1345			vput(nd.ni_dvp);
1346		vrele(vp);
1347		return (EEXIST);
1348	} else {
1349		VATTR_NULL(&vattr);
1350		vattr.va_mode = (mode & ALLPERMS) &
1351		    ~td->td_proc->p_pd->pd_cmask;
1352		vattr.va_rdev = dev;
1353		whiteout = 0;
1354
1355		switch (mode & S_IFMT) {
1356		case S_IFCHR:
1357			vattr.va_type = VCHR;
1358			break;
1359		case S_IFBLK:
1360			vattr.va_type = VBLK;
1361			break;
1362		case S_IFWHT:
1363			whiteout = 1;
1364			break;
1365		default:
1366			panic("kern_mknod: invalid mode");
1367		}
1368	}
1369	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1370		NDFREE(&nd, NDF_ONLY_PNBUF);
1371		vput(nd.ni_dvp);
1372		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1373			return (error);
1374		goto restart;
1375	}
1376#ifdef MAC
1377	if (error == 0 && !whiteout)
1378		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1379		    &nd.ni_cnd, &vattr);
1380#endif
1381	if (error == 0) {
1382		if (whiteout)
1383			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1384		else {
1385			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1386						&nd.ni_cnd, &vattr);
1387		}
1388	}
1389	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL,
1390	    true);
1391	vn_finished_write(mp);
1392	NDFREE(&nd, NDF_ONLY_PNBUF);
1393	if (error == ERELOOKUP)
1394		goto restart;
1395	return (error);
1396}
1397
1398/*
1399 * Create a named pipe.
1400 */
1401#ifndef _SYS_SYSPROTO_H_
1402struct mkfifo_args {
1403	char	*path;
1404	int	mode;
1405};
1406#endif
1407int
1408sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
1409{
1410
1411	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1412	    uap->mode));
1413}
1414
1415#ifndef _SYS_SYSPROTO_H_
1416struct mkfifoat_args {
1417	int	fd;
1418	char	*path;
1419	mode_t	mode;
1420};
1421#endif
1422int
1423sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1424{
1425
1426	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1427	    uap->mode));
1428}
1429
1430int
1431kern_mkfifoat(struct thread *td, int fd, const char *path,
1432    enum uio_seg pathseg, int mode)
1433{
1434	struct mount *mp;
1435	struct vattr vattr;
1436	struct nameidata nd;
1437	int error;
1438
1439	AUDIT_ARG_MODE(mode);
1440restart:
1441	bwillwrite();
1442	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1443	    NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
1444	    td);
1445	if ((error = namei(&nd)) != 0)
1446		return (error);
1447	if (nd.ni_vp != NULL) {
1448		NDFREE(&nd, NDF_ONLY_PNBUF);
1449		if (nd.ni_vp == nd.ni_dvp)
1450			vrele(nd.ni_dvp);
1451		else
1452			vput(nd.ni_dvp);
1453		vrele(nd.ni_vp);
1454		return (EEXIST);
1455	}
1456	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1457		NDFREE(&nd, NDF_ONLY_PNBUF);
1458		vput(nd.ni_dvp);
1459		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1460			return (error);
1461		goto restart;
1462	}
1463	VATTR_NULL(&vattr);
1464	vattr.va_type = VFIFO;
1465	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask;
1466#ifdef MAC
1467	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1468	    &vattr);
1469	if (error != 0)
1470		goto out;
1471#endif
1472	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1473#ifdef MAC
1474out:
1475#endif
1476	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
1477	vn_finished_write(mp);
1478	NDFREE(&nd, NDF_ONLY_PNBUF);
1479	if (error == ERELOOKUP)
1480		goto restart;
1481	return (error);
1482}
1483
1484/*
1485 * Make a hard file link.
1486 */
1487#ifndef _SYS_SYSPROTO_H_
1488struct link_args {
1489	char	*path;
1490	char	*link;
1491};
1492#endif
1493int
1494sys_link(struct thread *td, struct link_args *uap)
1495{
1496
1497	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1498	    UIO_USERSPACE, FOLLOW));
1499}
1500
1501#ifndef _SYS_SYSPROTO_H_
1502struct linkat_args {
1503	int	fd1;
1504	char	*path1;
1505	int	fd2;
1506	char	*path2;
1507	int	flag;
1508};
1509#endif
1510int
1511sys_linkat(struct thread *td, struct linkat_args *uap)
1512{
1513	int flag;
1514
1515	flag = uap->flag;
1516	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH |
1517	    AT_EMPTY_PATH)) != 0)
1518		return (EINVAL);
1519
1520	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1521	    UIO_USERSPACE, at2cnpflags(flag, AT_SYMLINK_FOLLOW |
1522	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH)));
1523}
1524
1525int hardlink_check_uid = 0;
1526SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1527    &hardlink_check_uid, 0,
1528    "Unprivileged processes cannot create hard links to files owned by other "
1529    "users");
1530static int hardlink_check_gid = 0;
1531SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1532    &hardlink_check_gid, 0,
1533    "Unprivileged processes cannot create hard links to files owned by other "
1534    "groups");
1535
1536static int
1537can_hardlink(struct vnode *vp, struct ucred *cred)
1538{
1539	struct vattr va;
1540	int error;
1541
1542	if (!hardlink_check_uid && !hardlink_check_gid)
1543		return (0);
1544
1545	error = VOP_GETATTR(vp, &va, cred);
1546	if (error != 0)
1547		return (error);
1548
1549	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1550		error = priv_check_cred(cred, PRIV_VFS_LINK);
1551		if (error != 0)
1552			return (error);
1553	}
1554
1555	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1556		error = priv_check_cred(cred, PRIV_VFS_LINK);
1557		if (error != 0)
1558			return (error);
1559	}
1560
1561	return (0);
1562}
1563
1564int
1565kern_linkat(struct thread *td, int fd1, int fd2, const char *path1,
1566    const char *path2, enum uio_seg segflag, int follow)
1567{
1568	struct nameidata nd;
1569	int error;
1570
1571	do {
1572		bwillwrite();
1573		NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag,
1574		    path1, fd1, &cap_linkat_source_rights, td);
1575		if ((error = namei(&nd)) != 0)
1576			return (error);
1577		NDFREE(&nd, NDF_ONLY_PNBUF);
1578		if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) {
1579			error = priv_check(td, PRIV_VFS_FHOPEN);
1580			if (error != 0) {
1581				vrele(nd.ni_vp);
1582				return (error);
1583			}
1584		}
1585		error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
1586	} while (error ==  EAGAIN || error == ERELOOKUP);
1587	return (error);
1588}
1589
1590static int
1591kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
1592    enum uio_seg segflag)
1593{
1594	struct nameidata nd;
1595	struct mount *mp;
1596	int error;
1597
1598	if (vp->v_type == VDIR) {
1599		vrele(vp);
1600		return (EPERM);		/* POSIX */
1601	}
1602	NDINIT_ATRIGHTS(&nd, CREATE,
1603	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
1604	    &cap_linkat_target_rights, td);
1605	if ((error = namei(&nd)) == 0) {
1606		if (nd.ni_vp != NULL) {
1607			NDFREE(&nd, NDF_ONLY_PNBUF);
1608			if (nd.ni_dvp == nd.ni_vp)
1609				vrele(nd.ni_dvp);
1610			else
1611				vput(nd.ni_dvp);
1612			vrele(nd.ni_vp);
1613			vrele(vp);
1614			return (EEXIST);
1615		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1616			/*
1617			 * Cross-device link.  No need to recheck
1618			 * vp->v_type, since it cannot change, except
1619			 * to VBAD.
1620			 */
1621			NDFREE(&nd, NDF_ONLY_PNBUF);
1622			vput(nd.ni_dvp);
1623			vrele(vp);
1624			return (EXDEV);
1625		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1626			error = can_hardlink(vp, td->td_ucred);
1627#ifdef MAC
1628			if (error == 0)
1629				error = mac_vnode_check_link(td->td_ucred,
1630				    nd.ni_dvp, vp, &nd.ni_cnd);
1631#endif
1632			if (error != 0) {
1633				vput(vp);
1634				vput(nd.ni_dvp);
1635				NDFREE(&nd, NDF_ONLY_PNBUF);
1636				return (error);
1637			}
1638			error = vn_start_write(vp, &mp, V_NOWAIT);
1639			if (error != 0) {
1640				vput(vp);
1641				vput(nd.ni_dvp);
1642				NDFREE(&nd, NDF_ONLY_PNBUF);
1643				error = vn_start_write(NULL, &mp,
1644				    V_XSLEEP | PCATCH);
1645				if (error != 0)
1646					return (error);
1647				return (EAGAIN);
1648			}
1649			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1650			VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
1651			vn_finished_write(mp);
1652			NDFREE(&nd, NDF_ONLY_PNBUF);
1653			vp = NULL;
1654		} else {
1655			vput(nd.ni_dvp);
1656			NDFREE(&nd, NDF_ONLY_PNBUF);
1657			vrele(vp);
1658			return (EAGAIN);
1659		}
1660	}
1661	if (vp != NULL)
1662		vrele(vp);
1663	return (error);
1664}
1665
1666/*
1667 * Make a symbolic link.
1668 */
1669#ifndef _SYS_SYSPROTO_H_
1670struct symlink_args {
1671	char	*path;
1672	char	*link;
1673};
1674#endif
1675int
1676sys_symlink(struct thread *td, struct symlink_args *uap)
1677{
1678
1679	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1680	    UIO_USERSPACE));
1681}
1682
1683#ifndef _SYS_SYSPROTO_H_
1684struct symlinkat_args {
1685	char	*path;
1686	int	fd;
1687	char	*path2;
1688};
1689#endif
1690int
1691sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1692{
1693
1694	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1695	    UIO_USERSPACE));
1696}
1697
1698int
1699kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2,
1700    enum uio_seg segflg)
1701{
1702	struct mount *mp;
1703	struct vattr vattr;
1704	const char *syspath;
1705	char *tmppath;
1706	struct nameidata nd;
1707	int error;
1708
1709	if (segflg == UIO_SYSSPACE) {
1710		syspath = path1;
1711	} else {
1712		tmppath = uma_zalloc(namei_zone, M_WAITOK);
1713		if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0)
1714			goto out;
1715		syspath = tmppath;
1716	}
1717	AUDIT_ARG_TEXT(syspath);
1718restart:
1719	bwillwrite();
1720	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1721	    NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
1722	    td);
1723	if ((error = namei(&nd)) != 0)
1724		goto out;
1725	if (nd.ni_vp) {
1726		NDFREE(&nd, NDF_ONLY_PNBUF);
1727		if (nd.ni_vp == nd.ni_dvp)
1728			vrele(nd.ni_dvp);
1729		else
1730			vput(nd.ni_dvp);
1731		vrele(nd.ni_vp);
1732		nd.ni_vp = NULL;
1733		error = EEXIST;
1734		goto out;
1735	}
1736	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1737		NDFREE(&nd, NDF_ONLY_PNBUF);
1738		vput(nd.ni_dvp);
1739		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1740			goto out;
1741		goto restart;
1742	}
1743	VATTR_NULL(&vattr);
1744	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask;
1745#ifdef MAC
1746	vattr.va_type = VLNK;
1747	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1748	    &vattr);
1749	if (error != 0)
1750		goto out2;
1751#endif
1752	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1753#ifdef MAC
1754out2:
1755#endif
1756	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
1757	vn_finished_write(mp);
1758	NDFREE(&nd, NDF_ONLY_PNBUF);
1759	if (error == ERELOOKUP)
1760		goto restart;
1761out:
1762	if (segflg != UIO_SYSSPACE)
1763		uma_zfree(namei_zone, tmppath);
1764	return (error);
1765}
1766
1767/*
1768 * Delete a whiteout from the filesystem.
1769 */
1770#ifndef _SYS_SYSPROTO_H_
1771struct undelete_args {
1772	char *path;
1773};
1774#endif
1775int
1776sys_undelete(struct thread *td, struct undelete_args *uap)
1777{
1778	struct mount *mp;
1779	struct nameidata nd;
1780	int error;
1781
1782restart:
1783	bwillwrite();
1784	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1785	    UIO_USERSPACE, uap->path, td);
1786	error = namei(&nd);
1787	if (error != 0)
1788		return (error);
1789
1790	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1791		NDFREE(&nd, NDF_ONLY_PNBUF);
1792		if (nd.ni_vp == nd.ni_dvp)
1793			vrele(nd.ni_dvp);
1794		else
1795			vput(nd.ni_dvp);
1796		if (nd.ni_vp)
1797			vrele(nd.ni_vp);
1798		return (EEXIST);
1799	}
1800	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1801		NDFREE(&nd, NDF_ONLY_PNBUF);
1802		vput(nd.ni_dvp);
1803		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1804			return (error);
1805		goto restart;
1806	}
1807	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1808	NDFREE(&nd, NDF_ONLY_PNBUF);
1809	vput(nd.ni_dvp);
1810	vn_finished_write(mp);
1811	if (error == ERELOOKUP)
1812		goto restart;
1813	return (error);
1814}
1815
1816/*
1817 * Delete a name from the filesystem.
1818 */
1819#ifndef _SYS_SYSPROTO_H_
1820struct unlink_args {
1821	char	*path;
1822};
1823#endif
1824int
1825sys_unlink(struct thread *td, struct unlink_args *uap)
1826{
1827
1828	return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
1829	    0, 0));
1830}
1831
1832static int
1833kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd,
1834    int flag, enum uio_seg pathseg, ino_t oldinum)
1835{
1836
1837	if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0)
1838		return (EINVAL);
1839
1840	if ((flag & AT_REMOVEDIR) != 0)
1841		return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0));
1842
1843	return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0));
1844}
1845
1846#ifndef _SYS_SYSPROTO_H_
1847struct unlinkat_args {
1848	int	fd;
1849	char	*path;
1850	int	flag;
1851};
1852#endif
1853int
1854sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1855{
1856
1857	return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag,
1858	    UIO_USERSPACE, 0));
1859}
1860
1861#ifndef _SYS_SYSPROTO_H_
1862struct funlinkat_args {
1863	int		dfd;
1864	const char	*path;
1865	int		fd;
1866	int		flag;
1867};
1868#endif
1869int
1870sys_funlinkat(struct thread *td, struct funlinkat_args *uap)
1871{
1872
1873	return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag,
1874	    UIO_USERSPACE, 0));
1875}
1876
1877int
1878kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
1879    enum uio_seg pathseg, int flag, ino_t oldinum)
1880{
1881	struct mount *mp;
1882	struct file *fp;
1883	struct vnode *vp;
1884	struct nameidata nd;
1885	struct stat sb;
1886	int error;
1887
1888	fp = NULL;
1889	if (fd != FD_NONE) {
1890		error = getvnode_path(td, fd, &cap_no_rights, &fp);
1891		if (error != 0)
1892			return (error);
1893	}
1894
1895restart:
1896	bwillwrite();
1897	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
1898	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
1899	    pathseg, path, dfd, &cap_unlinkat_rights, td);
1900	if ((error = namei(&nd)) != 0) {
1901		if (error == EINVAL)
1902			error = EPERM;
1903		goto fdout;
1904	}
1905	vp = nd.ni_vp;
1906	if (vp->v_type == VDIR && oldinum == 0) {
1907		error = EPERM;		/* POSIX */
1908	} else if (oldinum != 0 &&
1909	    ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1910	    sb.st_ino != oldinum) {
1911		error = EIDRM;	/* Identifier removed */
1912	} else if (fp != NULL && fp->f_vnode != vp) {
1913		if (VN_IS_DOOMED(fp->f_vnode))
1914			error = EBADF;
1915		else
1916			error = EDEADLK;
1917	} else {
1918		/*
1919		 * The root of a mounted filesystem cannot be deleted.
1920		 *
1921		 * XXX: can this only be a VDIR case?
1922		 */
1923		if (vp->v_vflag & VV_ROOT)
1924			error = EBUSY;
1925	}
1926	if (error == 0) {
1927		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1928			NDFREE(&nd, NDF_ONLY_PNBUF);
1929			vput(nd.ni_dvp);
1930			if (vp == nd.ni_dvp)
1931				vrele(vp);
1932			else
1933				vput(vp);
1934			if ((error = vn_start_write(NULL, &mp,
1935			    V_XSLEEP | PCATCH)) != 0) {
1936				goto fdout;
1937			}
1938			goto restart;
1939		}
1940#ifdef MAC
1941		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1942		    &nd.ni_cnd);
1943		if (error != 0)
1944			goto out;
1945#endif
1946		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1947		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1948#ifdef MAC
1949out:
1950#endif
1951		vn_finished_write(mp);
1952	}
1953	NDFREE(&nd, NDF_ONLY_PNBUF);
1954	vput(nd.ni_dvp);
1955	if (vp == nd.ni_dvp)
1956		vrele(vp);
1957	else
1958		vput(vp);
1959	if (error == ERELOOKUP)
1960		goto restart;
1961fdout:
1962	if (fp != NULL)
1963		fdrop(fp, td);
1964	return (error);
1965}
1966
1967/*
1968 * Reposition read/write file offset.
1969 */
1970#ifndef _SYS_SYSPROTO_H_
1971struct lseek_args {
1972	int	fd;
1973	int	pad;
1974	off_t	offset;
1975	int	whence;
1976};
1977#endif
1978int
1979sys_lseek(struct thread *td, struct lseek_args *uap)
1980{
1981
1982	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1983}
1984
1985int
1986kern_lseek(struct thread *td, int fd, off_t offset, int whence)
1987{
1988	struct file *fp;
1989	int error;
1990
1991	AUDIT_ARG_FD(fd);
1992	error = fget(td, fd, &cap_seek_rights, &fp);
1993	if (error != 0)
1994		return (error);
1995	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1996	    fo_seek(fp, offset, whence, td) : ESPIPE;
1997	fdrop(fp, td);
1998	return (error);
1999}
2000
2001#if defined(COMPAT_43)
2002/*
2003 * Reposition read/write file offset.
2004 */
2005#ifndef _SYS_SYSPROTO_H_
2006struct olseek_args {
2007	int	fd;
2008	long	offset;
2009	int	whence;
2010};
2011#endif
2012int
2013olseek(struct thread *td, struct olseek_args *uap)
2014{
2015
2016	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
2017}
2018#endif /* COMPAT_43 */
2019
2020#if defined(COMPAT_FREEBSD6)
2021/* Version with the 'pad' argument */
2022int
2023freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
2024{
2025
2026	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
2027}
2028#endif
2029
2030/*
2031 * Check access permissions using passed credentials.
2032 */
2033static int
2034vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
2035     struct thread *td)
2036{
2037	accmode_t accmode;
2038	int error;
2039
2040	/* Flags == 0 means only check for existence. */
2041	if (user_flags == 0)
2042		return (0);
2043
2044	accmode = 0;
2045	if (user_flags & R_OK)
2046		accmode |= VREAD;
2047	if (user_flags & W_OK)
2048		accmode |= VWRITE;
2049	if (user_flags & X_OK)
2050		accmode |= VEXEC;
2051#ifdef MAC
2052	error = mac_vnode_check_access(cred, vp, accmode);
2053	if (error != 0)
2054		return (error);
2055#endif
2056	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2057		error = VOP_ACCESS(vp, accmode, cred, td);
2058	return (error);
2059}
2060
2061/*
2062 * Check access permissions using "real" credentials.
2063 */
2064#ifndef _SYS_SYSPROTO_H_
2065struct access_args {
2066	char	*path;
2067	int	amode;
2068};
2069#endif
2070int
2071sys_access(struct thread *td, struct access_args *uap)
2072{
2073
2074	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2075	    0, uap->amode));
2076}
2077
2078#ifndef _SYS_SYSPROTO_H_
2079struct faccessat_args {
2080	int	dirfd;
2081	char	*path;
2082	int	amode;
2083	int	flag;
2084}
2085#endif
2086int
2087sys_faccessat(struct thread *td, struct faccessat_args *uap)
2088{
2089
2090	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2091	    uap->amode));
2092}
2093
2094int
2095kern_accessat(struct thread *td, int fd, const char *path,
2096    enum uio_seg pathseg, int flag, int amode)
2097{
2098	struct ucred *cred, *usecred;
2099	struct vnode *vp;
2100	struct nameidata nd;
2101	int error;
2102
2103	if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0)
2104		return (EINVAL);
2105	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
2106		return (EINVAL);
2107
2108	/*
2109	 * Create and modify a temporary credential instead of one that
2110	 * is potentially shared (if we need one).
2111	 */
2112	cred = td->td_ucred;
2113	if ((flag & AT_EACCESS) == 0 &&
2114	    ((cred->cr_uid != cred->cr_ruid ||
2115	    cred->cr_rgid != cred->cr_groups[0]))) {
2116		usecred = crdup(cred);
2117		usecred->cr_uid = cred->cr_ruid;
2118		usecred->cr_groups[0] = cred->cr_rgid;
2119		td->td_ucred = usecred;
2120	} else
2121		usecred = cred;
2122	AUDIT_ARG_VALUE(amode);
2123	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2124	    AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH |
2125	    AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td);
2126	if ((error = namei(&nd)) != 0)
2127		goto out;
2128	vp = nd.ni_vp;
2129
2130	error = vn_access(vp, amode, usecred, td);
2131	NDFREE_NOTHING(&nd);
2132	vput(vp);
2133out:
2134	if (usecred != cred) {
2135		td->td_ucred = cred;
2136		crfree(usecred);
2137	}
2138	return (error);
2139}
2140
2141/*
2142 * Check access permissions using "effective" credentials.
2143 */
2144#ifndef _SYS_SYSPROTO_H_
2145struct eaccess_args {
2146	char	*path;
2147	int	amode;
2148};
2149#endif
2150int
2151sys_eaccess(struct thread *td, struct eaccess_args *uap)
2152{
2153
2154	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2155	    AT_EACCESS, uap->amode));
2156}
2157
2158#if defined(COMPAT_43)
2159/*
2160 * Get file status; this version follows links.
2161 */
2162#ifndef _SYS_SYSPROTO_H_
2163struct ostat_args {
2164	char	*path;
2165	struct ostat *ub;
2166};
2167#endif
2168int
2169ostat(struct thread *td, struct ostat_args *uap)
2170{
2171	struct stat sb;
2172	struct ostat osb;
2173	int error;
2174
2175	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2176	    &sb, NULL);
2177	if (error != 0)
2178		return (error);
2179	cvtstat(&sb, &osb);
2180	return (copyout(&osb, uap->ub, sizeof (osb)));
2181}
2182
2183/*
2184 * Get file status; this version does not follow links.
2185 */
2186#ifndef _SYS_SYSPROTO_H_
2187struct olstat_args {
2188	char	*path;
2189	struct ostat *ub;
2190};
2191#endif
2192int
2193olstat(struct thread *td, struct olstat_args *uap)
2194{
2195	struct stat sb;
2196	struct ostat osb;
2197	int error;
2198
2199	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2200	    UIO_USERSPACE, &sb, NULL);
2201	if (error != 0)
2202		return (error);
2203	cvtstat(&sb, &osb);
2204	return (copyout(&osb, uap->ub, sizeof (osb)));
2205}
2206
2207/*
2208 * Convert from an old to a new stat structure.
2209 * XXX: many values are blindly truncated.
2210 */
2211void
2212cvtstat(struct stat *st, struct ostat *ost)
2213{
2214
2215	bzero(ost, sizeof(*ost));
2216	ost->st_dev = st->st_dev;
2217	ost->st_ino = st->st_ino;
2218	ost->st_mode = st->st_mode;
2219	ost->st_nlink = st->st_nlink;
2220	ost->st_uid = st->st_uid;
2221	ost->st_gid = st->st_gid;
2222	ost->st_rdev = st->st_rdev;
2223	ost->st_size = MIN(st->st_size, INT32_MAX);
2224	ost->st_atim = st->st_atim;
2225	ost->st_mtim = st->st_mtim;
2226	ost->st_ctim = st->st_ctim;
2227	ost->st_blksize = st->st_blksize;
2228	ost->st_blocks = st->st_blocks;
2229	ost->st_flags = st->st_flags;
2230	ost->st_gen = st->st_gen;
2231}
2232#endif /* COMPAT_43 */
2233
2234#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
2235int ino64_trunc_error;
2236SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
2237    &ino64_trunc_error, 0,
2238    "Error on truncation of device, file or inode number, or link count");
2239
2240int
2241freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
2242{
2243
2244	ost->st_dev = st->st_dev;
2245	if (ost->st_dev != st->st_dev) {
2246		switch (ino64_trunc_error) {
2247		default:
2248			/*
2249			 * Since dev_t is almost raw, don't clamp to the
2250			 * maximum for case 2, but ignore the error.
2251			 */
2252			break;
2253		case 1:
2254			return (EOVERFLOW);
2255		}
2256	}
2257	ost->st_ino = st->st_ino;
2258	if (ost->st_ino != st->st_ino) {
2259		switch (ino64_trunc_error) {
2260		default:
2261		case 0:
2262			break;
2263		case 1:
2264			return (EOVERFLOW);
2265		case 2:
2266			ost->st_ino = UINT32_MAX;
2267			break;
2268		}
2269	}
2270	ost->st_mode = st->st_mode;
2271	ost->st_nlink = st->st_nlink;
2272	if (ost->st_nlink != st->st_nlink) {
2273		switch (ino64_trunc_error) {
2274		default:
2275		case 0:
2276			break;
2277		case 1:
2278			return (EOVERFLOW);
2279		case 2:
2280			ost->st_nlink = UINT16_MAX;
2281			break;
2282		}
2283	}
2284	ost->st_uid = st->st_uid;
2285	ost->st_gid = st->st_gid;
2286	ost->st_rdev = st->st_rdev;
2287	if (ost->st_rdev != st->st_rdev) {
2288		switch (ino64_trunc_error) {
2289		default:
2290			break;
2291		case 1:
2292			return (EOVERFLOW);
2293		}
2294	}
2295	ost->st_atim = st->st_atim;
2296	ost->st_mtim = st->st_mtim;
2297	ost->st_ctim = st->st_ctim;
2298	ost->st_size = st->st_size;
2299	ost->st_blocks = st->st_blocks;
2300	ost->st_blksize = st->st_blksize;
2301	ost->st_flags = st->st_flags;
2302	ost->st_gen = st->st_gen;
2303	ost->st_lspare = 0;
2304	ost->st_birthtim = st->st_birthtim;
2305	bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
2306	    sizeof(*ost) - offsetof(struct freebsd11_stat,
2307	    st_birthtim) - sizeof(ost->st_birthtim));
2308	return (0);
2309}
2310
2311int
2312freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
2313{
2314	struct stat sb;
2315	struct freebsd11_stat osb;
2316	int error;
2317
2318	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2319	    &sb, NULL);
2320	if (error != 0)
2321		return (error);
2322	error = freebsd11_cvtstat(&sb, &osb);
2323	if (error == 0)
2324		error = copyout(&osb, uap->ub, sizeof(osb));
2325	return (error);
2326}
2327
2328int
2329freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
2330{
2331	struct stat sb;
2332	struct freebsd11_stat osb;
2333	int error;
2334
2335	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2336	    UIO_USERSPACE, &sb, NULL);
2337	if (error != 0)
2338		return (error);
2339	error = freebsd11_cvtstat(&sb, &osb);
2340	if (error == 0)
2341		error = copyout(&osb, uap->ub, sizeof(osb));
2342	return (error);
2343}
2344
2345int
2346freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
2347{
2348	struct fhandle fh;
2349	struct stat sb;
2350	struct freebsd11_stat osb;
2351	int error;
2352
2353	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
2354	if (error != 0)
2355		return (error);
2356	error = kern_fhstat(td, fh, &sb);
2357	if (error != 0)
2358		return (error);
2359	error = freebsd11_cvtstat(&sb, &osb);
2360	if (error == 0)
2361		error = copyout(&osb, uap->sb, sizeof(osb));
2362	return (error);
2363}
2364
2365int
2366freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
2367{
2368	struct stat sb;
2369	struct freebsd11_stat osb;
2370	int error;
2371
2372	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2373	    UIO_USERSPACE, &sb, NULL);
2374	if (error != 0)
2375		return (error);
2376	error = freebsd11_cvtstat(&sb, &osb);
2377	if (error == 0)
2378		error = copyout(&osb, uap->buf, sizeof(osb));
2379	return (error);
2380}
2381#endif	/* COMPAT_FREEBSD11 */
2382
2383/*
2384 * Get file status
2385 */
2386#ifndef _SYS_SYSPROTO_H_
2387struct fstatat_args {
2388	int	fd;
2389	char	*path;
2390	struct stat	*buf;
2391	int	flag;
2392}
2393#endif
2394int
2395sys_fstatat(struct thread *td, struct fstatat_args *uap)
2396{
2397	struct stat sb;
2398	int error;
2399
2400	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2401	    UIO_USERSPACE, &sb, NULL);
2402	if (error == 0)
2403		error = copyout(&sb, uap->buf, sizeof (sb));
2404	return (error);
2405}
2406
2407int
2408kern_statat(struct thread *td, int flag, int fd, const char *path,
2409    enum uio_seg pathseg, struct stat *sbp,
2410    void (*hook)(struct vnode *vp, struct stat *sbp))
2411{
2412	struct nameidata nd;
2413	int error;
2414
2415	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2416	    AT_EMPTY_PATH)) != 0)
2417		return (EINVAL);
2418
2419	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH |
2420	    AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF |
2421	    AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td);
2422
2423	if ((error = namei(&nd)) != 0)
2424		return (error);
2425	error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
2426	if (error == 0) {
2427		if (__predict_false(hook != NULL))
2428			hook(nd.ni_vp, sbp);
2429	}
2430	NDFREE_NOTHING(&nd);
2431	vput(nd.ni_vp);
2432#ifdef __STAT_TIME_T_EXT
2433	sbp->st_atim_ext = 0;
2434	sbp->st_mtim_ext = 0;
2435	sbp->st_ctim_ext = 0;
2436	sbp->st_btim_ext = 0;
2437#endif
2438#ifdef KTRACE
2439	if (KTRPOINT(td, KTR_STRUCT))
2440		ktrstat_error(sbp, error);
2441#endif
2442	return (error);
2443}
2444
2445#if defined(COMPAT_FREEBSD11)
2446/*
2447 * Implementation of the NetBSD [l]stat() functions.
2448 */
2449void
2450freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
2451{
2452
2453	bzero(nsb, sizeof(*nsb));
2454	nsb->st_dev = sb->st_dev;
2455	nsb->st_ino = sb->st_ino;
2456	nsb->st_mode = sb->st_mode;
2457	nsb->st_nlink = sb->st_nlink;
2458	nsb->st_uid = sb->st_uid;
2459	nsb->st_gid = sb->st_gid;
2460	nsb->st_rdev = sb->st_rdev;
2461	nsb->st_atim = sb->st_atim;
2462	nsb->st_mtim = sb->st_mtim;
2463	nsb->st_ctim = sb->st_ctim;
2464	nsb->st_size = sb->st_size;
2465	nsb->st_blocks = sb->st_blocks;
2466	nsb->st_blksize = sb->st_blksize;
2467	nsb->st_flags = sb->st_flags;
2468	nsb->st_gen = sb->st_gen;
2469	nsb->st_birthtim = sb->st_birthtim;
2470}
2471
2472#ifndef _SYS_SYSPROTO_H_
2473struct freebsd11_nstat_args {
2474	char	*path;
2475	struct nstat *ub;
2476};
2477#endif
2478int
2479freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
2480{
2481	struct stat sb;
2482	struct nstat nsb;
2483	int error;
2484
2485	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2486	    &sb, NULL);
2487	if (error != 0)
2488		return (error);
2489	freebsd11_cvtnstat(&sb, &nsb);
2490	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2491}
2492
2493/*
2494 * NetBSD lstat.  Get file status; this version does not follow links.
2495 */
2496#ifndef _SYS_SYSPROTO_H_
2497struct freebsd11_nlstat_args {
2498	char	*path;
2499	struct nstat *ub;
2500};
2501#endif
2502int
2503freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
2504{
2505	struct stat sb;
2506	struct nstat nsb;
2507	int error;
2508
2509	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2510	    UIO_USERSPACE, &sb, NULL);
2511	if (error != 0)
2512		return (error);
2513	freebsd11_cvtnstat(&sb, &nsb);
2514	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2515}
2516#endif /* COMPAT_FREEBSD11 */
2517
2518/*
2519 * Get configurable pathname variables.
2520 */
2521#ifndef _SYS_SYSPROTO_H_
2522struct pathconf_args {
2523	char	*path;
2524	int	name;
2525};
2526#endif
2527int
2528sys_pathconf(struct thread *td, struct pathconf_args *uap)
2529{
2530	long value;
2531	int error;
2532
2533	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
2534	    &value);
2535	if (error == 0)
2536		td->td_retval[0] = value;
2537	return (error);
2538}
2539
2540#ifndef _SYS_SYSPROTO_H_
2541struct lpathconf_args {
2542	char	*path;
2543	int	name;
2544};
2545#endif
2546int
2547sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
2548{
2549	long value;
2550	int error;
2551
2552	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2553	    NOFOLLOW, &value);
2554	if (error == 0)
2555		td->td_retval[0] = value;
2556	return (error);
2557}
2558
2559int
2560kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg,
2561    int name, u_long flags, long *valuep)
2562{
2563	struct nameidata nd;
2564	int error;
2565
2566	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2567	    pathseg, path, td);
2568	if ((error = namei(&nd)) != 0)
2569		return (error);
2570	NDFREE_NOTHING(&nd);
2571
2572	error = VOP_PATHCONF(nd.ni_vp, name, valuep);
2573	vput(nd.ni_vp);
2574	return (error);
2575}
2576
2577/*
2578 * Return target name of a symbolic link.
2579 */
2580#ifndef _SYS_SYSPROTO_H_
2581struct readlink_args {
2582	char	*path;
2583	char	*buf;
2584	size_t	count;
2585};
2586#endif
2587int
2588sys_readlink(struct thread *td, struct readlink_args *uap)
2589{
2590
2591	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2592	    uap->buf, UIO_USERSPACE, uap->count));
2593}
2594#ifndef _SYS_SYSPROTO_H_
2595struct readlinkat_args {
2596	int	fd;
2597	char	*path;
2598	char	*buf;
2599	size_t	bufsize;
2600};
2601#endif
2602int
2603sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2604{
2605
2606	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2607	    uap->buf, UIO_USERSPACE, uap->bufsize));
2608}
2609
2610int
2611kern_readlinkat(struct thread *td, int fd, const char *path,
2612    enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count)
2613{
2614	struct vnode *vp;
2615	struct nameidata nd;
2616	int error;
2617
2618	if (count > IOSIZE_MAX)
2619		return (EINVAL);
2620
2621	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2622	    pathseg, path, fd, td);
2623
2624	if ((error = namei(&nd)) != 0)
2625		return (error);
2626	NDFREE_NOTHING(&nd);
2627	vp = nd.ni_vp;
2628
2629	error = kern_readlink_vp(vp, buf, bufseg, count, td);
2630	vput(vp);
2631
2632	return (error);
2633}
2634
2635/*
2636 * Helper function to readlink from a vnode
2637 */
2638static int
2639kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
2640    struct thread *td)
2641{
2642	struct iovec aiov;
2643	struct uio auio;
2644	int error;
2645
2646	ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
2647#ifdef MAC
2648	error = mac_vnode_check_readlink(td->td_ucred, vp);
2649	if (error != 0)
2650		return (error);
2651#endif
2652	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
2653		return (EINVAL);
2654
2655	aiov.iov_base = buf;
2656	aiov.iov_len = count;
2657	auio.uio_iov = &aiov;
2658	auio.uio_iovcnt = 1;
2659	auio.uio_offset = 0;
2660	auio.uio_rw = UIO_READ;
2661	auio.uio_segflg = bufseg;
2662	auio.uio_td = td;
2663	auio.uio_resid = count;
2664	error = VOP_READLINK(vp, &auio, td->td_ucred);
2665	td->td_retval[0] = count - auio.uio_resid;
2666	return (error);
2667}
2668
2669/*
2670 * Common implementation code for chflags() and fchflags().
2671 */
2672static int
2673setfflags(struct thread *td, struct vnode *vp, u_long flags)
2674{
2675	struct mount *mp;
2676	struct vattr vattr;
2677	int error;
2678
2679	/* We can't support the value matching VNOVAL. */
2680	if (flags == VNOVAL)
2681		return (EOPNOTSUPP);
2682
2683	/*
2684	 * Prevent non-root users from setting flags on devices.  When
2685	 * a device is reused, users can retain ownership of the device
2686	 * if they are allowed to set flags and programs assume that
2687	 * chown can't fail when done as root.
2688	 */
2689	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2690		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2691		if (error != 0)
2692			return (error);
2693	}
2694
2695	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2696		return (error);
2697	VATTR_NULL(&vattr);
2698	vattr.va_flags = flags;
2699	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2700#ifdef MAC
2701	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2702	if (error == 0)
2703#endif
2704		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2705	VOP_UNLOCK(vp);
2706	vn_finished_write(mp);
2707	return (error);
2708}
2709
2710/*
2711 * Change flags of a file given a path name.
2712 */
2713#ifndef _SYS_SYSPROTO_H_
2714struct chflags_args {
2715	const char *path;
2716	u_long	flags;
2717};
2718#endif
2719int
2720sys_chflags(struct thread *td, struct chflags_args *uap)
2721{
2722
2723	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2724	    uap->flags, 0));
2725}
2726
2727#ifndef _SYS_SYSPROTO_H_
2728struct chflagsat_args {
2729	int	fd;
2730	const char *path;
2731	u_long	flags;
2732	int	atflag;
2733}
2734#endif
2735int
2736sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2737{
2738
2739	if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2740	    AT_EMPTY_PATH)) != 0)
2741		return (EINVAL);
2742
2743	return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE,
2744	    uap->flags, uap->atflag));
2745}
2746
2747/*
2748 * Same as chflags() but doesn't follow symlinks.
2749 */
2750#ifndef _SYS_SYSPROTO_H_
2751struct lchflags_args {
2752	const char *path;
2753	u_long flags;
2754};
2755#endif
2756int
2757sys_lchflags(struct thread *td, struct lchflags_args *uap)
2758{
2759
2760	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2761	    uap->flags, AT_SYMLINK_NOFOLLOW));
2762}
2763
2764static int
2765kern_chflagsat(struct thread *td, int fd, const char *path,
2766    enum uio_seg pathseg, u_long flags, int atflag)
2767{
2768	struct nameidata nd;
2769	int error;
2770
2771	AUDIT_ARG_FFLAGS(flags);
2772	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW |
2773	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
2774	    fd, &cap_fchflags_rights, td);
2775	if ((error = namei(&nd)) != 0)
2776		return (error);
2777	NDFREE_NOTHING(&nd);
2778	error = setfflags(td, nd.ni_vp, flags);
2779	vrele(nd.ni_vp);
2780	return (error);
2781}
2782
2783/*
2784 * Change flags of a file given a file descriptor.
2785 */
2786#ifndef _SYS_SYSPROTO_H_
2787struct fchflags_args {
2788	int	fd;
2789	u_long	flags;
2790};
2791#endif
2792int
2793sys_fchflags(struct thread *td, struct fchflags_args *uap)
2794{
2795	struct file *fp;
2796	int error;
2797
2798	AUDIT_ARG_FD(uap->fd);
2799	AUDIT_ARG_FFLAGS(uap->flags);
2800	error = getvnode(td, uap->fd, &cap_fchflags_rights,
2801	    &fp);
2802	if (error != 0)
2803		return (error);
2804#ifdef AUDIT
2805	if (AUDITING_TD(td)) {
2806		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2807		AUDIT_ARG_VNODE1(fp->f_vnode);
2808		VOP_UNLOCK(fp->f_vnode);
2809	}
2810#endif
2811	error = setfflags(td, fp->f_vnode, uap->flags);
2812	fdrop(fp, td);
2813	return (error);
2814}
2815
2816/*
2817 * Common implementation code for chmod(), lchmod() and fchmod().
2818 */
2819int
2820setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
2821{
2822	struct mount *mp;
2823	struct vattr vattr;
2824	int error;
2825
2826	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2827		return (error);
2828	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2829	VATTR_NULL(&vattr);
2830	vattr.va_mode = mode & ALLPERMS;
2831#ifdef MAC
2832	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2833	if (error == 0)
2834#endif
2835		error = VOP_SETATTR(vp, &vattr, cred);
2836	VOP_UNLOCK(vp);
2837	vn_finished_write(mp);
2838	return (error);
2839}
2840
2841/*
2842 * Change mode of a file given path name.
2843 */
2844#ifndef _SYS_SYSPROTO_H_
2845struct chmod_args {
2846	char	*path;
2847	int	mode;
2848};
2849#endif
2850int
2851sys_chmod(struct thread *td, struct chmod_args *uap)
2852{
2853
2854	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2855	    uap->mode, 0));
2856}
2857
2858#ifndef _SYS_SYSPROTO_H_
2859struct fchmodat_args {
2860	int	dirfd;
2861	char	*path;
2862	mode_t	mode;
2863	int	flag;
2864}
2865#endif
2866int
2867sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2868{
2869
2870	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2871	    AT_EMPTY_PATH)) != 0)
2872		return (EINVAL);
2873
2874	return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE,
2875	    uap->mode, uap->flag));
2876}
2877
2878/*
2879 * Change mode of a file given path name (don't follow links.)
2880 */
2881#ifndef _SYS_SYSPROTO_H_
2882struct lchmod_args {
2883	char	*path;
2884	int	mode;
2885};
2886#endif
2887int
2888sys_lchmod(struct thread *td, struct lchmod_args *uap)
2889{
2890
2891	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2892	    uap->mode, AT_SYMLINK_NOFOLLOW));
2893}
2894
2895int
2896kern_fchmodat(struct thread *td, int fd, const char *path,
2897    enum uio_seg pathseg, mode_t mode, int flag)
2898{
2899	struct nameidata nd;
2900	int error;
2901
2902	AUDIT_ARG_MODE(mode);
2903	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
2904	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
2905	    fd, &cap_fchmod_rights, td);
2906	if ((error = namei(&nd)) != 0)
2907		return (error);
2908	NDFREE_NOTHING(&nd);
2909	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2910	vrele(nd.ni_vp);
2911	return (error);
2912}
2913
2914/*
2915 * Change mode of a file given a file descriptor.
2916 */
2917#ifndef _SYS_SYSPROTO_H_
2918struct fchmod_args {
2919	int	fd;
2920	int	mode;
2921};
2922#endif
2923int
2924sys_fchmod(struct thread *td, struct fchmod_args *uap)
2925{
2926	struct file *fp;
2927	int error;
2928
2929	AUDIT_ARG_FD(uap->fd);
2930	AUDIT_ARG_MODE(uap->mode);
2931
2932	error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
2933	if (error != 0)
2934		return (error);
2935	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2936	fdrop(fp, td);
2937	return (error);
2938}
2939
2940/*
2941 * Common implementation for chown(), lchown(), and fchown()
2942 */
2943int
2944setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
2945    gid_t gid)
2946{
2947	struct mount *mp;
2948	struct vattr vattr;
2949	int error;
2950
2951	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2952		return (error);
2953	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2954	VATTR_NULL(&vattr);
2955	vattr.va_uid = uid;
2956	vattr.va_gid = gid;
2957#ifdef MAC
2958	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2959	    vattr.va_gid);
2960	if (error == 0)
2961#endif
2962		error = VOP_SETATTR(vp, &vattr, cred);
2963	VOP_UNLOCK(vp);
2964	vn_finished_write(mp);
2965	return (error);
2966}
2967
2968/*
2969 * Set ownership given a path name.
2970 */
2971#ifndef _SYS_SYSPROTO_H_
2972struct chown_args {
2973	char	*path;
2974	int	uid;
2975	int	gid;
2976};
2977#endif
2978int
2979sys_chown(struct thread *td, struct chown_args *uap)
2980{
2981
2982	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
2983	    uap->gid, 0));
2984}
2985
2986#ifndef _SYS_SYSPROTO_H_
2987struct fchownat_args {
2988	int fd;
2989	const char * path;
2990	uid_t uid;
2991	gid_t gid;
2992	int flag;
2993};
2994#endif
2995int
2996sys_fchownat(struct thread *td, struct fchownat_args *uap)
2997{
2998
2999	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
3000	    AT_EMPTY_PATH)) != 0)
3001		return (EINVAL);
3002
3003	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3004	    uap->gid, uap->flag));
3005}
3006
3007int
3008kern_fchownat(struct thread *td, int fd, const char *path,
3009    enum uio_seg pathseg, int uid, int gid, int flag)
3010{
3011	struct nameidata nd;
3012	int error;
3013
3014	AUDIT_ARG_OWNER(uid, gid);
3015	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
3016	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
3017	    fd, &cap_fchown_rights, td);
3018
3019	if ((error = namei(&nd)) != 0)
3020		return (error);
3021	NDFREE_NOTHING(&nd);
3022	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3023	vrele(nd.ni_vp);
3024	return (error);
3025}
3026
3027/*
3028 * Set ownership given a path name, do not cross symlinks.
3029 */
3030#ifndef _SYS_SYSPROTO_H_
3031struct lchown_args {
3032	char	*path;
3033	int	uid;
3034	int	gid;
3035};
3036#endif
3037int
3038sys_lchown(struct thread *td, struct lchown_args *uap)
3039{
3040
3041	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3042	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
3043}
3044
3045/*
3046 * Set ownership given a file descriptor.
3047 */
3048#ifndef _SYS_SYSPROTO_H_
3049struct fchown_args {
3050	int	fd;
3051	int	uid;
3052	int	gid;
3053};
3054#endif
3055int
3056sys_fchown(struct thread *td, struct fchown_args *uap)
3057{
3058	struct file *fp;
3059	int error;
3060
3061	AUDIT_ARG_FD(uap->fd);
3062	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3063	error = fget(td, uap->fd, &cap_fchown_rights, &fp);
3064	if (error != 0)
3065		return (error);
3066	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3067	fdrop(fp, td);
3068	return (error);
3069}
3070
3071/*
3072 * Common implementation code for utimes(), lutimes(), and futimes().
3073 */
3074static int
3075getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
3076    struct timespec *tsp)
3077{
3078	struct timeval tv[2];
3079	const struct timeval *tvp;
3080	int error;
3081
3082	if (usrtvp == NULL) {
3083		vfs_timestamp(&tsp[0]);
3084		tsp[1] = tsp[0];
3085	} else {
3086		if (tvpseg == UIO_SYSSPACE) {
3087			tvp = usrtvp;
3088		} else {
3089			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3090				return (error);
3091			tvp = tv;
3092		}
3093
3094		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3095		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3096			return (EINVAL);
3097		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3098		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3099	}
3100	return (0);
3101}
3102
3103/*
3104 * Common implementation code for futimens(), utimensat().
3105 */
3106#define	UTIMENS_NULL	0x1
3107#define	UTIMENS_EXIT	0x2
3108static int
3109getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
3110    struct timespec *tsp, int *retflags)
3111{
3112	struct timespec tsnow;
3113	int error;
3114
3115	vfs_timestamp(&tsnow);
3116	*retflags = 0;
3117	if (usrtsp == NULL) {
3118		tsp[0] = tsnow;
3119		tsp[1] = tsnow;
3120		*retflags |= UTIMENS_NULL;
3121		return (0);
3122	}
3123	if (tspseg == UIO_SYSSPACE) {
3124		tsp[0] = usrtsp[0];
3125		tsp[1] = usrtsp[1];
3126	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
3127		return (error);
3128	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
3129		*retflags |= UTIMENS_EXIT;
3130	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
3131		*retflags |= UTIMENS_NULL;
3132	if (tsp[0].tv_nsec == UTIME_OMIT)
3133		tsp[0].tv_sec = VNOVAL;
3134	else if (tsp[0].tv_nsec == UTIME_NOW)
3135		tsp[0] = tsnow;
3136	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
3137		return (EINVAL);
3138	if (tsp[1].tv_nsec == UTIME_OMIT)
3139		tsp[1].tv_sec = VNOVAL;
3140	else if (tsp[1].tv_nsec == UTIME_NOW)
3141		tsp[1] = tsnow;
3142	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
3143		return (EINVAL);
3144
3145	return (0);
3146}
3147
3148/*
3149 * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
3150 * and utimensat().
3151 */
3152static int
3153setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
3154    int numtimes, int nullflag)
3155{
3156	struct mount *mp;
3157	struct vattr vattr;
3158	int error, setbirthtime;
3159
3160	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3161		return (error);
3162	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3163	setbirthtime = 0;
3164	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3165	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3166		setbirthtime = 1;
3167	VATTR_NULL(&vattr);
3168	vattr.va_atime = ts[0];
3169	vattr.va_mtime = ts[1];
3170	if (setbirthtime)
3171		vattr.va_birthtime = ts[1];
3172	if (numtimes > 2)
3173		vattr.va_birthtime = ts[2];
3174	if (nullflag)
3175		vattr.va_vaflags |= VA_UTIMES_NULL;
3176#ifdef MAC
3177	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3178	    vattr.va_mtime);
3179#endif
3180	if (error == 0)
3181		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3182	VOP_UNLOCK(vp);
3183	vn_finished_write(mp);
3184	return (error);
3185}
3186
3187/*
3188 * Set the access and modification times of a file.
3189 */
3190#ifndef _SYS_SYSPROTO_H_
3191struct utimes_args {
3192	char	*path;
3193	struct	timeval *tptr;
3194};
3195#endif
3196int
3197sys_utimes(struct thread *td, struct utimes_args *uap)
3198{
3199
3200	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3201	    uap->tptr, UIO_USERSPACE));
3202}
3203
3204#ifndef _SYS_SYSPROTO_H_
3205struct futimesat_args {
3206	int fd;
3207	const char * path;
3208	const struct timeval * times;
3209};
3210#endif
3211int
3212sys_futimesat(struct thread *td, struct futimesat_args *uap)
3213{
3214
3215	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3216	    uap->times, UIO_USERSPACE));
3217}
3218
3219int
3220kern_utimesat(struct thread *td, int fd, const char *path,
3221    enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg)
3222{
3223	struct nameidata nd;
3224	struct timespec ts[2];
3225	int error;
3226
3227	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3228		return (error);
3229	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3230	    &cap_futimes_rights, td);
3231
3232	if ((error = namei(&nd)) != 0)
3233		return (error);
3234	NDFREE_NOTHING(&nd);
3235	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3236	vrele(nd.ni_vp);
3237	return (error);
3238}
3239
3240/*
3241 * Set the access and modification times of a file.
3242 */
3243#ifndef _SYS_SYSPROTO_H_
3244struct lutimes_args {
3245	char	*path;
3246	struct	timeval *tptr;
3247};
3248#endif
3249int
3250sys_lutimes(struct thread *td, struct lutimes_args *uap)
3251{
3252
3253	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3254	    UIO_USERSPACE));
3255}
3256
3257int
3258kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg,
3259    struct timeval *tptr, enum uio_seg tptrseg)
3260{
3261	struct timespec ts[2];
3262	struct nameidata nd;
3263	int error;
3264
3265	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3266		return (error);
3267	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3268	if ((error = namei(&nd)) != 0)
3269		return (error);
3270	NDFREE_NOTHING(&nd);
3271	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3272	vrele(nd.ni_vp);
3273	return (error);
3274}
3275
3276/*
3277 * Set the access and modification times of a file.
3278 */
3279#ifndef _SYS_SYSPROTO_H_
3280struct futimes_args {
3281	int	fd;
3282	struct	timeval *tptr;
3283};
3284#endif
3285int
3286sys_futimes(struct thread *td, struct futimes_args *uap)
3287{
3288
3289	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3290}
3291
3292int
3293kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3294    enum uio_seg tptrseg)
3295{
3296	struct timespec ts[2];
3297	struct file *fp;
3298	int error;
3299
3300	AUDIT_ARG_FD(fd);
3301	error = getutimes(tptr, tptrseg, ts);
3302	if (error != 0)
3303		return (error);
3304	error = getvnode(td, fd, &cap_futimes_rights, &fp);
3305	if (error != 0)
3306		return (error);
3307#ifdef AUDIT
3308	if (AUDITING_TD(td)) {
3309		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3310		AUDIT_ARG_VNODE1(fp->f_vnode);
3311		VOP_UNLOCK(fp->f_vnode);
3312	}
3313#endif
3314	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3315	fdrop(fp, td);
3316	return (error);
3317}
3318
3319int
3320sys_futimens(struct thread *td, struct futimens_args *uap)
3321{
3322
3323	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3324}
3325
3326int
3327kern_futimens(struct thread *td, int fd, struct timespec *tptr,
3328    enum uio_seg tptrseg)
3329{
3330	struct timespec ts[2];
3331	struct file *fp;
3332	int error, flags;
3333
3334	AUDIT_ARG_FD(fd);
3335	error = getutimens(tptr, tptrseg, ts, &flags);
3336	if (error != 0)
3337		return (error);
3338	if (flags & UTIMENS_EXIT)
3339		return (0);
3340	error = getvnode(td, fd, &cap_futimes_rights, &fp);
3341	if (error != 0)
3342		return (error);
3343#ifdef AUDIT
3344	if (AUDITING_TD(td)) {
3345		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3346		AUDIT_ARG_VNODE1(fp->f_vnode);
3347		VOP_UNLOCK(fp->f_vnode);
3348	}
3349#endif
3350	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3351	fdrop(fp, td);
3352	return (error);
3353}
3354
3355int
3356sys_utimensat(struct thread *td, struct utimensat_args *uap)
3357{
3358
3359	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3360	    uap->times, UIO_USERSPACE, uap->flag));
3361}
3362
3363int
3364kern_utimensat(struct thread *td, int fd, const char *path,
3365    enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
3366    int flag)
3367{
3368	struct nameidata nd;
3369	struct timespec ts[2];
3370	int error, flags;
3371
3372	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
3373	    AT_EMPTY_PATH)) != 0)
3374		return (EINVAL);
3375
3376	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3377		return (error);
3378	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
3379	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1,
3380	    pathseg, path, fd, &cap_futimes_rights, td);
3381	if ((error = namei(&nd)) != 0)
3382		return (error);
3383	/*
3384	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3385	 * POSIX states:
3386	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3387	 * "Search permission is denied by a component of the path prefix."
3388	 */
3389	NDFREE_NOTHING(&nd);
3390	if ((flags & UTIMENS_EXIT) == 0)
3391		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3392	vrele(nd.ni_vp);
3393	return (error);
3394}
3395
3396/*
3397 * Truncate a file given its path name.
3398 */
3399#ifndef _SYS_SYSPROTO_H_
3400struct truncate_args {
3401	char	*path;
3402	int	pad;
3403	off_t	length;
3404};
3405#endif
3406int
3407sys_truncate(struct thread *td, struct truncate_args *uap)
3408{
3409
3410	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3411}
3412
3413int
3414kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg,
3415    off_t length)
3416{
3417	struct mount *mp;
3418	struct vnode *vp;
3419	void *rl_cookie;
3420	struct vattr vattr;
3421	struct nameidata nd;
3422	int error;
3423
3424	if (length < 0)
3425		return (EINVAL);
3426retry:
3427	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3428	if ((error = namei(&nd)) != 0)
3429		return (error);
3430	vp = nd.ni_vp;
3431	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3432	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3433		vn_rangelock_unlock(vp, rl_cookie);
3434		vrele(vp);
3435		return (error);
3436	}
3437	NDFREE(&nd, NDF_ONLY_PNBUF);
3438	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3439	if (vp->v_type == VDIR)
3440		error = EISDIR;
3441#ifdef MAC
3442	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3443	}
3444#endif
3445	else if ((error = vn_writechk(vp)) == 0 &&
3446	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3447		VATTR_NULL(&vattr);
3448		vattr.va_size = length;
3449		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3450	}
3451	VOP_UNLOCK(vp);
3452	vn_finished_write(mp);
3453	vn_rangelock_unlock(vp, rl_cookie);
3454	vrele(vp);
3455	if (error == ERELOOKUP)
3456		goto retry;
3457	return (error);
3458}
3459
3460#if defined(COMPAT_43)
3461/*
3462 * Truncate a file given its path name.
3463 */
3464#ifndef _SYS_SYSPROTO_H_
3465struct otruncate_args {
3466	char	*path;
3467	long	length;
3468};
3469#endif
3470int
3471otruncate(struct thread *td, struct otruncate_args *uap)
3472{
3473
3474	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3475}
3476#endif /* COMPAT_43 */
3477
3478#if defined(COMPAT_FREEBSD6)
3479/* Versions with the pad argument */
3480int
3481freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3482{
3483
3484	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3485}
3486
3487int
3488freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3489{
3490
3491	return (kern_ftruncate(td, uap->fd, uap->length));
3492}
3493#endif
3494
3495int
3496kern_fsync(struct thread *td, int fd, bool fullsync)
3497{
3498	struct vnode *vp;
3499	struct mount *mp;
3500	struct file *fp;
3501	int error, lock_flags;
3502
3503	AUDIT_ARG_FD(fd);
3504	error = getvnode(td, fd, &cap_fsync_rights, &fp);
3505	if (error != 0)
3506		return (error);
3507	vp = fp->f_vnode;
3508#if 0
3509	if (!fullsync)
3510		/* XXXKIB: compete outstanding aio writes */;
3511#endif
3512retry:
3513	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3514	if (error != 0)
3515		goto drop;
3516	if (MNT_SHARED_WRITES(mp) ||
3517	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3518		lock_flags = LK_SHARED;
3519	} else {
3520		lock_flags = LK_EXCLUSIVE;
3521	}
3522	vn_lock(vp, lock_flags | LK_RETRY);
3523	AUDIT_ARG_VNODE1(vp);
3524	if (vp->v_object != NULL) {
3525		VM_OBJECT_WLOCK(vp->v_object);
3526		vm_object_page_clean(vp->v_object, 0, 0, 0);
3527		VM_OBJECT_WUNLOCK(vp->v_object);
3528	}
3529	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3530	VOP_UNLOCK(vp);
3531	vn_finished_write(mp);
3532	if (error == ERELOOKUP)
3533		goto retry;
3534drop:
3535	fdrop(fp, td);
3536	return (error);
3537}
3538
3539/*
3540 * Sync an open file.
3541 */
3542#ifndef _SYS_SYSPROTO_H_
3543struct fsync_args {
3544	int	fd;
3545};
3546#endif
3547int
3548sys_fsync(struct thread *td, struct fsync_args *uap)
3549{
3550
3551	return (kern_fsync(td, uap->fd, true));
3552}
3553
3554int
3555sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3556{
3557
3558	return (kern_fsync(td, uap->fd, false));
3559}
3560
3561/*
3562 * Rename files.  Source and destination must either both be directories, or
3563 * both not be directories.  If target is a directory, it must be empty.
3564 */
3565#ifndef _SYS_SYSPROTO_H_
3566struct rename_args {
3567	char	*from;
3568	char	*to;
3569};
3570#endif
3571int
3572sys_rename(struct thread *td, struct rename_args *uap)
3573{
3574
3575	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3576	    uap->to, UIO_USERSPACE));
3577}
3578
3579#ifndef _SYS_SYSPROTO_H_
3580struct renameat_args {
3581	int	oldfd;
3582	char	*old;
3583	int	newfd;
3584	char	*new;
3585};
3586#endif
3587int
3588sys_renameat(struct thread *td, struct renameat_args *uap)
3589{
3590
3591	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3592	    UIO_USERSPACE));
3593}
3594
3595#ifdef MAC
3596static int
3597kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd,
3598    const char *new, enum uio_seg pathseg, struct nameidata *fromnd)
3599{
3600	int error;
3601
3602	NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3603	    AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td);
3604	if ((error = namei(fromnd)) != 0)
3605		return (error);
3606	error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp,
3607	    fromnd->ni_vp, &fromnd->ni_cnd);
3608	VOP_UNLOCK(fromnd->ni_dvp);
3609	if (fromnd->ni_dvp != fromnd->ni_vp)
3610		VOP_UNLOCK(fromnd->ni_vp);
3611	if (error != 0) {
3612		NDFREE(fromnd, NDF_ONLY_PNBUF);
3613		vrele(fromnd->ni_dvp);
3614		vrele(fromnd->ni_vp);
3615		if (fromnd->ni_startdir)
3616			vrele(fromnd->ni_startdir);
3617	}
3618	return (error);
3619}
3620#endif
3621
3622int
3623kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
3624    const char *new, enum uio_seg pathseg)
3625{
3626	struct mount *mp = NULL;
3627	struct vnode *tvp, *fvp, *tdvp;
3628	struct nameidata fromnd, tond;
3629	u_int64_t tondflags;
3630	int error;
3631
3632again:
3633	bwillwrite();
3634#ifdef MAC
3635	if (mac_vnode_check_rename_from_enabled()) {
3636		error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg,
3637		    &fromnd);
3638		if (error != 0)
3639			return (error);
3640	} else {
3641#endif
3642	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3643	    pathseg, old, oldfd, &cap_renameat_source_rights, td);
3644	if ((error = namei(&fromnd)) != 0)
3645		return (error);
3646#ifdef MAC
3647	}
3648#endif
3649	fvp = fromnd.ni_vp;
3650	tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2;
3651	if (fromnd.ni_vp->v_type == VDIR)
3652		tondflags |= WILLBEDIR;
3653	NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd,
3654	    &cap_renameat_target_rights, td);
3655	if ((error = namei(&tond)) != 0) {
3656		/* Translate error code for rename("dir1", "dir2/."). */
3657		if (error == EISDIR && fvp->v_type == VDIR)
3658			error = EINVAL;
3659		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3660		vrele(fromnd.ni_dvp);
3661		vrele(fvp);
3662		goto out1;
3663	}
3664	tdvp = tond.ni_dvp;
3665	tvp = tond.ni_vp;
3666	error = vn_start_write(fvp, &mp, V_NOWAIT);
3667	if (error != 0) {
3668		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3669		NDFREE(&tond, NDF_ONLY_PNBUF);
3670		if (tvp != NULL)
3671			vput(tvp);
3672		if (tdvp == tvp)
3673			vrele(tdvp);
3674		else
3675			vput(tdvp);
3676		vrele(fromnd.ni_dvp);
3677		vrele(fvp);
3678		vrele(tond.ni_startdir);
3679		if (fromnd.ni_startdir != NULL)
3680			vrele(fromnd.ni_startdir);
3681		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3682		if (error != 0)
3683			return (error);
3684		goto again;
3685	}
3686	if (tvp != NULL) {
3687		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3688			error = ENOTDIR;
3689			goto out;
3690		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3691			error = EISDIR;
3692			goto out;
3693		}
3694#ifdef CAPABILITIES
3695		if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
3696			/*
3697			 * If the target already exists we require CAP_UNLINKAT
3698			 * from 'newfd', when newfd was used for the lookup.
3699			 */
3700			error = cap_check(&tond.ni_filecaps.fc_rights,
3701			    &cap_unlinkat_rights);
3702			if (error != 0)
3703				goto out;
3704		}
3705#endif
3706	}
3707	if (fvp == tdvp) {
3708		error = EINVAL;
3709		goto out;
3710	}
3711	/*
3712	 * If the source is the same as the destination (that is, if they
3713	 * are links to the same vnode), then there is nothing to do.
3714	 */
3715	if (fvp == tvp)
3716		error = ERESTART;
3717#ifdef MAC
3718	else
3719		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3720		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3721#endif
3722out:
3723	if (error == 0) {
3724		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3725		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3726		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3727		NDFREE(&tond, NDF_ONLY_PNBUF);
3728	} else {
3729		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3730		NDFREE(&tond, NDF_ONLY_PNBUF);
3731		if (tvp != NULL)
3732			vput(tvp);
3733		if (tdvp == tvp)
3734			vrele(tdvp);
3735		else
3736			vput(tdvp);
3737		vrele(fromnd.ni_dvp);
3738		vrele(fvp);
3739	}
3740	vrele(tond.ni_startdir);
3741	vn_finished_write(mp);
3742out1:
3743	if (fromnd.ni_startdir)
3744		vrele(fromnd.ni_startdir);
3745	if (error == ERESTART)
3746		return (0);
3747	if (error == ERELOOKUP)
3748		goto again;
3749	return (error);
3750}
3751
3752/*
3753 * Make a directory file.
3754 */
3755#ifndef _SYS_SYSPROTO_H_
3756struct mkdir_args {
3757	char	*path;
3758	int	mode;
3759};
3760#endif
3761int
3762sys_mkdir(struct thread *td, struct mkdir_args *uap)
3763{
3764
3765	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3766	    uap->mode));
3767}
3768
3769#ifndef _SYS_SYSPROTO_H_
3770struct mkdirat_args {
3771	int	fd;
3772	char	*path;
3773	mode_t	mode;
3774};
3775#endif
3776int
3777sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3778{
3779
3780	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3781}
3782
3783int
3784kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg,
3785    int mode)
3786{
3787	struct mount *mp;
3788	struct vattr vattr;
3789	struct nameidata nd;
3790	int error;
3791
3792	AUDIT_ARG_MODE(mode);
3793restart:
3794	bwillwrite();
3795	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3796	    NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR,
3797	    segflg, path, fd, &cap_mkdirat_rights, td);
3798	if ((error = namei(&nd)) != 0)
3799		return (error);
3800	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3801		NDFREE(&nd, NDF_ONLY_PNBUF);
3802		vput(nd.ni_dvp);
3803		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3804			return (error);
3805		goto restart;
3806	}
3807	VATTR_NULL(&vattr);
3808	vattr.va_type = VDIR;
3809	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask;
3810#ifdef MAC
3811	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3812	    &vattr);
3813	if (error != 0)
3814		goto out;
3815#endif
3816	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3817#ifdef MAC
3818out:
3819#endif
3820	NDFREE(&nd, NDF_ONLY_PNBUF);
3821	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
3822	vn_finished_write(mp);
3823	if (error == ERELOOKUP)
3824		goto restart;
3825	return (error);
3826}
3827
3828/*
3829 * Remove a directory file.
3830 */
3831#ifndef _SYS_SYSPROTO_H_
3832struct rmdir_args {
3833	char	*path;
3834};
3835#endif
3836int
3837sys_rmdir(struct thread *td, struct rmdir_args *uap)
3838{
3839
3840	return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
3841	    0));
3842}
3843
3844int
3845kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,
3846    enum uio_seg pathseg, int flag)
3847{
3848	struct mount *mp;
3849	struct vnode *vp;
3850	struct file *fp;
3851	struct nameidata nd;
3852	cap_rights_t rights;
3853	int error;
3854
3855	fp = NULL;
3856	if (fd != FD_NONE) {
3857		error = getvnode(td, fd, cap_rights_init_one(&rights,
3858		    CAP_LOOKUP), &fp);
3859		if (error != 0)
3860			return (error);
3861	}
3862
3863restart:
3864	bwillwrite();
3865	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
3866	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
3867	    pathseg, path, dfd, &cap_unlinkat_rights, td);
3868	if ((error = namei(&nd)) != 0)
3869		goto fdout;
3870	vp = nd.ni_vp;
3871	if (vp->v_type != VDIR) {
3872		error = ENOTDIR;
3873		goto out;
3874	}
3875	/*
3876	 * No rmdir "." please.
3877	 */
3878	if (nd.ni_dvp == vp) {
3879		error = EINVAL;
3880		goto out;
3881	}
3882	/*
3883	 * The root of a mounted filesystem cannot be deleted.
3884	 */
3885	if (vp->v_vflag & VV_ROOT) {
3886		error = EBUSY;
3887		goto out;
3888	}
3889
3890	if (fp != NULL && fp->f_vnode != vp) {
3891		if (VN_IS_DOOMED(fp->f_vnode))
3892			error = EBADF;
3893		else
3894			error = EDEADLK;
3895		goto out;
3896	}
3897
3898#ifdef MAC
3899	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3900	    &nd.ni_cnd);
3901	if (error != 0)
3902		goto out;
3903#endif
3904	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3905		NDFREE(&nd, NDF_ONLY_PNBUF);
3906		vput(vp);
3907		if (nd.ni_dvp == vp)
3908			vrele(nd.ni_dvp);
3909		else
3910			vput(nd.ni_dvp);
3911		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3912			goto fdout;
3913		goto restart;
3914	}
3915	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3916	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3917	vn_finished_write(mp);
3918out:
3919	NDFREE(&nd, NDF_ONLY_PNBUF);
3920	vput(vp);
3921	if (nd.ni_dvp == vp)
3922		vrele(nd.ni_dvp);
3923	else
3924		vput(nd.ni_dvp);
3925	if (error == ERELOOKUP)
3926		goto restart;
3927fdout:
3928	if (fp != NULL)
3929		fdrop(fp, td);
3930	return (error);
3931}
3932
3933#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
3934int
3935freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
3936    long *basep, void (*func)(struct freebsd11_dirent *))
3937{
3938	struct freebsd11_dirent dstdp;
3939	struct dirent *dp, *edp;
3940	char *dirbuf;
3941	off_t base;
3942	ssize_t resid, ucount;
3943	int error;
3944
3945	/* XXX arbitrary sanity limit on `count'. */
3946	count = min(count, 64 * 1024);
3947
3948	dirbuf = malloc(count, M_TEMP, M_WAITOK);
3949
3950	error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
3951	    UIO_SYSSPACE);
3952	if (error != 0)
3953		goto done;
3954	if (basep != NULL)
3955		*basep = base;
3956
3957	ucount = 0;
3958	for (dp = (struct dirent *)dirbuf,
3959	    edp = (struct dirent *)&dirbuf[count - resid];
3960	    ucount < count && dp < edp; ) {
3961		if (dp->d_reclen == 0)
3962			break;
3963		MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
3964		if (dp->d_namlen >= sizeof(dstdp.d_name))
3965			continue;
3966		dstdp.d_type = dp->d_type;
3967		dstdp.d_namlen = dp->d_namlen;
3968		dstdp.d_fileno = dp->d_fileno;		/* truncate */
3969		if (dstdp.d_fileno != dp->d_fileno) {
3970			switch (ino64_trunc_error) {
3971			default:
3972			case 0:
3973				break;
3974			case 1:
3975				error = EOVERFLOW;
3976				goto done;
3977			case 2:
3978				dstdp.d_fileno = UINT32_MAX;
3979				break;
3980			}
3981		}
3982		dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
3983		    ((dp->d_namlen + 1 + 3) &~ 3);
3984		bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
3985		bzero(dstdp.d_name + dstdp.d_namlen,
3986		    dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
3987		    dstdp.d_namlen);
3988		MPASS(dstdp.d_reclen <= dp->d_reclen);
3989		MPASS(ucount + dstdp.d_reclen <= count);
3990		if (func != NULL)
3991			func(&dstdp);
3992		error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
3993		if (error != 0)
3994			break;
3995		dp = (struct dirent *)((char *)dp + dp->d_reclen);
3996		ucount += dstdp.d_reclen;
3997	}
3998
3999done:
4000	free(dirbuf, M_TEMP);
4001	if (error == 0)
4002		td->td_retval[0] = ucount;
4003	return (error);
4004}
4005#endif /* COMPAT */
4006
4007#ifdef COMPAT_43
4008static void
4009ogetdirentries_cvt(struct freebsd11_dirent *dp)
4010{
4011#if (BYTE_ORDER == LITTLE_ENDIAN)
4012	/*
4013	 * The expected low byte of dp->d_namlen is our dp->d_type.
4014	 * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
4015	 */
4016	dp->d_type = dp->d_namlen;
4017	dp->d_namlen = 0;
4018#else
4019	/*
4020	 * The dp->d_type is the high byte of the expected dp->d_namlen,
4021	 * so must be zero'ed.
4022	 */
4023	dp->d_type = 0;
4024#endif
4025}
4026
4027/*
4028 * Read a block of directory entries in a filesystem independent format.
4029 */
4030#ifndef _SYS_SYSPROTO_H_
4031struct ogetdirentries_args {
4032	int	fd;
4033	char	*buf;
4034	u_int	count;
4035	long	*basep;
4036};
4037#endif
4038int
4039ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
4040{
4041	long loff;
4042	int error;
4043
4044	error = kern_ogetdirentries(td, uap, &loff);
4045	if (error == 0)
4046		error = copyout(&loff, uap->basep, sizeof(long));
4047	return (error);
4048}
4049
4050int
4051kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
4052    long *ploff)
4053{
4054	long base;
4055	int error;
4056
4057	/* XXX arbitrary sanity limit on `count'. */
4058	if (uap->count > 64 * 1024)
4059		return (EINVAL);
4060
4061	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
4062	    &base, ogetdirentries_cvt);
4063
4064	if (error == 0 && uap->basep != NULL)
4065		error = copyout(&base, uap->basep, sizeof(long));
4066
4067	return (error);
4068}
4069#endif /* COMPAT_43 */
4070
4071#if defined(COMPAT_FREEBSD11)
4072#ifndef _SYS_SYSPROTO_H_
4073struct freebsd11_getdirentries_args {
4074	int	fd;
4075	char	*buf;
4076	u_int	count;
4077	long	*basep;
4078};
4079#endif
4080int
4081freebsd11_getdirentries(struct thread *td,
4082    struct freebsd11_getdirentries_args *uap)
4083{
4084	long base;
4085	int error;
4086
4087	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
4088	    &base, NULL);
4089
4090	if (error == 0 && uap->basep != NULL)
4091		error = copyout(&base, uap->basep, sizeof(long));
4092	return (error);
4093}
4094
4095int
4096freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
4097{
4098	struct freebsd11_getdirentries_args ap;
4099
4100	ap.fd = uap->fd;
4101	ap.buf = uap->buf;
4102	ap.count = uap->count;
4103	ap.basep = NULL;
4104	return (freebsd11_getdirentries(td, &ap));
4105}
4106#endif /* COMPAT_FREEBSD11 */
4107
4108/*
4109 * Read a block of directory entries in a filesystem independent format.
4110 */
4111int
4112sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
4113{
4114	off_t base;
4115	int error;
4116
4117	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4118	    NULL, UIO_USERSPACE);
4119	if (error != 0)
4120		return (error);
4121	if (uap->basep != NULL)
4122		error = copyout(&base, uap->basep, sizeof(off_t));
4123	return (error);
4124}
4125
4126int
4127kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
4128    off_t *basep, ssize_t *residp, enum uio_seg bufseg)
4129{
4130	struct vnode *vp;
4131	struct file *fp;
4132	struct uio auio;
4133	struct iovec aiov;
4134	off_t loff;
4135	int error, eofflag;
4136	off_t foffset;
4137
4138	AUDIT_ARG_FD(fd);
4139	if (count > IOSIZE_MAX)
4140		return (EINVAL);
4141	auio.uio_resid = count;
4142	error = getvnode(td, fd, &cap_read_rights, &fp);
4143	if (error != 0)
4144		return (error);
4145	if ((fp->f_flag & FREAD) == 0) {
4146		fdrop(fp, td);
4147		return (EBADF);
4148	}
4149	vp = fp->f_vnode;
4150	foffset = foffset_lock(fp, 0);
4151unionread:
4152	if (vp->v_type != VDIR) {
4153		error = EINVAL;
4154		goto fail;
4155	}
4156	aiov.iov_base = buf;
4157	aiov.iov_len = count;
4158	auio.uio_iov = &aiov;
4159	auio.uio_iovcnt = 1;
4160	auio.uio_rw = UIO_READ;
4161	auio.uio_segflg = bufseg;
4162	auio.uio_td = td;
4163	vn_lock(vp, LK_SHARED | LK_RETRY);
4164	AUDIT_ARG_VNODE1(vp);
4165	loff = auio.uio_offset = foffset;
4166#ifdef MAC
4167	error = mac_vnode_check_readdir(td->td_ucred, vp);
4168	if (error == 0)
4169#endif
4170		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4171		    NULL);
4172	foffset = auio.uio_offset;
4173	if (error != 0) {
4174		VOP_UNLOCK(vp);
4175		goto fail;
4176	}
4177	if (count == auio.uio_resid &&
4178	    (vp->v_vflag & VV_ROOT) &&
4179	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4180		struct vnode *tvp = vp;
4181
4182		vp = vp->v_mount->mnt_vnodecovered;
4183		VREF(vp);
4184		fp->f_vnode = vp;
4185		foffset = 0;
4186		vput(tvp);
4187		goto unionread;
4188	}
4189	VOP_UNLOCK(vp);
4190	*basep = loff;
4191	if (residp != NULL)
4192		*residp = auio.uio_resid;
4193	td->td_retval[0] = count - auio.uio_resid;
4194fail:
4195	foffset_unlock(fp, foffset, 0);
4196	fdrop(fp, td);
4197	return (error);
4198}
4199
4200/*
4201 * Set the mode mask for creation of filesystem nodes.
4202 */
4203#ifndef _SYS_SYSPROTO_H_
4204struct umask_args {
4205	int	newmask;
4206};
4207#endif
4208int
4209sys_umask(struct thread *td, struct umask_args *uap)
4210{
4211	struct pwddesc *pdp;
4212
4213	pdp = td->td_proc->p_pd;
4214	PWDDESC_XLOCK(pdp);
4215	td->td_retval[0] = pdp->pd_cmask;
4216	pdp->pd_cmask = uap->newmask & ALLPERMS;
4217	PWDDESC_XUNLOCK(pdp);
4218	return (0);
4219}
4220
4221/*
4222 * Void all references to file by ripping underlying filesystem away from
4223 * vnode.
4224 */
4225#ifndef _SYS_SYSPROTO_H_
4226struct revoke_args {
4227	char	*path;
4228};
4229#endif
4230int
4231sys_revoke(struct thread *td, struct revoke_args *uap)
4232{
4233	struct vnode *vp;
4234	struct vattr vattr;
4235	struct nameidata nd;
4236	int error;
4237
4238	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4239	    uap->path, td);
4240	if ((error = namei(&nd)) != 0)
4241		return (error);
4242	vp = nd.ni_vp;
4243	NDFREE_NOTHING(&nd);
4244	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4245		error = EINVAL;
4246		goto out;
4247	}
4248#ifdef MAC
4249	error = mac_vnode_check_revoke(td->td_ucred, vp);
4250	if (error != 0)
4251		goto out;
4252#endif
4253	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4254	if (error != 0)
4255		goto out;
4256	if (td->td_ucred->cr_uid != vattr.va_uid) {
4257		error = priv_check(td, PRIV_VFS_ADMIN);
4258		if (error != 0)
4259			goto out;
4260	}
4261	if (devfs_usecount(vp) > 0)
4262		VOP_REVOKE(vp, REVOKEALL);
4263out:
4264	vput(vp);
4265	return (error);
4266}
4267
4268/*
4269 * This variant of getvnode() allows O_PATH files.  Caller should
4270 * ensure that returned file and vnode are only used for compatible
4271 * semantics.
4272 */
4273int
4274getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
4275    struct file **fpp)
4276{
4277	struct file *fp;
4278	int error;
4279
4280	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp);
4281	if (error != 0)
4282		return (error);
4283
4284	/*
4285	 * The file could be not of the vnode type, or it may be not
4286	 * yet fully initialized, in which case the f_vnode pointer
4287	 * may be set, but f_ops is still badfileops.  E.g.,
4288	 * devfs_open() transiently create such situation to
4289	 * facilitate csw d_fdopen().
4290	 *
4291	 * Dupfdopen() handling in kern_openat() installs the
4292	 * half-baked file into the process descriptor table, allowing
4293	 * other thread to dereference it. Guard against the race by
4294	 * checking f_ops.
4295	 */
4296	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4297		fdrop(fp, td);
4298		return (EINVAL);
4299	}
4300
4301	*fpp = fp;
4302	return (0);
4303}
4304
4305/*
4306 * Convert a user file descriptor to a kernel file entry and check
4307 * that, if it is a capability, the correct rights are present.
4308 * A reference on the file entry is held upon returning.
4309 */
4310int
4311getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
4312{
4313	int error;
4314
4315	error = getvnode_path(td, fd, rightsp, fpp);
4316
4317	/*
4318	 * Filter out O_PATH file descriptors, most getvnode() callers
4319	 * do not call fo_ methods.
4320	 */
4321	if (error == 0 && (*fpp)->f_ops == &path_fileops) {
4322		fdrop(*fpp, td);
4323		error = EBADF;
4324	}
4325
4326	return (error);
4327}
4328
4329/*
4330 * Get an (NFS) file handle.
4331 */
4332#ifndef _SYS_SYSPROTO_H_
4333struct lgetfh_args {
4334	char *fname;
4335	fhandle_t *fhp;
4336};
4337#endif
4338int
4339sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
4340{
4341
4342	return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
4343	    UIO_USERSPACE, uap->fhp, UIO_USERSPACE));
4344}
4345
4346#ifndef _SYS_SYSPROTO_H_
4347struct getfh_args {
4348	char *fname;
4349	fhandle_t *fhp;
4350};
4351#endif
4352int
4353sys_getfh(struct thread *td, struct getfh_args *uap)
4354{
4355
4356	return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
4357	    uap->fhp, UIO_USERSPACE));
4358}
4359
4360/*
4361 * syscall for the rpc.lockd to use to translate an open descriptor into
4362 * a NFS file handle.
4363 *
4364 * warning: do not remove the priv_check() call or this becomes one giant
4365 * security hole.
4366 */
4367#ifndef _SYS_SYSPROTO_H_
4368struct getfhat_args {
4369	int fd;
4370	char *path;
4371	fhandle_t *fhp;
4372	int flags;
4373};
4374#endif
4375int
4376sys_getfhat(struct thread *td, struct getfhat_args *uap)
4377{
4378
4379	if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
4380		return (EINVAL);
4381	return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
4382	    uap->fhp, UIO_USERSPACE));
4383}
4384
4385int
4386kern_getfhat(struct thread *td, int flags, int fd, const char *path,
4387    enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg)
4388{
4389	struct nameidata nd;
4390	fhandle_t fh;
4391	struct vnode *vp;
4392	int error;
4393
4394	error = priv_check(td, PRIV_VFS_GETFH);
4395	if (error != 0)
4396		return (error);
4397	NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW |
4398	    AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path,
4399	    fd, td);
4400	error = namei(&nd);
4401	if (error != 0)
4402		return (error);
4403	NDFREE_NOTHING(&nd);
4404	vp = nd.ni_vp;
4405	bzero(&fh, sizeof(fh));
4406	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4407	error = VOP_VPTOFH(vp, &fh.fh_fid);
4408	vput(vp);
4409	if (error == 0) {
4410		if (fhseg == UIO_USERSPACE)
4411			error = copyout(&fh, fhp, sizeof (fh));
4412		else
4413			memcpy(fhp, &fh, sizeof(fh));
4414	}
4415	return (error);
4416}
4417
4418#ifndef _SYS_SYSPROTO_H_
4419struct fhlink_args {
4420	fhandle_t *fhp;
4421	const char *to;
4422};
4423#endif
4424int
4425sys_fhlink(struct thread *td, struct fhlink_args *uap)
4426{
4427
4428	return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
4429}
4430
4431#ifndef _SYS_SYSPROTO_H_
4432struct fhlinkat_args {
4433	fhandle_t *fhp;
4434	int tofd;
4435	const char *to;
4436};
4437#endif
4438int
4439sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
4440{
4441
4442	return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
4443}
4444
4445static int
4446kern_fhlinkat(struct thread *td, int fd, const char *path,
4447    enum uio_seg pathseg, fhandle_t *fhp)
4448{
4449	fhandle_t fh;
4450	struct mount *mp;
4451	struct vnode *vp;
4452	int error;
4453
4454	error = priv_check(td, PRIV_VFS_GETFH);
4455	if (error != 0)
4456		return (error);
4457	error = copyin(fhp, &fh, sizeof(fh));
4458	if (error != 0)
4459		return (error);
4460	do {
4461		bwillwrite();
4462		if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4463			return (ESTALE);
4464		error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
4465		vfs_unbusy(mp);
4466		if (error != 0)
4467			return (error);
4468		VOP_UNLOCK(vp);
4469		error = kern_linkat_vp(td, vp, fd, path, pathseg);
4470	} while (error == EAGAIN || error == ERELOOKUP);
4471	return (error);
4472}
4473
4474#ifndef _SYS_SYSPROTO_H_
4475struct fhreadlink_args {
4476	fhandle_t *fhp;
4477	char *buf;
4478	size_t bufsize;
4479};
4480#endif
4481int
4482sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
4483{
4484	fhandle_t fh;
4485	struct mount *mp;
4486	struct vnode *vp;
4487	int error;
4488
4489	error = priv_check(td, PRIV_VFS_GETFH);
4490	if (error != 0)
4491		return (error);
4492	if (uap->bufsize > IOSIZE_MAX)
4493		return (EINVAL);
4494	error = copyin(uap->fhp, &fh, sizeof(fh));
4495	if (error != 0)
4496		return (error);
4497	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4498		return (ESTALE);
4499	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
4500	vfs_unbusy(mp);
4501	if (error != 0)
4502		return (error);
4503	error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
4504	vput(vp);
4505	return (error);
4506}
4507
4508/*
4509 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4510 * open descriptor.
4511 *
4512 * warning: do not remove the priv_check() call or this becomes one giant
4513 * security hole.
4514 */
4515#ifndef _SYS_SYSPROTO_H_
4516struct fhopen_args {
4517	const struct fhandle *u_fhp;
4518	int flags;
4519};
4520#endif
4521int
4522sys_fhopen(struct thread *td, struct fhopen_args *uap)
4523{
4524	return (kern_fhopen(td, uap->u_fhp, uap->flags));
4525}
4526
4527int
4528kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags)
4529{
4530	struct mount *mp;
4531	struct vnode *vp;
4532	struct fhandle fhp;
4533	struct file *fp;
4534	int fmode, error;
4535	int indx;
4536
4537	error = priv_check(td, PRIV_VFS_FHOPEN);
4538	if (error != 0)
4539		return (error);
4540	indx = -1;
4541	fmode = FFLAGS(flags);
4542	/* why not allow a non-read/write open for our lockd? */
4543	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4544		return (EINVAL);
4545	error = copyin(u_fhp, &fhp, sizeof(fhp));
4546	if (error != 0)
4547		return(error);
4548	/* find the mount point */
4549	mp = vfs_busyfs(&fhp.fh_fsid);
4550	if (mp == NULL)
4551		return (ESTALE);
4552	/* now give me my vnode, it gets returned to me locked */
4553	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4554	vfs_unbusy(mp);
4555	if (error != 0)
4556		return (error);
4557
4558	error = falloc_noinstall(td, &fp);
4559	if (error != 0) {
4560		vput(vp);
4561		return (error);
4562	}
4563	/*
4564	 * An extra reference on `fp' has been held for us by
4565	 * falloc_noinstall().
4566	 */
4567
4568#ifdef INVARIANTS
4569	td->td_dupfd = -1;
4570#endif
4571	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4572	if (error != 0) {
4573		KASSERT(fp->f_ops == &badfileops,
4574		    ("VOP_OPEN in fhopen() set f_ops"));
4575		KASSERT(td->td_dupfd < 0,
4576		    ("fhopen() encountered fdopen()"));
4577
4578		vput(vp);
4579		goto bad;
4580	}
4581#ifdef INVARIANTS
4582	td->td_dupfd = 0;
4583#endif
4584	fp->f_vnode = vp;
4585	finit_vnode(fp, fmode, NULL, &vnops);
4586	VOP_UNLOCK(vp);
4587	if ((fmode & O_TRUNC) != 0) {
4588		error = fo_truncate(fp, 0, td->td_ucred, td);
4589		if (error != 0)
4590			goto bad;
4591	}
4592
4593	error = finstall(td, fp, &indx, fmode, NULL);
4594bad:
4595	fdrop(fp, td);
4596	td->td_retval[0] = indx;
4597	return (error);
4598}
4599
4600/*
4601 * Stat an (NFS) file handle.
4602 */
4603#ifndef _SYS_SYSPROTO_H_
4604struct fhstat_args {
4605	struct fhandle *u_fhp;
4606	struct stat *sb;
4607};
4608#endif
4609int
4610sys_fhstat(struct thread *td, struct fhstat_args *uap)
4611{
4612	struct stat sb;
4613	struct fhandle fh;
4614	int error;
4615
4616	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4617	if (error != 0)
4618		return (error);
4619	error = kern_fhstat(td, fh, &sb);
4620	if (error == 0)
4621		error = copyout(&sb, uap->sb, sizeof(sb));
4622	return (error);
4623}
4624
4625int
4626kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4627{
4628	struct mount *mp;
4629	struct vnode *vp;
4630	int error;
4631
4632	error = priv_check(td, PRIV_VFS_FHSTAT);
4633	if (error != 0)
4634		return (error);
4635	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4636		return (ESTALE);
4637	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4638	vfs_unbusy(mp);
4639	if (error != 0)
4640		return (error);
4641	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
4642	vput(vp);
4643	return (error);
4644}
4645
4646/*
4647 * Implement fstatfs() for (NFS) file handles.
4648 */
4649#ifndef _SYS_SYSPROTO_H_
4650struct fhstatfs_args {
4651	struct fhandle *u_fhp;
4652	struct statfs *buf;
4653};
4654#endif
4655int
4656sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
4657{
4658	struct statfs *sfp;
4659	fhandle_t fh;
4660	int error;
4661
4662	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4663	if (error != 0)
4664		return (error);
4665	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
4666	error = kern_fhstatfs(td, fh, sfp);
4667	if (error == 0)
4668		error = copyout(sfp, uap->buf, sizeof(*sfp));
4669	free(sfp, M_STATFS);
4670	return (error);
4671}
4672
4673int
4674kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4675{
4676	struct mount *mp;
4677	struct vnode *vp;
4678	int error;
4679
4680	error = priv_check(td, PRIV_VFS_FHSTATFS);
4681	if (error != 0)
4682		return (error);
4683	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4684		return (ESTALE);
4685	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4686	if (error != 0) {
4687		vfs_unbusy(mp);
4688		return (error);
4689	}
4690	vput(vp);
4691	error = prison_canseemount(td->td_ucred, mp);
4692	if (error != 0)
4693		goto out;
4694#ifdef MAC
4695	error = mac_mount_check_stat(td->td_ucred, mp);
4696	if (error != 0)
4697		goto out;
4698#endif
4699	error = VFS_STATFS(mp, buf);
4700out:
4701	vfs_unbusy(mp);
4702	return (error);
4703}
4704
4705/*
4706 * Unlike madvise(2), we do not make a best effort to remember every
4707 * possible caching hint.  Instead, we remember the last setting with
4708 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4709 * region of any current setting.
4710 */
4711int
4712kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4713    int advice)
4714{
4715	struct fadvise_info *fa, *new;
4716	struct file *fp;
4717	struct vnode *vp;
4718	off_t end;
4719	int error;
4720
4721	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4722		return (EINVAL);
4723	AUDIT_ARG_VALUE(advice);
4724	switch (advice) {
4725	case POSIX_FADV_SEQUENTIAL:
4726	case POSIX_FADV_RANDOM:
4727	case POSIX_FADV_NOREUSE:
4728		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4729		break;
4730	case POSIX_FADV_NORMAL:
4731	case POSIX_FADV_WILLNEED:
4732	case POSIX_FADV_DONTNEED:
4733		new = NULL;
4734		break;
4735	default:
4736		return (EINVAL);
4737	}
4738	/* XXX: CAP_POSIX_FADVISE? */
4739	AUDIT_ARG_FD(fd);
4740	error = fget(td, fd, &cap_no_rights, &fp);
4741	if (error != 0)
4742		goto out;
4743	AUDIT_ARG_FILE(td->td_proc, fp);
4744	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4745		error = ESPIPE;
4746		goto out;
4747	}
4748	if (fp->f_type != DTYPE_VNODE) {
4749		error = ENODEV;
4750		goto out;
4751	}
4752	vp = fp->f_vnode;
4753	if (vp->v_type != VREG) {
4754		error = ENODEV;
4755		goto out;
4756	}
4757	if (len == 0)
4758		end = OFF_MAX;
4759	else
4760		end = offset + len - 1;
4761	switch (advice) {
4762	case POSIX_FADV_SEQUENTIAL:
4763	case POSIX_FADV_RANDOM:
4764	case POSIX_FADV_NOREUSE:
4765		/*
4766		 * Try to merge any existing non-standard region with
4767		 * this new region if possible, otherwise create a new
4768		 * non-standard region for this request.
4769		 */
4770		mtx_pool_lock(mtxpool_sleep, fp);
4771		fa = fp->f_advice;
4772		if (fa != NULL && fa->fa_advice == advice &&
4773		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4774		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4775		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4776			if (offset < fa->fa_start)
4777				fa->fa_start = offset;
4778			if (end > fa->fa_end)
4779				fa->fa_end = end;
4780		} else {
4781			new->fa_advice = advice;
4782			new->fa_start = offset;
4783			new->fa_end = end;
4784			fp->f_advice = new;
4785			new = fa;
4786		}
4787		mtx_pool_unlock(mtxpool_sleep, fp);
4788		break;
4789	case POSIX_FADV_NORMAL:
4790		/*
4791		 * If a the "normal" region overlaps with an existing
4792		 * non-standard region, trim or remove the
4793		 * non-standard region.
4794		 */
4795		mtx_pool_lock(mtxpool_sleep, fp);
4796		fa = fp->f_advice;
4797		if (fa != NULL) {
4798			if (offset <= fa->fa_start && end >= fa->fa_end) {
4799				new = fa;
4800				fp->f_advice = NULL;
4801			} else if (offset <= fa->fa_start &&
4802			    end >= fa->fa_start)
4803				fa->fa_start = end + 1;
4804			else if (offset <= fa->fa_end && end >= fa->fa_end)
4805				fa->fa_end = offset - 1;
4806			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4807				/*
4808				 * If the "normal" region is a middle
4809				 * portion of the existing
4810				 * non-standard region, just remove
4811				 * the whole thing rather than picking
4812				 * one side or the other to
4813				 * preserve.
4814				 */
4815				new = fa;
4816				fp->f_advice = NULL;
4817			}
4818		}
4819		mtx_pool_unlock(mtxpool_sleep, fp);
4820		break;
4821	case POSIX_FADV_WILLNEED:
4822	case POSIX_FADV_DONTNEED:
4823		error = VOP_ADVISE(vp, offset, end, advice);
4824		break;
4825	}
4826out:
4827	if (fp != NULL)
4828		fdrop(fp, td);
4829	free(new, M_FADVISE);
4830	return (error);
4831}
4832
4833int
4834sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4835{
4836	int error;
4837
4838	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4839	    uap->advice);
4840	return (kern_posix_error(td, error));
4841}
4842
4843int
4844kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
4845    off_t *outoffp, size_t len, unsigned int flags)
4846{
4847	struct file *infp, *outfp;
4848	struct vnode *invp, *outvp;
4849	int error;
4850	size_t retlen;
4851	void *rl_rcookie, *rl_wcookie;
4852	off_t savinoff, savoutoff;
4853
4854	infp = outfp = NULL;
4855	rl_rcookie = rl_wcookie = NULL;
4856	savinoff = -1;
4857	error = 0;
4858	retlen = 0;
4859
4860	if (flags != 0) {
4861		error = EINVAL;
4862		goto out;
4863	}
4864	if (len > SSIZE_MAX)
4865		/*
4866		 * Although the len argument is size_t, the return argument
4867		 * is ssize_t (which is signed).  Therefore a size that won't
4868		 * fit in ssize_t can't be returned.
4869		 */
4870		len = SSIZE_MAX;
4871
4872	/* Get the file structures for the file descriptors. */
4873	error = fget_read(td, infd, &cap_read_rights, &infp);
4874	if (error != 0)
4875		goto out;
4876	if (infp->f_ops == &badfileops) {
4877		error = EBADF;
4878		goto out;
4879	}
4880	if (infp->f_vnode == NULL) {
4881		error = EINVAL;
4882		goto out;
4883	}
4884	error = fget_write(td, outfd, &cap_write_rights, &outfp);
4885	if (error != 0)
4886		goto out;
4887	if (outfp->f_ops == &badfileops) {
4888		error = EBADF;
4889		goto out;
4890	}
4891	if (outfp->f_vnode == NULL) {
4892		error = EINVAL;
4893		goto out;
4894	}
4895
4896	/* Set the offset pointers to the correct place. */
4897	if (inoffp == NULL)
4898		inoffp = &infp->f_offset;
4899	if (outoffp == NULL)
4900		outoffp = &outfp->f_offset;
4901	savinoff = *inoffp;
4902	savoutoff = *outoffp;
4903
4904	invp = infp->f_vnode;
4905	outvp = outfp->f_vnode;
4906	/* Sanity check the f_flag bits. */
4907	if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
4908	    (infp->f_flag & FREAD) == 0) {
4909		error = EBADF;
4910		goto out;
4911	}
4912
4913	/* If len == 0, just return 0. */
4914	if (len == 0)
4915		goto out;
4916
4917	/*
4918	 * If infp and outfp refer to the same file, the byte ranges cannot
4919	 * overlap.
4920	 */
4921	if (invp == outvp && ((savinoff <= savoutoff && savinoff + len >
4922	    savoutoff) || (savinoff > savoutoff && savoutoff + len >
4923	    savinoff))) {
4924		error = EINVAL;
4925		goto out;
4926	}
4927
4928	/* Range lock the byte ranges for both invp and outvp. */
4929	for (;;) {
4930		rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
4931		    len);
4932		rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
4933		    len);
4934		if (rl_rcookie != NULL)
4935			break;
4936		vn_rangelock_unlock(outvp, rl_wcookie);
4937		rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
4938		vn_rangelock_unlock(invp, rl_rcookie);
4939	}
4940
4941	retlen = len;
4942	error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
4943	    flags, infp->f_cred, outfp->f_cred, td);
4944out:
4945	if (rl_rcookie != NULL)
4946		vn_rangelock_unlock(invp, rl_rcookie);
4947	if (rl_wcookie != NULL)
4948		vn_rangelock_unlock(outvp, rl_wcookie);
4949	if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
4950		*inoffp = savinoff;
4951		*outoffp = savoutoff;
4952	}
4953	if (outfp != NULL)
4954		fdrop(outfp, td);
4955	if (infp != NULL)
4956		fdrop(infp, td);
4957	td->td_retval[0] = retlen;
4958	return (error);
4959}
4960
4961int
4962sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
4963{
4964	off_t inoff, outoff, *inoffp, *outoffp;
4965	int error;
4966
4967	inoffp = outoffp = NULL;
4968	if (uap->inoffp != NULL) {
4969		error = copyin(uap->inoffp, &inoff, sizeof(off_t));
4970		if (error != 0)
4971			return (error);
4972		inoffp = &inoff;
4973	}
4974	if (uap->outoffp != NULL) {
4975		error = copyin(uap->outoffp, &outoff, sizeof(off_t));
4976		if (error != 0)
4977			return (error);
4978		outoffp = &outoff;
4979	}
4980	error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
4981	    outoffp, uap->len, uap->flags);
4982	if (error == 0 && uap->inoffp != NULL)
4983		error = copyout(inoffp, uap->inoffp, sizeof(off_t));
4984	if (error == 0 && uap->outoffp != NULL)
4985		error = copyout(outoffp, uap->outoffp, sizeof(off_t));
4986	return (error);
4987}
4988