1/*	$NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $	*/
2
3/*-
4 * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1989, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66 */
67
68/*
69 * Virtual File System System Calls
70 */
71
72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $");
74
75#ifdef _KERNEL_OPT
76#include "opt_fileassoc.h"
77#include "veriexec.h"
78#endif
79
80#include <sys/param.h>
81#include <sys/systm.h>
82#include <sys/namei.h>
83#include <sys/filedesc.h>
84#include <sys/kernel.h>
85#include <sys/file.h>
86#include <sys/fcntl.h>
87#include <sys/stat.h>
88#include <sys/vnode.h>
89#include <sys/mount.h>
90#include <sys/fstrans.h>
91#include <sys/proc.h>
92#include <sys/uio.h>
93#include <sys/kmem.h>
94#include <sys/dirent.h>
95#include <sys/sysctl.h>
96#include <sys/syscallargs.h>
97#include <sys/vfs_syscalls.h>
98#include <sys/quota.h>
99#include <sys/quotactl.h>
100#include <sys/ktrace.h>
101#ifdef FILEASSOC
102#include <sys/fileassoc.h>
103#endif /* FILEASSOC */
104#include <sys/extattr.h>
105#include <sys/verified_exec.h>
106#include <sys/kauth.h>
107#include <sys/atomic.h>
108#include <sys/module.h>
109#include <sys/buf.h>
110#include <sys/event.h>
111#include <sys/compat_stub.h>
112
113#include <miscfs/genfs/genfs.h>
114#include <miscfs/specfs/specdev.h>
115
116#include <nfs/rpcv2.h>
117#include <nfs/nfsproto.h>
118#include <nfs/nfs.h>
119#include <nfs/nfs_var.h>
120
121/* XXX this shouldn't be here */
122#ifndef OFF_T_MAX
123#define OFF_T_MAX __type_max(off_t)
124#endif
125
126static int change_flags(struct vnode *, u_long, struct lwp *);
127static int change_mode(struct vnode *, int, struct lwp *);
128static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131    enum uio_seg);
132static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134    enum uio_seg);
135static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136    enum uio_seg, int);
137static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138    size_t, register_t *);
139static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140
141static int fd_nameiat(struct lwp *, int, struct nameidata *);
142static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143    namei_simple_flags_t, struct vnode **);
144
145/*
146 * This table is used to maintain compatibility with 4.3BSD
147 * and NetBSD 0.9 mount syscalls - and possibly other systems.
148 * Note, the order is important!
149 *
150 * Do not modify this table. It should only contain filesystems
151 * supported by NetBSD 0.9 and 4.3BSD.
152 */
153const char * const mountcompatnames[] = {
154	NULL,		/* 0 = MOUNT_NONE */
155	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156	MOUNT_NFS,	/* 2 */
157	MOUNT_MFS,	/* 3 */
158	MOUNT_MSDOS,	/* 4 */
159	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160	MOUNT_FDESC,	/* 6 */
161	MOUNT_KERNFS,	/* 7 */
162	NULL,		/* 8 = MOUNT_DEVFS */
163	MOUNT_AFS,	/* 9 */
164};
165
166const u_int nmountcompatnames = __arraycount(mountcompatnames);
167
168/*
169 * Filter event method for EVFILT_FS.
170 */
171static struct klist fs_klist;
172static kmutex_t fs_klist_lock;
173
174CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
175CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
176
177void
178vfs_evfilt_fs_init(void)
179{
180	klist_init(&fs_klist);
181	mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
182}
183
184static int
185filt_fsattach(struct knote *kn)
186{
187	mutex_enter(&fs_klist_lock);
188	kn->kn_flags |= EV_CLEAR;
189	klist_insert(&fs_klist, kn);
190	mutex_exit(&fs_klist_lock);
191
192	return 0;
193}
194
195static void
196filt_fsdetach(struct knote *kn)
197{
198	mutex_enter(&fs_klist_lock);
199	klist_remove(&fs_klist, kn);
200	mutex_exit(&fs_klist_lock);
201}
202
203static int
204filt_fs(struct knote *kn, long hint)
205{
206	int rv;
207
208	if (hint & NOTE_SUBMIT) {
209		KASSERT(mutex_owned(&fs_klist_lock));
210		kn->kn_fflags |= hint & ~NOTE_SUBMIT;
211	} else {
212		mutex_enter(&fs_klist_lock);
213	}
214
215	rv = (kn->kn_fflags != 0);
216
217	if ((hint & NOTE_SUBMIT) == 0) {
218		mutex_exit(&fs_klist_lock);
219	}
220
221	return rv;
222}
223
224/* referenced in kern_event.c */
225const struct filterops fs_filtops = {
226	.f_flags = FILTEROP_MPSAFE,
227	.f_attach = filt_fsattach,
228	.f_detach = filt_fsdetach,
229	.f_event = filt_fs,
230};
231
232static int
233fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
234{
235	file_t *dfp;
236	int error;
237
238	if (fdat != AT_FDCWD) {
239		if ((error = fd_getvnode(fdat, &dfp)) != 0)
240			goto out;
241
242		NDAT(ndp, dfp->f_vnode);
243	}
244
245	error = namei(ndp);
246
247	if (fdat != AT_FDCWD)
248		fd_putfile(fdat);
249out:
250	return error;
251}
252
253static int
254fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
255    namei_simple_flags_t sflags, struct vnode **vp_ret)
256{
257	file_t *dfp;
258	struct vnode *dvp;
259	int error;
260
261	if (fdat != AT_FDCWD) {
262		if ((error = fd_getvnode(fdat, &dfp)) != 0)
263			goto out;
264
265		dvp = dfp->f_vnode;
266	} else {
267		dvp = NULL;
268	}
269
270	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
271
272	if (fdat != AT_FDCWD)
273		fd_putfile(fdat);
274out:
275	return error;
276}
277
278static int
279open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
280{
281	int error;
282
283	fp->f_flag = flags & FMASK;
284	fp->f_type = DTYPE_VNODE;
285	fp->f_ops = &vnops;
286	fp->f_vnode = vp;
287
288	if (flags & (O_EXLOCK | O_SHLOCK)) {
289		struct flock lf;
290		int type;
291
292		lf.l_whence = SEEK_SET;
293		lf.l_start = 0;
294		lf.l_len = 0;
295		if (flags & O_EXLOCK)
296			lf.l_type = F_WRLCK;
297		else
298			lf.l_type = F_RDLCK;
299		type = F_FLOCK;
300		if ((flags & FNONBLOCK) == 0)
301			type |= F_WAIT;
302		VOP_UNLOCK(vp);
303		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
304		if (error) {
305			(void) vn_close(vp, fp->f_flag, fp->f_cred);
306			fd_abort(l->l_proc, fp, indx);
307			return error;
308		}
309		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
310		atomic_or_uint(&fp->f_flag, FHASLOCK);
311	}
312	if (flags & O_CLOEXEC)
313		fd_set_exclose(l, indx, true);
314	return 0;
315}
316
317static int
318mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
319    void *data, size_t *data_len)
320{
321	struct mount *mp;
322	int error = 0, saved_flags;
323
324	mp = vp->v_mount;
325	saved_flags = mp->mnt_flag;
326
327	/* We can operate only on VV_ROOT nodes. */
328	if ((vp->v_vflag & VV_ROOT) == 0) {
329		error = EINVAL;
330		goto out;
331	}
332
333	/*
334	 * We only allow the filesystem to be reloaded if it
335	 * is currently mounted read-only.  Additionally, we
336	 * prevent read-write to read-only downgrades.
337	 */
338	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
339	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
340	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
341		error = EOPNOTSUPP;	/* Needs translation */
342		goto out;
343	}
344
345	/*
346	 * Enabling MNT_UNION requires a covered mountpoint and
347	 * must not happen on the root mount.
348	 */
349	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
350		error = EOPNOTSUPP;
351		goto out;
352	}
353
354	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
355	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
356	if (error)
357		goto out;
358
359	error = vfs_suspend(mp, 0);
360	if (error)
361		goto out;
362
363	mutex_enter(mp->mnt_updating);
364
365	mp->mnt_flag &= ~MNT_OP_FLAGS;
366	mp->mnt_flag |= flags & MNT_OP_FLAGS;
367
368	/*
369	 * Set the mount level flags.
370	 */
371	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
372		if ((flags & MNT_RDONLY))
373			mp->mnt_iflag |= IMNT_WANTRDONLY;
374		else
375			mp->mnt_iflag |= IMNT_WANTRDWR;
376	}
377	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
378	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
379	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
380		mp->mnt_flag &= ~MNT_RDONLY;
381
382	error = VFS_MOUNT(mp, path, data, data_len);
383
384	if (error && data != NULL) {
385		int error2;
386
387		/*
388		 * Update failed; let's try and see if it was an
389		 * export request.  For compat with 3.0 and earlier.
390		 */
391		error2 = vfs_hooks_reexport(mp, path, data);
392
393		/*
394		 * Only update error code if the export request was
395		 * understood but some problem occurred while
396		 * processing it.
397		 */
398		if (error2 != EJUSTRETURN)
399			error = error2;
400	}
401
402	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
403		mp->mnt_flag |= MNT_RDONLY;
404	if (error)
405		mp->mnt_flag = saved_flags;
406	mp->mnt_flag &= ~MNT_OP_FLAGS;
407	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
408	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
409		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
410			vfs_syncer_add_to_worklist(mp);
411	} else {
412		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
413			vfs_syncer_remove_from_worklist(mp);
414	}
415	mutex_exit(mp->mnt_updating);
416	vfs_resume(mp);
417
418	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
419	    (flags & MNT_EXTATTR)) {
420		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
421				   NULL, 0, NULL) != 0) {
422			printf("%s: failed to start extattr, error = %d",
423			       mp->mnt_stat.f_mntonname, error);
424			mp->mnt_flag &= ~MNT_EXTATTR;
425		}
426	}
427
428	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
429	    !(flags & MNT_EXTATTR)) {
430		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
431				   NULL, 0, NULL) != 0) {
432			printf("%s: failed to stop extattr, error = %d",
433			       mp->mnt_stat.f_mntonname, error);
434			mp->mnt_flag |= MNT_RDONLY;
435		}
436	}
437 out:
438	return (error);
439}
440
441static int
442mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
443    struct vfsops **vfsops)
444{
445	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
446	int error;
447
448	if (type_seg == UIO_USERSPACE) {
449		/* Copy file-system type from userspace.  */
450		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
451	} else {
452		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
453		KASSERT(error == 0);
454	}
455
456	if (error) {
457		/*
458		 * Historically, filesystem types were identified by numbers.
459		 * If we get an integer for the filesystem type instead of a
460		 * string, we check to see if it matches one of the historic
461		 * filesystem types.
462		 */
463		u_long fsindex = (u_long)fstype;
464		if (fsindex >= nmountcompatnames ||
465		    mountcompatnames[fsindex] == NULL)
466			return ENODEV;
467		strlcpy(fstypename, mountcompatnames[fsindex],
468		    sizeof(fstypename));
469	}
470
471	/* Accept `ufs' as an alias for `ffs', for compatibility. */
472	if (strcmp(fstypename, "ufs") == 0)
473		fstypename[0] = 'f';
474
475	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
476		return 0;
477
478	/* If we can autoload a vfs module, try again */
479	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
480
481	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
482		return 0;
483
484	return ENODEV;
485}
486
487static int
488mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
489    void *data, size_t *data_len)
490{
491	struct mount *mp;
492	int error;
493
494	/* If MNT_GETARGS is specified, it should be the only flag. */
495	if (flags & ~MNT_GETARGS)
496		return EINVAL;
497
498	mp = vp->v_mount;
499
500	/* XXX: probably some notion of "can see" here if we want isolation. */
501	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
502	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
503	if (error)
504		return error;
505
506	if ((vp->v_vflag & VV_ROOT) == 0)
507		return EINVAL;
508
509	if (vfs_busy(mp))
510		return EPERM;
511
512	mutex_enter(mp->mnt_updating);
513	mp->mnt_flag &= ~MNT_OP_FLAGS;
514	mp->mnt_flag |= MNT_GETARGS;
515	error = VFS_MOUNT(mp, path, data, data_len);
516	mp->mnt_flag &= ~MNT_OP_FLAGS;
517	mutex_exit(mp->mnt_updating);
518
519	vfs_unbusy(mp);
520	return (error);
521}
522
523int
524sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
525{
526	/* {
527		syscallarg(const char *) type;
528		syscallarg(const char *) path;
529		syscallarg(int) flags;
530		syscallarg(void *) data;
531		syscallarg(size_t) data_len;
532	} */
533
534	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
535	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
536	    SCARG(uap, data_len), retval);
537}
538
539int
540do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
541    const char *path, int flags, void *data, enum uio_seg data_seg,
542    size_t data_len, register_t *retval)
543{
544	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
545	struct vnode *vp;
546	void *data_buf = data;
547	bool vfsopsrele = false;
548	size_t alloc_sz = 0;
549	int error;
550
551	/*
552	 * Get vnode to be covered
553	 */
554	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
555	if (error != 0) {
556		vp = NULL;
557		goto done;
558	}
559
560	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
561		vfsops = vp->v_mount->mnt_op;
562	} else {
563		/* 'type' is userspace */
564		error = mount_get_vfsops(type, type_seg, &vfsops);
565		if (error != 0)
566			goto done;
567		vfsopsrele = true;
568	}
569
570	/*
571	 * We allow data to be NULL, even for userspace. Some fs's don't need
572	 * it. The others will handle NULL.
573	 */
574	if (data != NULL && data_seg == UIO_USERSPACE) {
575		if (data_len == 0) {
576			/* No length supplied, use default for filesystem */
577			data_len = vfsops->vfs_min_mount_data;
578
579			/*
580			 * Hopefully a longer buffer won't make copyin() fail.
581			 * For compatibility with 3.0 and earlier.
582			 */
583			if (flags & MNT_UPDATE
584			    && data_len < sizeof (struct mnt_export_args30))
585				data_len = sizeof (struct mnt_export_args30);
586		}
587		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
588			error = EINVAL;
589			goto done;
590		}
591		alloc_sz = data_len;
592		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
593
594		/* NFS needs the buffer even for mnt_getargs .... */
595		error = copyin(data, data_buf, data_len);
596		if (error != 0)
597			goto done;
598	}
599
600	if (flags & MNT_GETARGS) {
601		if (data_len == 0) {
602			error = EINVAL;
603			goto done;
604		}
605		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
606		if (error != 0)
607			goto done;
608		if (data_seg == UIO_USERSPACE)
609			error = copyout(data_buf, data, data_len);
610		*retval = data_len;
611	} else if (flags & MNT_UPDATE) {
612		error = mount_update(l, vp, path, flags, data_buf, &data_len);
613	} else {
614		/* Locking is handled internally in mount_domount(). */
615		KASSERT(vfsopsrele == true);
616		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
617		    &data_len);
618		vfsopsrele = false;
619	}
620	if (!error) {
621		mutex_enter(&fs_klist_lock);
622		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
623		mutex_exit(&fs_klist_lock);
624	}
625
626    done:
627	if (vfsopsrele)
628		vfs_delref(vfsops);
629    	if (vp != NULL) {
630	    	vrele(vp);
631	}
632	if (data_buf != data)
633		kmem_free(data_buf, alloc_sz);
634	return (error);
635}
636
637/*
638 * Unmount a file system.
639 *
640 * Note: unmount takes a path to the vnode mounted on as argument,
641 * not special file (as before).
642 */
643/* ARGSUSED */
644int
645sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
646{
647	/* {
648		syscallarg(const char *) path;
649		syscallarg(int) flags;
650	} */
651	struct vnode *vp;
652	struct mount *mp;
653	int error;
654	struct pathbuf *pb;
655	struct nameidata nd;
656
657	error = pathbuf_copyin(SCARG(uap, path), &pb);
658	if (error) {
659		return error;
660	}
661
662	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
663	if ((error = namei(&nd)) != 0) {
664		pathbuf_destroy(pb);
665		return error;
666	}
667	vp = nd.ni_vp;
668	pathbuf_destroy(pb);
669
670	mp = vp->v_mount;
671	vfs_ref(mp);
672	VOP_UNLOCK(vp);
673
674	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
675	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
676	if (error) {
677		vrele(vp);
678		vfs_rele(mp);
679		return (error);
680	}
681
682	/*
683	 * Don't allow unmounting the root file system.
684	 */
685	if (mp->mnt_flag & MNT_ROOTFS) {
686		vrele(vp);
687		vfs_rele(mp);
688		return (EINVAL);
689	}
690
691	/*
692	 * Must be the root of the filesystem
693	 */
694	if ((vp->v_vflag & VV_ROOT) == 0) {
695		vrele(vp);
696		vfs_rele(mp);
697		return (EINVAL);
698	}
699
700	vrele(vp);
701	error = dounmount(mp, SCARG(uap, flags), l);
702	vfs_rele(mp);
703	if (!error) {
704		mutex_enter(&fs_klist_lock);
705		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
706		mutex_exit(&fs_klist_lock);
707	}
708	return error;
709}
710
711/*
712 * Sync each mounted filesystem.
713 */
714#ifdef DEBUG
715int syncprt = 0;
716struct ctldebug debug0 = { "syncprt", &syncprt };
717#endif
718
719void
720do_sys_sync(struct lwp *l)
721{
722	mount_iterator_t *iter;
723	struct mount *mp;
724	int asyncflag;
725
726	mountlist_iterator_init(&iter);
727	while ((mp = mountlist_iterator_next(iter)) != NULL) {
728		mutex_enter(mp->mnt_updating);
729		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
730			asyncflag = mp->mnt_flag & MNT_ASYNC;
731			mp->mnt_flag &= ~MNT_ASYNC;
732			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
733			if (asyncflag)
734				 mp->mnt_flag |= MNT_ASYNC;
735		}
736		mutex_exit(mp->mnt_updating);
737	}
738	mountlist_iterator_destroy(iter);
739#ifdef DEBUG
740	if (syncprt)
741		vfs_bufstats();
742#endif /* DEBUG */
743}
744
745static bool
746sync_vnode_filter(void *cookie, vnode_t *vp)
747{
748
749	if (vp->v_numoutput > 0) {
750		++*(int *)cookie;
751	}
752	return false;
753}
754
755int
756vfs_syncwait(void)
757{
758	int nbusy, nbusy_prev, iter;
759	struct vnode_iterator *vniter;
760	mount_iterator_t *mpiter;
761	struct mount *mp;
762
763	for (nbusy_prev = 0, iter = 0; iter < 20;) {
764		nbusy = 0;
765		mountlist_iterator_init(&mpiter);
766		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
767			vnode_t *vp __diagused;
768			vfs_vnode_iterator_init(mp, &vniter);
769			vp = vfs_vnode_iterator_next(vniter,
770			    sync_vnode_filter, &nbusy);
771			KASSERT(vp == NULL);
772			vfs_vnode_iterator_destroy(vniter);
773		}
774		mountlist_iterator_destroy(mpiter);
775
776		if (nbusy == 0)
777			break;
778		if (nbusy_prev == 0)
779			nbusy_prev = nbusy;
780		printf("%d ", nbusy);
781		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
782		if (nbusy >= nbusy_prev) /* we didn't flush anything */
783			iter++;
784		else
785			nbusy_prev = nbusy;
786	}
787
788	if (nbusy) {
789#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
790		printf("giving up\nPrinting vnodes for busy buffers\n");
791		mountlist_iterator_init(&mpiter);
792		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
793			vnode_t *vp;
794			vfs_vnode_iterator_init(mp, &vniter);
795			vp = vfs_vnode_iterator_next(vniter,
796			    NULL, NULL);
797			mutex_enter(vp->v_interlock);
798			if (vp->v_numoutput > 0)
799				vprint(NULL, vp);
800			mutex_exit(vp->v_interlock);
801			vrele(vp);
802			vfs_vnode_iterator_destroy(vniter);
803		}
804		mountlist_iterator_destroy(mpiter);
805#endif
806	}
807
808	return nbusy;
809}
810
811/* ARGSUSED */
812int
813sys_sync(struct lwp *l, const void *v, register_t *retval)
814{
815	do_sys_sync(l);
816	return (0);
817}
818
819
820/*
821 * Access or change filesystem quotas.
822 *
823 * (this is really 14 different calls bundled into one)
824 */
825
826static int
827do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
828{
829	struct quotastat info_k;
830	int error;
831
832	/* ensure any padding bytes are cleared */
833	memset(&info_k, 0, sizeof(info_k));
834
835	error = vfs_quotactl_stat(mp, &info_k);
836	if (error) {
837		return error;
838	}
839
840	return copyout(&info_k, info_u, sizeof(info_k));
841}
842
843static int
844do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
845    struct quotaidtypestat *info_u)
846{
847	struct quotaidtypestat info_k;
848	int error;
849
850	/* ensure any padding bytes are cleared */
851	memset(&info_k, 0, sizeof(info_k));
852
853	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
854	if (error) {
855		return error;
856	}
857
858	return copyout(&info_k, info_u, sizeof(info_k));
859}
860
861static int
862do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
863    struct quotaobjtypestat *info_u)
864{
865	struct quotaobjtypestat info_k;
866	int error;
867
868	/* ensure any padding bytes are cleared */
869	memset(&info_k, 0, sizeof(info_k));
870
871	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
872	if (error) {
873		return error;
874	}
875
876	return copyout(&info_k, info_u, sizeof(info_k));
877}
878
879static int
880do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
881    struct quotaval *val_u)
882{
883	struct quotakey key_k;
884	struct quotaval val_k;
885	int error;
886
887	/* ensure any padding bytes are cleared */
888	memset(&val_k, 0, sizeof(val_k));
889
890	error = copyin(key_u, &key_k, sizeof(key_k));
891	if (error) {
892		return error;
893	}
894
895	error = vfs_quotactl_get(mp, &key_k, &val_k);
896	if (error) {
897		return error;
898	}
899
900	return copyout(&val_k, val_u, sizeof(val_k));
901}
902
903static int
904do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
905    const struct quotaval *val_u)
906{
907	struct quotakey key_k;
908	struct quotaval val_k;
909	int error;
910
911	error = copyin(key_u, &key_k, sizeof(key_k));
912	if (error) {
913		return error;
914	}
915
916	error = copyin(val_u, &val_k, sizeof(val_k));
917	if (error) {
918		return error;
919	}
920
921	return vfs_quotactl_put(mp, &key_k, &val_k);
922}
923
924static int
925do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
926{
927	struct quotakey key_k;
928	int error;
929
930	error = copyin(key_u, &key_k, sizeof(key_k));
931	if (error) {
932		return error;
933	}
934
935	return vfs_quotactl_del(mp, &key_k);
936}
937
938static int
939do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
940{
941	struct quotakcursor cursor_k;
942	int error;
943
944	/* ensure any padding bytes are cleared */
945	memset(&cursor_k, 0, sizeof(cursor_k));
946
947	error = vfs_quotactl_cursoropen(mp, &cursor_k);
948	if (error) {
949		return error;
950	}
951
952	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
953}
954
955static int
956do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
957{
958	struct quotakcursor cursor_k;
959	int error;
960
961	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
962	if (error) {
963		return error;
964	}
965
966	return vfs_quotactl_cursorclose(mp, &cursor_k);
967}
968
969static int
970do_sys_quotactl_cursorskipidtype(struct mount *mp,
971    struct quotakcursor *cursor_u, int idtype)
972{
973	struct quotakcursor cursor_k;
974	int error;
975
976	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
977	if (error) {
978		return error;
979	}
980
981	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
982	if (error) {
983		return error;
984	}
985
986	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
987}
988
989static int
990do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
991    struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
992    unsigned *ret_u)
993{
994#define CGET_STACK_MAX 8
995	struct quotakcursor cursor_k;
996	struct quotakey stackkeys[CGET_STACK_MAX];
997	struct quotaval stackvals[CGET_STACK_MAX];
998	struct quotakey *keys_k;
999	struct quotaval *vals_k;
1000	unsigned ret_k;
1001	int error;
1002
1003	if (maxnum > 128) {
1004		maxnum = 128;
1005	}
1006
1007	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1008	if (error) {
1009		return error;
1010	}
1011
1012	if (maxnum <= CGET_STACK_MAX) {
1013		keys_k = stackkeys;
1014		vals_k = stackvals;
1015		/* ensure any padding bytes are cleared */
1016		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1017		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1018	} else {
1019		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1020		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1021	}
1022
1023	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1024				       &ret_k);
1025	if (error) {
1026		goto fail;
1027	}
1028
1029	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1030	if (error) {
1031		goto fail;
1032	}
1033
1034	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1035	if (error) {
1036		goto fail;
1037	}
1038
1039	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1040	if (error) {
1041		goto fail;
1042	}
1043
1044	/* do last to maximize the chance of being able to recover a failure */
1045	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1046
1047fail:
1048	if (keys_k != stackkeys) {
1049		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1050	}
1051	if (vals_k != stackvals) {
1052		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1053	}
1054	return error;
1055}
1056
1057static int
1058do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1059    int *ret_u)
1060{
1061	struct quotakcursor cursor_k;
1062	int ret_k;
1063	int error;
1064
1065	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1066	if (error) {
1067		return error;
1068	}
1069
1070	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1071	if (error) {
1072		return error;
1073	}
1074
1075	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1076	if (error) {
1077		return error;
1078	}
1079
1080	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1081}
1082
1083static int
1084do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1085{
1086	struct quotakcursor cursor_k;
1087	int error;
1088
1089	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1090	if (error) {
1091		return error;
1092	}
1093
1094	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1095	if (error) {
1096		return error;
1097	}
1098
1099	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1100}
1101
1102static int
1103do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1104{
1105	char *path_k;
1106	int error;
1107
1108	/* XXX this should probably be a struct pathbuf */
1109	path_k = PNBUF_GET();
1110	error = copyin(path_u, path_k, PATH_MAX);
1111	if (error) {
1112		PNBUF_PUT(path_k);
1113		return error;
1114	}
1115
1116	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1117
1118	PNBUF_PUT(path_k);
1119	return error;
1120}
1121
1122static int
1123do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1124{
1125	return vfs_quotactl_quotaoff(mp, idtype);
1126}
1127
1128int
1129do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1130{
1131	struct mount *mp;
1132	struct vnode *vp;
1133	int error;
1134
1135	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1136	if (error != 0)
1137		return (error);
1138	mp = vp->v_mount;
1139
1140	switch (args->qc_op) {
1141	    case QUOTACTL_STAT:
1142		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1143		break;
1144	    case QUOTACTL_IDTYPESTAT:
1145		error = do_sys_quotactl_idtypestat(mp,
1146				args->u.idtypestat.qc_idtype,
1147				args->u.idtypestat.qc_info);
1148		break;
1149	    case QUOTACTL_OBJTYPESTAT:
1150		error = do_sys_quotactl_objtypestat(mp,
1151				args->u.objtypestat.qc_objtype,
1152				args->u.objtypestat.qc_info);
1153		break;
1154	    case QUOTACTL_GET:
1155		error = do_sys_quotactl_get(mp,
1156				args->u.get.qc_key,
1157				args->u.get.qc_val);
1158		break;
1159	    case QUOTACTL_PUT:
1160		error = do_sys_quotactl_put(mp,
1161				args->u.put.qc_key,
1162				args->u.put.qc_val);
1163		break;
1164	    case QUOTACTL_DEL:
1165		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1166		break;
1167	    case QUOTACTL_CURSOROPEN:
1168		error = do_sys_quotactl_cursoropen(mp,
1169				args->u.cursoropen.qc_cursor);
1170		break;
1171	    case QUOTACTL_CURSORCLOSE:
1172		error = do_sys_quotactl_cursorclose(mp,
1173				args->u.cursorclose.qc_cursor);
1174		break;
1175	    case QUOTACTL_CURSORSKIPIDTYPE:
1176		error = do_sys_quotactl_cursorskipidtype(mp,
1177				args->u.cursorskipidtype.qc_cursor,
1178				args->u.cursorskipidtype.qc_idtype);
1179		break;
1180	    case QUOTACTL_CURSORGET:
1181		error = do_sys_quotactl_cursorget(mp,
1182				args->u.cursorget.qc_cursor,
1183				args->u.cursorget.qc_keys,
1184				args->u.cursorget.qc_vals,
1185				args->u.cursorget.qc_maxnum,
1186				args->u.cursorget.qc_ret);
1187		break;
1188	    case QUOTACTL_CURSORATEND:
1189		error = do_sys_quotactl_cursoratend(mp,
1190				args->u.cursoratend.qc_cursor,
1191				args->u.cursoratend.qc_ret);
1192		break;
1193	    case QUOTACTL_CURSORREWIND:
1194		error = do_sys_quotactl_cursorrewind(mp,
1195				args->u.cursorrewind.qc_cursor);
1196		break;
1197	    case QUOTACTL_QUOTAON:
1198		error = do_sys_quotactl_quotaon(mp,
1199				args->u.quotaon.qc_idtype,
1200				args->u.quotaon.qc_quotafile);
1201		break;
1202	    case QUOTACTL_QUOTAOFF:
1203		error = do_sys_quotactl_quotaoff(mp,
1204				args->u.quotaoff.qc_idtype);
1205		break;
1206	    default:
1207		error = EINVAL;
1208		break;
1209	}
1210
1211	vrele(vp);
1212	return error;
1213}
1214
1215/* ARGSUSED */
1216int
1217sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1218    register_t *retval)
1219{
1220	/* {
1221		syscallarg(const char *) path;
1222		syscallarg(struct quotactl_args *) args;
1223	} */
1224	struct quotactl_args args;
1225	int error;
1226
1227	error = copyin(SCARG(uap, args), &args, sizeof(args));
1228	if (error) {
1229		return error;
1230	}
1231
1232	return do_sys_quotactl(SCARG(uap, path), &args);
1233}
1234
1235int
1236dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1237    int root)
1238{
1239	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1240	bool chrooted;
1241	int error = 0;
1242
1243	KASSERT(l == curlwp);
1244
1245	/*
1246	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1247	 * since it would imply chroots can be escaped.  Just make sure this
1248	 * routine is self-consistent.
1249	 */
1250	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1251
1252	/*
1253	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1254	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1255	 * overrides MNT_NOWAIT.
1256	 */
1257	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1258	    (flags != MNT_WAIT && flags != 0)) {
1259		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1260	} else {
1261		/* Get the filesystem stats now */
1262		memset(sp, 0, sizeof(*sp));
1263		if ((error = VFS_STATVFS(mp, sp)) != 0)
1264			return error;
1265		if (!chrooted)
1266			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1267	}
1268
1269	if (chrooted) {
1270		size_t len;
1271		char *bp;
1272		char c;
1273		char *path = PNBUF_GET();
1274
1275		bp = path + MAXPATHLEN;
1276		*--bp = '\0';
1277		rw_enter(&cwdi->cwdi_lock, RW_READER);
1278		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1279		    MAXPATHLEN / 2, 0, l);
1280		rw_exit(&cwdi->cwdi_lock);
1281		if (error) {
1282			PNBUF_PUT(path);
1283			return error;
1284		}
1285		len = strlen(bp);
1286		if (len != 1) {
1287			/*
1288			 * for mount points that are below our root, we can see
1289			 * them, so we fix up the pathname and return them. The
1290			 * rest we cannot see, so we don't allow viewing the
1291			 * data.
1292			 */
1293			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1294			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1295				(void)strlcpy(sp->f_mntonname,
1296				    c == '\0' ? "/" : &sp->f_mntonname[len],
1297				    sizeof(sp->f_mntonname));
1298			} else {
1299				if (root)
1300					(void)strlcpy(sp->f_mntonname, "/",
1301					    sizeof(sp->f_mntonname));
1302				else
1303					error = EPERM;
1304			}
1305		}
1306		PNBUF_PUT(path);
1307	}
1308	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1309	return error;
1310}
1311
1312/*
1313 * Get filesystem statistics by path.
1314 */
1315int
1316do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1317{
1318	struct mount *mp;
1319	int error;
1320	struct vnode *vp;
1321
1322	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1323	if (error != 0)
1324		return error;
1325	mp = vp->v_mount;
1326	error = dostatvfs(mp, sb, l, flags, 1);
1327	vrele(vp);
1328	return error;
1329}
1330
1331/* ARGSUSED */
1332int
1333sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1334{
1335	/* {
1336		syscallarg(const char *) path;
1337		syscallarg(struct statvfs *) buf;
1338		syscallarg(int) flags;
1339	} */
1340	struct statvfs *sb;
1341	int error;
1342
1343	sb = STATVFSBUF_GET();
1344	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1345	if (error == 0)
1346		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1347	STATVFSBUF_PUT(sb);
1348	return error;
1349}
1350
1351/*
1352 * Get filesystem statistics by fd.
1353 */
1354int
1355do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1356{
1357	file_t *fp;
1358	struct mount *mp;
1359	int error;
1360
1361	/* fd_getvnode() will use the descriptor for us */
1362	if ((error = fd_getvnode(fd, &fp)) != 0)
1363		return (error);
1364	mp = fp->f_vnode->v_mount;
1365	error = dostatvfs(mp, sb, curlwp, flags, 1);
1366	fd_putfile(fd);
1367	return error;
1368}
1369
1370/* ARGSUSED */
1371int
1372sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1373{
1374	/* {
1375		syscallarg(int) fd;
1376		syscallarg(struct statvfs *) buf;
1377		syscallarg(int) flags;
1378	} */
1379	struct statvfs *sb;
1380	int error;
1381
1382	sb = STATVFSBUF_GET();
1383	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1384	if (error == 0)
1385		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1386	STATVFSBUF_PUT(sb);
1387	return error;
1388}
1389
1390
1391/*
1392 * Get statistics on all filesystems.
1393 */
1394int
1395do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1396    int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1397    register_t *retval)
1398{
1399	int root = 0;
1400	mount_iterator_t *iter;
1401	struct proc *p = l->l_proc;
1402	struct mount *mp;
1403	struct statvfs *sb;
1404	size_t count, maxcount;
1405	int error = 0;
1406
1407	sb = STATVFSBUF_GET();
1408	maxcount = bufsize / entry_sz;
1409	count = 0;
1410	mountlist_iterator_init(&iter);
1411	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1412		if (sfsp && count < maxcount) {
1413			error = dostatvfs(mp, sb, l, flags, 0);
1414			if (error) {
1415				error = 0;
1416				continue;
1417			}
1418			error = copyfn(sb, sfsp, entry_sz);
1419			if (error)
1420				goto out;
1421			sfsp = (char *)sfsp + entry_sz;
1422			root |= strcmp(sb->f_mntonname, "/") == 0;
1423		}
1424		count++;
1425	}
1426
1427	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1428		/*
1429		 * fake a root entry
1430		 */
1431		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1432		    sb, l, flags, 1);
1433		if (error != 0)
1434			goto out;
1435		if (sfsp) {
1436			error = copyfn(sb, sfsp, entry_sz);
1437			if (error != 0)
1438				goto out;
1439		}
1440		count++;
1441	}
1442	if (sfsp && count > maxcount)
1443		*retval = maxcount;
1444	else
1445		*retval = count;
1446out:
1447	mountlist_iterator_destroy(iter);
1448	STATVFSBUF_PUT(sb);
1449	return error;
1450}
1451
1452int
1453sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1454    register_t *retval)
1455{
1456	/* {
1457		syscallarg(struct statvfs *) buf;
1458		syscallarg(size_t) bufsize;
1459		syscallarg(int) flags;
1460	} */
1461
1462	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1463	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1464}
1465
1466/*
1467 * Change current working directory to a given file descriptor.
1468 */
1469int
1470do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
1471{
1472	struct proc *p = l->l_proc;
1473	struct cwdinfo *cwdi;
1474	struct vnode *vp, *tdp;
1475	struct mount *mp;
1476	file_t *fp;
1477	int error;
1478
1479	/* fd_getvnode() will use the descriptor for us */
1480	if ((error = fd_getvnode(fd, &fp)) != 0)
1481		return error;
1482	vp = fp->f_vnode;
1483
1484	vref(vp);
1485	vn_lock(vp, LK_SHARED | LK_RETRY);
1486	if (vp->v_type != VDIR)
1487		error = ENOTDIR;
1488	else
1489		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1490	if (error) {
1491		vput(vp);
1492		goto out;
1493	}
1494	while ((mp = vp->v_mountedhere) != NULL) {
1495		error = vfs_busy(mp);
1496		vput(vp);
1497		if (error != 0)
1498			goto out;
1499		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1500		vfs_unbusy(mp);
1501		if (error)
1502			goto out;
1503		vp = tdp;
1504	}
1505	VOP_UNLOCK(vp);
1506
1507	/*
1508	 * Disallow changing to a directory not under the process's
1509	 * current root directory (if there is one).
1510	 */
1511	cwdi = p->p_cwdi;
1512	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1513	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1514		vrele(vp);
1515		error = EPERM;	/* operation not permitted */
1516	} else {
1517		vrele(cwdi->cwdi_cdir);
1518		cwdi->cwdi_cdir = vp;
1519	}
1520	rw_exit(&cwdi->cwdi_lock);
1521
1522out:
1523	fd_putfile(fd);
1524	return error;
1525}
1526
1527/*
1528 * Change current working directory to a given file descriptor.
1529 */
1530/* ARGSUSED */
1531int
1532sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1533{
1534	/* {
1535		syscallarg(int) fd;
1536	} */
1537	return do_sys_fchdir(l, SCARG(uap, fd), retval);
1538}
1539
1540/*
1541 * Change this process's notion of the root directory to a given file
1542 * descriptor.
1543 */
1544int
1545sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1546{
1547	struct vnode	*vp;
1548	file_t	*fp;
1549	int		 error, fd = SCARG(uap, fd);
1550
1551	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1552 	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1553		return error;
1554	/* fd_getvnode() will use the descriptor for us */
1555	if ((error = fd_getvnode(fd, &fp)) != 0)
1556		return error;
1557	vp = fp->f_vnode;
1558	vn_lock(vp, LK_SHARED | LK_RETRY);
1559	if (vp->v_type != VDIR)
1560		error = ENOTDIR;
1561	else
1562		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1563	VOP_UNLOCK(vp);
1564	if (error)
1565		goto out;
1566	vref(vp);
1567	change_root(vp);
1568
1569 out:
1570	fd_putfile(fd);
1571	return (error);
1572}
1573
1574/*
1575 * Change current working directory (``.'').
1576 */
1577int
1578do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
1579    register_t *retval)
1580{
1581	struct proc *p = l->l_proc;
1582	struct cwdinfo * cwdi;
1583	int error;
1584	struct vnode *vp;
1585
1586	if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
1587		return error;
1588	cwdi = p->p_cwdi;
1589	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1590	vrele(cwdi->cwdi_cdir);
1591	cwdi->cwdi_cdir = vp;
1592	rw_exit(&cwdi->cwdi_lock);
1593	return 0;
1594}
1595
1596/*
1597 * Change current working directory (``.'').
1598 */
1599/* ARGSUSED */
1600int
1601sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1602{
1603	/* {
1604		syscallarg(const char *) path;
1605	} */
1606	return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
1607}
1608
1609/*
1610 * Change notion of root (``/'') directory.
1611 */
1612/* ARGSUSED */
1613int
1614sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1615{
1616	/* {
1617		syscallarg(const char *) path;
1618	} */
1619	int error;
1620	struct vnode *vp;
1621
1622	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1623	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1624		return (error);
1625
1626	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1627	if (error == 0)
1628		change_root(vp);
1629	return error;
1630}
1631
1632/*
1633 * Common routine for chroot and fchroot.
1634 * NB: callers need to properly authorize the change root operation.
1635 */
1636void
1637change_root(struct vnode *vp)
1638{
1639	kauth_cred_t ncred;
1640	struct lwp *l = curlwp;
1641	struct proc *p = l->l_proc;
1642	struct cwdinfo *cwdi = p->p_cwdi;
1643
1644	ncred = kauth_cred_alloc();
1645
1646	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1647	if (cwdi->cwdi_rdir != NULL)
1648		vrele(cwdi->cwdi_rdir);
1649	cwdi->cwdi_rdir = vp;
1650
1651	/*
1652	 * Prevent escaping from chroot by putting the root under
1653	 * the working directory.  Silently chdir to / if we aren't
1654	 * already there.
1655	 */
1656	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1657		/*
1658		 * XXX would be more failsafe to change directory to a
1659		 * deadfs node here instead
1660		 */
1661		vrele(cwdi->cwdi_cdir);
1662		vref(vp);
1663		cwdi->cwdi_cdir = vp;
1664	}
1665	rw_exit(&cwdi->cwdi_lock);
1666
1667	/* Get a write lock on the process credential. */
1668	proc_crmod_enter();
1669
1670	kauth_cred_clone(p->p_cred, ncred);
1671	kauth_proc_chroot(ncred, p->p_cwdi);
1672
1673	/* Broadcast our credentials to the process and other LWPs. */
1674 	proc_crmod_leave(ncred, p->p_cred, true);
1675}
1676
1677/*
1678 * Common routine for chroot and chdir.
1679 * XXX "where" should be enum uio_seg
1680 */
1681int
1682chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1683{
1684	struct pathbuf *pb;
1685	struct nameidata nd;
1686	int error;
1687
1688	error = pathbuf_maybe_copyin(path, where, &pb);
1689	if (error) {
1690		return error;
1691	}
1692	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1693	if ((error = namei(&nd)) != 0) {
1694		pathbuf_destroy(pb);
1695		return error;
1696	}
1697	*vpp = nd.ni_vp;
1698	pathbuf_destroy(pb);
1699
1700	if ((*vpp)->v_type != VDIR)
1701		error = ENOTDIR;
1702	else
1703		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1704
1705	if (error)
1706		vput(*vpp);
1707	else
1708		VOP_UNLOCK(*vpp);
1709	return (error);
1710}
1711
1712/*
1713 * Internals of sys_open - path has already been converted into a pathbuf
1714 * (so we can easily reuse this function from other parts of the kernel,
1715 * like posix_spawn post-processing).
1716 */
1717int
1718do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1719	int open_mode, int *fd)
1720{
1721	struct proc *p = l->l_proc;
1722	struct cwdinfo *cwdi = p->p_cwdi;
1723	file_t *fp;
1724	struct vnode *vp;
1725	int dupfd;
1726	bool dupfd_move;
1727	int flags, cmode;
1728	int indx, error;
1729
1730	if (open_flags & O_SEARCH) {
1731		open_flags &= ~(int)O_SEARCH;
1732	}
1733
1734	/*
1735	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1736	 * may be specified.
1737	 */
1738	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1739		return EINVAL;
1740
1741	flags = FFLAGS(open_flags);
1742	if ((flags & (FREAD | FWRITE)) == 0)
1743		return EINVAL;
1744
1745	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1746		return error;
1747	}
1748
1749	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1750	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1751
1752	error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1753	    &vp, &dupfd_move, &dupfd);
1754	if (error != 0) {
1755		fd_abort(p, fp, indx);
1756		return error;
1757	}
1758
1759	if (vp == NULL) {
1760		fd_abort(p, fp, indx);
1761		error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1762		if (error)
1763			return error;
1764		*fd = indx;
1765	} else {
1766		error = open_setfp(l, fp, vp, indx, flags);
1767		if (error)
1768			return error;
1769		VOP_UNLOCK(vp);
1770		*fd = indx;
1771		fd_affix(p, fp, indx);
1772	}
1773
1774	return 0;
1775}
1776
1777int
1778fd_open(const char *path, int open_flags, int open_mode, int *fd)
1779{
1780	struct pathbuf *pb;
1781	int error, oflags;
1782
1783	oflags = FFLAGS(open_flags);
1784	if ((oflags & (FREAD | FWRITE)) == 0)
1785		return EINVAL;
1786
1787	pb = pathbuf_create(path);
1788	if (pb == NULL)
1789		return ENOMEM;
1790
1791	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1792	pathbuf_destroy(pb);
1793
1794	return error;
1795}
1796
1797static int
1798do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1799    int mode, int *fd)
1800{
1801	file_t *dfp = NULL;
1802	struct vnode *dvp = NULL;
1803	struct pathbuf *pb;
1804	const char *pathstring = NULL;
1805	int error;
1806
1807	if (path == NULL) {
1808		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1809		if (error == ENOSYS)
1810			goto no_compat;
1811		if (error)
1812			return error;
1813	} else {
1814no_compat:
1815		error = pathbuf_copyin(path, &pb);
1816		if (error)
1817			return error;
1818	}
1819
1820	pathstring = pathbuf_stringcopy_get(pb);
1821
1822	/*
1823	 * fdat is ignored if:
1824	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1825	 * 2) if path is absolute, then fdat is useless.
1826	 */
1827	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1828		/* fd_getvnode() will use the descriptor for us */
1829		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1830			goto out;
1831
1832		dvp = dfp->f_vnode;
1833	}
1834
1835	error = do_open(l, dvp, pb, flags, mode, fd);
1836
1837	if (dfp != NULL)
1838		fd_putfile(fdat);
1839out:
1840	pathbuf_stringcopy_put(pb, pathstring);
1841	pathbuf_destroy(pb);
1842	return error;
1843}
1844
1845int
1846sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1847{
1848	/* {
1849		syscallarg(const char *) path;
1850		syscallarg(int) flags;
1851		syscallarg(int) mode;
1852	} */
1853	int error;
1854	int fd;
1855
1856	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1857			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1858
1859	if (error == 0)
1860		*retval = fd;
1861
1862	return error;
1863}
1864
1865int
1866sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1867{
1868	/* {
1869		syscallarg(int) fd;
1870		syscallarg(const char *) path;
1871		syscallarg(int) oflags;
1872		syscallarg(int) mode;
1873	} */
1874	int error;
1875	int fd;
1876
1877	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1878			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1879
1880	if (error == 0)
1881		*retval = fd;
1882
1883	return error;
1884}
1885
1886static void
1887vfs__fhfree(fhandle_t *fhp)
1888{
1889	size_t fhsize;
1890
1891	fhsize = FHANDLE_SIZE(fhp);
1892	kmem_free(fhp, fhsize);
1893}
1894
1895/*
1896 * vfs_composefh: compose a filehandle.
1897 */
1898
1899int
1900vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1901{
1902	struct mount *mp;
1903	struct fid *fidp;
1904	int error;
1905	size_t needfhsize;
1906	size_t fidsize;
1907
1908	mp = vp->v_mount;
1909	fidp = NULL;
1910	if (*fh_size < FHANDLE_SIZE_MIN) {
1911		fidsize = 0;
1912	} else {
1913		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1914		if (fhp != NULL) {
1915			memset(fhp, 0, *fh_size);
1916			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1917			fidp = &fhp->fh_fid;
1918		}
1919	}
1920	error = VFS_VPTOFH(vp, fidp, &fidsize);
1921	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1922	if (error == 0 && *fh_size < needfhsize) {
1923		error = E2BIG;
1924	}
1925	*fh_size = needfhsize;
1926	return error;
1927}
1928
1929int
1930vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1931{
1932	struct mount *mp;
1933	fhandle_t *fhp;
1934	size_t fhsize;
1935	size_t fidsize;
1936	int error;
1937
1938	mp = vp->v_mount;
1939	fidsize = 0;
1940	error = VFS_VPTOFH(vp, NULL, &fidsize);
1941	KASSERT(error != 0);
1942	if (error != E2BIG) {
1943		goto out;
1944	}
1945	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1946	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1947	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1948	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1949	if (error == 0) {
1950		KASSERT(FHANDLE_SIZE(fhp) == fhsize);
1951		KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize);
1952		*fhpp = fhp;
1953	} else {
1954		kmem_free(fhp, fhsize);
1955	}
1956out:
1957	return error;
1958}
1959
1960void
1961vfs_composefh_free(fhandle_t *fhp)
1962{
1963
1964	vfs__fhfree(fhp);
1965}
1966
1967/*
1968 * vfs_fhtovp: lookup a vnode by a filehandle.
1969 */
1970
1971int
1972vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1973{
1974	struct mount *mp;
1975	int error;
1976
1977	*vpp = NULL;
1978	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1979	if (mp == NULL) {
1980		error = ESTALE;
1981		goto out;
1982	}
1983	if (mp->mnt_op->vfs_fhtovp == NULL) {
1984		error = EOPNOTSUPP;
1985		goto out;
1986	}
1987	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1988out:
1989	return error;
1990}
1991
1992/*
1993 * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1994 * the needed size.
1995 */
1996
1997int
1998vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1999{
2000	fhandle_t *fhp;
2001	int error;
2002
2003	if (fhsize > FHANDLE_SIZE_MAX) {
2004		return EINVAL;
2005	}
2006	if (fhsize < FHANDLE_SIZE_MIN) {
2007		return EINVAL;
2008	}
2009again:
2010	fhp = kmem_alloc(fhsize, KM_SLEEP);
2011	error = copyin(ufhp, fhp, fhsize);
2012	if (error == 0) {
2013		/* XXX this check shouldn't be here */
2014		if (FHANDLE_SIZE(fhp) == fhsize) {
2015			*fhpp = fhp;
2016			return 0;
2017		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
2018			/*
2019			 * a kludge for nfsv2 padded handles.
2020			 */
2021			size_t sz;
2022
2023			sz = FHANDLE_SIZE(fhp);
2024			kmem_free(fhp, fhsize);
2025			fhsize = sz;
2026			goto again;
2027		} else {
2028			/*
2029			 * userland told us wrong size.
2030			 */
2031		    	error = EINVAL;
2032		}
2033	}
2034	kmem_free(fhp, fhsize);
2035	return error;
2036}
2037
2038void
2039vfs_copyinfh_free(fhandle_t *fhp)
2040{
2041
2042	vfs__fhfree(fhp);
2043}
2044
2045/*
2046 * Get file handle system call
2047 */
2048int
2049sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
2050{
2051	/* {
2052		syscallarg(char *) fname;
2053		syscallarg(fhandle_t *) fhp;
2054		syscallarg(size_t *) fh_size;
2055	} */
2056	struct vnode *vp;
2057	fhandle_t *fh;
2058	int error;
2059	struct pathbuf *pb;
2060	struct nameidata nd;
2061	size_t sz;
2062	size_t usz;
2063
2064	/*
2065	 * Must be super user
2066	 */
2067	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2068	    0, NULL, NULL, NULL);
2069	if (error)
2070		return (error);
2071
2072	error = pathbuf_copyin(SCARG(uap, fname), &pb);
2073	if (error) {
2074		return error;
2075	}
2076	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2077	error = namei(&nd);
2078	if (error) {
2079		pathbuf_destroy(pb);
2080		return error;
2081	}
2082	vp = nd.ni_vp;
2083	pathbuf_destroy(pb);
2084
2085	error = vfs_composefh_alloc(vp, &fh);
2086	vput(vp);
2087	if (error != 0) {
2088		return error;
2089	}
2090	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2091	if (error != 0) {
2092		goto out;
2093	}
2094	sz = FHANDLE_SIZE(fh);
2095	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2096	if (error != 0) {
2097		goto out;
2098	}
2099	if (usz >= sz) {
2100		error = copyout(fh, SCARG(uap, fhp), sz);
2101	} else {
2102		error = E2BIG;
2103	}
2104out:
2105	vfs_composefh_free(fh);
2106	return (error);
2107}
2108
2109/*
2110 * Open a file given a file handle.
2111 *
2112 * Check permissions, allocate an open file structure,
2113 * and call the device open routine if any.
2114 */
2115
2116int
2117dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2118    register_t *retval)
2119{
2120	file_t *fp;
2121	struct vnode *vp = NULL;
2122	kauth_cred_t cred = l->l_cred;
2123	file_t *nfp;
2124	int indx, error;
2125	struct vattr va;
2126	fhandle_t *fh;
2127	int flags;
2128	proc_t *p;
2129
2130	p = curproc;
2131
2132	/*
2133	 * Must be super user
2134	 */
2135	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2136	    0, NULL, NULL, NULL)))
2137		return (error);
2138
2139	if (oflags & O_SEARCH) {
2140		oflags &= ~(int)O_SEARCH;
2141	}
2142
2143	flags = FFLAGS(oflags);
2144	if ((flags & (FREAD | FWRITE)) == 0)
2145		return (EINVAL);
2146	if ((flags & O_CREAT))
2147		return (EINVAL);
2148	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2149		return (error);
2150	fp = nfp;
2151	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2152	if (error != 0) {
2153		goto bad;
2154	}
2155	error = vfs_fhtovp(fh, &vp);
2156	vfs_copyinfh_free(fh);
2157	if (error != 0) {
2158		goto bad;
2159	}
2160
2161	/* Now do an effective vn_open */
2162
2163	if (vp->v_type == VSOCK) {
2164		error = EOPNOTSUPP;
2165		goto bad;
2166	}
2167	error = vn_openchk(vp, cred, flags);
2168	if (error != 0)
2169		goto bad;
2170	if (flags & O_TRUNC) {
2171		VOP_UNLOCK(vp);			/* XXX */
2172		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2173		vattr_null(&va);
2174		va.va_size = 0;
2175		error = VOP_SETATTR(vp, &va, cred);
2176		if (error)
2177			goto bad;
2178	}
2179	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2180		goto bad;
2181	if (flags & FWRITE) {
2182		mutex_enter(vp->v_interlock);
2183		vp->v_writecount++;
2184		mutex_exit(vp->v_interlock);
2185	}
2186
2187	/* done with modified vn_open, now finish what sys_open does. */
2188	if ((error = open_setfp(l, fp, vp, indx, flags)))
2189		return error;
2190
2191	VOP_UNLOCK(vp);
2192	*retval = indx;
2193	fd_affix(p, fp, indx);
2194	return (0);
2195
2196bad:
2197	fd_abort(p, fp, indx);
2198	if (vp != NULL)
2199		vput(vp);
2200	if (error == EDUPFD || error == EMOVEFD) {
2201		/* XXX should probably close curlwp->l_dupfd */
2202		error = EOPNOTSUPP;
2203	}
2204	return (error);
2205}
2206
2207int
2208sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2209{
2210	/* {
2211		syscallarg(const void *) fhp;
2212		syscallarg(size_t) fh_size;
2213		syscallarg(int) flags;
2214	} */
2215
2216	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2217	    SCARG(uap, flags), retval);
2218}
2219
2220int
2221do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2222{
2223	int error;
2224	fhandle_t *fh;
2225	struct vnode *vp;
2226
2227	/*
2228	 * Must be super user
2229	 */
2230	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2231	    0, NULL, NULL, NULL)))
2232		return (error);
2233
2234	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2235	if (error != 0)
2236		return error;
2237
2238	error = vfs_fhtovp(fh, &vp);
2239	vfs_copyinfh_free(fh);
2240	if (error != 0)
2241		return error;
2242
2243	error = vn_stat(vp, sb);
2244	vput(vp);
2245	return error;
2246}
2247
2248
2249/* ARGSUSED */
2250int
2251sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2252{
2253	/* {
2254		syscallarg(const void *) fhp;
2255		syscallarg(size_t) fh_size;
2256		syscallarg(struct stat *) sb;
2257	} */
2258	struct stat sb;
2259	int error;
2260
2261	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2262	if (error)
2263		return error;
2264	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2265}
2266
2267int
2268do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2269    int flags)
2270{
2271	fhandle_t *fh;
2272	struct mount *mp;
2273	struct vnode *vp;
2274	int error;
2275
2276	/*
2277	 * Must be super user
2278	 */
2279	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2280	    0, NULL, NULL, NULL)))
2281		return error;
2282
2283	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2284	if (error != 0)
2285		return error;
2286
2287	error = vfs_fhtovp(fh, &vp);
2288	vfs_copyinfh_free(fh);
2289	if (error != 0)
2290		return error;
2291
2292	mp = vp->v_mount;
2293	error = dostatvfs(mp, sb, l, flags, 1);
2294	vput(vp);
2295	return error;
2296}
2297
2298/* ARGSUSED */
2299int
2300sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2301{
2302	/* {
2303		syscallarg(const void *) fhp;
2304		syscallarg(size_t) fh_size;
2305		syscallarg(struct statvfs *) buf;
2306		syscallarg(int)	flags;
2307	} */
2308	struct statvfs *sb = STATVFSBUF_GET();
2309	int error;
2310
2311	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2312	    SCARG(uap, flags));
2313	if (error == 0)
2314		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2315	STATVFSBUF_PUT(sb);
2316	return error;
2317}
2318
2319int
2320do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2321    dev_t dev)
2322{
2323
2324	/*
2325	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2326	 * in mode and dev=0.
2327	 *
2328	 * In all the other cases it's implementation defined behavior.
2329	 */
2330
2331	if ((mode & S_IFIFO) && dev == 0)
2332		return do_sys_mkfifoat(l, fdat, pathname, mode);
2333	else
2334		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2335		    UIO_USERSPACE);
2336}
2337
2338/*
2339 * Create a special file.
2340 */
2341/* ARGSUSED */
2342int
2343sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2344    register_t *retval)
2345{
2346	/* {
2347		syscallarg(const char *) path;
2348		syscallarg(mode_t) mode;
2349		syscallarg(dev_t) dev;
2350	} */
2351	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2352	    SCARG(uap, mode), SCARG(uap, dev));
2353}
2354
2355int
2356sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2357    register_t *retval)
2358{
2359	/* {
2360		syscallarg(int) fd;
2361		syscallarg(const char *) path;
2362		syscallarg(mode_t) mode;
2363		syscallarg(int) pad;
2364		syscallarg(dev_t) dev;
2365	} */
2366
2367	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2368	    SCARG(uap, mode), SCARG(uap, dev));
2369}
2370
2371int
2372do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2373    enum uio_seg seg)
2374{
2375	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2376}
2377
2378int
2379do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2380    dev_t dev, enum uio_seg seg)
2381{
2382	struct proc *p = l->l_proc;
2383	struct vnode *vp;
2384	struct vattr vattr;
2385	int error, optype;
2386	struct pathbuf *pb;
2387	struct nameidata nd;
2388	const char *pathstring;
2389
2390	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2391	    0, NULL, NULL, NULL)) != 0)
2392		return (error);
2393
2394	optype = VOP_MKNOD_DESCOFFSET;
2395
2396	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2397	if (error) {
2398		return error;
2399	}
2400	pathstring = pathbuf_stringcopy_get(pb);
2401	if (pathstring == NULL) {
2402		pathbuf_destroy(pb);
2403		return ENOMEM;
2404	}
2405
2406	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2407
2408	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2409		goto out;
2410	vp = nd.ni_vp;
2411
2412	if (vp != NULL)
2413		error = EEXIST;
2414	else {
2415		vattr_null(&vattr);
2416		/* We will read cwdi->cwdi_cmask unlocked. */
2417		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2418		vattr.va_rdev = dev;
2419
2420		switch (mode & S_IFMT) {
2421		case S_IFMT:	/* used by badsect to flag bad sectors */
2422			vattr.va_type = VBAD;
2423			break;
2424		case S_IFCHR:
2425			vattr.va_type = VCHR;
2426			break;
2427		case S_IFBLK:
2428			vattr.va_type = VBLK;
2429			break;
2430		case S_IFWHT:
2431			optype = VOP_WHITEOUT_DESCOFFSET;
2432			break;
2433		case S_IFREG:
2434#if NVERIEXEC > 0
2435			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2436			    O_CREAT);
2437#endif /* NVERIEXEC > 0 */
2438			vattr.va_type = VREG;
2439			vattr.va_rdev = VNOVAL;
2440			optype = VOP_CREATE_DESCOFFSET;
2441			break;
2442		default:
2443			error = EINVAL;
2444			break;
2445		}
2446
2447		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2448		    vattr.va_rdev == VNOVAL)
2449			error = EINVAL;
2450	}
2451
2452	if (!error) {
2453		switch (optype) {
2454		case VOP_WHITEOUT_DESCOFFSET:
2455			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2456			if (error)
2457				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2458			vput(nd.ni_dvp);
2459			break;
2460
2461		case VOP_MKNOD_DESCOFFSET:
2462			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2463						&nd.ni_cnd, &vattr);
2464			if (error == 0)
2465				vrele(nd.ni_vp);
2466			vput(nd.ni_dvp);
2467			break;
2468
2469		case VOP_CREATE_DESCOFFSET:
2470			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2471						&nd.ni_cnd, &vattr);
2472			if (error == 0)
2473				vrele(nd.ni_vp);
2474			vput(nd.ni_dvp);
2475			break;
2476		}
2477	} else {
2478		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2479		if (nd.ni_dvp == vp)
2480			vrele(nd.ni_dvp);
2481		else
2482			vput(nd.ni_dvp);
2483		if (vp)
2484			vrele(vp);
2485	}
2486out:
2487	pathbuf_stringcopy_put(pb, pathstring);
2488	pathbuf_destroy(pb);
2489	return (error);
2490}
2491
2492/*
2493 * Create a named pipe.
2494 */
2495/* ARGSUSED */
2496int
2497sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2498{
2499	/* {
2500		syscallarg(const char *) path;
2501		syscallarg(int) mode;
2502	} */
2503	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2504}
2505
2506int
2507sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2508    register_t *retval)
2509{
2510	/* {
2511		syscallarg(int) fd;
2512		syscallarg(const char *) path;
2513		syscallarg(int) mode;
2514	} */
2515
2516	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2517	    SCARG(uap, mode));
2518}
2519
2520static int
2521do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2522{
2523	struct proc *p = l->l_proc;
2524	struct vattr vattr;
2525	int error;
2526	struct pathbuf *pb;
2527	struct nameidata nd;
2528
2529	error = pathbuf_copyin(path, &pb);
2530	if (error) {
2531		return error;
2532	}
2533	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2534
2535	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2536		pathbuf_destroy(pb);
2537		return error;
2538	}
2539	if (nd.ni_vp != NULL) {
2540		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2541		if (nd.ni_dvp == nd.ni_vp)
2542			vrele(nd.ni_dvp);
2543		else
2544			vput(nd.ni_dvp);
2545		vrele(nd.ni_vp);
2546		pathbuf_destroy(pb);
2547		return (EEXIST);
2548	}
2549	vattr_null(&vattr);
2550	vattr.va_type = VFIFO;
2551	/* We will read cwdi->cwdi_cmask unlocked. */
2552	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2553	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2554	if (error == 0)
2555		vrele(nd.ni_vp);
2556	vput(nd.ni_dvp);
2557	pathbuf_destroy(pb);
2558	return (error);
2559}
2560
2561/*
2562 * Make a hard file link.
2563 */
2564/* ARGSUSED */
2565int
2566do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2567    const char *link, int follow, register_t *retval)
2568{
2569	struct vnode *vp;
2570	struct pathbuf *linkpb;
2571	struct nameidata nd;
2572	namei_simple_flags_t ns_flags;
2573	int error;
2574
2575	if (follow & AT_SYMLINK_FOLLOW)
2576		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2577	else
2578		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2579
2580	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2581	if (error != 0)
2582		return (error);
2583	error = pathbuf_copyin(link, &linkpb);
2584	if (error) {
2585		goto out1;
2586	}
2587	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2588	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2589		goto out2;
2590	if (nd.ni_vp) {
2591		error = EEXIST;
2592		goto abortop;
2593	}
2594	/* Prevent hard links on directories. */
2595	if (vp->v_type == VDIR) {
2596		error = EPERM;
2597		goto abortop;
2598	}
2599	/* Prevent cross-mount operation. */
2600	if (nd.ni_dvp->v_mount != vp->v_mount) {
2601		error = EXDEV;
2602		goto abortop;
2603	}
2604	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2605	VOP_UNLOCK(nd.ni_dvp);
2606	vrele(nd.ni_dvp);
2607out2:
2608	pathbuf_destroy(linkpb);
2609out1:
2610	vrele(vp);
2611	return (error);
2612abortop:
2613	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2614	if (nd.ni_dvp == nd.ni_vp)
2615		vrele(nd.ni_dvp);
2616	else
2617		vput(nd.ni_dvp);
2618	if (nd.ni_vp != NULL)
2619		vrele(nd.ni_vp);
2620	goto out2;
2621}
2622
2623int
2624sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2625{
2626	/* {
2627		syscallarg(const char *) path;
2628		syscallarg(const char *) link;
2629	} */
2630	const char *path = SCARG(uap, path);
2631	const char *link = SCARG(uap, link);
2632
2633	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2634	    AT_SYMLINK_FOLLOW, retval);
2635}
2636
2637int
2638sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2639    register_t *retval)
2640{
2641	/* {
2642		syscallarg(int) fd1;
2643		syscallarg(const char *) name1;
2644		syscallarg(int) fd2;
2645		syscallarg(const char *) name2;
2646		syscallarg(int) flags;
2647	} */
2648	int fd1 = SCARG(uap, fd1);
2649	const char *name1 = SCARG(uap, name1);
2650	int fd2 = SCARG(uap, fd2);
2651	const char *name2 = SCARG(uap, name2);
2652	int follow;
2653
2654	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2655
2656	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2657}
2658
2659
2660int
2661do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2662{
2663	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2664}
2665
2666static int
2667do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2668    const char *link, enum uio_seg seg)
2669{
2670	struct proc *p = curproc;
2671	struct vattr vattr;
2672	char *path;
2673	int error;
2674	size_t len;
2675	struct pathbuf *linkpb;
2676	struct nameidata nd;
2677
2678	KASSERT(l != NULL || fdat == AT_FDCWD);
2679
2680	path = PNBUF_GET();
2681	if (seg == UIO_USERSPACE) {
2682		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2683			goto out1;
2684		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2685			goto out1;
2686	} else {
2687		len = strlen(patharg) + 1;
2688		KASSERT(len <= MAXPATHLEN);
2689		memcpy(path, patharg, len);
2690		linkpb = pathbuf_create(link);
2691		if (linkpb == NULL) {
2692			error = ENOMEM;
2693			goto out1;
2694		}
2695	}
2696	ktrkuser("symlink-target", path, len - 1);
2697
2698	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2699	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2700		goto out2;
2701	if (nd.ni_vp) {
2702		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2703		if (nd.ni_dvp == nd.ni_vp)
2704			vrele(nd.ni_dvp);
2705		else
2706			vput(nd.ni_dvp);
2707		vrele(nd.ni_vp);
2708		error = EEXIST;
2709		goto out2;
2710	}
2711	vattr_null(&vattr);
2712	vattr.va_type = VLNK;
2713	/* We will read cwdi->cwdi_cmask unlocked. */
2714	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2715	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2716	if (error == 0)
2717		vrele(nd.ni_vp);
2718	vput(nd.ni_dvp);
2719out2:
2720	pathbuf_destroy(linkpb);
2721out1:
2722	PNBUF_PUT(path);
2723	return (error);
2724}
2725
2726/*
2727 * Make a symbolic link.
2728 */
2729/* ARGSUSED */
2730int
2731sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2732{
2733	/* {
2734		syscallarg(const char *) path;
2735		syscallarg(const char *) link;
2736	} */
2737
2738	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2739	    UIO_USERSPACE);
2740}
2741
2742int
2743sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2744    register_t *retval)
2745{
2746	/* {
2747		syscallarg(const char *) path1;
2748		syscallarg(int) fd;
2749		syscallarg(const char *) path2;
2750	} */
2751
2752	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2753	    SCARG(uap, path2), UIO_USERSPACE);
2754}
2755
2756/*
2757 * Delete a whiteout from the filesystem.
2758 */
2759/* ARGSUSED */
2760int
2761sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2762{
2763	/* {
2764		syscallarg(const char *) path;
2765	} */
2766	int error;
2767	struct pathbuf *pb;
2768	struct nameidata nd;
2769
2770	error = pathbuf_copyin(SCARG(uap, path), &pb);
2771	if (error) {
2772		return error;
2773	}
2774
2775	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2776	error = namei(&nd);
2777	if (error) {
2778		pathbuf_destroy(pb);
2779		return (error);
2780	}
2781
2782	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2783		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2784		if (nd.ni_dvp == nd.ni_vp)
2785			vrele(nd.ni_dvp);
2786		else
2787			vput(nd.ni_dvp);
2788		if (nd.ni_vp)
2789			vrele(nd.ni_vp);
2790		pathbuf_destroy(pb);
2791		return (EEXIST);
2792	}
2793	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2794		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2795	vput(nd.ni_dvp);
2796	pathbuf_destroy(pb);
2797	return (error);
2798}
2799
2800/*
2801 * Delete a name from the filesystem.
2802 */
2803/* ARGSUSED */
2804int
2805sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2806{
2807	/* {
2808		syscallarg(const char *) path;
2809	} */
2810
2811	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2812}
2813
2814int
2815sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2816    register_t *retval)
2817{
2818	/* {
2819		syscallarg(int) fd;
2820		syscallarg(const char *) path;
2821		syscallarg(int) flag;
2822	} */
2823
2824	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2825	    SCARG(uap, flag), UIO_USERSPACE);
2826}
2827
2828int
2829do_sys_unlink(const char *arg, enum uio_seg seg)
2830{
2831	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2832}
2833
2834static int
2835do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2836    enum uio_seg seg)
2837{
2838	struct vnode *vp;
2839	int error;
2840	struct pathbuf *pb;
2841	struct nameidata nd;
2842	const char *pathstring;
2843
2844	KASSERT(l != NULL || fdat == AT_FDCWD);
2845
2846	error = pathbuf_maybe_copyin(arg, seg, &pb);
2847	if (error) {
2848		return error;
2849	}
2850	pathstring = pathbuf_stringcopy_get(pb);
2851	if (pathstring == NULL) {
2852		pathbuf_destroy(pb);
2853		return ENOMEM;
2854	}
2855
2856	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2857	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2858		goto out;
2859	vp = nd.ni_vp;
2860
2861	/*
2862	 * The root of a mounted filesystem cannot be deleted.
2863	 */
2864	if ((vp->v_vflag & VV_ROOT) != 0) {
2865		error = EBUSY;
2866		goto abort;
2867	}
2868
2869	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2870		error = EBUSY;
2871		goto abort;
2872	}
2873
2874	/*
2875	 * No rmdir "." please.
2876	 */
2877	if (nd.ni_dvp == vp) {
2878		error = EINVAL;
2879		goto abort;
2880	}
2881
2882	/*
2883	 * AT_REMOVEDIR is required to remove a directory
2884	 */
2885	if (vp->v_type == VDIR) {
2886		if (!(flags & AT_REMOVEDIR)) {
2887			error = EPERM;
2888			goto abort;
2889		} else {
2890			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2891			vput(nd.ni_dvp);
2892			goto out;
2893		}
2894	}
2895
2896	/*
2897	 * Starting here we only deal with non directories.
2898	 */
2899	if (flags & AT_REMOVEDIR) {
2900		error = ENOTDIR;
2901		goto abort;
2902	}
2903
2904#if NVERIEXEC > 0
2905	/* Handle remove requests for veriexec entries. */
2906	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2907		goto abort;
2908	}
2909#endif /* NVERIEXEC > 0 */
2910
2911#ifdef FILEASSOC
2912	(void)fileassoc_file_delete(vp);
2913#endif /* FILEASSOC */
2914	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2915	vput(nd.ni_dvp);
2916	goto out;
2917
2918abort:
2919	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2920	if (nd.ni_dvp == vp)
2921		vrele(nd.ni_dvp);
2922	else
2923		vput(nd.ni_dvp);
2924	vput(vp);
2925
2926out:
2927	pathbuf_stringcopy_put(pb, pathstring);
2928	pathbuf_destroy(pb);
2929	return (error);
2930}
2931
2932/*
2933 * Reposition read/write file offset.
2934 */
2935int
2936sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2937{
2938	/* {
2939		syscallarg(int) fd;
2940		syscallarg(int) pad;
2941		syscallarg(off_t) offset;
2942		syscallarg(int) whence;
2943	} */
2944	file_t *fp;
2945	int error, fd;
2946
2947	switch (SCARG(uap, whence)) {
2948	case SEEK_CUR:
2949	case SEEK_END:
2950	case SEEK_SET:
2951		break;
2952	default:
2953		return EINVAL;
2954	}
2955
2956	fd = SCARG(uap, fd);
2957
2958	if ((fp = fd_getfile(fd)) == NULL)
2959		return (EBADF);
2960
2961	if (fp->f_ops->fo_seek == NULL) {
2962		error = ESPIPE;
2963		goto out;
2964	}
2965
2966	error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
2967	    SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
2968 out:
2969 	fd_putfile(fd);
2970	return (error);
2971}
2972
2973/*
2974 * Positional read system call.
2975 */
2976int
2977sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2978{
2979	/* {
2980		syscallarg(int) fd;
2981		syscallarg(void *) buf;
2982		syscallarg(size_t) nbyte;
2983		syscallarg(off_t) offset;
2984	} */
2985	file_t *fp;
2986	off_t offset;
2987	int error, fd = SCARG(uap, fd);
2988
2989	if ((fp = fd_getfile(fd)) == NULL)
2990		return (EBADF);
2991
2992	if ((fp->f_flag & FREAD) == 0) {
2993		fd_putfile(fd);
2994		return (EBADF);
2995	}
2996
2997	if (fp->f_ops->fo_seek == NULL) {
2998		error = ESPIPE;
2999		goto out;
3000	}
3001
3002	offset = SCARG(uap, offset);
3003	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3004	if (error)
3005		goto out;
3006
3007	/* dofileread() will unuse the descriptor for us */
3008	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3009	    &offset, 0, retval));
3010
3011 out:
3012	fd_putfile(fd);
3013	return (error);
3014}
3015
3016/*
3017 * Positional scatter read system call.
3018 */
3019int
3020sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
3021{
3022	/* {
3023		syscallarg(int) fd;
3024		syscallarg(const struct iovec *) iovp;
3025		syscallarg(int) iovcnt;
3026		syscallarg(off_t) offset;
3027	} */
3028	off_t offset = SCARG(uap, offset);
3029
3030	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3031	    SCARG(uap, iovcnt), &offset, 0, retval);
3032}
3033
3034/*
3035 * Positional write system call.
3036 */
3037int
3038sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
3039{
3040	/* {
3041		syscallarg(int) fd;
3042		syscallarg(const void *) buf;
3043		syscallarg(size_t) nbyte;
3044		syscallarg(off_t) offset;
3045	} */
3046	file_t *fp;
3047	off_t offset;
3048	int error, fd = SCARG(uap, fd);
3049
3050	if ((fp = fd_getfile(fd)) == NULL)
3051		return (EBADF);
3052
3053	if ((fp->f_flag & FWRITE) == 0) {
3054		fd_putfile(fd);
3055		return (EBADF);
3056	}
3057
3058	if (fp->f_ops->fo_seek == NULL) {
3059		error = ESPIPE;
3060		goto out;
3061	}
3062
3063	offset = SCARG(uap, offset);
3064	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3065	if (error)
3066		goto out;
3067
3068	/* dofilewrite() will unuse the descriptor for us */
3069	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3070	    &offset, 0, retval));
3071
3072 out:
3073	fd_putfile(fd);
3074	return (error);
3075}
3076
3077/*
3078 * Positional gather write system call.
3079 */
3080int
3081sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
3082{
3083	/* {
3084		syscallarg(int) fd;
3085		syscallarg(const struct iovec *) iovp;
3086		syscallarg(int) iovcnt;
3087		syscallarg(off_t) offset;
3088	} */
3089	off_t offset = SCARG(uap, offset);
3090
3091	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3092	    SCARG(uap, iovcnt), &offset, 0, retval);
3093}
3094
3095/*
3096 * Check access permissions.
3097 */
3098int
3099sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
3100{
3101	/* {
3102		syscallarg(const char *) path;
3103		syscallarg(int) flags;
3104	} */
3105
3106	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3107	     SCARG(uap, flags), 0);
3108}
3109
3110int
3111do_sys_accessat(struct lwp *l, int fdat, const char *path,
3112    int mode, int flags)
3113{
3114	kauth_cred_t cred;
3115	struct vnode *vp;
3116	int error, nd_flag, vmode;
3117	struct pathbuf *pb;
3118	struct nameidata nd;
3119
3120	CTASSERT(F_OK == 0);
3121	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3122		/* nonsense mode */
3123		return EINVAL;
3124	}
3125
3126	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3127	if (flags & AT_SYMLINK_NOFOLLOW)
3128		nd_flag &= ~FOLLOW;
3129
3130	error = pathbuf_copyin(path, &pb);
3131	if (error)
3132		return error;
3133
3134	NDINIT(&nd, LOOKUP, nd_flag, pb);
3135
3136	/* Override default credentials */
3137	if (!(flags & AT_EACCESS)) {
3138		cred = kauth_cred_dup(l->l_cred);
3139		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3140		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3141	} else
3142		cred = l->l_cred;
3143	nd.ni_cnd.cn_cred = cred;
3144
3145	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3146		pathbuf_destroy(pb);
3147		goto out;
3148	}
3149	vp = nd.ni_vp;
3150	pathbuf_destroy(pb);
3151
3152	/* Flags == 0 means only check for existence. */
3153	if (mode) {
3154		vmode = 0;
3155		if (mode & R_OK)
3156			vmode |= VREAD;
3157		if (mode & W_OK)
3158			vmode |= VWRITE;
3159		if (mode & X_OK)
3160			vmode |= VEXEC;
3161
3162		error = VOP_ACCESS(vp, vmode, cred);
3163		if (!error && (vmode & VWRITE))
3164			error = vn_writechk(vp);
3165	}
3166	vput(vp);
3167out:
3168	if (!(flags & AT_EACCESS))
3169		kauth_cred_free(cred);
3170	return (error);
3171}
3172
3173int
3174sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3175    register_t *retval)
3176{
3177	/* {
3178		syscallarg(int) fd;
3179		syscallarg(const char *) path;
3180		syscallarg(int) amode;
3181		syscallarg(int) flag;
3182	} */
3183
3184	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3185	     SCARG(uap, amode), SCARG(uap, flag));
3186}
3187
3188/*
3189 * Common code for all sys_stat functions, including compat versions.
3190 */
3191int
3192do_sys_stat(const char *userpath, unsigned int nd_flag,
3193    struct stat *sb)
3194{
3195	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3196}
3197
3198int
3199do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3200    unsigned int nd_flag, struct stat *sb)
3201{
3202	int error;
3203	struct pathbuf *pb;
3204	struct nameidata nd;
3205
3206	KASSERT(l != NULL || fdat == AT_FDCWD);
3207
3208	error = pathbuf_copyin(userpath, &pb);
3209	if (error) {
3210		return error;
3211	}
3212
3213	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3214
3215	error = fd_nameiat(l, fdat, &nd);
3216	if (error != 0) {
3217		pathbuf_destroy(pb);
3218		return error;
3219	}
3220	error = vn_stat(nd.ni_vp, sb);
3221	vput(nd.ni_vp);
3222	pathbuf_destroy(pb);
3223	return error;
3224}
3225
3226/*
3227 * Get file status; this version follows links.
3228 */
3229/* ARGSUSED */
3230int
3231sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3232{
3233	/* {
3234		syscallarg(const char *) path;
3235		syscallarg(struct stat *) ub;
3236	} */
3237	struct stat sb;
3238	int error;
3239
3240	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3241	if (error)
3242		return error;
3243	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3244}
3245
3246/*
3247 * Get file status; this version does not follow links.
3248 */
3249/* ARGSUSED */
3250int
3251sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3252{
3253	/* {
3254		syscallarg(const char *) path;
3255		syscallarg(struct stat *) ub;
3256	} */
3257	struct stat sb;
3258	int error;
3259
3260	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3261	if (error)
3262		return error;
3263	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3264}
3265
3266int
3267sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3268    register_t *retval)
3269{
3270	/* {
3271		syscallarg(int) fd;
3272		syscallarg(const char *) path;
3273		syscallarg(struct stat *) buf;
3274		syscallarg(int) flag;
3275	} */
3276	unsigned int nd_flag;
3277	struct stat sb;
3278	int error;
3279
3280	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3281		nd_flag = NOFOLLOW;
3282	else
3283		nd_flag = FOLLOW;
3284
3285	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3286	    &sb);
3287	if (error)
3288		return error;
3289	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3290}
3291
3292static int
3293kern_pathconf(register_t *retval, const char *path, int name, int flag)
3294{
3295	int error;
3296	struct pathbuf *pb;
3297	struct nameidata nd;
3298
3299	error = pathbuf_copyin(path, &pb);
3300	if (error) {
3301		return error;
3302	}
3303	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3304	if ((error = namei(&nd)) != 0) {
3305		pathbuf_destroy(pb);
3306		return error;
3307	}
3308	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3309	vput(nd.ni_vp);
3310	pathbuf_destroy(pb);
3311	return error;
3312}
3313
3314/*
3315 * Get configurable pathname variables.
3316 */
3317/* ARGSUSED */
3318int
3319sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3320    register_t *retval)
3321{
3322	/* {
3323		syscallarg(const char *) path;
3324		syscallarg(int) name;
3325	} */
3326	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3327	    FOLLOW);
3328}
3329
3330/* ARGSUSED */
3331int
3332sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3333    register_t *retval)
3334{
3335	/* {
3336		syscallarg(const char *) path;
3337		syscallarg(int) name;
3338	} */
3339	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3340	    NOFOLLOW);
3341}
3342
3343/*
3344 * Return target name of a symbolic link.
3345 */
3346/* ARGSUSED */
3347int
3348sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3349    register_t *retval)
3350{
3351	/* {
3352		syscallarg(const char *) path;
3353		syscallarg(char *) buf;
3354		syscallarg(size_t) count;
3355	} */
3356	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3357	    SCARG(uap, buf), SCARG(uap, count), retval);
3358}
3359
3360static int
3361do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3362    size_t count, register_t *retval)
3363{
3364	struct vnode *vp;
3365	struct iovec aiov;
3366	struct uio auio;
3367	int error;
3368	struct pathbuf *pb;
3369	struct nameidata nd;
3370
3371	error = pathbuf_copyin(path, &pb);
3372	if (error) {
3373		return error;
3374	}
3375	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
3376	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3377		pathbuf_destroy(pb);
3378		return error;
3379	}
3380	vp = nd.ni_vp;
3381	pathbuf_destroy(pb);
3382	if (vp->v_type != VLNK)
3383		error = EINVAL;
3384	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3385	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3386		aiov.iov_base = buf;
3387		aiov.iov_len = count;
3388		auio.uio_iov = &aiov;
3389		auio.uio_iovcnt = 1;
3390		auio.uio_offset = 0;
3391		auio.uio_rw = UIO_READ;
3392		KASSERT(l == curlwp);
3393		auio.uio_vmspace = l->l_proc->p_vmspace;
3394		auio.uio_resid = count;
3395		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3396			*retval = count - auio.uio_resid;
3397	}
3398	vput(vp);
3399	return (error);
3400}
3401
3402int
3403sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3404    register_t *retval)
3405{
3406	/* {
3407		syscallarg(int) fd;
3408		syscallarg(const char *) path;
3409		syscallarg(char *) buf;
3410		syscallarg(size_t) bufsize;
3411	} */
3412
3413	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3414	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3415}
3416
3417/*
3418 * Change flags of a file given a path name.
3419 */
3420/* ARGSUSED */
3421int
3422sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3423{
3424	/* {
3425		syscallarg(const char *) path;
3426		syscallarg(u_long) flags;
3427	} */
3428	struct vnode *vp;
3429	int error;
3430
3431	error = namei_simple_user(SCARG(uap, path),
3432				NSM_FOLLOW_TRYEMULROOT, &vp);
3433	if (error != 0)
3434		return (error);
3435	error = change_flags(vp, SCARG(uap, flags), l);
3436	vput(vp);
3437	return (error);
3438}
3439
3440/*
3441 * Change flags of a file given a file descriptor.
3442 */
3443/* ARGSUSED */
3444int
3445sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3446{
3447	/* {
3448		syscallarg(int) fd;
3449		syscallarg(u_long) flags;
3450	} */
3451	struct vnode *vp;
3452	file_t *fp;
3453	int error;
3454
3455	/* fd_getvnode() will use the descriptor for us */
3456	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3457		return (error);
3458	vp = fp->f_vnode;
3459	error = change_flags(vp, SCARG(uap, flags), l);
3460	VOP_UNLOCK(vp);
3461	fd_putfile(SCARG(uap, fd));
3462	return (error);
3463}
3464
3465/*
3466 * Change flags of a file given a path name; this version does
3467 * not follow links.
3468 */
3469int
3470sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3471{
3472	/* {
3473		syscallarg(const char *) path;
3474		syscallarg(u_long) flags;
3475	} */
3476	struct vnode *vp;
3477	int error;
3478
3479	error = namei_simple_user(SCARG(uap, path),
3480				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3481	if (error != 0)
3482		return (error);
3483	error = change_flags(vp, SCARG(uap, flags), l);
3484	vput(vp);
3485	return (error);
3486}
3487
3488/*
3489 * Common routine to change flags of a file.
3490 */
3491int
3492change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3493{
3494	struct vattr vattr;
3495	int error;
3496
3497	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3498
3499	vattr_null(&vattr);
3500	vattr.va_flags = flags;
3501	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3502
3503	return (error);
3504}
3505
3506/*
3507 * Change mode of a file given path name; this version follows links.
3508 */
3509/* ARGSUSED */
3510int
3511sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3512{
3513	/* {
3514		syscallarg(const char *) path;
3515		syscallarg(int) mode;
3516	} */
3517	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3518			      SCARG(uap, mode), 0);
3519}
3520
3521int
3522do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3523{
3524	int error;
3525	struct vnode *vp;
3526	namei_simple_flags_t ns_flag;
3527
3528	if (flags & AT_SYMLINK_NOFOLLOW)
3529		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3530	else
3531		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3532
3533	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3534	if (error != 0)
3535		return error;
3536
3537	error = change_mode(vp, mode, l);
3538
3539	vrele(vp);
3540
3541	return (error);
3542}
3543
3544/*
3545 * Change mode of a file given a file descriptor.
3546 */
3547/* ARGSUSED */
3548int
3549sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3550{
3551	/* {
3552		syscallarg(int) fd;
3553		syscallarg(int) mode;
3554	} */
3555	file_t *fp;
3556	int error;
3557
3558	/* fd_getvnode() will use the descriptor for us */
3559	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3560		return (error);
3561	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3562	fd_putfile(SCARG(uap, fd));
3563	return (error);
3564}
3565
3566int
3567sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3568    register_t *retval)
3569{
3570	/* {
3571		syscallarg(int) fd;
3572		syscallarg(const char *) path;
3573		syscallarg(int) mode;
3574		syscallarg(int) flag;
3575	} */
3576
3577	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3578			      SCARG(uap, mode), SCARG(uap, flag));
3579}
3580
3581/*
3582 * Change mode of a file given path name; this version does not follow links.
3583 */
3584/* ARGSUSED */
3585int
3586sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3587{
3588	/* {
3589		syscallarg(const char *) path;
3590		syscallarg(int) mode;
3591	} */
3592	int error;
3593	struct vnode *vp;
3594
3595	error = namei_simple_user(SCARG(uap, path),
3596				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3597	if (error != 0)
3598		return (error);
3599
3600	error = change_mode(vp, SCARG(uap, mode), l);
3601
3602	vrele(vp);
3603	return (error);
3604}
3605
3606/*
3607 * Common routine to set mode given a vnode.
3608 */
3609static int
3610change_mode(struct vnode *vp, int mode, struct lwp *l)
3611{
3612	struct vattr vattr;
3613	int error;
3614
3615	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3616	vattr_null(&vattr);
3617	vattr.va_mode = mode & ALLPERMS;
3618	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3619	VOP_UNLOCK(vp);
3620	return (error);
3621}
3622
3623/*
3624 * Set ownership given a path name; this version follows links.
3625 */
3626/* ARGSUSED */
3627int
3628sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3629{
3630	/* {
3631		syscallarg(const char *) path;
3632		syscallarg(uid_t) uid;
3633		syscallarg(gid_t) gid;
3634	} */
3635	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3636			      SCARG(uap, gid), 0);
3637}
3638
3639int
3640do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3641   gid_t gid, int flags)
3642{
3643	int error;
3644	struct vnode *vp;
3645	namei_simple_flags_t ns_flag;
3646
3647	if (flags & AT_SYMLINK_NOFOLLOW)
3648		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3649	else
3650		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3651
3652	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3653	if (error != 0)
3654		return error;
3655
3656	error = change_owner(vp, uid, gid, l, 0);
3657
3658	vrele(vp);
3659
3660	return (error);
3661}
3662
3663/*
3664 * Set ownership given a path name; this version follows links.
3665 * Provides POSIX semantics.
3666 */
3667/* ARGSUSED */
3668int
3669sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3670{
3671	/* {
3672		syscallarg(const char *) path;
3673		syscallarg(uid_t) uid;
3674		syscallarg(gid_t) gid;
3675	} */
3676	int error;
3677	struct vnode *vp;
3678
3679	error = namei_simple_user(SCARG(uap, path),
3680				NSM_FOLLOW_TRYEMULROOT, &vp);
3681	if (error != 0)
3682		return (error);
3683
3684	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3685
3686	vrele(vp);
3687	return (error);
3688}
3689
3690/*
3691 * Set ownership given a file descriptor.
3692 */
3693/* ARGSUSED */
3694int
3695sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3696{
3697	/* {
3698		syscallarg(int) fd;
3699		syscallarg(uid_t) uid;
3700		syscallarg(gid_t) gid;
3701	} */
3702	int error;
3703	file_t *fp;
3704
3705	/* fd_getvnode() will use the descriptor for us */
3706	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3707		return (error);
3708	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3709	    l, 0);
3710	fd_putfile(SCARG(uap, fd));
3711	return (error);
3712}
3713
3714int
3715sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3716    register_t *retval)
3717{
3718	/* {
3719		syscallarg(int) fd;
3720		syscallarg(const char *) path;
3721		syscallarg(uid_t) owner;
3722		syscallarg(gid_t) group;
3723		syscallarg(int) flag;
3724	} */
3725
3726	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3727			      SCARG(uap, owner), SCARG(uap, group),
3728			      SCARG(uap, flag));
3729}
3730
3731/*
3732 * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3733 */
3734/* ARGSUSED */
3735int
3736sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3737{
3738	/* {
3739		syscallarg(int) fd;
3740		syscallarg(uid_t) uid;
3741		syscallarg(gid_t) gid;
3742	} */
3743	int error;
3744	file_t *fp;
3745
3746	/* fd_getvnode() will use the descriptor for us */
3747	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3748		return (error);
3749	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3750	    l, 1);
3751	fd_putfile(SCARG(uap, fd));
3752	return (error);
3753}
3754
3755/*
3756 * Set ownership given a path name; this version does not follow links.
3757 */
3758/* ARGSUSED */
3759int
3760sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3761{
3762	/* {
3763		syscallarg(const char *) path;
3764		syscallarg(uid_t) uid;
3765		syscallarg(gid_t) gid;
3766	} */
3767	int error;
3768	struct vnode *vp;
3769
3770	error = namei_simple_user(SCARG(uap, path),
3771				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3772	if (error != 0)
3773		return (error);
3774
3775	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3776
3777	vrele(vp);
3778	return (error);
3779}
3780
3781/*
3782 * Set ownership given a path name; this version does not follow links.
3783 * Provides POSIX/XPG semantics.
3784 */
3785/* ARGSUSED */
3786int
3787sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3788{
3789	/* {
3790		syscallarg(const char *) path;
3791		syscallarg(uid_t) uid;
3792		syscallarg(gid_t) gid;
3793	} */
3794	int error;
3795	struct vnode *vp;
3796
3797	error = namei_simple_user(SCARG(uap, path),
3798				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3799	if (error != 0)
3800		return (error);
3801
3802	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3803
3804	vrele(vp);
3805	return (error);
3806}
3807
3808/*
3809 * Common routine to set ownership given a vnode.
3810 */
3811static int
3812change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3813    int posix_semantics)
3814{
3815	struct vattr vattr;
3816	mode_t newmode;
3817	int error;
3818
3819	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3820	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3821		goto out;
3822
3823#define CHANGED(x) ((int)(x) != -1)
3824	newmode = vattr.va_mode;
3825	if (posix_semantics) {
3826		/*
3827		 * POSIX/XPG semantics: if the caller is not the super-user,
3828		 * clear set-user-id and set-group-id bits.  Both POSIX and
3829		 * the XPG consider the behaviour for calls by the super-user
3830		 * implementation-defined; we leave the set-user-id and set-
3831		 * group-id settings intact in that case.
3832		 */
3833		if (vattr.va_mode & S_ISUID) {
3834			if (kauth_authorize_vnode(l->l_cred,
3835			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3836				newmode &= ~S_ISUID;
3837		}
3838		if (vattr.va_mode & S_ISGID) {
3839			if (kauth_authorize_vnode(l->l_cred,
3840			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3841				newmode &= ~S_ISGID;
3842		}
3843	} else {
3844		/*
3845		 * NetBSD semantics: when changing owner and/or group,
3846		 * clear the respective bit(s).
3847		 */
3848		if (CHANGED(uid))
3849			newmode &= ~S_ISUID;
3850		if (CHANGED(gid))
3851			newmode &= ~S_ISGID;
3852	}
3853	/* Update va_mode iff altered. */
3854	if (vattr.va_mode == newmode)
3855		newmode = VNOVAL;
3856
3857	vattr_null(&vattr);
3858	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3859	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3860	vattr.va_mode = newmode;
3861	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3862#undef CHANGED
3863
3864out:
3865	VOP_UNLOCK(vp);
3866	return (error);
3867}
3868
3869/*
3870 * Set the access and modification times given a path name; this
3871 * version follows links.
3872 */
3873/* ARGSUSED */
3874int
3875sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3876    register_t *retval)
3877{
3878	/* {
3879		syscallarg(const char *) path;
3880		syscallarg(const struct timeval *) tptr;
3881	} */
3882
3883	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3884	    SCARG(uap, tptr), UIO_USERSPACE);
3885}
3886
3887/*
3888 * Set the access and modification times given a file descriptor.
3889 */
3890/* ARGSUSED */
3891int
3892sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3893    register_t *retval)
3894{
3895	/* {
3896		syscallarg(int) fd;
3897		syscallarg(const struct timeval *) tptr;
3898	} */
3899	int error;
3900	file_t *fp;
3901
3902	/* fd_getvnode() will use the descriptor for us */
3903	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3904		return (error);
3905	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3906	    UIO_USERSPACE);
3907	fd_putfile(SCARG(uap, fd));
3908	return (error);
3909}
3910
3911int
3912sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3913    register_t *retval)
3914{
3915	/* {
3916		syscallarg(int) fd;
3917		syscallarg(const struct timespec *) tptr;
3918	} */
3919	int error;
3920	file_t *fp;
3921
3922	/* fd_getvnode() will use the descriptor for us */
3923	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3924		return (error);
3925	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3926	    SCARG(uap, tptr), UIO_USERSPACE);
3927	fd_putfile(SCARG(uap, fd));
3928	return (error);
3929}
3930
3931/*
3932 * Set the access and modification times given a path name; this
3933 * version does not follow links.
3934 */
3935int
3936sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3937    register_t *retval)
3938{
3939	/* {
3940		syscallarg(const char *) path;
3941		syscallarg(const struct timeval *) tptr;
3942	} */
3943
3944	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3945	    SCARG(uap, tptr), UIO_USERSPACE);
3946}
3947
3948int
3949sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3950    register_t *retval)
3951{
3952	/* {
3953		syscallarg(int) fd;
3954		syscallarg(const char *) path;
3955		syscallarg(const struct timespec *) tptr;
3956		syscallarg(int) flag;
3957	} */
3958	int follow;
3959	const struct timespec *tptr;
3960	int error;
3961
3962	tptr = SCARG(uap, tptr);
3963	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3964
3965	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3966	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3967
3968	return error;
3969}
3970
3971/*
3972 * Common routine to set access and modification times given a vnode.
3973 */
3974int
3975do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3976    const struct timespec *tptr, enum uio_seg seg)
3977{
3978	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3979}
3980
3981int
3982do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3983    const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3984{
3985	struct vattr vattr;
3986	int error, dorele = 0;
3987	namei_simple_flags_t sflags;
3988	bool vanull, setbirthtime;
3989	struct timespec ts[2];
3990
3991	KASSERT(l != NULL || fdat == AT_FDCWD);
3992
3993	/*
3994	 * I have checked all callers and they pass either FOLLOW,
3995	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3996	 * is 0. More to the point, they don't pass anything else.
3997	 * Let's keep it that way at least until the namei interfaces
3998	 * are fully sanitized.
3999	 */
4000	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
4001	sflags = (flag == FOLLOW) ?
4002		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
4003
4004	if (tptr == NULL) {
4005		vanull = true;
4006		nanotime(&ts[0]);
4007		ts[1] = ts[0];
4008	} else {
4009		vanull = false;
4010		if (seg != UIO_SYSSPACE) {
4011			error = copyin(tptr, ts, sizeof (ts));
4012			if (error != 0)
4013				return error;
4014		} else {
4015			ts[0] = tptr[0];
4016			ts[1] = tptr[1];
4017		}
4018	}
4019
4020	if (ts[0].tv_nsec == UTIME_NOW) {
4021		nanotime(&ts[0]);
4022		if (ts[1].tv_nsec == UTIME_NOW) {
4023			vanull = true;
4024			ts[1] = ts[0];
4025		}
4026	} else if (ts[1].tv_nsec == UTIME_NOW)
4027		nanotime(&ts[1]);
4028
4029	if (vp == NULL) {
4030		/* note: SEG describes TPTR, not PATH; PATH is always user */
4031		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4032		if (error != 0)
4033			return error;
4034		dorele = 1;
4035	}
4036
4037	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4038	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4039	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
4040	vattr_null(&vattr);
4041
4042	if (ts[0].tv_nsec != UTIME_OMIT)
4043		vattr.va_atime = ts[0];
4044
4045	if (ts[1].tv_nsec != UTIME_OMIT) {
4046		vattr.va_mtime = ts[1];
4047		if (setbirthtime)
4048			vattr.va_birthtime = ts[1];
4049	}
4050
4051	if (vanull)
4052		vattr.va_vaflags |= VA_UTIMES_NULL;
4053	error = VOP_SETATTR(vp, &vattr, l->l_cred);
4054	VOP_UNLOCK(vp);
4055
4056	if (dorele != 0)
4057		vrele(vp);
4058
4059	return error;
4060}
4061
4062int
4063do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4064    const struct timeval *tptr, enum uio_seg seg)
4065{
4066	struct timespec ts[2];
4067	struct timespec *tsptr = NULL;
4068	int error;
4069
4070	if (tptr != NULL) {
4071		struct timeval tv[2];
4072
4073		if (seg != UIO_SYSSPACE) {
4074			error = copyin(tptr, tv, sizeof(tv));
4075			if (error != 0)
4076				return error;
4077			tptr = tv;
4078		}
4079
4080		if ((tptr[0].tv_usec == UTIME_NOW) ||
4081		    (tptr[0].tv_usec == UTIME_OMIT))
4082			ts[0].tv_nsec = tptr[0].tv_usec;
4083		else {
4084			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4085				return EINVAL;
4086
4087			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4088		}
4089
4090		if ((tptr[1].tv_usec == UTIME_NOW) ||
4091		    (tptr[1].tv_usec == UTIME_OMIT))
4092			ts[1].tv_nsec = tptr[1].tv_usec;
4093		else {
4094			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4095				return EINVAL;
4096
4097			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4098		}
4099
4100		tsptr = &ts[0];
4101	}
4102
4103	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4104}
4105
4106/*
4107 * Truncate a file given its path name.
4108 */
4109/* ARGSUSED */
4110int
4111sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
4112{
4113	/* {
4114		syscallarg(const char *) path;
4115		syscallarg(int) pad;
4116		syscallarg(off_t) length;
4117	} */
4118	struct vnode *vp;
4119	struct vattr vattr;
4120	int error;
4121
4122	if (SCARG(uap, length) < 0)
4123		return EINVAL;
4124
4125	error = namei_simple_user(SCARG(uap, path),
4126				NSM_FOLLOW_TRYEMULROOT, &vp);
4127	if (error != 0)
4128		return (error);
4129	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4130	if (vp->v_type == VDIR)
4131		error = EISDIR;
4132	else if ((error = vn_writechk(vp)) == 0 &&
4133	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4134		vattr_null(&vattr);
4135		vattr.va_size = SCARG(uap, length);
4136		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4137	}
4138	vput(vp);
4139	return (error);
4140}
4141
4142/*
4143 * Truncate a file given a file descriptor.
4144 */
4145/* ARGSUSED */
4146int
4147sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
4148{
4149	/* {
4150		syscallarg(int) fd;
4151		syscallarg(int) pad;
4152		syscallarg(off_t) length;
4153	} */
4154	file_t *fp;
4155	int error, fd = SCARG(uap, fd);
4156
4157	fp = fd_getfile(fd);
4158	if (fp == NULL)
4159		return EBADF;
4160	if (fp->f_ops->fo_truncate == NULL)
4161		error = EOPNOTSUPP;
4162	else
4163		error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
4164
4165	fd_putfile(fd);
4166	return error;
4167}
4168
4169/*
4170 * Sync an open file.
4171 */
4172/* ARGSUSED */
4173int
4174sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4175{
4176	/* {
4177		syscallarg(int) fd;
4178	} */
4179	struct vnode *vp;
4180	file_t *fp;
4181	int error;
4182
4183	/* fd_getvnode() will use the descriptor for us */
4184	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4185		return (error);
4186	vp = fp->f_vnode;
4187	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4188	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4189	VOP_UNLOCK(vp);
4190	fd_putfile(SCARG(uap, fd));
4191	return (error);
4192}
4193
4194/*
4195 * Sync a range of file data.  API modeled after that found in AIX.
4196 *
4197 * FDATASYNC indicates that we need only save enough metadata to be able
4198 * to re-read the written data.
4199 */
4200/* ARGSUSED */
4201int
4202sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4203{
4204	/* {
4205		syscallarg(int) fd;
4206		syscallarg(int) flags;
4207		syscallarg(off_t) start;
4208		syscallarg(off_t) length;
4209	} */
4210	struct vnode *vp;
4211	file_t *fp;
4212	int flags, nflags;
4213	off_t s, e, len;
4214	int error;
4215
4216	/* fd_getvnode() will use the descriptor for us */
4217	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4218		return (error);
4219
4220	if ((fp->f_flag & FWRITE) == 0) {
4221		error = EBADF;
4222		goto out;
4223	}
4224
4225	flags = SCARG(uap, flags);
4226	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4227	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4228		error = EINVAL;
4229		goto out;
4230	}
4231	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4232	if (flags & FDATASYNC)
4233		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4234	else
4235		nflags = FSYNC_WAIT;
4236	if (flags & FDISKSYNC)
4237		nflags |= FSYNC_CACHE;
4238
4239	len = SCARG(uap, length);
4240	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4241	if (len) {
4242		s = SCARG(uap, start);
4243		if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4244			error = EINVAL;
4245			goto out;
4246		}
4247		e = s + len;
4248		KASSERT(s <= e);
4249	} else {
4250		e = 0;
4251		s = 0;
4252	}
4253
4254	vp = fp->f_vnode;
4255	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4256	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4257	VOP_UNLOCK(vp);
4258out:
4259	fd_putfile(SCARG(uap, fd));
4260	return (error);
4261}
4262
4263/*
4264 * Sync the data of an open file.
4265 */
4266/* ARGSUSED */
4267int
4268sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4269{
4270	/* {
4271		syscallarg(int) fd;
4272	} */
4273	struct vnode *vp;
4274	file_t *fp;
4275	int error;
4276
4277	/* fd_getvnode() will use the descriptor for us */
4278	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4279		return (error);
4280	vp = fp->f_vnode;
4281	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4282	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4283	VOP_UNLOCK(vp);
4284	fd_putfile(SCARG(uap, fd));
4285	return (error);
4286}
4287
4288/*
4289 * Rename files, (standard) BSD semantics frontend.
4290 */
4291/* ARGSUSED */
4292int
4293sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4294{
4295	/* {
4296		syscallarg(const char *) from;
4297		syscallarg(const char *) to;
4298	} */
4299
4300	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4301	    SCARG(uap, to), UIO_USERSPACE, 0));
4302}
4303
4304int
4305sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4306    register_t *retval)
4307{
4308	/* {
4309		syscallarg(int) fromfd;
4310		syscallarg(const char *) from;
4311		syscallarg(int) tofd;
4312		syscallarg(const char *) to;
4313	} */
4314
4315	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4316	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4317}
4318
4319/*
4320 * Rename files, POSIX semantics frontend.
4321 */
4322/* ARGSUSED */
4323int
4324sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4325{
4326	/* {
4327		syscallarg(const char *) from;
4328		syscallarg(const char *) to;
4329	} */
4330
4331	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4332	    SCARG(uap, to), UIO_USERSPACE, 1));
4333}
4334
4335/*
4336 * Rename files.  Source and destination must either both be directories,
4337 * or both not be directories.  If target is a directory, it must be empty.
4338 * If `from' and `to' refer to the same object, the value of the `retain'
4339 * argument is used to determine whether `from' will be
4340 *
4341 * (retain == 0)	deleted unless `from' and `to' refer to the same
4342 *			object in the file system's name space (BSD).
4343 * (retain == 1)	always retained (POSIX).
4344 *
4345 * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4346 */
4347int
4348do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4349{
4350	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4351}
4352
4353static int
4354do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4355    const char *to, enum uio_seg seg, int retain)
4356{
4357	struct pathbuf *fpb, *tpb;
4358	struct nameidata fnd, tnd;
4359	struct vnode *fdvp, *fvp;
4360	struct vnode *tdvp, *tvp;
4361	struct mount *mp, *tmp;
4362	int error;
4363
4364	KASSERT(l != NULL || fromfd == AT_FDCWD);
4365	KASSERT(l != NULL || tofd == AT_FDCWD);
4366
4367	error = pathbuf_maybe_copyin(from, seg, &fpb);
4368	if (error)
4369		goto out0;
4370	KASSERT(fpb != NULL);
4371
4372	error = pathbuf_maybe_copyin(to, seg, &tpb);
4373	if (error)
4374		goto out1;
4375	KASSERT(tpb != NULL);
4376
4377	/*
4378	 * Lookup from.
4379	 *
4380	 * XXX LOCKPARENT is wrong because we don't actually want it
4381	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4382	 * insane, so for the time being we need to leave it like this.
4383	 */
4384	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4385	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4386		goto out2;
4387
4388	/*
4389	 * Pull out the important results of the lookup, fdvp and fvp.
4390	 * Of course, fvp is bogus because we're about to unlock fdvp.
4391	 */
4392	fdvp = fnd.ni_dvp;
4393	fvp = fnd.ni_vp;
4394	mp = fdvp->v_mount;
4395	KASSERT(fdvp != NULL);
4396	KASSERT(fvp != NULL);
4397	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4398	/*
4399	 * Bracket the operation with fstrans_start()/fstrans_done().
4400	 *
4401	 * Inside the bracket this file system cannot be unmounted so
4402	 * a vnode on this file system cannot change its v_mount.
4403	 * A vnode on another file system may still change to dead mount.
4404	 */
4405	fstrans_start(mp);
4406
4407	/*
4408	 * Make sure neither fdvp nor fvp is locked.
4409	 */
4410	if (fdvp != fvp)
4411		VOP_UNLOCK(fdvp);
4412	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4413	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4414
4415	/*
4416	 * Reject renaming `.' and `..'.  Can't do this until after
4417	 * namei because we need namei's parsing to find the final
4418	 * component name.  (namei should just leave us with the final
4419	 * component name and not look it up itself, but anyway...)
4420	 *
4421	 * This was here before because we used to relookup from
4422	 * instead of to and relookup requires the caller to check
4423	 * this, but now file systems may depend on this check, so we
4424	 * must retain it until the file systems are all rototilled.
4425	 */
4426	if (((fnd.ni_cnd.cn_namelen == 1) &&
4427		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4428	    ((fnd.ni_cnd.cn_namelen == 2) &&
4429		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4430		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4431		error = EINVAL;	/* XXX EISDIR?  */
4432		goto abort0;
4433	}
4434
4435	/*
4436	 * Lookup to.
4437	 *
4438	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4439	 * fvp here to decide whether to add CREATEDIR is a load of
4440	 * bollocks because fvp might be the wrong node by now, since
4441	 * fdvp is unlocked.
4442	 *
4443	 * XXX Why not pass CREATEDIR always?
4444	 */
4445	NDINIT(&tnd, RENAME,
4446	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4447		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4448	    tpb);
4449	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4450		goto abort0;
4451
4452	/*
4453	 * Pull out the important results of the lookup, tdvp and tvp.
4454	 * Of course, tvp is bogus because we're about to unlock tdvp.
4455	 */
4456	tdvp = tnd.ni_dvp;
4457	tvp = tnd.ni_vp;
4458	KASSERT(tdvp != NULL);
4459	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4460
4461	if (fvp->v_type == VDIR)
4462		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4463	/*
4464	 * Make sure neither tdvp nor tvp is locked.
4465	 */
4466	if (tdvp != tvp)
4467		VOP_UNLOCK(tdvp);
4468	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4469	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4470
4471	/*
4472	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4473	 * these, which is why we must do this here.  Once upon a time
4474	 * we relooked up from instead of to, and consequently didn't
4475	 * need this check, but now that we relookup to instead of
4476	 * from, we need this; and we shall need it forever forward
4477	 * until the VOP_RENAME protocol changes, because file systems
4478	 * will no doubt begin to depend on this check.
4479	 */
4480	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4481		error = EISDIR;
4482		goto abort1;
4483	}
4484	if ((tnd.ni_cnd.cn_namelen == 2) &&
4485	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4486	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4487		error = EINVAL;
4488		goto abort1;
4489	}
4490
4491	/*
4492	 * Make sure the mount points match.  Although we don't hold
4493	 * any vnode locks, the v_mount on fdvp file system are stable.
4494	 *
4495	 * Unmounting another file system at an inopportune moment may
4496	 * cause tdvp to disappear and change its v_mount to dead.
4497	 *
4498	 * So in either case different v_mount means cross-device rename.
4499	 */
4500	KASSERT(mp != NULL);
4501	tmp = tdvp->v_mount;
4502
4503	if (mp != tmp) {
4504		error = EXDEV;
4505		goto abort1;
4506	}
4507
4508	/*
4509	 * Take the vfs rename lock to avoid cross-directory screw cases.
4510	 * Nothing is locked currently, so taking this lock is safe.
4511	 */
4512	error = VFS_RENAMELOCK_ENTER(mp);
4513	if (error)
4514		goto abort1;
4515
4516	/*
4517	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4518	 * and nothing is locked except for the vfs rename lock.
4519	 *
4520	 * The next step is a little rain dance to conform to the
4521	 * insane lock protocol, even though it does nothing to ward
4522	 * off race conditions.
4523	 *
4524	 * We need tdvp and tvp to be locked.  However, because we have
4525	 * unlocked tdvp in order to hold no locks while we take the
4526	 * vfs rename lock, tvp may be wrong here, and we can't safely
4527	 * lock it even if the sensible file systems will just unlock
4528	 * it straight away.  Consequently, we must lock tdvp and then
4529	 * relookup tvp to get it locked.
4530	 *
4531	 * Finally, because the VOP_RENAME protocol is brain-damaged
4532	 * and various file systems insanely depend on the semantics of
4533	 * this brain damage, the lookup of to must be the last lookup
4534	 * before VOP_RENAME.
4535	 */
4536	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4537	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4538	if (error)
4539		goto abort2;
4540
4541	/*
4542	 * Drop the old tvp and pick up the new one -- which might be
4543	 * the same, but that doesn't matter to us.  After this, tdvp
4544	 * and tvp should both be locked.
4545	 */
4546	if (tvp != NULL)
4547		vrele(tvp);
4548	tvp = tnd.ni_vp;
4549	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4550	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4551
4552	/*
4553	 * The old do_sys_rename had various consistency checks here
4554	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4555	 * will become bogus soon in any sensible file system, so the
4556	 * only purpose in putting these checks here is to give lip
4557	 * service to these screw cases and to acknowledge that they
4558	 * exist, not actually to handle them, but here you go
4559	 * anyway...
4560	 */
4561
4562	/*
4563	 * Acknowledge that directories and non-directories aren't
4564	 * supposed to mix.
4565	 */
4566	if (tvp != NULL) {
4567		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4568			error = ENOTDIR;
4569			goto abort3;
4570		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4571			error = EISDIR;
4572			goto abort3;
4573		}
4574	}
4575
4576	/*
4577	 * Acknowledge some random screw case, among the dozens that
4578	 * might arise.
4579	 */
4580	if (fvp == tdvp) {
4581		error = EINVAL;
4582		goto abort3;
4583	}
4584
4585	/*
4586	 * Acknowledge that POSIX has a wacky screw case.
4587	 *
4588	 * XXX Eventually the retain flag needs to be passed on to
4589	 * VOP_RENAME.
4590	 */
4591	if (fvp == tvp) {
4592		if (retain) {
4593			error = 0;
4594			goto abort3;
4595		} else if ((fdvp == tdvp) &&
4596		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4597		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4598			fnd.ni_cnd.cn_namelen))) {
4599			error = 0;
4600			goto abort3;
4601		}
4602	}
4603
4604	/*
4605	 * Make sure veriexec can screw us up.  (But a race can screw
4606	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4607	 * bogus.)
4608	 */
4609#if NVERIEXEC > 0
4610	{
4611		char *f1, *f2;
4612		size_t f1_len;
4613		size_t f2_len;
4614
4615		f1_len = fnd.ni_cnd.cn_namelen + 1;
4616		f1 = kmem_alloc(f1_len, KM_SLEEP);
4617		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4618
4619		f2_len = tnd.ni_cnd.cn_namelen + 1;
4620		f2 = kmem_alloc(f2_len, KM_SLEEP);
4621		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4622
4623		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4624
4625		kmem_free(f1, f1_len);
4626		kmem_free(f2, f2_len);
4627
4628		if (error)
4629			goto abort3;
4630	}
4631#endif /* NVERIEXEC > 0 */
4632
4633	/*
4634	 * All ready.  Incant the rename vop.
4635	 */
4636	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4637	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4638	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4639	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4640	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4641
4642	/*
4643	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4644	 * tdvp and tvp.  But we can't assert any of that.
4645	 */
4646	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4647	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4648	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4649	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4650
4651	/*
4652	 * So all we have left to do is to drop the rename lock and
4653	 * destroy the pathbufs.
4654	 */
4655	VFS_RENAMELOCK_EXIT(mp);
4656	fstrans_done(mp);
4657	goto out2;
4658
4659abort3:	if ((tvp != NULL) && (tvp != tdvp))
4660		VOP_UNLOCK(tvp);
4661abort2:	VOP_UNLOCK(tdvp);
4662	VFS_RENAMELOCK_EXIT(mp);
4663abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4664	vrele(tdvp);
4665	if (tvp != NULL)
4666		vrele(tvp);
4667abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4668	vrele(fdvp);
4669	vrele(fvp);
4670	fstrans_done(mp);
4671out2:	pathbuf_destroy(tpb);
4672out1:	pathbuf_destroy(fpb);
4673out0:	return error;
4674}
4675
4676/*
4677 * Make a directory file.
4678 */
4679/* ARGSUSED */
4680int
4681sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4682{
4683	/* {
4684		syscallarg(const char *) path;
4685		syscallarg(int) mode;
4686	} */
4687
4688	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4689	    SCARG(uap, mode), UIO_USERSPACE);
4690}
4691
4692int
4693sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4694    register_t *retval)
4695{
4696	/* {
4697		syscallarg(int) fd;
4698		syscallarg(const char *) path;
4699		syscallarg(int) mode;
4700	} */
4701
4702	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4703	    SCARG(uap, mode), UIO_USERSPACE);
4704}
4705
4706
4707int
4708do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4709{
4710	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4711}
4712
4713static int
4714do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4715    enum uio_seg seg)
4716{
4717	struct proc *p = curlwp->l_proc;
4718	struct vnode *vp;
4719	struct vattr vattr;
4720	int error;
4721	struct pathbuf *pb;
4722	struct nameidata nd;
4723
4724	KASSERT(l != NULL || fdat == AT_FDCWD);
4725
4726	/* XXX bollocks, should pass in a pathbuf */
4727	error = pathbuf_maybe_copyin(path, seg, &pb);
4728	if (error) {
4729		return error;
4730	}
4731
4732	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4733
4734	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4735		pathbuf_destroy(pb);
4736		return (error);
4737	}
4738	vp = nd.ni_vp;
4739	if (vp != NULL) {
4740		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4741		if (nd.ni_dvp == vp)
4742			vrele(nd.ni_dvp);
4743		else
4744			vput(nd.ni_dvp);
4745		vrele(vp);
4746		pathbuf_destroy(pb);
4747		return (EEXIST);
4748	}
4749	vattr_null(&vattr);
4750	vattr.va_type = VDIR;
4751	/* We will read cwdi->cwdi_cmask unlocked. */
4752	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4753	nd.ni_cnd.cn_flags |= WILLBEDIR;
4754	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4755	if (!error)
4756		vrele(nd.ni_vp);
4757	vput(nd.ni_dvp);
4758	pathbuf_destroy(pb);
4759	return (error);
4760}
4761
4762/*
4763 * Remove a directory file.
4764 */
4765/* ARGSUSED */
4766int
4767sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4768{
4769	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4770	    AT_REMOVEDIR, UIO_USERSPACE);
4771}
4772
4773/*
4774 * Read a block of directory entries in a file system independent format.
4775 */
4776int
4777sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4778{
4779	/* {
4780		syscallarg(int) fd;
4781		syscallarg(char *) buf;
4782		syscallarg(size_t) count;
4783	} */
4784	file_t *fp;
4785	int error, done;
4786
4787	/* fd_getvnode() will use the descriptor for us */
4788	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4789		return (error);
4790	if ((fp->f_flag & FREAD) == 0) {
4791		error = EBADF;
4792		goto out;
4793	}
4794	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4795			SCARG(uap, count), &done, l, 0, 0);
4796	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4797	*retval = done;
4798 out:
4799	fd_putfile(SCARG(uap, fd));
4800	return (error);
4801}
4802
4803/*
4804 * Set the mode mask for creation of filesystem nodes.
4805 */
4806int
4807sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4808{
4809	/* {
4810		syscallarg(mode_t) newmask;
4811	} */
4812
4813	/*
4814	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4815	 * serialization with those reads is required.  It's important to
4816	 * return a coherent answer for the caller of umask() though, and
4817	 * the atomic operation accomplishes that.
4818	 */
4819	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4820	    SCARG(uap, newmask) & ALLPERMS);
4821
4822	return (0);
4823}
4824
4825int
4826dorevoke(struct vnode *vp, kauth_cred_t cred)
4827{
4828	struct vattr vattr;
4829	int error, fs_decision;
4830
4831	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4832	error = VOP_GETATTR(vp, &vattr, cred);
4833	VOP_UNLOCK(vp);
4834	if (error != 0)
4835		return error;
4836	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4837	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4838	    fs_decision);
4839	if (!error)
4840		VOP_REVOKE(vp, REVOKEALL);
4841	return (error);
4842}
4843
4844/*
4845 * Void all references to file by ripping underlying filesystem
4846 * away from vnode.
4847 */
4848/* ARGSUSED */
4849int
4850sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4851{
4852	/* {
4853		syscallarg(const char *) path;
4854	} */
4855	struct vnode *vp;
4856	int error;
4857
4858	error = namei_simple_user(SCARG(uap, path),
4859				NSM_FOLLOW_TRYEMULROOT, &vp);
4860	if (error != 0)
4861		return (error);
4862	error = dorevoke(vp, l->l_cred);
4863	vrele(vp);
4864	return (error);
4865}
4866
4867/*
4868 * Allocate backing store for a file, filling a hole without having to
4869 * explicitly write anything out.
4870 */
4871/* ARGSUSED */
4872int
4873sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4874		register_t *retval)
4875{
4876	/* {
4877		syscallarg(int) fd;
4878		syscallarg(off_t) pos;
4879		syscallarg(off_t) len;
4880	} */
4881	int fd;
4882	off_t pos, len;
4883	struct file *fp;
4884	struct vnode *vp;
4885	int error;
4886
4887	fd = SCARG(uap, fd);
4888	pos = SCARG(uap, pos);
4889	len = SCARG(uap, len);
4890
4891	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4892		*retval = EINVAL;
4893		return 0;
4894	}
4895
4896	error = fd_getvnode(fd, &fp);
4897	if (error) {
4898		*retval = error;
4899		return 0;
4900	}
4901	if ((fp->f_flag & FWRITE) == 0) {
4902		error = EBADF;
4903		goto fail;
4904	}
4905	vp = fp->f_vnode;
4906
4907	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4908	if (vp->v_type == VDIR) {
4909		error = EISDIR;
4910	} else {
4911		error = VOP_FALLOCATE(vp, pos, len);
4912	}
4913	VOP_UNLOCK(vp);
4914
4915fail:
4916	fd_putfile(fd);
4917	*retval = error;
4918	return 0;
4919}
4920
4921/*
4922 * Deallocate backing store for a file, creating a hole. Also used for
4923 * invoking TRIM on disks.
4924 */
4925/* ARGSUSED */
4926int
4927sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4928		register_t *retval)
4929{
4930	/* {
4931		syscallarg(int) fd;
4932		syscallarg(off_t) pos;
4933		syscallarg(off_t) len;
4934	} */
4935	int fd;
4936	off_t pos, len;
4937	struct file *fp;
4938	struct vnode *vp;
4939	int error;
4940
4941	fd = SCARG(uap, fd);
4942	pos = SCARG(uap, pos);
4943	len = SCARG(uap, len);
4944
4945	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4946		return EINVAL;
4947	}
4948
4949	error = fd_getvnode(fd, &fp);
4950	if (error) {
4951		return error;
4952	}
4953	if ((fp->f_flag & FWRITE) == 0) {
4954		error = EBADF;
4955		goto fail;
4956	}
4957	vp = fp->f_vnode;
4958
4959	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4960	if (vp->v_type == VDIR) {
4961		error = EISDIR;
4962	} else {
4963		error = VOP_FDISCARD(vp, pos, len);
4964	}
4965	VOP_UNLOCK(vp);
4966
4967fail:
4968	fd_putfile(fd);
4969	return error;
4970}
4971