1/*	$NetBSD: vfs_syscalls.c,v 1.449.2.2 2012/05/19 15:01:35 riz Exp $	*/
2
3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1989, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66 */
67
68/*
69 * Virtual File System System Calls
70 */
71
72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.449.2.2 2012/05/19 15:01:35 riz Exp $");
74
75#ifdef _KERNEL_OPT
76#include "opt_fileassoc.h"
77#include "veriexec.h"
78#endif
79
80#include <sys/param.h>
81#include <sys/systm.h>
82#include <sys/namei.h>
83#include <sys/filedesc.h>
84#include <sys/kernel.h>
85#include <sys/file.h>
86#include <sys/fcntl.h>
87#include <sys/stat.h>
88#include <sys/vnode.h>
89#include <sys/mount.h>
90#include <sys/proc.h>
91#include <sys/uio.h>
92#include <sys/kmem.h>
93#include <sys/dirent.h>
94#include <sys/sysctl.h>
95#include <sys/syscallargs.h>
96#include <sys/vfs_syscalls.h>
97#include <sys/quota.h>
98#include <sys/quotactl.h>
99#include <sys/ktrace.h>
100#ifdef FILEASSOC
101#include <sys/fileassoc.h>
102#endif /* FILEASSOC */
103#include <sys/extattr.h>
104#include <sys/verified_exec.h>
105#include <sys/kauth.h>
106#include <sys/atomic.h>
107#include <sys/module.h>
108#include <sys/buf.h>
109
110#include <miscfs/genfs/genfs.h>
111#include <miscfs/syncfs/syncfs.h>
112#include <miscfs/specfs/specdev.h>
113
114#include <nfs/rpcv2.h>
115#include <nfs/nfsproto.h>
116#include <nfs/nfs.h>
117#include <nfs/nfs_var.h>
118
119static int change_flags(struct vnode *, u_long, struct lwp *);
120static int change_mode(struct vnode *, int, struct lwp *l);
121static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
122static int do_open(lwp_t *, struct pathbuf *, int, int, int *);
123
124/*
125 * This table is used to maintain compatibility with 4.3BSD
126 * and NetBSD 0.9 mount syscalls - and possibly other systems.
127 * Note, the order is important!
128 *
129 * Do not modify this table. It should only contain filesystems
130 * supported by NetBSD 0.9 and 4.3BSD.
131 */
132const char * const mountcompatnames[] = {
133	NULL,		/* 0 = MOUNT_NONE */
134	MOUNT_FFS,	/* 1 = MOUNT_UFS */
135	MOUNT_NFS,	/* 2 */
136	MOUNT_MFS,	/* 3 */
137	MOUNT_MSDOS,	/* 4 */
138	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
139	MOUNT_FDESC,	/* 6 */
140	MOUNT_KERNFS,	/* 7 */
141	NULL,		/* 8 = MOUNT_DEVFS */
142	MOUNT_AFS,	/* 9 */
143};
144
145const int nmountcompatnames = __arraycount(mountcompatnames);
146
147static int
148open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
149{
150	int error;
151
152	fp->f_flag = flags & FMASK;
153	fp->f_type = DTYPE_VNODE;
154	fp->f_ops = &vnops;
155	fp->f_data = vp;
156
157	if (flags & (O_EXLOCK | O_SHLOCK)) {
158		struct flock lf;
159		int type;
160
161		lf.l_whence = SEEK_SET;
162		lf.l_start = 0;
163		lf.l_len = 0;
164		if (flags & O_EXLOCK)
165			lf.l_type = F_WRLCK;
166		else
167			lf.l_type = F_RDLCK;
168		type = F_FLOCK;
169		if ((flags & FNONBLOCK) == 0)
170			type |= F_WAIT;
171		VOP_UNLOCK(vp);
172		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
173		if (error) {
174			(void) vn_close(vp, fp->f_flag, fp->f_cred);
175			fd_abort(l->l_proc, fp, indx);
176			return error;
177		}
178		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
179		atomic_or_uint(&fp->f_flag, FHASLOCK);
180	}
181	if (flags & O_CLOEXEC)
182		fd_set_exclose(l, indx, true);
183	return 0;
184}
185
186static int
187mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
188    void *data, size_t *data_len)
189{
190	struct mount *mp;
191	int error = 0, saved_flags;
192
193	mp = vp->v_mount;
194	saved_flags = mp->mnt_flag;
195
196	/* We can operate only on VV_ROOT nodes. */
197	if ((vp->v_vflag & VV_ROOT) == 0) {
198		error = EINVAL;
199		goto out;
200	}
201
202	/*
203	 * We only allow the filesystem to be reloaded if it
204	 * is currently mounted read-only.  Additionally, we
205	 * prevent read-write to read-only downgrades.
206	 */
207	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
208	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
209	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
210		error = EOPNOTSUPP;	/* Needs translation */
211		goto out;
212	}
213
214	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
215	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
216	if (error)
217		goto out;
218
219	if (vfs_busy(mp, NULL)) {
220		error = EPERM;
221		goto out;
222	}
223
224	mutex_enter(&mp->mnt_updating);
225
226	mp->mnt_flag &= ~MNT_OP_FLAGS;
227	mp->mnt_flag |= flags & MNT_OP_FLAGS;
228
229	/*
230	 * Set the mount level flags.
231	 */
232	if (flags & MNT_RDONLY)
233		mp->mnt_flag |= MNT_RDONLY;
234	else if (mp->mnt_flag & MNT_RDONLY)
235		mp->mnt_iflag |= IMNT_WANTRDWR;
236	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
237	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
238	error = VFS_MOUNT(mp, path, data, data_len);
239
240	if (error && data != NULL) {
241		int error2;
242
243		/*
244		 * Update failed; let's try and see if it was an
245		 * export request.  For compat with 3.0 and earlier.
246		 */
247		error2 = vfs_hooks_reexport(mp, path, data);
248
249		/*
250		 * Only update error code if the export request was
251		 * understood but some problem occurred while
252		 * processing it.
253		 */
254		if (error2 != EJUSTRETURN)
255			error = error2;
256	}
257
258	if (mp->mnt_iflag & IMNT_WANTRDWR)
259		mp->mnt_flag &= ~MNT_RDONLY;
260	if (error)
261		mp->mnt_flag = saved_flags;
262	mp->mnt_flag &= ~MNT_OP_FLAGS;
263	mp->mnt_iflag &= ~IMNT_WANTRDWR;
264	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
265		if (mp->mnt_syncer == NULL)
266			error = vfs_allocate_syncvnode(mp);
267	} else {
268		if (mp->mnt_syncer != NULL)
269			vfs_deallocate_syncvnode(mp);
270	}
271	mutex_exit(&mp->mnt_updating);
272	vfs_unbusy(mp, false, NULL);
273
274	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
275	    (flags & MNT_EXTATTR)) {
276		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
277				   NULL, 0, NULL) != 0) {
278			printf("%s: failed to start extattr, error = %d",
279			       mp->mnt_stat.f_mntonname, error);
280			mp->mnt_flag &= ~MNT_EXTATTR;
281		}
282	}
283
284	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
285	    !(flags & MNT_EXTATTR)) {
286		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
287				   NULL, 0, NULL) != 0) {
288			printf("%s: failed to stop extattr, error = %d",
289			       mp->mnt_stat.f_mntonname, error);
290			mp->mnt_flag |= MNT_RDONLY;
291		}
292	}
293 out:
294	return (error);
295}
296
297static int
298mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
299{
300	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
301	int error;
302
303	/* Copy file-system type from userspace.  */
304	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
305	if (error) {
306		/*
307		 * Historically, filesystem types were identified by numbers.
308		 * If we get an integer for the filesystem type instead of a
309		 * string, we check to see if it matches one of the historic
310		 * filesystem types.
311		 */
312		u_long fsindex = (u_long)fstype;
313		if (fsindex >= nmountcompatnames ||
314		    mountcompatnames[fsindex] == NULL)
315			return ENODEV;
316		strlcpy(fstypename, mountcompatnames[fsindex],
317		    sizeof(fstypename));
318	}
319
320	/* Accept `ufs' as an alias for `ffs', for compatibility. */
321	if (strcmp(fstypename, "ufs") == 0)
322		fstypename[0] = 'f';
323
324	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
325		return 0;
326
327	/* If we can autoload a vfs module, try again */
328	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
329
330	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
331		return 0;
332
333	return ENODEV;
334}
335
336static int
337mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
338    void *data, size_t *data_len)
339{
340	struct mount *mp;
341	int error;
342
343	/* If MNT_GETARGS is specified, it should be the only flag. */
344	if (flags & ~MNT_GETARGS)
345		return EINVAL;
346
347	mp = vp->v_mount;
348
349	/* XXX: probably some notion of "can see" here if we want isolation. */
350	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
351	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
352	if (error)
353		return error;
354
355	if ((vp->v_vflag & VV_ROOT) == 0)
356		return EINVAL;
357
358	if (vfs_busy(mp, NULL))
359		return EPERM;
360
361	mutex_enter(&mp->mnt_updating);
362	mp->mnt_flag &= ~MNT_OP_FLAGS;
363	mp->mnt_flag |= MNT_GETARGS;
364	error = VFS_MOUNT(mp, path, data, data_len);
365	mp->mnt_flag &= ~MNT_OP_FLAGS;
366	mutex_exit(&mp->mnt_updating);
367
368	vfs_unbusy(mp, false, NULL);
369	return (error);
370}
371
372int
373sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
374{
375	/* {
376		syscallarg(const char *) type;
377		syscallarg(const char *) path;
378		syscallarg(int) flags;
379		syscallarg(void *) data;
380		syscallarg(size_t) data_len;
381	} */
382
383	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
384	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
385	    SCARG(uap, data_len), retval);
386}
387
388int
389do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
390    const char *path, int flags, void *data, enum uio_seg data_seg,
391    size_t data_len, register_t *retval)
392{
393	struct vnode *vp;
394	void *data_buf = data;
395	bool vfsopsrele = false;
396	size_t alloc_sz = 0;
397	int error;
398
399	/* XXX: The calling convention of this routine is totally bizarre */
400	if (vfsops)
401		vfsopsrele = true;
402
403	/*
404	 * Get vnode to be covered
405	 */
406	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
407	if (error != 0) {
408		vp = NULL;
409		goto done;
410	}
411
412	if (vfsops == NULL) {
413		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
414			vfsops = vp->v_mount->mnt_op;
415		} else {
416			/* 'type' is userspace */
417			error = mount_get_vfsops(type, &vfsops);
418			if (error != 0)
419				goto done;
420			vfsopsrele = true;
421		}
422	}
423
424	/*
425	 * We allow data to be NULL, even for userspace. Some fs's don't need
426	 * it. The others will handle NULL.
427	 */
428	if (data != NULL && data_seg == UIO_USERSPACE) {
429		if (data_len == 0) {
430			/* No length supplied, use default for filesystem */
431			data_len = vfsops->vfs_min_mount_data;
432
433			/*
434			 * Hopefully a longer buffer won't make copyin() fail.
435			 * For compatibility with 3.0 and earlier.
436			 */
437			if (flags & MNT_UPDATE
438			    && data_len < sizeof (struct mnt_export_args30))
439				data_len = sizeof (struct mnt_export_args30);
440		}
441		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
442			error = EINVAL;
443			goto done;
444		}
445		alloc_sz = data_len;
446		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
447
448		/* NFS needs the buffer even for mnt_getargs .... */
449		error = copyin(data, data_buf, data_len);
450		if (error != 0)
451			goto done;
452	}
453
454	if (flags & MNT_GETARGS) {
455		if (data_len == 0) {
456			error = EINVAL;
457			goto done;
458		}
459		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
460		if (error != 0)
461			goto done;
462		if (data_seg == UIO_USERSPACE)
463			error = copyout(data_buf, data, data_len);
464		*retval = data_len;
465	} else if (flags & MNT_UPDATE) {
466		error = mount_update(l, vp, path, flags, data_buf, &data_len);
467	} else {
468		/* Locking is handled internally in mount_domount(). */
469		KASSERT(vfsopsrele == true);
470		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
471		    &data_len);
472		vfsopsrele = false;
473	}
474
475    done:
476	if (vfsopsrele)
477		vfs_delref(vfsops);
478    	if (vp != NULL) {
479	    	vrele(vp);
480	}
481	if (data_buf != data)
482		kmem_free(data_buf, alloc_sz);
483	return (error);
484}
485
486/*
487 * Unmount a file system.
488 *
489 * Note: unmount takes a path to the vnode mounted on as argument,
490 * not special file (as before).
491 */
492/* ARGSUSED */
493int
494sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
495{
496	/* {
497		syscallarg(const char *) path;
498		syscallarg(int) flags;
499	} */
500	struct vnode *vp;
501	struct mount *mp;
502	int error;
503	struct pathbuf *pb;
504	struct nameidata nd;
505
506	error = pathbuf_copyin(SCARG(uap, path), &pb);
507	if (error) {
508		return error;
509	}
510
511	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
512	if ((error = namei(&nd)) != 0) {
513		pathbuf_destroy(pb);
514		return error;
515	}
516	vp = nd.ni_vp;
517	pathbuf_destroy(pb);
518
519	mp = vp->v_mount;
520	atomic_inc_uint(&mp->mnt_refcnt);
521	VOP_UNLOCK(vp);
522
523	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
524	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
525	if (error) {
526		vrele(vp);
527		vfs_destroy(mp);
528		return (error);
529	}
530
531	/*
532	 * Don't allow unmounting the root file system.
533	 */
534	if (mp->mnt_flag & MNT_ROOTFS) {
535		vrele(vp);
536		vfs_destroy(mp);
537		return (EINVAL);
538	}
539
540	/*
541	 * Must be the root of the filesystem
542	 */
543	if ((vp->v_vflag & VV_ROOT) == 0) {
544		vrele(vp);
545		vfs_destroy(mp);
546		return (EINVAL);
547	}
548
549	vrele(vp);
550	error = dounmount(mp, SCARG(uap, flags), l);
551	vfs_destroy(mp);
552	return error;
553}
554
555/*
556 * Sync each mounted filesystem.
557 */
558#ifdef DEBUG
559int syncprt = 0;
560struct ctldebug debug0 = { "syncprt", &syncprt };
561#endif
562
563void
564do_sys_sync(struct lwp *l)
565{
566	struct mount *mp, *nmp;
567	int asyncflag;
568
569	mutex_enter(&mountlist_lock);
570	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
571	     mp = nmp) {
572		if (vfs_busy(mp, &nmp)) {
573			continue;
574		}
575		mutex_enter(&mp->mnt_updating);
576		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
577			asyncflag = mp->mnt_flag & MNT_ASYNC;
578			mp->mnt_flag &= ~MNT_ASYNC;
579			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
580			if (asyncflag)
581				 mp->mnt_flag |= MNT_ASYNC;
582		}
583		mutex_exit(&mp->mnt_updating);
584		vfs_unbusy(mp, false, &nmp);
585	}
586	mutex_exit(&mountlist_lock);
587#ifdef DEBUG
588	if (syncprt)
589		vfs_bufstats();
590#endif /* DEBUG */
591}
592
593/* ARGSUSED */
594int
595sys_sync(struct lwp *l, const void *v, register_t *retval)
596{
597	do_sys_sync(l);
598	return (0);
599}
600
601
602/*
603 * Access or change filesystem quotas.
604 *
605 * (this is really 14 different calls bundled into one)
606 */
607
608static int
609do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
610{
611	struct quotastat info_k;
612	int error;
613
614	/* ensure any padding bytes are cleared */
615	memset(&info_k, 0, sizeof(info_k));
616
617	error = vfs_quotactl_stat(mp, &info_k);
618	if (error) {
619		return error;
620	}
621
622	return copyout(&info_k, info_u, sizeof(info_k));
623}
624
625static int
626do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
627    struct quotaidtypestat *info_u)
628{
629	struct quotaidtypestat info_k;
630	int error;
631
632	/* ensure any padding bytes are cleared */
633	memset(&info_k, 0, sizeof(info_k));
634
635	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
636	if (error) {
637		return error;
638	}
639
640	return copyout(&info_k, info_u, sizeof(info_k));
641}
642
643static int
644do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
645    struct quotaobjtypestat *info_u)
646{
647	struct quotaobjtypestat info_k;
648	int error;
649
650	/* ensure any padding bytes are cleared */
651	memset(&info_k, 0, sizeof(info_k));
652
653	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
654	if (error) {
655		return error;
656	}
657
658	return copyout(&info_k, info_u, sizeof(info_k));
659}
660
661static int
662do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
663    struct quotaval *val_u)
664{
665	struct quotakey key_k;
666	struct quotaval val_k;
667	int error;
668
669	/* ensure any padding bytes are cleared */
670	memset(&val_k, 0, sizeof(val_k));
671
672	error = copyin(key_u, &key_k, sizeof(key_k));
673	if (error) {
674		return error;
675	}
676
677	error = vfs_quotactl_get(mp, &key_k, &val_k);
678	if (error) {
679		return error;
680	}
681
682	return copyout(&val_k, val_u, sizeof(val_k));
683}
684
685static int
686do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
687    const struct quotaval *val_u)
688{
689	struct quotakey key_k;
690	struct quotaval val_k;
691	int error;
692
693	error = copyin(key_u, &key_k, sizeof(key_k));
694	if (error) {
695		return error;
696	}
697
698	error = copyin(val_u, &val_k, sizeof(val_k));
699	if (error) {
700		return error;
701	}
702
703	return vfs_quotactl_put(mp, &key_k, &val_k);
704}
705
706static int
707do_sys_quotactl_delete(struct mount *mp, const struct quotakey *key_u)
708{
709	struct quotakey key_k;
710	int error;
711
712	error = copyin(key_u, &key_k, sizeof(key_k));
713	if (error) {
714		return error;
715	}
716
717	return vfs_quotactl_delete(mp, &key_k);
718}
719
720static int
721do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
722{
723	struct quotakcursor cursor_k;
724	int error;
725
726	/* ensure any padding bytes are cleared */
727	memset(&cursor_k, 0, sizeof(cursor_k));
728
729	error = vfs_quotactl_cursoropen(mp, &cursor_k);
730	if (error) {
731		return error;
732	}
733
734	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
735}
736
737static int
738do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
739{
740	struct quotakcursor cursor_k;
741	int error;
742
743	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
744	if (error) {
745		return error;
746	}
747
748	return vfs_quotactl_cursorclose(mp, &cursor_k);
749}
750
751static int
752do_sys_quotactl_cursorskipidtype(struct mount *mp,
753    struct quotakcursor *cursor_u, int idtype)
754{
755	struct quotakcursor cursor_k;
756	int error;
757
758	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
759	if (error) {
760		return error;
761	}
762
763	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
764	if (error) {
765		return error;
766	}
767
768	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
769}
770
771static int
772do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
773    struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
774    unsigned *ret_u)
775{
776#define CGET_STACK_MAX 8
777	struct quotakcursor cursor_k;
778	struct quotakey stackkeys[CGET_STACK_MAX];
779	struct quotaval stackvals[CGET_STACK_MAX];
780	struct quotakey *keys_k;
781	struct quotaval *vals_k;
782	unsigned ret_k;
783	int error;
784
785	if (maxnum > 128) {
786		maxnum = 128;
787	}
788
789	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
790	if (error) {
791		return error;
792	}
793
794	if (maxnum <= CGET_STACK_MAX) {
795		keys_k = stackkeys;
796		vals_k = stackvals;
797		/* ensure any padding bytes are cleared */
798		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
799		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
800	} else {
801		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
802		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
803	}
804
805	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
806				       &ret_k);
807	if (error) {
808		goto fail;
809	}
810
811	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
812	if (error) {
813		goto fail;
814	}
815
816	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
817	if (error) {
818		goto fail;
819	}
820
821	error = copyout(&ret_k, ret_u, sizeof(ret_k));
822	if (error) {
823		goto fail;
824	}
825
826	/* do last to maximize the chance of being able to recover a failure */
827	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
828
829fail:
830	if (keys_k != stackkeys) {
831		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
832	}
833	if (vals_k != stackvals) {
834		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
835	}
836	return error;
837}
838
839static int
840do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
841    int *ret_u)
842{
843	struct quotakcursor cursor_k;
844	int ret_k;
845	int error;
846
847	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
848	if (error) {
849		return error;
850	}
851
852	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
853	if (error) {
854		return error;
855	}
856
857	error = copyout(&ret_k, ret_u, sizeof(ret_k));
858	if (error) {
859		return error;
860	}
861
862	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
863}
864
865static int
866do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
867{
868	struct quotakcursor cursor_k;
869	int error;
870
871	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
872	if (error) {
873		return error;
874	}
875
876	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
877	if (error) {
878		return error;
879	}
880
881	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
882}
883
884static int
885do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
886{
887	char *path_k;
888	int error;
889
890	/* XXX this should probably be a struct pathbuf */
891	path_k = PNBUF_GET();
892	error = copyin(path_u, path_k, PATH_MAX);
893	if (error) {
894		PNBUF_PUT(path_k);
895		return error;
896	}
897
898	error = vfs_quotactl_quotaon(mp, idtype, path_k);
899
900	PNBUF_PUT(path_k);
901	return error;
902}
903
904static int
905do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
906{
907	return vfs_quotactl_quotaoff(mp, idtype);
908}
909
910int
911do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
912{
913	struct mount *mp;
914	struct vnode *vp;
915	int error;
916
917	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
918	if (error != 0)
919		return (error);
920	mp = vp->v_mount;
921
922	switch (args->qc_op) {
923	    case QUOTACTL_STAT:
924		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
925		break;
926	    case QUOTACTL_IDTYPESTAT:
927		error = do_sys_quotactl_idtypestat(mp,
928				args->u.idtypestat.qc_idtype,
929				args->u.idtypestat.qc_info);
930		break;
931	    case QUOTACTL_OBJTYPESTAT:
932		error = do_sys_quotactl_objtypestat(mp,
933				args->u.objtypestat.qc_objtype,
934				args->u.objtypestat.qc_info);
935		break;
936	    case QUOTACTL_GET:
937		error = do_sys_quotactl_get(mp,
938				args->u.get.qc_key,
939				args->u.get.qc_val);
940		break;
941	    case QUOTACTL_PUT:
942		error = do_sys_quotactl_put(mp,
943				args->u.put.qc_key,
944				args->u.put.qc_val);
945		break;
946	    case QUOTACTL_DELETE:
947		error = do_sys_quotactl_delete(mp, args->u.delete.qc_key);
948		break;
949	    case QUOTACTL_CURSOROPEN:
950		error = do_sys_quotactl_cursoropen(mp,
951				args->u.cursoropen.qc_cursor);
952		break;
953	    case QUOTACTL_CURSORCLOSE:
954		error = do_sys_quotactl_cursorclose(mp,
955				args->u.cursorclose.qc_cursor);
956		break;
957	    case QUOTACTL_CURSORSKIPIDTYPE:
958		error = do_sys_quotactl_cursorskipidtype(mp,
959				args->u.cursorskipidtype.qc_cursor,
960				args->u.cursorskipidtype.qc_idtype);
961		break;
962	    case QUOTACTL_CURSORGET:
963		error = do_sys_quotactl_cursorget(mp,
964				args->u.cursorget.qc_cursor,
965				args->u.cursorget.qc_keys,
966				args->u.cursorget.qc_vals,
967				args->u.cursorget.qc_maxnum,
968				args->u.cursorget.qc_ret);
969		break;
970	    case QUOTACTL_CURSORATEND:
971		error = do_sys_quotactl_cursoratend(mp,
972				args->u.cursoratend.qc_cursor,
973				args->u.cursoratend.qc_ret);
974		break;
975	    case QUOTACTL_CURSORREWIND:
976		error = do_sys_quotactl_cursorrewind(mp,
977				args->u.cursorrewind.qc_cursor);
978		break;
979	    case QUOTACTL_QUOTAON:
980		error = do_sys_quotactl_quotaon(mp,
981				args->u.quotaon.qc_idtype,
982				args->u.quotaon.qc_quotafile);
983		break;
984	    case QUOTACTL_QUOTAOFF:
985		error = do_sys_quotactl_quotaoff(mp,
986				args->u.quotaoff.qc_idtype);
987		break;
988	    default:
989		error = EINVAL;
990		break;
991	}
992
993	vrele(vp);
994	return error;
995}
996
997/* ARGSUSED */
998int
999sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1000    register_t *retval)
1001{
1002	/* {
1003		syscallarg(const char *) path;
1004		syscallarg(struct quotactl_args *) args;
1005	} */
1006	struct quotactl_args args;
1007	int error;
1008
1009	error = copyin(SCARG(uap, args), &args, sizeof(args));
1010	if (error) {
1011		return error;
1012	}
1013
1014	return do_sys_quotactl(SCARG(uap, path), &args);
1015}
1016
1017int
1018dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1019    int root)
1020{
1021	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1022	int error = 0;
1023
1024	/*
1025	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1026	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1027	 * overrides MNT_NOWAIT.
1028	 */
1029	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1030	    (flags != MNT_WAIT && flags != 0)) {
1031		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1032		goto done;
1033	}
1034
1035	/* Get the filesystem stats now */
1036	memset(sp, 0, sizeof(*sp));
1037	if ((error = VFS_STATVFS(mp, sp)) != 0) {
1038		return error;
1039	}
1040
1041	if (cwdi->cwdi_rdir == NULL)
1042		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1043done:
1044	if (cwdi->cwdi_rdir != NULL) {
1045		size_t len;
1046		char *bp;
1047		char c;
1048		char *path = PNBUF_GET();
1049
1050		bp = path + MAXPATHLEN;
1051		*--bp = '\0';
1052		rw_enter(&cwdi->cwdi_lock, RW_READER);
1053		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1054		    MAXPATHLEN / 2, 0, l);
1055		rw_exit(&cwdi->cwdi_lock);
1056		if (error) {
1057			PNBUF_PUT(path);
1058			return error;
1059		}
1060		len = strlen(bp);
1061		if (len != 1) {
1062			/*
1063			 * for mount points that are below our root, we can see
1064			 * them, so we fix up the pathname and return them. The
1065			 * rest we cannot see, so we don't allow viewing the
1066			 * data.
1067			 */
1068			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1069			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1070				(void)strlcpy(sp->f_mntonname,
1071				    c == '\0' ? "/" : &sp->f_mntonname[len],
1072				    sizeof(sp->f_mntonname));
1073			} else {
1074				if (root)
1075					(void)strlcpy(sp->f_mntonname, "/",
1076					    sizeof(sp->f_mntonname));
1077				else
1078					error = EPERM;
1079			}
1080		}
1081		PNBUF_PUT(path);
1082	}
1083	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1084	return error;
1085}
1086
1087/*
1088 * Get filesystem statistics by path.
1089 */
1090int
1091do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1092{
1093	struct mount *mp;
1094	int error;
1095	struct vnode *vp;
1096
1097	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1098	if (error != 0)
1099		return error;
1100	mp = vp->v_mount;
1101	error = dostatvfs(mp, sb, l, flags, 1);
1102	vrele(vp);
1103	return error;
1104}
1105
1106/* ARGSUSED */
1107int
1108sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
1109{
1110	/* {
1111		syscallarg(const char *) path;
1112		syscallarg(struct statvfs *) buf;
1113		syscallarg(int) flags;
1114	} */
1115	struct statvfs *sb;
1116	int error;
1117
1118	sb = STATVFSBUF_GET();
1119	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1120	if (error == 0)
1121		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1122	STATVFSBUF_PUT(sb);
1123	return error;
1124}
1125
1126/*
1127 * Get filesystem statistics by fd.
1128 */
1129int
1130do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1131{
1132	file_t *fp;
1133	struct mount *mp;
1134	int error;
1135
1136	/* fd_getvnode() will use the descriptor for us */
1137	if ((error = fd_getvnode(fd, &fp)) != 0)
1138		return (error);
1139	mp = ((struct vnode *)fp->f_data)->v_mount;
1140	error = dostatvfs(mp, sb, curlwp, flags, 1);
1141	fd_putfile(fd);
1142	return error;
1143}
1144
1145/* ARGSUSED */
1146int
1147sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
1148{
1149	/* {
1150		syscallarg(int) fd;
1151		syscallarg(struct statvfs *) buf;
1152		syscallarg(int) flags;
1153	} */
1154	struct statvfs *sb;
1155	int error;
1156
1157	sb = STATVFSBUF_GET();
1158	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1159	if (error == 0)
1160		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1161	STATVFSBUF_PUT(sb);
1162	return error;
1163}
1164
1165
1166/*
1167 * Get statistics on all filesystems.
1168 */
1169int
1170do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1171    int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1172    register_t *retval)
1173{
1174	int root = 0;
1175	struct proc *p = l->l_proc;
1176	struct mount *mp, *nmp;
1177	struct statvfs *sb;
1178	size_t count, maxcount;
1179	int error = 0;
1180
1181	sb = STATVFSBUF_GET();
1182	maxcount = bufsize / entry_sz;
1183	mutex_enter(&mountlist_lock);
1184	count = 0;
1185	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1186	     mp = nmp) {
1187		if (vfs_busy(mp, &nmp)) {
1188			continue;
1189		}
1190		if (sfsp && count < maxcount) {
1191			error = dostatvfs(mp, sb, l, flags, 0);
1192			if (error) {
1193				vfs_unbusy(mp, false, &nmp);
1194				error = 0;
1195				continue;
1196			}
1197			error = copyfn(sb, sfsp, entry_sz);
1198			if (error) {
1199				vfs_unbusy(mp, false, NULL);
1200				goto out;
1201			}
1202			sfsp = (char *)sfsp + entry_sz;
1203			root |= strcmp(sb->f_mntonname, "/") == 0;
1204		}
1205		count++;
1206		vfs_unbusy(mp, false, &nmp);
1207	}
1208	mutex_exit(&mountlist_lock);
1209
1210	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1211		/*
1212		 * fake a root entry
1213		 */
1214		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1215		    sb, l, flags, 1);
1216		if (error != 0)
1217			goto out;
1218		if (sfsp) {
1219			error = copyfn(sb, sfsp, entry_sz);
1220			if (error != 0)
1221				goto out;
1222		}
1223		count++;
1224	}
1225	if (sfsp && count > maxcount)
1226		*retval = maxcount;
1227	else
1228		*retval = count;
1229out:
1230	STATVFSBUF_PUT(sb);
1231	return error;
1232}
1233
1234int
1235sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1236{
1237	/* {
1238		syscallarg(struct statvfs *) buf;
1239		syscallarg(size_t) bufsize;
1240		syscallarg(int) flags;
1241	} */
1242
1243	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1244	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1245}
1246
1247/*
1248 * Change current working directory to a given file descriptor.
1249 */
1250/* ARGSUSED */
1251int
1252sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1253{
1254	/* {
1255		syscallarg(int) fd;
1256	} */
1257	struct proc *p = l->l_proc;
1258	struct cwdinfo *cwdi;
1259	struct vnode *vp, *tdp;
1260	struct mount *mp;
1261	file_t *fp;
1262	int error, fd;
1263
1264	/* fd_getvnode() will use the descriptor for us */
1265	fd = SCARG(uap, fd);
1266	if ((error = fd_getvnode(fd, &fp)) != 0)
1267		return (error);
1268	vp = fp->f_data;
1269
1270	vref(vp);
1271	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1272	if (vp->v_type != VDIR)
1273		error = ENOTDIR;
1274	else
1275		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1276	if (error) {
1277		vput(vp);
1278		goto out;
1279	}
1280	while ((mp = vp->v_mountedhere) != NULL) {
1281		error = vfs_busy(mp, NULL);
1282		vput(vp);
1283		if (error != 0)
1284			goto out;
1285		error = VFS_ROOT(mp, &tdp);
1286		vfs_unbusy(mp, false, NULL);
1287		if (error)
1288			goto out;
1289		vp = tdp;
1290	}
1291	VOP_UNLOCK(vp);
1292
1293	/*
1294	 * Disallow changing to a directory not under the process's
1295	 * current root directory (if there is one).
1296	 */
1297	cwdi = p->p_cwdi;
1298	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1299	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1300		vrele(vp);
1301		error = EPERM;	/* operation not permitted */
1302	} else {
1303		vrele(cwdi->cwdi_cdir);
1304		cwdi->cwdi_cdir = vp;
1305	}
1306	rw_exit(&cwdi->cwdi_lock);
1307
1308 out:
1309	fd_putfile(fd);
1310	return (error);
1311}
1312
1313/*
1314 * Change this process's notion of the root directory to a given file
1315 * descriptor.
1316 */
1317int
1318sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1319{
1320	struct proc *p = l->l_proc;
1321	struct vnode	*vp;
1322	file_t	*fp;
1323	int		 error, fd = SCARG(uap, fd);
1324
1325	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1326 	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1327		return error;
1328	/* fd_getvnode() will use the descriptor for us */
1329	if ((error = fd_getvnode(fd, &fp)) != 0)
1330		return error;
1331	vp = fp->f_data;
1332	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1333	if (vp->v_type != VDIR)
1334		error = ENOTDIR;
1335	else
1336		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1337	VOP_UNLOCK(vp);
1338	if (error)
1339		goto out;
1340	vref(vp);
1341
1342	change_root(p->p_cwdi, vp, l);
1343
1344 out:
1345	fd_putfile(fd);
1346	return (error);
1347}
1348
1349/*
1350 * Change current working directory (``.'').
1351 */
1352/* ARGSUSED */
1353int
1354sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1355{
1356	/* {
1357		syscallarg(const char *) path;
1358	} */
1359	struct proc *p = l->l_proc;
1360	struct cwdinfo *cwdi;
1361	int error;
1362	struct vnode *vp;
1363
1364	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1365				  &vp, l)) != 0)
1366		return (error);
1367	cwdi = p->p_cwdi;
1368	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1369	vrele(cwdi->cwdi_cdir);
1370	cwdi->cwdi_cdir = vp;
1371	rw_exit(&cwdi->cwdi_lock);
1372	return (0);
1373}
1374
1375/*
1376 * Change notion of root (``/'') directory.
1377 */
1378/* ARGSUSED */
1379int
1380sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1381{
1382	/* {
1383		syscallarg(const char *) path;
1384	} */
1385	struct proc *p = l->l_proc;
1386	int error;
1387	struct vnode *vp;
1388
1389	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1390	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1391		return (error);
1392	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1393				  &vp, l)) != 0)
1394		return (error);
1395
1396	change_root(p->p_cwdi, vp, l);
1397
1398	return (0);
1399}
1400
1401/*
1402 * Common routine for chroot and fchroot.
1403 * NB: callers need to properly authorize the change root operation.
1404 */
1405void
1406change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1407{
1408
1409	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1410	if (cwdi->cwdi_rdir != NULL)
1411		vrele(cwdi->cwdi_rdir);
1412	cwdi->cwdi_rdir = vp;
1413
1414	/*
1415	 * Prevent escaping from chroot by putting the root under
1416	 * the working directory.  Silently chdir to / if we aren't
1417	 * already there.
1418	 */
1419	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1420		/*
1421		 * XXX would be more failsafe to change directory to a
1422		 * deadfs node here instead
1423		 */
1424		vrele(cwdi->cwdi_cdir);
1425		vref(vp);
1426		cwdi->cwdi_cdir = vp;
1427	}
1428	rw_exit(&cwdi->cwdi_lock);
1429}
1430
1431/*
1432 * Common routine for chroot and chdir.
1433 * XXX "where" should be enum uio_seg
1434 */
1435int
1436chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1437{
1438	struct pathbuf *pb;
1439	struct nameidata nd;
1440	int error;
1441
1442	error = pathbuf_maybe_copyin(path, where, &pb);
1443	if (error) {
1444		return error;
1445	}
1446	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1447	if ((error = namei(&nd)) != 0) {
1448		pathbuf_destroy(pb);
1449		return error;
1450	}
1451	*vpp = nd.ni_vp;
1452	pathbuf_destroy(pb);
1453
1454	if ((*vpp)->v_type != VDIR)
1455		error = ENOTDIR;
1456	else
1457		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1458
1459	if (error)
1460		vput(*vpp);
1461	else
1462		VOP_UNLOCK(*vpp);
1463	return (error);
1464}
1465
1466/*
1467 * Internals of sys_open - path has already been converted into a pathbuf
1468 * (so we can easily reuse this function from other parts of the kernel,
1469 * like posix_spawn post-processing).
1470 */
1471static int
1472do_open(lwp_t *l, struct pathbuf *pb, int open_flags, int open_mode, int *fd)
1473{
1474	struct proc *p = l->l_proc;
1475	struct cwdinfo *cwdi = p->p_cwdi;
1476	file_t *fp;
1477	struct vnode *vp;
1478	int flags, cmode;
1479	int indx, error;
1480	struct nameidata nd;
1481
1482	flags = FFLAGS(open_flags);
1483	if ((flags & (FREAD | FWRITE)) == 0)
1484		return EINVAL;
1485
1486	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1487		return error;
1488	}
1489
1490	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1491	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1492	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1493	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1494	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1495		fd_abort(p, fp, indx);
1496		if ((error == EDUPFD || error == EMOVEFD) &&
1497		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1498		    (error =
1499			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1500			*fd = indx;
1501			return 0;
1502		}
1503		if (error == ERESTART)
1504			error = EINTR;
1505		return error;
1506	}
1507
1508	l->l_dupfd = 0;
1509	vp = nd.ni_vp;
1510
1511	if ((error = open_setfp(l, fp, vp, indx, flags)))
1512		return error;
1513
1514	VOP_UNLOCK(vp);
1515	*fd = indx;
1516	fd_affix(p, fp, indx);
1517	return 0;
1518}
1519
1520int
1521fd_open(const char *path, int open_flags, int open_mode, int *fd)
1522{
1523	struct pathbuf *pb;
1524	int error, oflags;
1525
1526	oflags = FFLAGS(open_flags);
1527	if ((oflags & (FREAD | FWRITE)) == 0)
1528		return EINVAL;
1529
1530	pb = pathbuf_create(path);
1531	if (pb == NULL)
1532		return ENOMEM;
1533
1534	error = do_open(curlwp, pb, open_flags, open_mode, fd);
1535	pathbuf_destroy(pb);
1536
1537	return error;
1538}
1539
1540/*
1541 * Check permissions, allocate an open file structure,
1542 * and call the device open routine if any.
1543 */
1544int
1545sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1546{
1547	/* {
1548		syscallarg(const char *) path;
1549		syscallarg(int) flags;
1550		syscallarg(int) mode;
1551	} */
1552	struct pathbuf *pb;
1553	int result, flags, error;
1554
1555	flags = FFLAGS(SCARG(uap, flags));
1556	if ((flags & (FREAD | FWRITE)) == 0)
1557		return EINVAL;
1558
1559	error = pathbuf_copyin(SCARG(uap, path), &pb);
1560	if (error)
1561		return error;
1562
1563	error = do_open(l, pb, SCARG(uap, flags), SCARG(uap, mode), &result);
1564	pathbuf_destroy(pb);
1565
1566	*retval = result;
1567	return error;
1568}
1569
1570int
1571sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1572{
1573	/* {
1574		syscallarg(int) fd;
1575		syscallarg(const char *) path;
1576		syscallarg(int) flags;
1577		syscallarg(int) mode;
1578	} */
1579
1580	return ENOSYS;
1581}
1582
1583static void
1584vfs__fhfree(fhandle_t *fhp)
1585{
1586	size_t fhsize;
1587
1588	if (fhp == NULL) {
1589		return;
1590	}
1591	fhsize = FHANDLE_SIZE(fhp);
1592	kmem_free(fhp, fhsize);
1593}
1594
1595/*
1596 * vfs_composefh: compose a filehandle.
1597 */
1598
1599int
1600vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1601{
1602	struct mount *mp;
1603	struct fid *fidp;
1604	int error;
1605	size_t needfhsize;
1606	size_t fidsize;
1607
1608	mp = vp->v_mount;
1609	fidp = NULL;
1610	if (*fh_size < FHANDLE_SIZE_MIN) {
1611		fidsize = 0;
1612	} else {
1613		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1614		if (fhp != NULL) {
1615			memset(fhp, 0, *fh_size);
1616			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1617			fidp = &fhp->fh_fid;
1618		}
1619	}
1620	error = VFS_VPTOFH(vp, fidp, &fidsize);
1621	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1622	if (error == 0 && *fh_size < needfhsize) {
1623		error = E2BIG;
1624	}
1625	*fh_size = needfhsize;
1626	return error;
1627}
1628
1629int
1630vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1631{
1632	struct mount *mp;
1633	fhandle_t *fhp;
1634	size_t fhsize;
1635	size_t fidsize;
1636	int error;
1637
1638	*fhpp = NULL;
1639	mp = vp->v_mount;
1640	fidsize = 0;
1641	error = VFS_VPTOFH(vp, NULL, &fidsize);
1642	KASSERT(error != 0);
1643	if (error != E2BIG) {
1644		goto out;
1645	}
1646	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1647	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1648	if (fhp == NULL) {
1649		error = ENOMEM;
1650		goto out;
1651	}
1652	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1653	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1654	if (error == 0) {
1655		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1656		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1657		*fhpp = fhp;
1658	} else {
1659		kmem_free(fhp, fhsize);
1660	}
1661out:
1662	return error;
1663}
1664
1665void
1666vfs_composefh_free(fhandle_t *fhp)
1667{
1668
1669	vfs__fhfree(fhp);
1670}
1671
1672/*
1673 * vfs_fhtovp: lookup a vnode by a filehandle.
1674 */
1675
1676int
1677vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1678{
1679	struct mount *mp;
1680	int error;
1681
1682	*vpp = NULL;
1683	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1684	if (mp == NULL) {
1685		error = ESTALE;
1686		goto out;
1687	}
1688	if (mp->mnt_op->vfs_fhtovp == NULL) {
1689		error = EOPNOTSUPP;
1690		goto out;
1691	}
1692	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1693out:
1694	return error;
1695}
1696
1697/*
1698 * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1699 * the needed size.
1700 */
1701
1702int
1703vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1704{
1705	fhandle_t *fhp;
1706	int error;
1707
1708	*fhpp = NULL;
1709	if (fhsize > FHANDLE_SIZE_MAX) {
1710		return EINVAL;
1711	}
1712	if (fhsize < FHANDLE_SIZE_MIN) {
1713		return EINVAL;
1714	}
1715again:
1716	fhp = kmem_alloc(fhsize, KM_SLEEP);
1717	if (fhp == NULL) {
1718		return ENOMEM;
1719	}
1720	error = copyin(ufhp, fhp, fhsize);
1721	if (error == 0) {
1722		/* XXX this check shouldn't be here */
1723		if (FHANDLE_SIZE(fhp) == fhsize) {
1724			*fhpp = fhp;
1725			return 0;
1726		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1727			/*
1728			 * a kludge for nfsv2 padded handles.
1729			 */
1730			size_t sz;
1731
1732			sz = FHANDLE_SIZE(fhp);
1733			kmem_free(fhp, fhsize);
1734			fhsize = sz;
1735			goto again;
1736		} else {
1737			/*
1738			 * userland told us wrong size.
1739			 */
1740		    	error = EINVAL;
1741		}
1742	}
1743	kmem_free(fhp, fhsize);
1744	return error;
1745}
1746
1747void
1748vfs_copyinfh_free(fhandle_t *fhp)
1749{
1750
1751	vfs__fhfree(fhp);
1752}
1753
1754/*
1755 * Get file handle system call
1756 */
1757int
1758sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1759{
1760	/* {
1761		syscallarg(char *) fname;
1762		syscallarg(fhandle_t *) fhp;
1763		syscallarg(size_t *) fh_size;
1764	} */
1765	struct vnode *vp;
1766	fhandle_t *fh;
1767	int error;
1768	struct pathbuf *pb;
1769	struct nameidata nd;
1770	size_t sz;
1771	size_t usz;
1772
1773	/*
1774	 * Must be super user
1775	 */
1776	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1777	    0, NULL, NULL, NULL);
1778	if (error)
1779		return (error);
1780
1781	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1782	if (error) {
1783		return error;
1784	}
1785	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1786	error = namei(&nd);
1787	if (error) {
1788		pathbuf_destroy(pb);
1789		return error;
1790	}
1791	vp = nd.ni_vp;
1792	pathbuf_destroy(pb);
1793
1794	error = vfs_composefh_alloc(vp, &fh);
1795	vput(vp);
1796	if (error != 0) {
1797		goto out;
1798	}
1799	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1800	if (error != 0) {
1801		goto out;
1802	}
1803	sz = FHANDLE_SIZE(fh);
1804	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1805	if (error != 0) {
1806		goto out;
1807	}
1808	if (usz >= sz) {
1809		error = copyout(fh, SCARG(uap, fhp), sz);
1810	} else {
1811		error = E2BIG;
1812	}
1813out:
1814	vfs_composefh_free(fh);
1815	return (error);
1816}
1817
1818/*
1819 * Open a file given a file handle.
1820 *
1821 * Check permissions, allocate an open file structure,
1822 * and call the device open routine if any.
1823 */
1824
1825int
1826dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1827    register_t *retval)
1828{
1829	file_t *fp;
1830	struct vnode *vp = NULL;
1831	kauth_cred_t cred = l->l_cred;
1832	file_t *nfp;
1833	int indx, error = 0;
1834	struct vattr va;
1835	fhandle_t *fh;
1836	int flags;
1837	proc_t *p;
1838
1839	p = curproc;
1840
1841	/*
1842	 * Must be super user
1843	 */
1844	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1845	    0, NULL, NULL, NULL)))
1846		return (error);
1847
1848	flags = FFLAGS(oflags);
1849	if ((flags & (FREAD | FWRITE)) == 0)
1850		return (EINVAL);
1851	if ((flags & O_CREAT))
1852		return (EINVAL);
1853	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1854		return (error);
1855	fp = nfp;
1856	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1857	if (error != 0) {
1858		goto bad;
1859	}
1860	error = vfs_fhtovp(fh, &vp);
1861	if (error != 0) {
1862		goto bad;
1863	}
1864
1865	/* Now do an effective vn_open */
1866
1867	if (vp->v_type == VSOCK) {
1868		error = EOPNOTSUPP;
1869		goto bad;
1870	}
1871	error = vn_openchk(vp, cred, flags);
1872	if (error != 0)
1873		goto bad;
1874	if (flags & O_TRUNC) {
1875		VOP_UNLOCK(vp);			/* XXX */
1876		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1877		vattr_null(&va);
1878		va.va_size = 0;
1879		error = VOP_SETATTR(vp, &va, cred);
1880		if (error)
1881			goto bad;
1882	}
1883	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1884		goto bad;
1885	if (flags & FWRITE) {
1886		mutex_enter(vp->v_interlock);
1887		vp->v_writecount++;
1888		mutex_exit(vp->v_interlock);
1889	}
1890
1891	/* done with modified vn_open, now finish what sys_open does. */
1892	if ((error = open_setfp(l, fp, vp, indx, flags)))
1893		return error;
1894
1895	VOP_UNLOCK(vp);
1896	*retval = indx;
1897	fd_affix(p, fp, indx);
1898	vfs_copyinfh_free(fh);
1899	return (0);
1900
1901bad:
1902	fd_abort(p, fp, indx);
1903	if (vp != NULL)
1904		vput(vp);
1905	vfs_copyinfh_free(fh);
1906	return (error);
1907}
1908
1909int
1910sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1911{
1912	/* {
1913		syscallarg(const void *) fhp;
1914		syscallarg(size_t) fh_size;
1915		syscallarg(int) flags;
1916	} */
1917
1918	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1919	    SCARG(uap, flags), retval);
1920}
1921
1922int
1923do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1924{
1925	int error;
1926	fhandle_t *fh;
1927	struct vnode *vp;
1928
1929	/*
1930	 * Must be super user
1931	 */
1932	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1933	    0, NULL, NULL, NULL)))
1934		return (error);
1935
1936	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1937	if (error != 0)
1938		return error;
1939
1940	error = vfs_fhtovp(fh, &vp);
1941	vfs_copyinfh_free(fh);
1942	if (error != 0)
1943		return error;
1944
1945	error = vn_stat(vp, sb);
1946	vput(vp);
1947	return error;
1948}
1949
1950
1951/* ARGSUSED */
1952int
1953sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1954{
1955	/* {
1956		syscallarg(const void *) fhp;
1957		syscallarg(size_t) fh_size;
1958		syscallarg(struct stat *) sb;
1959	} */
1960	struct stat sb;
1961	int error;
1962
1963	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1964	if (error)
1965		return error;
1966	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1967}
1968
1969int
1970do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1971    int flags)
1972{
1973	fhandle_t *fh;
1974	struct mount *mp;
1975	struct vnode *vp;
1976	int error;
1977
1978	/*
1979	 * Must be super user
1980	 */
1981	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1982	    0, NULL, NULL, NULL)))
1983		return error;
1984
1985	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1986	if (error != 0)
1987		return error;
1988
1989	error = vfs_fhtovp(fh, &vp);
1990	vfs_copyinfh_free(fh);
1991	if (error != 0)
1992		return error;
1993
1994	mp = vp->v_mount;
1995	error = dostatvfs(mp, sb, l, flags, 1);
1996	vput(vp);
1997	return error;
1998}
1999
2000/* ARGSUSED */
2001int
2002sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
2003{
2004	/* {
2005		syscallarg(const void *) fhp;
2006		syscallarg(size_t) fh_size;
2007		syscallarg(struct statvfs *) buf;
2008		syscallarg(int)	flags;
2009	} */
2010	struct statvfs *sb = STATVFSBUF_GET();
2011	int error;
2012
2013	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2014	    SCARG(uap, flags));
2015	if (error == 0)
2016		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2017	STATVFSBUF_PUT(sb);
2018	return error;
2019}
2020
2021/*
2022 * Create a special file.
2023 */
2024/* ARGSUSED */
2025int
2026sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2027    register_t *retval)
2028{
2029	/* {
2030		syscallarg(const char *) path;
2031		syscallarg(mode_t) mode;
2032		syscallarg(dev_t) dev;
2033	} */
2034	return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
2035	    SCARG(uap, dev), retval, UIO_USERSPACE);
2036}
2037
2038int
2039sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2040    register_t *retval)
2041{
2042	/* {
2043		syscallarg(int) fd;
2044		syscallarg(const char *) path;
2045		syscallarg(mode_t) mode;
2046		syscallarg(uint32_t) dev;
2047	} */
2048
2049	return ENOSYS;
2050}
2051
2052int
2053do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2054    register_t *retval, enum uio_seg seg)
2055{
2056	struct proc *p = l->l_proc;
2057	struct vnode *vp;
2058	struct vattr vattr;
2059	int error, optype;
2060	struct pathbuf *pb;
2061	struct nameidata nd;
2062	const char *pathstring;
2063
2064	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2065	    0, NULL, NULL, NULL)) != 0)
2066		return (error);
2067
2068	optype = VOP_MKNOD_DESCOFFSET;
2069
2070	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2071	if (error) {
2072		return error;
2073	}
2074	pathstring = pathbuf_stringcopy_get(pb);
2075	if (pathstring == NULL) {
2076		pathbuf_destroy(pb);
2077		return ENOMEM;
2078	}
2079
2080	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2081	if ((error = namei(&nd)) != 0)
2082		goto out;
2083	vp = nd.ni_vp;
2084
2085	if (vp != NULL)
2086		error = EEXIST;
2087	else {
2088		vattr_null(&vattr);
2089		/* We will read cwdi->cwdi_cmask unlocked. */
2090		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2091		vattr.va_rdev = dev;
2092
2093		switch (mode & S_IFMT) {
2094		case S_IFMT:	/* used by badsect to flag bad sectors */
2095			vattr.va_type = VBAD;
2096			break;
2097		case S_IFCHR:
2098			vattr.va_type = VCHR;
2099			break;
2100		case S_IFBLK:
2101			vattr.va_type = VBLK;
2102			break;
2103		case S_IFWHT:
2104			optype = VOP_WHITEOUT_DESCOFFSET;
2105			break;
2106		case S_IFREG:
2107#if NVERIEXEC > 0
2108			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2109			    O_CREAT);
2110#endif /* NVERIEXEC > 0 */
2111			vattr.va_type = VREG;
2112			vattr.va_rdev = VNOVAL;
2113			optype = VOP_CREATE_DESCOFFSET;
2114			break;
2115		default:
2116			error = EINVAL;
2117			break;
2118		}
2119	}
2120	if (error == 0 && optype == VOP_MKNOD_DESCOFFSET
2121	    && vattr.va_rdev == VNOVAL)
2122		error = EINVAL;
2123	if (!error) {
2124		switch (optype) {
2125		case VOP_WHITEOUT_DESCOFFSET:
2126			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2127			if (error)
2128				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2129			vput(nd.ni_dvp);
2130			break;
2131
2132		case VOP_MKNOD_DESCOFFSET:
2133			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2134						&nd.ni_cnd, &vattr);
2135			if (error == 0)
2136				vput(nd.ni_vp);
2137			break;
2138
2139		case VOP_CREATE_DESCOFFSET:
2140			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2141						&nd.ni_cnd, &vattr);
2142			if (error == 0)
2143				vput(nd.ni_vp);
2144			break;
2145		}
2146	} else {
2147		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2148		if (nd.ni_dvp == vp)
2149			vrele(nd.ni_dvp);
2150		else
2151			vput(nd.ni_dvp);
2152		if (vp)
2153			vrele(vp);
2154	}
2155out:
2156	pathbuf_stringcopy_put(pb, pathstring);
2157	pathbuf_destroy(pb);
2158	return (error);
2159}
2160
2161/*
2162 * Create a named pipe.
2163 */
2164/* ARGSUSED */
2165int
2166sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2167{
2168	/* {
2169		syscallarg(const char *) path;
2170		syscallarg(int) mode;
2171	} */
2172	struct proc *p = l->l_proc;
2173	struct vattr vattr;
2174	int error;
2175	struct pathbuf *pb;
2176	struct nameidata nd;
2177
2178	error = pathbuf_copyin(SCARG(uap, path), &pb);
2179	if (error) {
2180		return error;
2181	}
2182	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2183	if ((error = namei(&nd)) != 0) {
2184		pathbuf_destroy(pb);
2185		return error;
2186	}
2187	if (nd.ni_vp != NULL) {
2188		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2189		if (nd.ni_dvp == nd.ni_vp)
2190			vrele(nd.ni_dvp);
2191		else
2192			vput(nd.ni_dvp);
2193		vrele(nd.ni_vp);
2194		pathbuf_destroy(pb);
2195		return (EEXIST);
2196	}
2197	vattr_null(&vattr);
2198	vattr.va_type = VFIFO;
2199	/* We will read cwdi->cwdi_cmask unlocked. */
2200	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2201	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2202	if (error == 0)
2203		vput(nd.ni_vp);
2204	pathbuf_destroy(pb);
2205	return (error);
2206}
2207
2208int
2209sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2210    register_t *retval)
2211{
2212	/* {
2213		syscallarg(int) fd;
2214		syscallarg(const char *) path;
2215		syscallarg(int) mode;
2216	} */
2217
2218	return ENOSYS;
2219}
2220/*
2221 * Make a hard file link.
2222 */
2223/* ARGSUSED */
2224static int
2225do_sys_link(struct lwp *l, const char *path, const char *link,
2226	    int follow, register_t *retval)
2227{
2228	struct vnode *vp;
2229	struct pathbuf *linkpb;
2230	struct nameidata nd;
2231	namei_simple_flags_t namei_simple_flags;
2232	int error;
2233
2234	if (follow)
2235		namei_simple_flags = NSM_FOLLOW_TRYEMULROOT;
2236	else
2237		namei_simple_flags =  NSM_NOFOLLOW_TRYEMULROOT;
2238
2239	error = namei_simple_user(path, namei_simple_flags, &vp);
2240	if (error != 0)
2241		return (error);
2242	error = pathbuf_copyin(link, &linkpb);
2243	if (error) {
2244		goto out1;
2245	}
2246	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2247	if ((error = namei(&nd)) != 0)
2248		goto out2;
2249	if (nd.ni_vp) {
2250		error = EEXIST;
2251		goto abortop;
2252	}
2253	/* Prevent hard links on directories. */
2254	if (vp->v_type == VDIR) {
2255		error = EPERM;
2256		goto abortop;
2257	}
2258	/* Prevent cross-mount operation. */
2259	if (nd.ni_dvp->v_mount != vp->v_mount) {
2260		error = EXDEV;
2261		goto abortop;
2262	}
2263	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2264out2:
2265	pathbuf_destroy(linkpb);
2266out1:
2267	vrele(vp);
2268	return (error);
2269abortop:
2270	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2271	if (nd.ni_dvp == nd.ni_vp)
2272		vrele(nd.ni_dvp);
2273	else
2274		vput(nd.ni_dvp);
2275	if (nd.ni_vp != NULL)
2276		vrele(nd.ni_vp);
2277	goto out2;
2278}
2279
2280int
2281sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2282{
2283	/* {
2284		syscallarg(const char *) path;
2285		syscallarg(const char *) link;
2286	} */
2287	const char *path = SCARG(uap, path);
2288	const char *link = SCARG(uap, link);
2289
2290	return do_sys_link(l, path, link, 1, retval);
2291}
2292
2293int
2294sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2295    register_t *retval)
2296{
2297	/* {
2298		syscallarg(int) fd1;
2299		syscallarg(const char *) name1;
2300		syscallarg(int) fd2;
2301		syscallarg(const char *) name2;
2302		syscallarg(int) flags;
2303	} */
2304	const char *name1 = SCARG(uap, name1);
2305	const char *name2 = SCARG(uap, name2);
2306	int follow;
2307
2308	/*
2309	 * Specified fd1 and fd2 are not yet implemented
2310	 */
2311	if ((SCARG(uap, fd1) != AT_FDCWD) || (SCARG(uap, fd2) != AT_FDCWD))
2312		return ENOSYS;
2313
2314	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2315
2316	return do_sys_link(l, name1, name2, follow, retval);
2317}
2318
2319
2320int
2321do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2322{
2323	struct proc *p = curproc;
2324	struct vattr vattr;
2325	char *path;
2326	int error;
2327	struct pathbuf *linkpb;
2328	struct nameidata nd;
2329
2330	path = PNBUF_GET();
2331	if (seg == UIO_USERSPACE) {
2332		if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != 0)
2333			goto out1;
2334		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2335			goto out1;
2336	} else {
2337		KASSERT(strlen(patharg) < MAXPATHLEN);
2338		strcpy(path, patharg);
2339		linkpb = pathbuf_create(link);
2340		if (linkpb == NULL) {
2341			error = ENOMEM;
2342			goto out1;
2343		}
2344	}
2345	ktrkuser("symlink-target", path, strlen(path));
2346
2347	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2348	if ((error = namei(&nd)) != 0)
2349		goto out2;
2350	if (nd.ni_vp) {
2351		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2352		if (nd.ni_dvp == nd.ni_vp)
2353			vrele(nd.ni_dvp);
2354		else
2355			vput(nd.ni_dvp);
2356		vrele(nd.ni_vp);
2357		error = EEXIST;
2358		goto out2;
2359	}
2360	vattr_null(&vattr);
2361	vattr.va_type = VLNK;
2362	/* We will read cwdi->cwdi_cmask unlocked. */
2363	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2364	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2365	if (error == 0)
2366		vput(nd.ni_vp);
2367out2:
2368	pathbuf_destroy(linkpb);
2369out1:
2370	PNBUF_PUT(path);
2371	return (error);
2372}
2373
2374/*
2375 * Make a symbolic link.
2376 */
2377/* ARGSUSED */
2378int
2379sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2380{
2381	/* {
2382		syscallarg(const char *) path;
2383		syscallarg(const char *) link;
2384	} */
2385
2386	return do_sys_symlink(SCARG(uap, path), SCARG(uap, link),
2387	    UIO_USERSPACE);
2388}
2389
2390int
2391sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2392    register_t *retval)
2393{
2394	/* {
2395		syscallarg(int) fd;
2396		syscallarg(const char *) path;
2397		syscallarg(const char *) link;
2398	} */
2399
2400	return ENOSYS;
2401}
2402
2403/*
2404 * Delete a whiteout from the filesystem.
2405 */
2406/* ARGSUSED */
2407int
2408sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2409{
2410	/* {
2411		syscallarg(const char *) path;
2412	} */
2413	int error;
2414	struct pathbuf *pb;
2415	struct nameidata nd;
2416
2417	error = pathbuf_copyin(SCARG(uap, path), &pb);
2418	if (error) {
2419		return error;
2420	}
2421
2422	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2423	error = namei(&nd);
2424	if (error) {
2425		pathbuf_destroy(pb);
2426		return (error);
2427	}
2428
2429	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2430		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2431		if (nd.ni_dvp == nd.ni_vp)
2432			vrele(nd.ni_dvp);
2433		else
2434			vput(nd.ni_dvp);
2435		if (nd.ni_vp)
2436			vrele(nd.ni_vp);
2437		pathbuf_destroy(pb);
2438		return (EEXIST);
2439	}
2440	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2441		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2442	vput(nd.ni_dvp);
2443	pathbuf_destroy(pb);
2444	return (error);
2445}
2446
2447/*
2448 * Delete a name from the filesystem.
2449 */
2450/* ARGSUSED */
2451int
2452sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2453{
2454	/* {
2455		syscallarg(const char *) path;
2456	} */
2457
2458	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2459}
2460
2461int
2462sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2463    register_t *retval)
2464{
2465	/* {
2466		syscallarg(int) fd;
2467		syscallarg(const char *) path;
2468	} */
2469
2470	return ENOSYS;
2471}
2472
2473int
2474do_sys_unlink(const char *arg, enum uio_seg seg)
2475{
2476	struct vnode *vp;
2477	int error;
2478	struct pathbuf *pb;
2479	struct nameidata nd;
2480	const char *pathstring;
2481
2482	error = pathbuf_maybe_copyin(arg, seg, &pb);
2483	if (error) {
2484		return error;
2485	}
2486	pathstring = pathbuf_stringcopy_get(pb);
2487	if (pathstring == NULL) {
2488		pathbuf_destroy(pb);
2489		return ENOMEM;
2490	}
2491
2492	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2493	if ((error = namei(&nd)) != 0)
2494		goto out;
2495	vp = nd.ni_vp;
2496
2497	/*
2498	 * The root of a mounted filesystem cannot be deleted.
2499	 */
2500	if (vp->v_vflag & VV_ROOT) {
2501		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2502		if (nd.ni_dvp == vp)
2503			vrele(nd.ni_dvp);
2504		else
2505			vput(nd.ni_dvp);
2506		vput(vp);
2507		error = EBUSY;
2508		goto out;
2509	}
2510
2511#if NVERIEXEC > 0
2512	/* Handle remove requests for veriexec entries. */
2513	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2514		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2515		if (nd.ni_dvp == vp)
2516			vrele(nd.ni_dvp);
2517		else
2518			vput(nd.ni_dvp);
2519		vput(vp);
2520		goto out;
2521	}
2522#endif /* NVERIEXEC > 0 */
2523
2524#ifdef FILEASSOC
2525	(void)fileassoc_file_delete(vp);
2526#endif /* FILEASSOC */
2527	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2528out:
2529	pathbuf_stringcopy_put(pb, pathstring);
2530	pathbuf_destroy(pb);
2531	return (error);
2532}
2533
2534/*
2535 * Reposition read/write file offset.
2536 */
2537int
2538sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2539{
2540	/* {
2541		syscallarg(int) fd;
2542		syscallarg(int) pad;
2543		syscallarg(off_t) offset;
2544		syscallarg(int) whence;
2545	} */
2546	kauth_cred_t cred = l->l_cred;
2547	file_t *fp;
2548	struct vnode *vp;
2549	struct vattr vattr;
2550	off_t newoff;
2551	int error, fd;
2552
2553	fd = SCARG(uap, fd);
2554
2555	if ((fp = fd_getfile(fd)) == NULL)
2556		return (EBADF);
2557
2558	vp = fp->f_data;
2559	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2560		error = ESPIPE;
2561		goto out;
2562	}
2563
2564	switch (SCARG(uap, whence)) {
2565	case SEEK_CUR:
2566		newoff = fp->f_offset + SCARG(uap, offset);
2567		break;
2568	case SEEK_END:
2569		vn_lock(vp, LK_SHARED | LK_RETRY);
2570		error = VOP_GETATTR(vp, &vattr, cred);
2571		VOP_UNLOCK(vp);
2572		if (error) {
2573			goto out;
2574		}
2575		newoff = SCARG(uap, offset) + vattr.va_size;
2576		break;
2577	case SEEK_SET:
2578		newoff = SCARG(uap, offset);
2579		break;
2580	default:
2581		error = EINVAL;
2582		goto out;
2583	}
2584	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2585		*(off_t *)retval = fp->f_offset = newoff;
2586	}
2587 out:
2588 	fd_putfile(fd);
2589	return (error);
2590}
2591
2592/*
2593 * Positional read system call.
2594 */
2595int
2596sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2597{
2598	/* {
2599		syscallarg(int) fd;
2600		syscallarg(void *) buf;
2601		syscallarg(size_t) nbyte;
2602		syscallarg(off_t) offset;
2603	} */
2604	file_t *fp;
2605	struct vnode *vp;
2606	off_t offset;
2607	int error, fd = SCARG(uap, fd);
2608
2609	if ((fp = fd_getfile(fd)) == NULL)
2610		return (EBADF);
2611
2612	if ((fp->f_flag & FREAD) == 0) {
2613		fd_putfile(fd);
2614		return (EBADF);
2615	}
2616
2617	vp = fp->f_data;
2618	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2619		error = ESPIPE;
2620		goto out;
2621	}
2622
2623	offset = SCARG(uap, offset);
2624
2625	/*
2626	 * XXX This works because no file systems actually
2627	 * XXX take any action on the seek operation.
2628	 */
2629	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2630		goto out;
2631
2632	/* dofileread() will unuse the descriptor for us */
2633	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2634	    &offset, 0, retval));
2635
2636 out:
2637	fd_putfile(fd);
2638	return (error);
2639}
2640
2641/*
2642 * Positional scatter read system call.
2643 */
2644int
2645sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2646{
2647	/* {
2648		syscallarg(int) fd;
2649		syscallarg(const struct iovec *) iovp;
2650		syscallarg(int) iovcnt;
2651		syscallarg(off_t) offset;
2652	} */
2653	off_t offset = SCARG(uap, offset);
2654
2655	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2656	    SCARG(uap, iovcnt), &offset, 0, retval);
2657}
2658
2659/*
2660 * Positional write system call.
2661 */
2662int
2663sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2664{
2665	/* {
2666		syscallarg(int) fd;
2667		syscallarg(const void *) buf;
2668		syscallarg(size_t) nbyte;
2669		syscallarg(off_t) offset;
2670	} */
2671	file_t *fp;
2672	struct vnode *vp;
2673	off_t offset;
2674	int error, fd = SCARG(uap, fd);
2675
2676	if ((fp = fd_getfile(fd)) == NULL)
2677		return (EBADF);
2678
2679	if ((fp->f_flag & FWRITE) == 0) {
2680		fd_putfile(fd);
2681		return (EBADF);
2682	}
2683
2684	vp = fp->f_data;
2685	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2686		error = ESPIPE;
2687		goto out;
2688	}
2689
2690	offset = SCARG(uap, offset);
2691
2692	/*
2693	 * XXX This works because no file systems actually
2694	 * XXX take any action on the seek operation.
2695	 */
2696	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2697		goto out;
2698
2699	/* dofilewrite() will unuse the descriptor for us */
2700	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2701	    &offset, 0, retval));
2702
2703 out:
2704	fd_putfile(fd);
2705	return (error);
2706}
2707
2708/*
2709 * Positional gather write system call.
2710 */
2711int
2712sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2713{
2714	/* {
2715		syscallarg(int) fd;
2716		syscallarg(const struct iovec *) iovp;
2717		syscallarg(int) iovcnt;
2718		syscallarg(off_t) offset;
2719	} */
2720	off_t offset = SCARG(uap, offset);
2721
2722	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2723	    SCARG(uap, iovcnt), &offset, 0, retval);
2724}
2725
2726/*
2727 * Check access permissions.
2728 */
2729int
2730sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2731{
2732	/* {
2733		syscallarg(const char *) path;
2734		syscallarg(int) flags;
2735	} */
2736	kauth_cred_t cred;
2737	struct vnode *vp;
2738	int error, flags;
2739	struct pathbuf *pb;
2740	struct nameidata nd;
2741
2742	CTASSERT(F_OK == 0);
2743	if ((SCARG(uap, flags) & ~(R_OK | W_OK | X_OK)) != 0) {
2744		/* nonsense flags */
2745		return EINVAL;
2746	}
2747
2748	error = pathbuf_copyin(SCARG(uap, path), &pb);
2749	if (error) {
2750		return error;
2751	}
2752	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2753
2754	/* Override default credentials */
2755	cred = kauth_cred_dup(l->l_cred);
2756	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2757	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2758	nd.ni_cnd.cn_cred = cred;
2759
2760	if ((error = namei(&nd)) != 0) {
2761		pathbuf_destroy(pb);
2762		goto out;
2763	}
2764	vp = nd.ni_vp;
2765	pathbuf_destroy(pb);
2766
2767	/* Flags == 0 means only check for existence. */
2768	if (SCARG(uap, flags)) {
2769		flags = 0;
2770		if (SCARG(uap, flags) & R_OK)
2771			flags |= VREAD;
2772		if (SCARG(uap, flags) & W_OK)
2773			flags |= VWRITE;
2774		if (SCARG(uap, flags) & X_OK)
2775			flags |= VEXEC;
2776
2777		error = VOP_ACCESS(vp, flags, cred);
2778		if (!error && (flags & VWRITE))
2779			error = vn_writechk(vp);
2780	}
2781	vput(vp);
2782out:
2783	kauth_cred_free(cred);
2784	return (error);
2785}
2786
2787int
2788sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
2789    register_t *retval)
2790{
2791	/* {
2792		syscallarg(int) fd;
2793		syscallarg(const char *) path;
2794		syscallarg(int) amode;
2795		syscallarg(int) flag;
2796	} */
2797
2798	return ENOSYS;
2799}
2800
2801/*
2802 * Common code for all sys_stat functions, including compat versions.
2803 */
2804int
2805do_sys_stat(const char *userpath, unsigned int nd_flags, struct stat *sb)
2806{
2807	int error;
2808	struct pathbuf *pb;
2809	struct nameidata nd;
2810
2811	error = pathbuf_copyin(userpath, &pb);
2812	if (error) {
2813		return error;
2814	}
2815	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT, pb);
2816	error = namei(&nd);
2817	if (error != 0) {
2818		pathbuf_destroy(pb);
2819		return error;
2820	}
2821	error = vn_stat(nd.ni_vp, sb);
2822	vput(nd.ni_vp);
2823	pathbuf_destroy(pb);
2824	return error;
2825}
2826
2827/*
2828 * Get file status; this version follows links.
2829 */
2830/* ARGSUSED */
2831int
2832sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2833{
2834	/* {
2835		syscallarg(const char *) path;
2836		syscallarg(struct stat *) ub;
2837	} */
2838	struct stat sb;
2839	int error;
2840
2841	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2842	if (error)
2843		return error;
2844	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2845}
2846
2847/*
2848 * Get file status; this version does not follow links.
2849 */
2850/* ARGSUSED */
2851int
2852sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2853{
2854	/* {
2855		syscallarg(const char *) path;
2856		syscallarg(struct stat *) ub;
2857	} */
2858	struct stat sb;
2859	int error;
2860
2861	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2862	if (error)
2863		return error;
2864	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2865}
2866
2867int
2868sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
2869    register_t *retval)
2870{
2871	/* {
2872		syscallarg(int) fd;
2873		syscallarg(const char *) path;
2874		syscallarg(struct stat *) ub;
2875		syscallarg(int) flag;
2876	} */
2877
2878	return ENOSYS;
2879}
2880/*
2881 * Get configurable pathname variables.
2882 */
2883/* ARGSUSED */
2884int
2885sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2886{
2887	/* {
2888		syscallarg(const char *) path;
2889		syscallarg(int) name;
2890	} */
2891	int error;
2892	struct pathbuf *pb;
2893	struct nameidata nd;
2894
2895	error = pathbuf_copyin(SCARG(uap, path), &pb);
2896	if (error) {
2897		return error;
2898	}
2899	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2900	if ((error = namei(&nd)) != 0) {
2901		pathbuf_destroy(pb);
2902		return (error);
2903	}
2904	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2905	vput(nd.ni_vp);
2906	pathbuf_destroy(pb);
2907	return (error);
2908}
2909
2910/*
2911 * Return target name of a symbolic link.
2912 */
2913/* ARGSUSED */
2914int
2915sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2916{
2917	/* {
2918		syscallarg(const char *) path;
2919		syscallarg(char *) buf;
2920		syscallarg(size_t) count;
2921	} */
2922	struct vnode *vp;
2923	struct iovec aiov;
2924	struct uio auio;
2925	int error;
2926	struct pathbuf *pb;
2927	struct nameidata nd;
2928
2929	error = pathbuf_copyin(SCARG(uap, path), &pb);
2930	if (error) {
2931		return error;
2932	}
2933	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2934	if ((error = namei(&nd)) != 0) {
2935		pathbuf_destroy(pb);
2936		return error;
2937	}
2938	vp = nd.ni_vp;
2939	pathbuf_destroy(pb);
2940	if (vp->v_type != VLNK)
2941		error = EINVAL;
2942	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2943	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2944		aiov.iov_base = SCARG(uap, buf);
2945		aiov.iov_len = SCARG(uap, count);
2946		auio.uio_iov = &aiov;
2947		auio.uio_iovcnt = 1;
2948		auio.uio_offset = 0;
2949		auio.uio_rw = UIO_READ;
2950		KASSERT(l == curlwp);
2951		auio.uio_vmspace = l->l_proc->p_vmspace;
2952		auio.uio_resid = SCARG(uap, count);
2953		error = VOP_READLINK(vp, &auio, l->l_cred);
2954	}
2955	vput(vp);
2956	*retval = SCARG(uap, count) - auio.uio_resid;
2957	return (error);
2958}
2959
2960int
2961sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
2962    register_t *retval)
2963{
2964	/* {
2965		syscallarg(int) fd;
2966		syscallarg(const char *) path;
2967		syscallarg(char *) buf;
2968		syscallarg(size_t) count;
2969	} */
2970
2971	return ENOSYS;
2972}
2973
2974/*
2975 * Change flags of a file given a path name.
2976 */
2977/* ARGSUSED */
2978int
2979sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2980{
2981	/* {
2982		syscallarg(const char *) path;
2983		syscallarg(u_long) flags;
2984	} */
2985	struct vnode *vp;
2986	int error;
2987
2988	error = namei_simple_user(SCARG(uap, path),
2989				NSM_FOLLOW_TRYEMULROOT, &vp);
2990	if (error != 0)
2991		return (error);
2992	error = change_flags(vp, SCARG(uap, flags), l);
2993	vput(vp);
2994	return (error);
2995}
2996
2997/*
2998 * Change flags of a file given a file descriptor.
2999 */
3000/* ARGSUSED */
3001int
3002sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3003{
3004	/* {
3005		syscallarg(int) fd;
3006		syscallarg(u_long) flags;
3007	} */
3008	struct vnode *vp;
3009	file_t *fp;
3010	int error;
3011
3012	/* fd_getvnode() will use the descriptor for us */
3013	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3014		return (error);
3015	vp = fp->f_data;
3016	error = change_flags(vp, SCARG(uap, flags), l);
3017	VOP_UNLOCK(vp);
3018	fd_putfile(SCARG(uap, fd));
3019	return (error);
3020}
3021
3022/*
3023 * Change flags of a file given a path name; this version does
3024 * not follow links.
3025 */
3026int
3027sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3028{
3029	/* {
3030		syscallarg(const char *) path;
3031		syscallarg(u_long) flags;
3032	} */
3033	struct vnode *vp;
3034	int error;
3035
3036	error = namei_simple_user(SCARG(uap, path),
3037				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3038	if (error != 0)
3039		return (error);
3040	error = change_flags(vp, SCARG(uap, flags), l);
3041	vput(vp);
3042	return (error);
3043}
3044
3045/*
3046 * Common routine to change flags of a file.
3047 */
3048int
3049change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3050{
3051	struct vattr vattr;
3052	int error;
3053
3054	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3055	/*
3056	 * Non-superusers cannot change the flags on devices, even if they
3057	 * own them.
3058	 */
3059	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
3060		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3061			goto out;
3062		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
3063			error = EINVAL;
3064			goto out;
3065		}
3066	}
3067	vattr_null(&vattr);
3068	vattr.va_flags = flags;
3069	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3070out:
3071	return (error);
3072}
3073
3074/*
3075 * Change mode of a file given path name; this version follows links.
3076 */
3077/* ARGSUSED */
3078int
3079sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3080{
3081	/* {
3082		syscallarg(const char *) path;
3083		syscallarg(int) mode;
3084	} */
3085	int error;
3086	struct vnode *vp;
3087
3088	error = namei_simple_user(SCARG(uap, path),
3089				NSM_FOLLOW_TRYEMULROOT, &vp);
3090	if (error != 0)
3091		return (error);
3092
3093	error = change_mode(vp, SCARG(uap, mode), l);
3094
3095	vrele(vp);
3096	return (error);
3097}
3098
3099/*
3100 * Change mode of a file given a file descriptor.
3101 */
3102/* ARGSUSED */
3103int
3104sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3105{
3106	/* {
3107		syscallarg(int) fd;
3108		syscallarg(int) mode;
3109	} */
3110	file_t *fp;
3111	int error;
3112
3113	/* fd_getvnode() will use the descriptor for us */
3114	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3115		return (error);
3116	error = change_mode(fp->f_data, SCARG(uap, mode), l);
3117	fd_putfile(SCARG(uap, fd));
3118	return (error);
3119}
3120
3121int
3122sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3123    register_t *retval)
3124{
3125	/* {
3126		syscallarg(int) fd;
3127		syscallarg(const char *) path;
3128		syscallarg(int) mode;
3129		syscallarg(int) flag;
3130	} */
3131
3132	return ENOSYS;
3133}
3134
3135/*
3136 * Change mode of a file given path name; this version does not follow links.
3137 */
3138/* ARGSUSED */
3139int
3140sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3141{
3142	/* {
3143		syscallarg(const char *) path;
3144		syscallarg(int) mode;
3145	} */
3146	int error;
3147	struct vnode *vp;
3148
3149	error = namei_simple_user(SCARG(uap, path),
3150				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3151	if (error != 0)
3152		return (error);
3153
3154	error = change_mode(vp, SCARG(uap, mode), l);
3155
3156	vrele(vp);
3157	return (error);
3158}
3159
3160/*
3161 * Common routine to set mode given a vnode.
3162 */
3163static int
3164change_mode(struct vnode *vp, int mode, struct lwp *l)
3165{
3166	struct vattr vattr;
3167	int error;
3168
3169	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3170	vattr_null(&vattr);
3171	vattr.va_mode = mode & ALLPERMS;
3172	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3173	VOP_UNLOCK(vp);
3174	return (error);
3175}
3176
3177/*
3178 * Set ownership given a path name; this version follows links.
3179 */
3180/* ARGSUSED */
3181int
3182sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3183{
3184	/* {
3185		syscallarg(const char *) path;
3186		syscallarg(uid_t) uid;
3187		syscallarg(gid_t) gid;
3188	} */
3189	int error;
3190	struct vnode *vp;
3191
3192	error = namei_simple_user(SCARG(uap, path),
3193				NSM_FOLLOW_TRYEMULROOT, &vp);
3194	if (error != 0)
3195		return (error);
3196
3197	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3198
3199	vrele(vp);
3200	return (error);
3201}
3202
3203/*
3204 * Set ownership given a path name; this version follows links.
3205 * Provides POSIX semantics.
3206 */
3207/* ARGSUSED */
3208int
3209sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3210{
3211	/* {
3212		syscallarg(const char *) path;
3213		syscallarg(uid_t) uid;
3214		syscallarg(gid_t) gid;
3215	} */
3216	int error;
3217	struct vnode *vp;
3218
3219	error = namei_simple_user(SCARG(uap, path),
3220				NSM_FOLLOW_TRYEMULROOT, &vp);
3221	if (error != 0)
3222		return (error);
3223
3224	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3225
3226	vrele(vp);
3227	return (error);
3228}
3229
3230/*
3231 * Set ownership given a file descriptor.
3232 */
3233/* ARGSUSED */
3234int
3235sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3236{
3237	/* {
3238		syscallarg(int) fd;
3239		syscallarg(uid_t) uid;
3240		syscallarg(gid_t) gid;
3241	} */
3242	int error;
3243	file_t *fp;
3244
3245	/* fd_getvnode() will use the descriptor for us */
3246	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3247		return (error);
3248	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
3249	    l, 0);
3250	fd_putfile(SCARG(uap, fd));
3251	return (error);
3252}
3253
3254int
3255sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3256    register_t *retval)
3257{
3258	/* {
3259		syscallarg(int) fd;
3260		syscallarg(const char *) path;
3261		syscallarg(uid_t) uid;
3262		syscallarg(gid_t) gid;
3263		syscallarg(int) flag;
3264	} */
3265
3266	return ENOSYS;
3267}
3268
3269/*
3270 * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3271 */
3272/* ARGSUSED */
3273int
3274sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3275{
3276	/* {
3277		syscallarg(int) fd;
3278		syscallarg(uid_t) uid;
3279		syscallarg(gid_t) gid;
3280	} */
3281	int error;
3282	file_t *fp;
3283
3284	/* fd_getvnode() will use the descriptor for us */
3285	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3286		return (error);
3287	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
3288	    l, 1);
3289	fd_putfile(SCARG(uap, fd));
3290	return (error);
3291}
3292
3293/*
3294 * Set ownership given a path name; this version does not follow links.
3295 */
3296/* ARGSUSED */
3297int
3298sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3299{
3300	/* {
3301		syscallarg(const char *) path;
3302		syscallarg(uid_t) uid;
3303		syscallarg(gid_t) gid;
3304	} */
3305	int error;
3306	struct vnode *vp;
3307
3308	error = namei_simple_user(SCARG(uap, path),
3309				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3310	if (error != 0)
3311		return (error);
3312
3313	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3314
3315	vrele(vp);
3316	return (error);
3317}
3318
3319/*
3320 * Set ownership given a path name; this version does not follow links.
3321 * Provides POSIX/XPG semantics.
3322 */
3323/* ARGSUSED */
3324int
3325sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3326{
3327	/* {
3328		syscallarg(const char *) path;
3329		syscallarg(uid_t) uid;
3330		syscallarg(gid_t) gid;
3331	} */
3332	int error;
3333	struct vnode *vp;
3334
3335	error = namei_simple_user(SCARG(uap, path),
3336				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3337	if (error != 0)
3338		return (error);
3339
3340	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3341
3342	vrele(vp);
3343	return (error);
3344}
3345
3346/*
3347 * Common routine to set ownership given a vnode.
3348 */
3349static int
3350change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3351    int posix_semantics)
3352{
3353	struct vattr vattr;
3354	mode_t newmode;
3355	int error;
3356
3357	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3358	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3359		goto out;
3360
3361#define CHANGED(x) ((int)(x) != -1)
3362	newmode = vattr.va_mode;
3363	if (posix_semantics) {
3364		/*
3365		 * POSIX/XPG semantics: if the caller is not the super-user,
3366		 * clear set-user-id and set-group-id bits.  Both POSIX and
3367		 * the XPG consider the behaviour for calls by the super-user
3368		 * implementation-defined; we leave the set-user-id and set-
3369		 * group-id settings intact in that case.
3370		 */
3371		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
3372				      NULL) != 0)
3373			newmode &= ~(S_ISUID | S_ISGID);
3374	} else {
3375		/*
3376		 * NetBSD semantics: when changing owner and/or group,
3377		 * clear the respective bit(s).
3378		 */
3379		if (CHANGED(uid))
3380			newmode &= ~S_ISUID;
3381		if (CHANGED(gid))
3382			newmode &= ~S_ISGID;
3383	}
3384	/* Update va_mode iff altered. */
3385	if (vattr.va_mode == newmode)
3386		newmode = VNOVAL;
3387
3388	vattr_null(&vattr);
3389	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3390	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3391	vattr.va_mode = newmode;
3392	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3393#undef CHANGED
3394
3395out:
3396	VOP_UNLOCK(vp);
3397	return (error);
3398}
3399
3400/*
3401 * Set the access and modification times given a path name; this
3402 * version follows links.
3403 */
3404/* ARGSUSED */
3405int
3406sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3407    register_t *retval)
3408{
3409	/* {
3410		syscallarg(const char *) path;
3411		syscallarg(const struct timeval *) tptr;
3412	} */
3413
3414	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3415	    SCARG(uap, tptr), UIO_USERSPACE);
3416}
3417
3418/*
3419 * Set the access and modification times given a file descriptor.
3420 */
3421/* ARGSUSED */
3422int
3423sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3424    register_t *retval)
3425{
3426	/* {
3427		syscallarg(int) fd;
3428		syscallarg(const struct timeval *) tptr;
3429	} */
3430	int error;
3431	file_t *fp;
3432
3433	/* fd_getvnode() will use the descriptor for us */
3434	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3435		return (error);
3436	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
3437	    UIO_USERSPACE);
3438	fd_putfile(SCARG(uap, fd));
3439	return (error);
3440}
3441
3442int
3443sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3444    register_t *retval)
3445{
3446	/* {
3447		syscallarg(int) fd;
3448		syscallarg(const struct timespec *) tptr;
3449	} */
3450	int error;
3451	file_t *fp;
3452
3453	/* fd_getvnode() will use the descriptor for us */
3454	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3455		return (error);
3456	error = do_sys_utimens(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
3457	    UIO_USERSPACE);
3458	fd_putfile(SCARG(uap, fd));
3459	return (error);
3460}
3461
3462/*
3463 * Set the access and modification times given a path name; this
3464 * version does not follow links.
3465 */
3466int
3467sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3468    register_t *retval)
3469{
3470	/* {
3471		syscallarg(const char *) path;
3472		syscallarg(const struct timeval *) tptr;
3473	} */
3474
3475	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3476	    SCARG(uap, tptr), UIO_USERSPACE);
3477}
3478
3479int
3480sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3481    register_t *retval)
3482{
3483	/* {
3484		syscallarg(int) fd;
3485		syscallarg(const char *) path;
3486		syscallarg(const struct timespec *) tptr;
3487		syscallarg(int) flag;
3488	} */
3489	int follow;
3490	const struct timespec *tptr;
3491
3492	/*
3493	 * Specified fd is not yet implemented
3494	 */
3495	if (SCARG(uap, fd) != AT_FDCWD)
3496		return ENOSYS;
3497
3498	tptr = SCARG(uap, tptr);
3499	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3500
3501	return do_sys_utimens(l, NULL, SCARG(uap, path), follow,
3502	    tptr, UIO_USERSPACE);
3503}
3504
3505/*
3506 * Common routine to set access and modification times given a vnode.
3507 */
3508int
3509do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3510    const struct timespec *tptr, enum uio_seg seg)
3511{
3512	struct vattr vattr;
3513	int error, dorele = 0;
3514	namei_simple_flags_t sflags;
3515
3516	bool vanull, setbirthtime;
3517	struct timespec ts[2];
3518
3519	/*
3520	 * I have checked all callers and they pass either FOLLOW,
3521	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3522	 * is 0. More to the point, they don't pass anything else.
3523	 * Let's keep it that way at least until the namei interfaces
3524	 * are fully sanitized.
3525	 */
3526	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3527	sflags = (flag == FOLLOW) ?
3528		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3529
3530	if (tptr == NULL) {
3531		vanull = true;
3532		nanotime(&ts[0]);
3533		ts[1] = ts[0];
3534	} else {
3535		vanull = false;
3536		if (seg != UIO_SYSSPACE) {
3537			error = copyin(tptr, ts, sizeof (ts));
3538			if (error != 0)
3539				return error;
3540		} else {
3541			ts[0] = tptr[0];
3542			ts[1] = tptr[1];
3543		}
3544	}
3545
3546	if (ts[0].tv_nsec == UTIME_NOW) {
3547		nanotime(&ts[0]);
3548		if (ts[1].tv_nsec == UTIME_NOW) {
3549			vanull = true;
3550			ts[1] = ts[0];
3551		}
3552	} else if (ts[1].tv_nsec == UTIME_NOW)
3553		nanotime(&ts[1]);
3554
3555	if (vp == NULL) {
3556		/* note: SEG describes TPTR, not PATH; PATH is always user */
3557		error = namei_simple_user(path, sflags, &vp);
3558		if (error != 0)
3559			return error;
3560		dorele = 1;
3561	}
3562
3563	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3564	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3565	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3566	vattr_null(&vattr);
3567
3568	if (ts[0].tv_nsec != UTIME_OMIT)
3569		vattr.va_atime = ts[0];
3570
3571	if (ts[1].tv_nsec != UTIME_OMIT) {
3572		vattr.va_mtime = ts[1];
3573		if (setbirthtime)
3574			vattr.va_birthtime = ts[1];
3575	}
3576
3577	if (vanull)
3578		vattr.va_vaflags |= VA_UTIMES_NULL;
3579	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3580	VOP_UNLOCK(vp);
3581
3582	if (dorele != 0)
3583		vrele(vp);
3584
3585	return error;
3586}
3587
3588int
3589do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3590    const struct timeval *tptr, enum uio_seg seg)
3591{
3592	struct timespec ts[2];
3593	struct timespec *tsptr = NULL;
3594	int error;
3595
3596	if (tptr != NULL) {
3597		struct timeval tv[2];
3598
3599		if (seg != UIO_SYSSPACE) {
3600			error = copyin(tptr, tv, sizeof (tv));
3601			if (error != 0)
3602				return error;
3603			tptr = tv;
3604		}
3605
3606		if ((tv[0].tv_usec == UTIME_NOW) ||
3607		    (tv[0].tv_usec == UTIME_OMIT))
3608			ts[0].tv_nsec = tv[0].tv_usec;
3609		else
3610			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3611
3612		if ((tv[1].tv_usec == UTIME_NOW) ||
3613		    (tv[1].tv_usec == UTIME_OMIT))
3614			ts[1].tv_nsec = tv[1].tv_usec;
3615		else
3616			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3617
3618		tsptr = &ts[0];
3619	}
3620
3621	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3622}
3623
3624/*
3625 * Truncate a file given its path name.
3626 */
3627/* ARGSUSED */
3628int
3629sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3630{
3631	/* {
3632		syscallarg(const char *) path;
3633		syscallarg(int) pad;
3634		syscallarg(off_t) length;
3635	} */
3636	struct vnode *vp;
3637	struct vattr vattr;
3638	int error;
3639
3640	error = namei_simple_user(SCARG(uap, path),
3641				NSM_FOLLOW_TRYEMULROOT, &vp);
3642	if (error != 0)
3643		return (error);
3644	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3645	if (vp->v_type == VDIR)
3646		error = EISDIR;
3647	else if ((error = vn_writechk(vp)) == 0 &&
3648	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3649		vattr_null(&vattr);
3650		vattr.va_size = SCARG(uap, length);
3651		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3652	}
3653	vput(vp);
3654	return (error);
3655}
3656
3657/*
3658 * Truncate a file given a file descriptor.
3659 */
3660/* ARGSUSED */
3661int
3662sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3663{
3664	/* {
3665		syscallarg(int) fd;
3666		syscallarg(int) pad;
3667		syscallarg(off_t) length;
3668	} */
3669	struct vattr vattr;
3670	struct vnode *vp;
3671	file_t *fp;
3672	int error;
3673
3674	/* fd_getvnode() will use the descriptor for us */
3675	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3676		return (error);
3677	if ((fp->f_flag & FWRITE) == 0) {
3678		error = EINVAL;
3679		goto out;
3680	}
3681	vp = fp->f_data;
3682	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3683	if (vp->v_type == VDIR)
3684		error = EISDIR;
3685	else if ((error = vn_writechk(vp)) == 0) {
3686		vattr_null(&vattr);
3687		vattr.va_size = SCARG(uap, length);
3688		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3689	}
3690	VOP_UNLOCK(vp);
3691 out:
3692	fd_putfile(SCARG(uap, fd));
3693	return (error);
3694}
3695
3696/*
3697 * Sync an open file.
3698 */
3699/* ARGSUSED */
3700int
3701sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3702{
3703	/* {
3704		syscallarg(int) fd;
3705	} */
3706	struct vnode *vp;
3707	file_t *fp;
3708	int error;
3709
3710	/* fd_getvnode() will use the descriptor for us */
3711	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3712		return (error);
3713	vp = fp->f_data;
3714	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3715	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3716	VOP_UNLOCK(vp);
3717	fd_putfile(SCARG(uap, fd));
3718	return (error);
3719}
3720
3721/*
3722 * Sync a range of file data.  API modeled after that found in AIX.
3723 *
3724 * FDATASYNC indicates that we need only save enough metadata to be able
3725 * to re-read the written data.  Note we duplicate AIX's requirement that
3726 * the file be open for writing.
3727 */
3728/* ARGSUSED */
3729int
3730sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3731{
3732	/* {
3733		syscallarg(int) fd;
3734		syscallarg(int) flags;
3735		syscallarg(off_t) start;
3736		syscallarg(off_t) length;
3737	} */
3738	struct vnode *vp;
3739	file_t *fp;
3740	int flags, nflags;
3741	off_t s, e, len;
3742	int error;
3743
3744	/* fd_getvnode() will use the descriptor for us */
3745	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3746		return (error);
3747
3748	if ((fp->f_flag & FWRITE) == 0) {
3749		error = EBADF;
3750		goto out;
3751	}
3752
3753	flags = SCARG(uap, flags);
3754	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3755	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3756		error = EINVAL;
3757		goto out;
3758	}
3759	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3760	if (flags & FDATASYNC)
3761		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3762	else
3763		nflags = FSYNC_WAIT;
3764	if (flags & FDISKSYNC)
3765		nflags |= FSYNC_CACHE;
3766
3767	len = SCARG(uap, length);
3768	/* If length == 0, we do the whole file, and s = e = 0 will do that */
3769	if (len) {
3770		s = SCARG(uap, start);
3771		e = s + len;
3772		if (e < s) {
3773			error = EINVAL;
3774			goto out;
3775		}
3776	} else {
3777		e = 0;
3778		s = 0;
3779	}
3780
3781	vp = fp->f_data;
3782	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3783	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3784	VOP_UNLOCK(vp);
3785out:
3786	fd_putfile(SCARG(uap, fd));
3787	return (error);
3788}
3789
3790/*
3791 * Sync the data of an open file.
3792 */
3793/* ARGSUSED */
3794int
3795sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3796{
3797	/* {
3798		syscallarg(int) fd;
3799	} */
3800	struct vnode *vp;
3801	file_t *fp;
3802	int error;
3803
3804	/* fd_getvnode() will use the descriptor for us */
3805	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3806		return (error);
3807	if ((fp->f_flag & FWRITE) == 0) {
3808		fd_putfile(SCARG(uap, fd));
3809		return (EBADF);
3810	}
3811	vp = fp->f_data;
3812	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3813	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3814	VOP_UNLOCK(vp);
3815	fd_putfile(SCARG(uap, fd));
3816	return (error);
3817}
3818
3819/*
3820 * Rename files, (standard) BSD semantics frontend.
3821 */
3822/* ARGSUSED */
3823int
3824sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3825{
3826	/* {
3827		syscallarg(const char *) from;
3828		syscallarg(const char *) to;
3829	} */
3830
3831	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3832}
3833
3834int
3835sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
3836    register_t *retval)
3837{
3838	/* {
3839		syscallarg(int) fromfd;
3840		syscallarg(const char *) from;
3841		syscallarg(int) tofd;
3842		syscallarg(const char *) to;
3843	} */
3844
3845	return ENOSYS;
3846}
3847
3848/*
3849 * Rename files, POSIX semantics frontend.
3850 */
3851/* ARGSUSED */
3852int
3853sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3854{
3855	/* {
3856		syscallarg(const char *) from;
3857		syscallarg(const char *) to;
3858	} */
3859
3860	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3861}
3862
3863/*
3864 * Rename files.  Source and destination must either both be directories,
3865 * or both not be directories.  If target is a directory, it must be empty.
3866 * If `from' and `to' refer to the same object, the value of the `retain'
3867 * argument is used to determine whether `from' will be
3868 *
3869 * (retain == 0)	deleted unless `from' and `to' refer to the same
3870 *			object in the file system's name space (BSD).
3871 * (retain == 1)	always retained (POSIX).
3872 */
3873int
3874do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3875{
3876	struct vnode *tvp, *fvp, *tdvp;
3877	struct pathbuf *frompb, *topb;
3878	struct nameidata fromnd, tond;
3879	struct mount *fs;
3880	int error;
3881
3882	error = pathbuf_maybe_copyin(from, seg, &frompb);
3883	if (error) {
3884		return error;
3885	}
3886	error = pathbuf_maybe_copyin(to, seg, &topb);
3887	if (error) {
3888		pathbuf_destroy(frompb);
3889		return error;
3890	}
3891
3892	NDINIT(&fromnd, DELETE, LOCKPARENT | TRYEMULROOT | INRENAME,
3893	    frompb);
3894	if ((error = namei(&fromnd)) != 0) {
3895		pathbuf_destroy(frompb);
3896		pathbuf_destroy(topb);
3897		return (error);
3898	}
3899	if (fromnd.ni_dvp != fromnd.ni_vp)
3900		VOP_UNLOCK(fromnd.ni_dvp);
3901	fvp = fromnd.ni_vp;
3902
3903	fs = fvp->v_mount;
3904	error = VFS_RENAMELOCK_ENTER(fs);
3905	if (error) {
3906		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3907		vrele(fromnd.ni_dvp);
3908		vrele(fvp);
3909		goto out1;
3910	}
3911
3912	/*
3913	 * close, partially, yet another race - ideally we should only
3914	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3915	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3916	 * that with namei as it stands.
3917	 *
3918	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3919	 * under us. The real fix is to get the locks in the right
3920	 * order and do the lookups in the right places, but that's a
3921	 * major rototill.
3922	 *
3923	 * Note: this logic (as well as this whole function) is cloned
3924	 * in nfs_serv.c. Proceed accordingly.
3925	 */
3926	vrele(fvp);
3927	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3928	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3929	    (fromnd.ni_cnd.cn_namelen == 2 &&
3930	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3931	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3932		error = EINVAL;
3933		VFS_RENAMELOCK_EXIT(fs);
3934		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3935		vrele(fromnd.ni_dvp);
3936		goto out1;
3937	}
3938	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3939	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd, 0);
3940	if (error) {
3941		VOP_UNLOCK(fromnd.ni_dvp);
3942		VFS_RENAMELOCK_EXIT(fs);
3943		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3944		vrele(fromnd.ni_dvp);
3945		goto out1;
3946	}
3947	VOP_UNLOCK(fromnd.ni_vp);
3948	if (fromnd.ni_dvp != fromnd.ni_vp)
3949		VOP_UNLOCK(fromnd.ni_dvp);
3950	fvp = fromnd.ni_vp;
3951
3952	NDINIT(&tond, RENAME,
3953	    LOCKPARENT | LOCKLEAF | NOCACHE | TRYEMULROOT
3954	      | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
3955	    topb);
3956	if ((error = namei(&tond)) != 0) {
3957		VFS_RENAMELOCK_EXIT(fs);
3958		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3959		vrele(fromnd.ni_dvp);
3960		vrele(fvp);
3961		goto out1;
3962	}
3963	tdvp = tond.ni_dvp;
3964	tvp = tond.ni_vp;
3965
3966	if (tvp != NULL) {
3967		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3968			error = ENOTDIR;
3969			goto out;
3970		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3971			error = EISDIR;
3972			goto out;
3973		}
3974	}
3975
3976	if (fvp == tdvp)
3977		error = EINVAL;
3978
3979	/*
3980	 * Source and destination refer to the same object.
3981	 */
3982	if (fvp == tvp) {
3983		if (retain)
3984			error = -1;
3985		else if (fromnd.ni_dvp == tdvp &&
3986		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3987		    !memcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
3988		          fromnd.ni_cnd.cn_namelen))
3989			error = -1;
3990	}
3991	/*
3992	 * Prevent cross-mount operation.
3993	 */
3994	if (error == 0) {
3995		if (tond.ni_dvp->v_mount != fromnd.ni_dvp->v_mount) {
3996			error = EXDEV;
3997		}
3998	}
3999#if NVERIEXEC > 0
4000	if (!error) {
4001		char *f1, *f2;
4002		size_t f1_len;
4003		size_t f2_len;
4004
4005		f1_len = fromnd.ni_cnd.cn_namelen + 1;
4006		f1 = kmem_alloc(f1_len, KM_SLEEP);
4007		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
4008
4009		f2_len = tond.ni_cnd.cn_namelen + 1;
4010		f2 = kmem_alloc(f2_len, KM_SLEEP);
4011		strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
4012
4013		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4014
4015		kmem_free(f1, f1_len);
4016		kmem_free(f2, f2_len);
4017	}
4018#endif /* NVERIEXEC > 0 */
4019
4020out:
4021	if (!error) {
4022		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
4023				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
4024		VFS_RENAMELOCK_EXIT(fs);
4025	} else {
4026		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
4027		if (tdvp == tvp)
4028			vrele(tdvp);
4029		else
4030			vput(tdvp);
4031		if (tvp)
4032			vput(tvp);
4033		VFS_RENAMELOCK_EXIT(fs);
4034		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
4035		vrele(fromnd.ni_dvp);
4036		vrele(fvp);
4037	}
4038out1:
4039	pathbuf_destroy(frompb);
4040	pathbuf_destroy(topb);
4041	return (error == -1 ? 0 : error);
4042}
4043
4044/*
4045 * Make a directory file.
4046 */
4047/* ARGSUSED */
4048int
4049sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4050{
4051	/* {
4052		syscallarg(const char *) path;
4053		syscallarg(int) mode;
4054	} */
4055
4056	return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE);
4057}
4058
4059int
4060sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4061    register_t *retval)
4062{
4063	/* {
4064		syscallarg(int) fd;
4065		syscallarg(const char *) path;
4066		syscallarg(int) mode;
4067	} */
4068
4069	return ENOSYS;
4070}
4071
4072
4073int
4074do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4075{
4076	struct proc *p = curlwp->l_proc;
4077	struct vnode *vp;
4078	struct vattr vattr;
4079	int error;
4080	struct pathbuf *pb;
4081	struct nameidata nd;
4082
4083	/* XXX bollocks, should pass in a pathbuf */
4084	error = pathbuf_maybe_copyin(path, seg, &pb);
4085	if (error) {
4086		return error;
4087	}
4088
4089	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4090	if ((error = namei(&nd)) != 0) {
4091		pathbuf_destroy(pb);
4092		return (error);
4093	}
4094	vp = nd.ni_vp;
4095	if (vp != NULL) {
4096		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4097		if (nd.ni_dvp == vp)
4098			vrele(nd.ni_dvp);
4099		else
4100			vput(nd.ni_dvp);
4101		vrele(vp);
4102		pathbuf_destroy(pb);
4103		return (EEXIST);
4104	}
4105	vattr_null(&vattr);
4106	vattr.va_type = VDIR;
4107	/* We will read cwdi->cwdi_cmask unlocked. */
4108	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4109	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4110	if (!error)
4111		vput(nd.ni_vp);
4112	pathbuf_destroy(pb);
4113	return (error);
4114}
4115
4116/*
4117 * Remove a directory file.
4118 */
4119/* ARGSUSED */
4120int
4121sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4122{
4123	/* {
4124		syscallarg(const char *) path;
4125	} */
4126	struct vnode *vp;
4127	int error;
4128	struct pathbuf *pb;
4129	struct nameidata nd;
4130
4131	error = pathbuf_copyin(SCARG(uap, path), &pb);
4132	if (error) {
4133		return error;
4134	}
4135	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
4136	if ((error = namei(&nd)) != 0) {
4137		pathbuf_destroy(pb);
4138		return error;
4139	}
4140	vp = nd.ni_vp;
4141	if (vp->v_type != VDIR) {
4142		error = ENOTDIR;
4143		goto out;
4144	}
4145	/*
4146	 * No rmdir "." please.
4147	 */
4148	if (nd.ni_dvp == vp) {
4149		error = EINVAL;
4150		goto out;
4151	}
4152	/*
4153	 * The root of a mounted filesystem cannot be deleted.
4154	 */
4155	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
4156		error = EBUSY;
4157		goto out;
4158	}
4159	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
4160	pathbuf_destroy(pb);
4161	return (error);
4162
4163out:
4164	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4165	if (nd.ni_dvp == vp)
4166		vrele(nd.ni_dvp);
4167	else
4168		vput(nd.ni_dvp);
4169	vput(vp);
4170	pathbuf_destroy(pb);
4171	return (error);
4172}
4173
4174/*
4175 * Read a block of directory entries in a file system independent format.
4176 */
4177int
4178sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4179{
4180	/* {
4181		syscallarg(int) fd;
4182		syscallarg(char *) buf;
4183		syscallarg(size_t) count;
4184	} */
4185	file_t *fp;
4186	int error, done;
4187
4188	/* fd_getvnode() will use the descriptor for us */
4189	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4190		return (error);
4191	if ((fp->f_flag & FREAD) == 0) {
4192		error = EBADF;
4193		goto out;
4194	}
4195	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4196			SCARG(uap, count), &done, l, 0, 0);
4197	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4198	*retval = done;
4199 out:
4200	fd_putfile(SCARG(uap, fd));
4201	return (error);
4202}
4203
4204/*
4205 * Set the mode mask for creation of filesystem nodes.
4206 */
4207int
4208sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4209{
4210	/* {
4211		syscallarg(mode_t) newmask;
4212	} */
4213	struct proc *p = l->l_proc;
4214	struct cwdinfo *cwdi;
4215
4216	/*
4217	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
4218	 * important is that we serialize changes to the mask.  The
4219	 * rw_exit() will issue a write memory barrier on our behalf,
4220	 * and force the changes out to other CPUs (as it must use an
4221	 * atomic operation, draining the local CPU's store buffers).
4222	 */
4223	cwdi = p->p_cwdi;
4224	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
4225	*retval = cwdi->cwdi_cmask;
4226	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
4227	rw_exit(&cwdi->cwdi_lock);
4228
4229	return (0);
4230}
4231
4232int
4233dorevoke(struct vnode *vp, kauth_cred_t cred)
4234{
4235	struct vattr vattr;
4236	int error;
4237
4238	vn_lock(vp, LK_SHARED | LK_RETRY);
4239	error = VOP_GETATTR(vp, &vattr, cred);
4240	VOP_UNLOCK(vp);
4241	if (error != 0)
4242		return error;
4243	if (kauth_cred_geteuid(cred) == vattr.va_uid ||
4244	    (error = kauth_authorize_generic(cred,
4245	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
4246		VOP_REVOKE(vp, REVOKEALL);
4247	return (error);
4248}
4249
4250/*
4251 * Void all references to file by ripping underlying filesystem
4252 * away from vnode.
4253 */
4254/* ARGSUSED */
4255int
4256sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4257{
4258	/* {
4259		syscallarg(const char *) path;
4260	} */
4261	struct vnode *vp;
4262	int error;
4263
4264	error = namei_simple_user(SCARG(uap, path),
4265				NSM_FOLLOW_TRYEMULROOT, &vp);
4266	if (error != 0)
4267		return (error);
4268	error = dorevoke(vp, l->l_cred);
4269	vrele(vp);
4270	return (error);
4271}
4272