vfs_syscalls.c revision 230124
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/9/sys/kern/vfs_syscalls.c 230124 2012-01-15 00:46:29Z kib $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capability.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/sdt.h>
65#include <sys/stat.h>
66#include <sys/sx.h>
67#include <sys/unistd.h>
68#include <sys/vnode.h>
69#include <sys/priv.h>
70#include <sys/proc.h>
71#include <sys/dirent.h>
72#include <sys/jail.h>
73#include <sys/syscallsubr.h>
74#include <sys/sysctl.h>
75#ifdef KTRACE
76#include <sys/ktrace.h>
77#endif
78
79#include <machine/stdarg.h>
80
81#include <security/audit/audit.h>
82#include <security/mac/mac_framework.h>
83
84#include <vm/vm.h>
85#include <vm/vm_object.h>
86#include <vm/vm_page.h>
87#include <vm/uma.h>
88
89#include <ufs/ufs/quota.h>
90
91static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92
93SDT_PROVIDER_DEFINE(vfs);
94SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
95SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
96SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
97SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
98SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
99SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
100
101static int chroot_refuse_vdir_fds(struct filedesc *fdp);
102static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103static int setfflags(struct thread *td, struct vnode *, int);
104static int setutimes(struct thread *td, struct vnode *,
105    const struct timespec *, int, int);
106static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
107    struct thread *td);
108
109/*
110 * The module initialization routine for POSIX asynchronous I/O will
111 * set this to the version of AIO that it implements.  (Zero means
112 * that it is not implemented.)  This value is used here by pathconf()
113 * and in kern_descrip.c by fpathconf().
114 */
115int async_io_version;
116
117#ifdef DEBUG
118static int syncprt = 0;
119SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
120#endif
121
122/*
123 * Sync each mounted filesystem.
124 */
125#ifndef _SYS_SYSPROTO_H_
126struct sync_args {
127	int     dummy;
128};
129#endif
130/* ARGSUSED */
131int
132sys_sync(td, uap)
133	struct thread *td;
134	struct sync_args *uap;
135{
136	struct mount *mp, *nmp;
137	int vfslocked;
138
139	mtx_lock(&mountlist_mtx);
140	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
141		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
142			nmp = TAILQ_NEXT(mp, mnt_list);
143			continue;
144		}
145		vfslocked = VFS_LOCK_GIANT(mp);
146		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148			MNT_ILOCK(mp);
149			mp->mnt_noasync++;
150			mp->mnt_kern_flag &= ~MNTK_ASYNC;
151			MNT_IUNLOCK(mp);
152			vfs_msync(mp, MNT_NOWAIT);
153			VFS_SYNC(mp, MNT_NOWAIT);
154			MNT_ILOCK(mp);
155			mp->mnt_noasync--;
156			if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
157			    mp->mnt_noasync == 0)
158				mp->mnt_kern_flag |= MNTK_ASYNC;
159			MNT_IUNLOCK(mp);
160			vn_finished_write(mp);
161		}
162		VFS_UNLOCK_GIANT(vfslocked);
163		mtx_lock(&mountlist_mtx);
164		nmp = TAILQ_NEXT(mp, mnt_list);
165		vfs_unbusy(mp);
166	}
167	mtx_unlock(&mountlist_mtx);
168	return (0);
169}
170
171/*
172 * Change filesystem quotas.
173 */
174#ifndef _SYS_SYSPROTO_H_
175struct quotactl_args {
176	char *path;
177	int cmd;
178	int uid;
179	caddr_t arg;
180};
181#endif
182int
183sys_quotactl(td, uap)
184	struct thread *td;
185	register struct quotactl_args /* {
186		char *path;
187		int cmd;
188		int uid;
189		caddr_t arg;
190	} */ *uap;
191{
192	struct mount *mp;
193	int vfslocked;
194	int error;
195	struct nameidata nd;
196
197	AUDIT_ARG_CMD(uap->cmd);
198	AUDIT_ARG_UID(uap->uid);
199	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
200		return (EPERM);
201	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
202	   UIO_USERSPACE, uap->path, td);
203	if ((error = namei(&nd)) != 0)
204		return (error);
205	vfslocked = NDHASGIANT(&nd);
206	NDFREE(&nd, NDF_ONLY_PNBUF);
207	mp = nd.ni_vp->v_mount;
208	vfs_ref(mp);
209	vput(nd.ni_vp);
210	error = vfs_busy(mp, 0);
211	vfs_rel(mp);
212	if (error) {
213		VFS_UNLOCK_GIANT(vfslocked);
214		return (error);
215	}
216	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
217
218	/*
219	 * Since quota on operation typically needs to open quota
220	 * file, the Q_QUOTAON handler needs to unbusy the mount point
221	 * before calling into namei.  Otherwise, unmount might be
222	 * started between two vfs_busy() invocations (first is our,
223	 * second is from mount point cross-walk code in lookup()),
224	 * causing deadlock.
225	 *
226	 * Require that Q_QUOTAON handles the vfs_busy() reference on
227	 * its own, always returning with ubusied mount point.
228	 */
229	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
230		vfs_unbusy(mp);
231	VFS_UNLOCK_GIANT(vfslocked);
232	return (error);
233}
234
235/*
236 * Used by statfs conversion routines to scale the block size up if
237 * necessary so that all of the block counts are <= 'max_size'.  Note
238 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
239 * value of 'n'.
240 */
241void
242statfs_scale_blocks(struct statfs *sf, long max_size)
243{
244	uint64_t count;
245	int shift;
246
247	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
248
249	/*
250	 * Attempt to scale the block counts to give a more accurate
251	 * overview to userland of the ratio of free space to used
252	 * space.  To do this, find the largest block count and compute
253	 * a divisor that lets it fit into a signed integer <= max_size.
254	 */
255	if (sf->f_bavail < 0)
256		count = -sf->f_bavail;
257	else
258		count = sf->f_bavail;
259	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
260	if (count <= max_size)
261		return;
262
263	count >>= flsl(max_size);
264	shift = 0;
265	while (count > 0) {
266		shift++;
267		count >>=1;
268	}
269
270	sf->f_bsize <<= shift;
271	sf->f_blocks >>= shift;
272	sf->f_bfree >>= shift;
273	sf->f_bavail >>= shift;
274}
275
276/*
277 * Get filesystem statistics.
278 */
279#ifndef _SYS_SYSPROTO_H_
280struct statfs_args {
281	char *path;
282	struct statfs *buf;
283};
284#endif
285int
286sys_statfs(td, uap)
287	struct thread *td;
288	register struct statfs_args /* {
289		char *path;
290		struct statfs *buf;
291	} */ *uap;
292{
293	struct statfs sf;
294	int error;
295
296	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
297	if (error == 0)
298		error = copyout(&sf, uap->buf, sizeof(sf));
299	return (error);
300}
301
302int
303kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
304    struct statfs *buf)
305{
306	struct mount *mp;
307	struct statfs *sp, sb;
308	int vfslocked;
309	int error;
310	struct nameidata nd;
311
312	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
313	    AUDITVNODE1, pathseg, path, td);
314	error = namei(&nd);
315	if (error)
316		return (error);
317	vfslocked = NDHASGIANT(&nd);
318	mp = nd.ni_vp->v_mount;
319	vfs_ref(mp);
320	NDFREE(&nd, NDF_ONLY_PNBUF);
321	vput(nd.ni_vp);
322	error = vfs_busy(mp, 0);
323	vfs_rel(mp);
324	if (error) {
325		VFS_UNLOCK_GIANT(vfslocked);
326		return (error);
327	}
328#ifdef MAC
329	error = mac_mount_check_stat(td->td_ucred, mp);
330	if (error)
331		goto out;
332#endif
333	/*
334	 * Set these in case the underlying filesystem fails to do so.
335	 */
336	sp = &mp->mnt_stat;
337	sp->f_version = STATFS_VERSION;
338	sp->f_namemax = NAME_MAX;
339	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
340	error = VFS_STATFS(mp, sp);
341	if (error)
342		goto out;
343	if (priv_check(td, PRIV_VFS_GENERATION)) {
344		bcopy(sp, &sb, sizeof(sb));
345		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
346		prison_enforce_statfs(td->td_ucred, mp, &sb);
347		sp = &sb;
348	}
349	*buf = *sp;
350out:
351	vfs_unbusy(mp);
352	VFS_UNLOCK_GIANT(vfslocked);
353	return (error);
354}
355
356/*
357 * Get filesystem statistics.
358 */
359#ifndef _SYS_SYSPROTO_H_
360struct fstatfs_args {
361	int fd;
362	struct statfs *buf;
363};
364#endif
365int
366sys_fstatfs(td, uap)
367	struct thread *td;
368	register struct fstatfs_args /* {
369		int fd;
370		struct statfs *buf;
371	} */ *uap;
372{
373	struct statfs sf;
374	int error;
375
376	error = kern_fstatfs(td, uap->fd, &sf);
377	if (error == 0)
378		error = copyout(&sf, uap->buf, sizeof(sf));
379	return (error);
380}
381
382int
383kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
384{
385	struct file *fp;
386	struct mount *mp;
387	struct statfs *sp, sb;
388	int vfslocked;
389	struct vnode *vp;
390	int error;
391
392	AUDIT_ARG_FD(fd);
393	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
394	if (error)
395		return (error);
396	vp = fp->f_vnode;
397	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
398	vn_lock(vp, LK_SHARED | LK_RETRY);
399#ifdef AUDIT
400	AUDIT_ARG_VNODE1(vp);
401#endif
402	mp = vp->v_mount;
403	if (mp)
404		vfs_ref(mp);
405	VOP_UNLOCK(vp, 0);
406	fdrop(fp, td);
407	if (mp == NULL) {
408		error = EBADF;
409		goto out;
410	}
411	error = vfs_busy(mp, 0);
412	vfs_rel(mp);
413	if (error) {
414		VFS_UNLOCK_GIANT(vfslocked);
415		return (error);
416	}
417#ifdef MAC
418	error = mac_mount_check_stat(td->td_ucred, mp);
419	if (error)
420		goto out;
421#endif
422	/*
423	 * Set these in case the underlying filesystem fails to do so.
424	 */
425	sp = &mp->mnt_stat;
426	sp->f_version = STATFS_VERSION;
427	sp->f_namemax = NAME_MAX;
428	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
429	error = VFS_STATFS(mp, sp);
430	if (error)
431		goto out;
432	if (priv_check(td, PRIV_VFS_GENERATION)) {
433		bcopy(sp, &sb, sizeof(sb));
434		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
435		prison_enforce_statfs(td->td_ucred, mp, &sb);
436		sp = &sb;
437	}
438	*buf = *sp;
439out:
440	if (mp)
441		vfs_unbusy(mp);
442	VFS_UNLOCK_GIANT(vfslocked);
443	return (error);
444}
445
446/*
447 * Get statistics on all filesystems.
448 */
449#ifndef _SYS_SYSPROTO_H_
450struct getfsstat_args {
451	struct statfs *buf;
452	long bufsize;
453	int flags;
454};
455#endif
456int
457sys_getfsstat(td, uap)
458	struct thread *td;
459	register struct getfsstat_args /* {
460		struct statfs *buf;
461		long bufsize;
462		int flags;
463	} */ *uap;
464{
465
466	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
467	    uap->flags));
468}
469
470/*
471 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
472 * 	The caller is responsible for freeing memory which will be allocated
473 *	in '*buf'.
474 */
475int
476kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
477    enum uio_seg bufseg, int flags)
478{
479	struct mount *mp, *nmp;
480	struct statfs *sfsp, *sp, sb;
481	size_t count, maxcount;
482	int vfslocked;
483	int error;
484
485	maxcount = bufsize / sizeof(struct statfs);
486	if (bufsize == 0)
487		sfsp = NULL;
488	else if (bufseg == UIO_USERSPACE)
489		sfsp = *buf;
490	else /* if (bufseg == UIO_SYSSPACE) */ {
491		count = 0;
492		mtx_lock(&mountlist_mtx);
493		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
494			count++;
495		}
496		mtx_unlock(&mountlist_mtx);
497		if (maxcount > count)
498			maxcount = count;
499		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
500		    M_WAITOK);
501	}
502	count = 0;
503	mtx_lock(&mountlist_mtx);
504	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
505		if (prison_canseemount(td->td_ucred, mp) != 0) {
506			nmp = TAILQ_NEXT(mp, mnt_list);
507			continue;
508		}
509#ifdef MAC
510		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
511			nmp = TAILQ_NEXT(mp, mnt_list);
512			continue;
513		}
514#endif
515		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
516			nmp = TAILQ_NEXT(mp, mnt_list);
517			continue;
518		}
519		vfslocked = VFS_LOCK_GIANT(mp);
520		if (sfsp && count < maxcount) {
521			sp = &mp->mnt_stat;
522			/*
523			 * Set these in case the underlying filesystem
524			 * fails to do so.
525			 */
526			sp->f_version = STATFS_VERSION;
527			sp->f_namemax = NAME_MAX;
528			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
529			/*
530			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
531			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
532			 * overrides MNT_WAIT.
533			 */
534			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
535			    (flags & MNT_WAIT)) &&
536			    (error = VFS_STATFS(mp, sp))) {
537				VFS_UNLOCK_GIANT(vfslocked);
538				mtx_lock(&mountlist_mtx);
539				nmp = TAILQ_NEXT(mp, mnt_list);
540				vfs_unbusy(mp);
541				continue;
542			}
543			if (priv_check(td, PRIV_VFS_GENERATION)) {
544				bcopy(sp, &sb, sizeof(sb));
545				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
546				prison_enforce_statfs(td->td_ucred, mp, &sb);
547				sp = &sb;
548			}
549			if (bufseg == UIO_SYSSPACE)
550				bcopy(sp, sfsp, sizeof(*sp));
551			else /* if (bufseg == UIO_USERSPACE) */ {
552				error = copyout(sp, sfsp, sizeof(*sp));
553				if (error) {
554					vfs_unbusy(mp);
555					VFS_UNLOCK_GIANT(vfslocked);
556					return (error);
557				}
558			}
559			sfsp++;
560		}
561		VFS_UNLOCK_GIANT(vfslocked);
562		count++;
563		mtx_lock(&mountlist_mtx);
564		nmp = TAILQ_NEXT(mp, mnt_list);
565		vfs_unbusy(mp);
566	}
567	mtx_unlock(&mountlist_mtx);
568	if (sfsp && count > maxcount)
569		td->td_retval[0] = maxcount;
570	else
571		td->td_retval[0] = count;
572	return (0);
573}
574
575#ifdef COMPAT_FREEBSD4
576/*
577 * Get old format filesystem statistics.
578 */
579static void cvtstatfs(struct statfs *, struct ostatfs *);
580
581#ifndef _SYS_SYSPROTO_H_
582struct freebsd4_statfs_args {
583	char *path;
584	struct ostatfs *buf;
585};
586#endif
587int
588freebsd4_statfs(td, uap)
589	struct thread *td;
590	struct freebsd4_statfs_args /* {
591		char *path;
592		struct ostatfs *buf;
593	} */ *uap;
594{
595	struct ostatfs osb;
596	struct statfs sf;
597	int error;
598
599	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
600	if (error)
601		return (error);
602	cvtstatfs(&sf, &osb);
603	return (copyout(&osb, uap->buf, sizeof(osb)));
604}
605
606/*
607 * Get filesystem statistics.
608 */
609#ifndef _SYS_SYSPROTO_H_
610struct freebsd4_fstatfs_args {
611	int fd;
612	struct ostatfs *buf;
613};
614#endif
615int
616freebsd4_fstatfs(td, uap)
617	struct thread *td;
618	struct freebsd4_fstatfs_args /* {
619		int fd;
620		struct ostatfs *buf;
621	} */ *uap;
622{
623	struct ostatfs osb;
624	struct statfs sf;
625	int error;
626
627	error = kern_fstatfs(td, uap->fd, &sf);
628	if (error)
629		return (error);
630	cvtstatfs(&sf, &osb);
631	return (copyout(&osb, uap->buf, sizeof(osb)));
632}
633
634/*
635 * Get statistics on all filesystems.
636 */
637#ifndef _SYS_SYSPROTO_H_
638struct freebsd4_getfsstat_args {
639	struct ostatfs *buf;
640	long bufsize;
641	int flags;
642};
643#endif
644int
645freebsd4_getfsstat(td, uap)
646	struct thread *td;
647	register struct freebsd4_getfsstat_args /* {
648		struct ostatfs *buf;
649		long bufsize;
650		int flags;
651	} */ *uap;
652{
653	struct statfs *buf, *sp;
654	struct ostatfs osb;
655	size_t count, size;
656	int error;
657
658	count = uap->bufsize / sizeof(struct ostatfs);
659	size = count * sizeof(struct statfs);
660	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
661	if (size > 0) {
662		count = td->td_retval[0];
663		sp = buf;
664		while (count > 0 && error == 0) {
665			cvtstatfs(sp, &osb);
666			error = copyout(&osb, uap->buf, sizeof(osb));
667			sp++;
668			uap->buf++;
669			count--;
670		}
671		free(buf, M_TEMP);
672	}
673	return (error);
674}
675
676/*
677 * Implement fstatfs() for (NFS) file handles.
678 */
679#ifndef _SYS_SYSPROTO_H_
680struct freebsd4_fhstatfs_args {
681	struct fhandle *u_fhp;
682	struct ostatfs *buf;
683};
684#endif
685int
686freebsd4_fhstatfs(td, uap)
687	struct thread *td;
688	struct freebsd4_fhstatfs_args /* {
689		struct fhandle *u_fhp;
690		struct ostatfs *buf;
691	} */ *uap;
692{
693	struct ostatfs osb;
694	struct statfs sf;
695	fhandle_t fh;
696	int error;
697
698	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
699	if (error)
700		return (error);
701	error = kern_fhstatfs(td, fh, &sf);
702	if (error)
703		return (error);
704	cvtstatfs(&sf, &osb);
705	return (copyout(&osb, uap->buf, sizeof(osb)));
706}
707
708/*
709 * Convert a new format statfs structure to an old format statfs structure.
710 */
711static void
712cvtstatfs(nsp, osp)
713	struct statfs *nsp;
714	struct ostatfs *osp;
715{
716
717	statfs_scale_blocks(nsp, LONG_MAX);
718	bzero(osp, sizeof(*osp));
719	osp->f_bsize = nsp->f_bsize;
720	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
721	osp->f_blocks = nsp->f_blocks;
722	osp->f_bfree = nsp->f_bfree;
723	osp->f_bavail = nsp->f_bavail;
724	osp->f_files = MIN(nsp->f_files, LONG_MAX);
725	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
726	osp->f_owner = nsp->f_owner;
727	osp->f_type = nsp->f_type;
728	osp->f_flags = nsp->f_flags;
729	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
730	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
731	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
732	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
733	strlcpy(osp->f_fstypename, nsp->f_fstypename,
734	    MIN(MFSNAMELEN, OMFSNAMELEN));
735	strlcpy(osp->f_mntonname, nsp->f_mntonname,
736	    MIN(MNAMELEN, OMNAMELEN));
737	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
738	    MIN(MNAMELEN, OMNAMELEN));
739	osp->f_fsid = nsp->f_fsid;
740}
741#endif /* COMPAT_FREEBSD4 */
742
743/*
744 * Change current working directory to a given file descriptor.
745 */
746#ifndef _SYS_SYSPROTO_H_
747struct fchdir_args {
748	int	fd;
749};
750#endif
751int
752sys_fchdir(td, uap)
753	struct thread *td;
754	struct fchdir_args /* {
755		int fd;
756	} */ *uap;
757{
758	register struct filedesc *fdp = td->td_proc->p_fd;
759	struct vnode *vp, *tdp, *vpold;
760	struct mount *mp;
761	struct file *fp;
762	int vfslocked;
763	int error;
764
765	AUDIT_ARG_FD(uap->fd);
766	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
767		return (error);
768	vp = fp->f_vnode;
769	VREF(vp);
770	fdrop(fp, td);
771	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
772	vn_lock(vp, LK_SHARED | LK_RETRY);
773	AUDIT_ARG_VNODE1(vp);
774	error = change_dir(vp, td);
775	while (!error && (mp = vp->v_mountedhere) != NULL) {
776		int tvfslocked;
777		if (vfs_busy(mp, 0))
778			continue;
779		tvfslocked = VFS_LOCK_GIANT(mp);
780		error = VFS_ROOT(mp, LK_SHARED, &tdp);
781		vfs_unbusy(mp);
782		if (error) {
783			VFS_UNLOCK_GIANT(tvfslocked);
784			break;
785		}
786		vput(vp);
787		VFS_UNLOCK_GIANT(vfslocked);
788		vp = tdp;
789		vfslocked = tvfslocked;
790	}
791	if (error) {
792		vput(vp);
793		VFS_UNLOCK_GIANT(vfslocked);
794		return (error);
795	}
796	VOP_UNLOCK(vp, 0);
797	VFS_UNLOCK_GIANT(vfslocked);
798	FILEDESC_XLOCK(fdp);
799	vpold = fdp->fd_cdir;
800	fdp->fd_cdir = vp;
801	FILEDESC_XUNLOCK(fdp);
802	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
803	vrele(vpold);
804	VFS_UNLOCK_GIANT(vfslocked);
805	return (0);
806}
807
808/*
809 * Change current working directory (``.'').
810 */
811#ifndef _SYS_SYSPROTO_H_
812struct chdir_args {
813	char	*path;
814};
815#endif
816int
817sys_chdir(td, uap)
818	struct thread *td;
819	struct chdir_args /* {
820		char *path;
821	} */ *uap;
822{
823
824	return (kern_chdir(td, uap->path, UIO_USERSPACE));
825}
826
827int
828kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
829{
830	register struct filedesc *fdp = td->td_proc->p_fd;
831	int error;
832	struct nameidata nd;
833	struct vnode *vp;
834	int vfslocked;
835
836	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
837	    MPSAFE, pathseg, path, td);
838	if ((error = namei(&nd)) != 0)
839		return (error);
840	vfslocked = NDHASGIANT(&nd);
841	if ((error = change_dir(nd.ni_vp, td)) != 0) {
842		vput(nd.ni_vp);
843		VFS_UNLOCK_GIANT(vfslocked);
844		NDFREE(&nd, NDF_ONLY_PNBUF);
845		return (error);
846	}
847	VOP_UNLOCK(nd.ni_vp, 0);
848	VFS_UNLOCK_GIANT(vfslocked);
849	NDFREE(&nd, NDF_ONLY_PNBUF);
850	FILEDESC_XLOCK(fdp);
851	vp = fdp->fd_cdir;
852	fdp->fd_cdir = nd.ni_vp;
853	FILEDESC_XUNLOCK(fdp);
854	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
855	vrele(vp);
856	VFS_UNLOCK_GIANT(vfslocked);
857	return (0);
858}
859
860/*
861 * Helper function for raised chroot(2) security function:  Refuse if
862 * any filedescriptors are open directories.
863 */
864static int
865chroot_refuse_vdir_fds(fdp)
866	struct filedesc *fdp;
867{
868	struct vnode *vp;
869	struct file *fp;
870	int fd;
871
872	FILEDESC_LOCK_ASSERT(fdp);
873
874	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
875		fp = fget_locked(fdp, fd);
876		if (fp == NULL)
877			continue;
878		if (fp->f_type == DTYPE_VNODE) {
879			vp = fp->f_vnode;
880			if (vp->v_type == VDIR)
881				return (EPERM);
882		}
883	}
884	return (0);
885}
886
887/*
888 * This sysctl determines if we will allow a process to chroot(2) if it
889 * has a directory open:
890 *	0: disallowed for all processes.
891 *	1: allowed for processes that were not already chroot(2)'ed.
892 *	2: allowed for all processes.
893 */
894
895static int chroot_allow_open_directories = 1;
896
897SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
898     &chroot_allow_open_directories, 0, "");
899
900/*
901 * Change notion of root (``/'') directory.
902 */
903#ifndef _SYS_SYSPROTO_H_
904struct chroot_args {
905	char	*path;
906};
907#endif
908int
909sys_chroot(td, uap)
910	struct thread *td;
911	struct chroot_args /* {
912		char *path;
913	} */ *uap;
914{
915	int error;
916	struct nameidata nd;
917	int vfslocked;
918
919	error = priv_check(td, PRIV_VFS_CHROOT);
920	if (error)
921		return (error);
922	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
923	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
924	error = namei(&nd);
925	if (error)
926		goto error;
927	vfslocked = NDHASGIANT(&nd);
928	if ((error = change_dir(nd.ni_vp, td)) != 0)
929		goto e_vunlock;
930#ifdef MAC
931	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
932		goto e_vunlock;
933#endif
934	VOP_UNLOCK(nd.ni_vp, 0);
935	error = change_root(nd.ni_vp, td);
936	vrele(nd.ni_vp);
937	VFS_UNLOCK_GIANT(vfslocked);
938	NDFREE(&nd, NDF_ONLY_PNBUF);
939	return (error);
940e_vunlock:
941	vput(nd.ni_vp);
942	VFS_UNLOCK_GIANT(vfslocked);
943error:
944	NDFREE(&nd, NDF_ONLY_PNBUF);
945	return (error);
946}
947
948/*
949 * Common routine for chroot and chdir.  Callers must provide a locked vnode
950 * instance.
951 */
952int
953change_dir(vp, td)
954	struct vnode *vp;
955	struct thread *td;
956{
957	int error;
958
959	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
960	if (vp->v_type != VDIR)
961		return (ENOTDIR);
962#ifdef MAC
963	error = mac_vnode_check_chdir(td->td_ucred, vp);
964	if (error)
965		return (error);
966#endif
967	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
968	return (error);
969}
970
971/*
972 * Common routine for kern_chroot() and jail_attach().  The caller is
973 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
974 * authorize this operation.
975 */
976int
977change_root(vp, td)
978	struct vnode *vp;
979	struct thread *td;
980{
981	struct filedesc *fdp;
982	struct vnode *oldvp;
983	int vfslocked;
984	int error;
985
986	VFS_ASSERT_GIANT(vp->v_mount);
987	fdp = td->td_proc->p_fd;
988	FILEDESC_XLOCK(fdp);
989	if (chroot_allow_open_directories == 0 ||
990	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
991		error = chroot_refuse_vdir_fds(fdp);
992		if (error) {
993			FILEDESC_XUNLOCK(fdp);
994			return (error);
995		}
996	}
997	oldvp = fdp->fd_rdir;
998	fdp->fd_rdir = vp;
999	VREF(fdp->fd_rdir);
1000	if (!fdp->fd_jdir) {
1001		fdp->fd_jdir = vp;
1002		VREF(fdp->fd_jdir);
1003	}
1004	FILEDESC_XUNLOCK(fdp);
1005	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
1006	vrele(oldvp);
1007	VFS_UNLOCK_GIANT(vfslocked);
1008	return (0);
1009}
1010
1011static __inline cap_rights_t
1012flags_to_rights(int flags)
1013{
1014	cap_rights_t rights = 0;
1015
1016	switch ((flags & O_ACCMODE)) {
1017	case O_RDONLY:
1018		rights |= CAP_READ;
1019		break;
1020
1021	case O_RDWR:
1022		rights |= CAP_READ;
1023		/* fall through */
1024
1025	case O_WRONLY:
1026		rights |= CAP_WRITE;
1027		break;
1028
1029	case O_EXEC:
1030		rights |= CAP_FEXECVE;
1031		break;
1032	}
1033
1034	if (flags & O_CREAT)
1035		rights |= CAP_CREATE;
1036
1037	if (flags & O_TRUNC)
1038		rights |= CAP_FTRUNCATE;
1039
1040	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
1041		rights |= CAP_FLOCK;
1042
1043	return (rights);
1044}
1045
1046/*
1047 * Check permissions, allocate an open file structure, and call the device
1048 * open routine if any.
1049 */
1050#ifndef _SYS_SYSPROTO_H_
1051struct open_args {
1052	char	*path;
1053	int	flags;
1054	int	mode;
1055};
1056#endif
1057int
1058sys_open(td, uap)
1059	struct thread *td;
1060	register struct open_args /* {
1061		char *path;
1062		int flags;
1063		int mode;
1064	} */ *uap;
1065{
1066
1067	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1068}
1069
1070#ifndef _SYS_SYSPROTO_H_
1071struct openat_args {
1072	int	fd;
1073	char	*path;
1074	int	flag;
1075	int	mode;
1076};
1077#endif
1078int
1079sys_openat(struct thread *td, struct openat_args *uap)
1080{
1081
1082	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1083	    uap->mode));
1084}
1085
1086int
1087kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1088    int mode)
1089{
1090
1091	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1092}
1093
1094int
1095kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1096    int flags, int mode)
1097{
1098	struct proc *p = td->td_proc;
1099	struct filedesc *fdp = p->p_fd;
1100	struct file *fp;
1101	struct vnode *vp;
1102	int cmode;
1103	struct file *nfp;
1104	int type, indx = -1, error, error_open;
1105	struct flock lf;
1106	struct nameidata nd;
1107	int vfslocked;
1108	cap_rights_t rights_needed = CAP_LOOKUP;
1109
1110	AUDIT_ARG_FFLAGS(flags);
1111	AUDIT_ARG_MODE(mode);
1112	/* XXX: audit dirfd */
1113	rights_needed |= flags_to_rights(flags);
1114	/*
1115	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1116	 * may be specified.
1117	 */
1118	if (flags & O_EXEC) {
1119		if (flags & O_ACCMODE)
1120			return (EINVAL);
1121	} else if ((flags & O_ACCMODE) == O_ACCMODE)
1122		return (EINVAL);
1123	else
1124		flags = FFLAGS(flags);
1125
1126	/*
1127	 * allocate the file descriptor, but don't install a descriptor yet
1128	 */
1129	error = falloc_noinstall(td, &nfp);
1130	if (error)
1131		return (error);
1132	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
1133	fp = nfp;
1134	/* Set the flags early so the finit in devfs can pick them up. */
1135	fp->f_flag = flags & FMASK;
1136	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1137	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
1138	    path, fd, rights_needed, td);
1139	td->td_dupfd = -1;		/* XXX check for fdopen */
1140	error = vn_open(&nd, &flags, cmode, fp);
1141	if (error) {
1142		/*
1143		 * If the vn_open replaced the method vector, something
1144		 * wonderous happened deep below and we just pass it up
1145		 * pretending we know what we do.
1146		 */
1147		if (error == ENXIO && fp->f_ops != &badfileops)
1148			goto success;
1149
1150		/*
1151		 * handle special fdopen() case.  bleh.  dupfdopen() is
1152		 * responsible for dropping the old contents of ofiles[indx]
1153		 * if it succeeds.
1154		 *
1155		 * Don't do this for relative (capability) lookups; we don't
1156		 * understand exactly what would happen, and we don't think
1157		 * that it ever should.
1158		 */
1159		if ((nd.ni_strictrelative == 0) &&
1160		    (error == ENODEV || error == ENXIO) &&
1161		    (td->td_dupfd >= 0)) {
1162			/* XXX from fdopen */
1163			error_open = error;
1164			if ((error = finstall(td, fp, &indx, flags)) != 0)
1165				goto bad_unlocked;
1166			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
1167			    flags, error_open)) == 0)
1168				goto success;
1169		}
1170		/*
1171		 * Clean up the descriptor, but only if another thread hadn't
1172		 * replaced or closed it.
1173		 */
1174		if (indx != -1)
1175			fdclose(fdp, fp, indx, td);
1176		fdrop(fp, td);
1177
1178		if (error == ERESTART)
1179			error = EINTR;
1180		return (error);
1181	}
1182	td->td_dupfd = 0;
1183	vfslocked = NDHASGIANT(&nd);
1184	NDFREE(&nd, NDF_ONLY_PNBUF);
1185	vp = nd.ni_vp;
1186
1187	/*
1188	 * Store the vnode, for any f_type. Typically, the vnode use
1189	 * count is decremented by direct call to vn_closefile() for
1190	 * files that switched type in the cdevsw fdopen() method.
1191	 */
1192	fp->f_vnode = vp;
1193	/*
1194	 * If the file wasn't claimed by devfs bind it to the normal
1195	 * vnode operations here.
1196	 */
1197	if (fp->f_ops == &badfileops) {
1198		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1199		fp->f_seqcount = 1;
1200		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
1201	}
1202
1203	VOP_UNLOCK(vp, 0);
1204	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
1205		lf.l_whence = SEEK_SET;
1206		lf.l_start = 0;
1207		lf.l_len = 0;
1208		if (flags & O_EXLOCK)
1209			lf.l_type = F_WRLCK;
1210		else
1211			lf.l_type = F_RDLCK;
1212		type = F_FLOCK;
1213		if ((flags & FNONBLOCK) == 0)
1214			type |= F_WAIT;
1215		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1216			    type)) != 0)
1217			goto bad;
1218		atomic_set_int(&fp->f_flag, FHASLOCK);
1219	}
1220	if (flags & O_TRUNC) {
1221		error = fo_truncate(fp, 0, td->td_ucred, td);
1222		if (error)
1223			goto bad;
1224	}
1225	VFS_UNLOCK_GIANT(vfslocked);
1226success:
1227	/*
1228	 * If we haven't already installed the FD (for dupfdopen), do so now.
1229	 */
1230	if (indx == -1) {
1231#ifdef CAPABILITIES
1232		if (nd.ni_strictrelative == 1) {
1233			/*
1234			 * We are doing a strict relative lookup; wrap the
1235			 * result in a capability.
1236			 */
1237			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
1238			    &indx)) != 0)
1239				goto bad_unlocked;
1240		} else
1241#endif
1242			if ((error = finstall(td, fp, &indx, flags)) != 0)
1243				goto bad_unlocked;
1244
1245	}
1246
1247	/*
1248	 * Release our private reference, leaving the one associated with
1249	 * the descriptor table intact.
1250	 */
1251	fdrop(fp, td);
1252	td->td_retval[0] = indx;
1253	return (0);
1254bad:
1255	VFS_UNLOCK_GIANT(vfslocked);
1256bad_unlocked:
1257	if (indx != -1)
1258		fdclose(fdp, fp, indx, td);
1259	fdrop(fp, td);
1260	td->td_retval[0] = -1;
1261	return (error);
1262}
1263
1264#ifdef COMPAT_43
1265/*
1266 * Create a file.
1267 */
1268#ifndef _SYS_SYSPROTO_H_
1269struct ocreat_args {
1270	char	*path;
1271	int	mode;
1272};
1273#endif
1274int
1275ocreat(td, uap)
1276	struct thread *td;
1277	register struct ocreat_args /* {
1278		char *path;
1279		int mode;
1280	} */ *uap;
1281{
1282
1283	return (kern_open(td, uap->path, UIO_USERSPACE,
1284	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1285}
1286#endif /* COMPAT_43 */
1287
1288/*
1289 * Create a special file.
1290 */
1291#ifndef _SYS_SYSPROTO_H_
1292struct mknod_args {
1293	char	*path;
1294	int	mode;
1295	int	dev;
1296};
1297#endif
1298int
1299sys_mknod(td, uap)
1300	struct thread *td;
1301	register struct mknod_args /* {
1302		char *path;
1303		int mode;
1304		int dev;
1305	} */ *uap;
1306{
1307
1308	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1309}
1310
1311#ifndef _SYS_SYSPROTO_H_
1312struct mknodat_args {
1313	int	fd;
1314	char	*path;
1315	mode_t	mode;
1316	dev_t	dev;
1317};
1318#endif
1319int
1320sys_mknodat(struct thread *td, struct mknodat_args *uap)
1321{
1322
1323	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1324	    uap->dev));
1325}
1326
1327int
1328kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1329    int dev)
1330{
1331
1332	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1333}
1334
1335int
1336kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1337    int mode, int dev)
1338{
1339	struct vnode *vp;
1340	struct mount *mp;
1341	struct vattr vattr;
1342	int error;
1343	int whiteout = 0;
1344	struct nameidata nd;
1345	int vfslocked;
1346
1347	AUDIT_ARG_MODE(mode);
1348	AUDIT_ARG_DEV(dev);
1349	switch (mode & S_IFMT) {
1350	case S_IFCHR:
1351	case S_IFBLK:
1352		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1353		break;
1354	case S_IFMT:
1355		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1356		break;
1357	case S_IFWHT:
1358		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1359		break;
1360	case S_IFIFO:
1361		if (dev == 0)
1362			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1363		/* FALLTHROUGH */
1364	default:
1365		error = EINVAL;
1366		break;
1367	}
1368	if (error)
1369		return (error);
1370restart:
1371	bwillwrite();
1372	NDINIT_ATRIGHTS(&nd, CREATE,
1373	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
1374	    CAP_MKFIFO, td);
1375	if ((error = namei(&nd)) != 0)
1376		return (error);
1377	vfslocked = NDHASGIANT(&nd);
1378	vp = nd.ni_vp;
1379	if (vp != NULL) {
1380		NDFREE(&nd, NDF_ONLY_PNBUF);
1381		if (vp == nd.ni_dvp)
1382			vrele(nd.ni_dvp);
1383		else
1384			vput(nd.ni_dvp);
1385		vrele(vp);
1386		VFS_UNLOCK_GIANT(vfslocked);
1387		return (EEXIST);
1388	} else {
1389		VATTR_NULL(&vattr);
1390		vattr.va_mode = (mode & ALLPERMS) &
1391		    ~td->td_proc->p_fd->fd_cmask;
1392		vattr.va_rdev = dev;
1393		whiteout = 0;
1394
1395		switch (mode & S_IFMT) {
1396		case S_IFMT:	/* used by badsect to flag bad sectors */
1397			vattr.va_type = VBAD;
1398			break;
1399		case S_IFCHR:
1400			vattr.va_type = VCHR;
1401			break;
1402		case S_IFBLK:
1403			vattr.va_type = VBLK;
1404			break;
1405		case S_IFWHT:
1406			whiteout = 1;
1407			break;
1408		default:
1409			panic("kern_mknod: invalid mode");
1410		}
1411	}
1412	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1413		NDFREE(&nd, NDF_ONLY_PNBUF);
1414		vput(nd.ni_dvp);
1415		VFS_UNLOCK_GIANT(vfslocked);
1416		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1417			return (error);
1418		goto restart;
1419	}
1420#ifdef MAC
1421	if (error == 0 && !whiteout)
1422		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1423		    &nd.ni_cnd, &vattr);
1424#endif
1425	if (!error) {
1426		if (whiteout)
1427			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1428		else {
1429			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1430						&nd.ni_cnd, &vattr);
1431			if (error == 0)
1432				vput(nd.ni_vp);
1433		}
1434	}
1435	NDFREE(&nd, NDF_ONLY_PNBUF);
1436	vput(nd.ni_dvp);
1437	vn_finished_write(mp);
1438	VFS_UNLOCK_GIANT(vfslocked);
1439	return (error);
1440}
1441
1442/*
1443 * Create a named pipe.
1444 */
1445#ifndef _SYS_SYSPROTO_H_
1446struct mkfifo_args {
1447	char	*path;
1448	int	mode;
1449};
1450#endif
1451int
1452sys_mkfifo(td, uap)
1453	struct thread *td;
1454	register struct mkfifo_args /* {
1455		char *path;
1456		int mode;
1457	} */ *uap;
1458{
1459
1460	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1461}
1462
1463#ifndef _SYS_SYSPROTO_H_
1464struct mkfifoat_args {
1465	int	fd;
1466	char	*path;
1467	mode_t	mode;
1468};
1469#endif
1470int
1471sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1472{
1473
1474	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1475	    uap->mode));
1476}
1477
1478int
1479kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1480{
1481
1482	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1483}
1484
1485int
1486kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1487    int mode)
1488{
1489	struct mount *mp;
1490	struct vattr vattr;
1491	int error;
1492	struct nameidata nd;
1493	int vfslocked;
1494
1495	AUDIT_ARG_MODE(mode);
1496restart:
1497	bwillwrite();
1498	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1499	    pathseg, path, fd, td);
1500	if ((error = namei(&nd)) != 0)
1501		return (error);
1502	vfslocked = NDHASGIANT(&nd);
1503	if (nd.ni_vp != NULL) {
1504		NDFREE(&nd, NDF_ONLY_PNBUF);
1505		if (nd.ni_vp == nd.ni_dvp)
1506			vrele(nd.ni_dvp);
1507		else
1508			vput(nd.ni_dvp);
1509		vrele(nd.ni_vp);
1510		VFS_UNLOCK_GIANT(vfslocked);
1511		return (EEXIST);
1512	}
1513	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1514		NDFREE(&nd, NDF_ONLY_PNBUF);
1515		vput(nd.ni_dvp);
1516		VFS_UNLOCK_GIANT(vfslocked);
1517		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1518			return (error);
1519		goto restart;
1520	}
1521	VATTR_NULL(&vattr);
1522	vattr.va_type = VFIFO;
1523	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1524#ifdef MAC
1525	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1526	    &vattr);
1527	if (error)
1528		goto out;
1529#endif
1530	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1531	if (error == 0)
1532		vput(nd.ni_vp);
1533#ifdef MAC
1534out:
1535#endif
1536	vput(nd.ni_dvp);
1537	vn_finished_write(mp);
1538	VFS_UNLOCK_GIANT(vfslocked);
1539	NDFREE(&nd, NDF_ONLY_PNBUF);
1540	return (error);
1541}
1542
1543/*
1544 * Make a hard file link.
1545 */
1546#ifndef _SYS_SYSPROTO_H_
1547struct link_args {
1548	char	*path;
1549	char	*link;
1550};
1551#endif
1552int
1553sys_link(td, uap)
1554	struct thread *td;
1555	register struct link_args /* {
1556		char *path;
1557		char *link;
1558	} */ *uap;
1559{
1560
1561	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1562}
1563
1564#ifndef _SYS_SYSPROTO_H_
1565struct linkat_args {
1566	int	fd1;
1567	char	*path1;
1568	int	fd2;
1569	char	*path2;
1570	int	flag;
1571};
1572#endif
1573int
1574sys_linkat(struct thread *td, struct linkat_args *uap)
1575{
1576	int flag;
1577
1578	flag = uap->flag;
1579	if (flag & ~AT_SYMLINK_FOLLOW)
1580		return (EINVAL);
1581
1582	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1583	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1584}
1585
1586int hardlink_check_uid = 0;
1587SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1588    &hardlink_check_uid, 0,
1589    "Unprivileged processes cannot create hard links to files owned by other "
1590    "users");
1591static int hardlink_check_gid = 0;
1592SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1593    &hardlink_check_gid, 0,
1594    "Unprivileged processes cannot create hard links to files owned by other "
1595    "groups");
1596
1597static int
1598can_hardlink(struct vnode *vp, struct ucred *cred)
1599{
1600	struct vattr va;
1601	int error;
1602
1603	if (!hardlink_check_uid && !hardlink_check_gid)
1604		return (0);
1605
1606	error = VOP_GETATTR(vp, &va, cred);
1607	if (error != 0)
1608		return (error);
1609
1610	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1611		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1612		if (error)
1613			return (error);
1614	}
1615
1616	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1617		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1618		if (error)
1619			return (error);
1620	}
1621
1622	return (0);
1623}
1624
1625int
1626kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1627{
1628
1629	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1630}
1631
1632int
1633kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1634    enum uio_seg segflg, int follow)
1635{
1636	struct vnode *vp;
1637	struct mount *mp;
1638	struct nameidata nd;
1639	int vfslocked;
1640	int lvfslocked;
1641	int error;
1642
1643	bwillwrite();
1644	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
1645	    fd1, td);
1646
1647	if ((error = namei(&nd)) != 0)
1648		return (error);
1649	vfslocked = NDHASGIANT(&nd);
1650	NDFREE(&nd, NDF_ONLY_PNBUF);
1651	vp = nd.ni_vp;
1652	if (vp->v_type == VDIR) {
1653		vrele(vp);
1654		VFS_UNLOCK_GIANT(vfslocked);
1655		return (EPERM);		/* POSIX */
1656	}
1657	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1658		vrele(vp);
1659		VFS_UNLOCK_GIANT(vfslocked);
1660		return (error);
1661	}
1662	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
1663	    segflg, path2, fd2, td);
1664	if ((error = namei(&nd)) == 0) {
1665		lvfslocked = NDHASGIANT(&nd);
1666		if (nd.ni_vp != NULL) {
1667			if (nd.ni_dvp == nd.ni_vp)
1668				vrele(nd.ni_dvp);
1669			else
1670				vput(nd.ni_dvp);
1671			vrele(nd.ni_vp);
1672			error = EEXIST;
1673		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1674		    == 0) {
1675			error = can_hardlink(vp, td->td_ucred);
1676			if (error == 0)
1677#ifdef MAC
1678				error = mac_vnode_check_link(td->td_ucred,
1679				    nd.ni_dvp, vp, &nd.ni_cnd);
1680			if (error == 0)
1681#endif
1682				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1683			VOP_UNLOCK(vp, 0);
1684			vput(nd.ni_dvp);
1685		}
1686		NDFREE(&nd, NDF_ONLY_PNBUF);
1687		VFS_UNLOCK_GIANT(lvfslocked);
1688	}
1689	vrele(vp);
1690	vn_finished_write(mp);
1691	VFS_UNLOCK_GIANT(vfslocked);
1692	return (error);
1693}
1694
1695/*
1696 * Make a symbolic link.
1697 */
1698#ifndef _SYS_SYSPROTO_H_
1699struct symlink_args {
1700	char	*path;
1701	char	*link;
1702};
1703#endif
1704int
1705sys_symlink(td, uap)
1706	struct thread *td;
1707	register struct symlink_args /* {
1708		char *path;
1709		char *link;
1710	} */ *uap;
1711{
1712
1713	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1714}
1715
1716#ifndef _SYS_SYSPROTO_H_
1717struct symlinkat_args {
1718	char	*path;
1719	int	fd;
1720	char	*path2;
1721};
1722#endif
1723int
1724sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1725{
1726
1727	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1728	    UIO_USERSPACE));
1729}
1730
1731int
1732kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1733{
1734
1735	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1736}
1737
1738int
1739kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1740    enum uio_seg segflg)
1741{
1742	struct mount *mp;
1743	struct vattr vattr;
1744	char *syspath;
1745	int error;
1746	struct nameidata nd;
1747	int vfslocked;
1748
1749	if (segflg == UIO_SYSSPACE) {
1750		syspath = path1;
1751	} else {
1752		syspath = uma_zalloc(namei_zone, M_WAITOK);
1753		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1754			goto out;
1755	}
1756	AUDIT_ARG_TEXT(syspath);
1757restart:
1758	bwillwrite();
1759	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
1760	    segflg, path2, fd, td);
1761	if ((error = namei(&nd)) != 0)
1762		goto out;
1763	vfslocked = NDHASGIANT(&nd);
1764	if (nd.ni_vp) {
1765		NDFREE(&nd, NDF_ONLY_PNBUF);
1766		if (nd.ni_vp == nd.ni_dvp)
1767			vrele(nd.ni_dvp);
1768		else
1769			vput(nd.ni_dvp);
1770		vrele(nd.ni_vp);
1771		VFS_UNLOCK_GIANT(vfslocked);
1772		error = EEXIST;
1773		goto out;
1774	}
1775	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1776		NDFREE(&nd, NDF_ONLY_PNBUF);
1777		vput(nd.ni_dvp);
1778		VFS_UNLOCK_GIANT(vfslocked);
1779		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1780			goto out;
1781		goto restart;
1782	}
1783	VATTR_NULL(&vattr);
1784	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1785#ifdef MAC
1786	vattr.va_type = VLNK;
1787	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1788	    &vattr);
1789	if (error)
1790		goto out2;
1791#endif
1792	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1793	if (error == 0)
1794		vput(nd.ni_vp);
1795#ifdef MAC
1796out2:
1797#endif
1798	NDFREE(&nd, NDF_ONLY_PNBUF);
1799	vput(nd.ni_dvp);
1800	vn_finished_write(mp);
1801	VFS_UNLOCK_GIANT(vfslocked);
1802out:
1803	if (segflg != UIO_SYSSPACE)
1804		uma_zfree(namei_zone, syspath);
1805	return (error);
1806}
1807
1808/*
1809 * Delete a whiteout from the filesystem.
1810 */
1811int
1812sys_undelete(td, uap)
1813	struct thread *td;
1814	register struct undelete_args /* {
1815		char *path;
1816	} */ *uap;
1817{
1818	int error;
1819	struct mount *mp;
1820	struct nameidata nd;
1821	int vfslocked;
1822
1823restart:
1824	bwillwrite();
1825	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
1826	    UIO_USERSPACE, uap->path, td);
1827	error = namei(&nd);
1828	if (error)
1829		return (error);
1830	vfslocked = NDHASGIANT(&nd);
1831
1832	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1833		NDFREE(&nd, NDF_ONLY_PNBUF);
1834		if (nd.ni_vp == nd.ni_dvp)
1835			vrele(nd.ni_dvp);
1836		else
1837			vput(nd.ni_dvp);
1838		if (nd.ni_vp)
1839			vrele(nd.ni_vp);
1840		VFS_UNLOCK_GIANT(vfslocked);
1841		return (EEXIST);
1842	}
1843	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1844		NDFREE(&nd, NDF_ONLY_PNBUF);
1845		vput(nd.ni_dvp);
1846		VFS_UNLOCK_GIANT(vfslocked);
1847		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1848			return (error);
1849		goto restart;
1850	}
1851	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1852	NDFREE(&nd, NDF_ONLY_PNBUF);
1853	vput(nd.ni_dvp);
1854	vn_finished_write(mp);
1855	VFS_UNLOCK_GIANT(vfslocked);
1856	return (error);
1857}
1858
1859/*
1860 * Delete a name from the filesystem.
1861 */
1862#ifndef _SYS_SYSPROTO_H_
1863struct unlink_args {
1864	char	*path;
1865};
1866#endif
1867int
1868sys_unlink(td, uap)
1869	struct thread *td;
1870	struct unlink_args /* {
1871		char *path;
1872	} */ *uap;
1873{
1874
1875	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1876}
1877
1878#ifndef _SYS_SYSPROTO_H_
1879struct unlinkat_args {
1880	int	fd;
1881	char	*path;
1882	int	flag;
1883};
1884#endif
1885int
1886sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1887{
1888	int flag = uap->flag;
1889	int fd = uap->fd;
1890	char *path = uap->path;
1891
1892	if (flag & ~AT_REMOVEDIR)
1893		return (EINVAL);
1894
1895	if (flag & AT_REMOVEDIR)
1896		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1897	else
1898		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1899}
1900
1901int
1902kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1903{
1904
1905	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1906}
1907
1908int
1909kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1910    ino_t oldinum)
1911{
1912	struct mount *mp;
1913	struct vnode *vp;
1914	int error;
1915	struct nameidata nd;
1916	struct stat sb;
1917	int vfslocked;
1918
1919restart:
1920	bwillwrite();
1921	NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
1922	    pathseg, path, fd, td);
1923	if ((error = namei(&nd)) != 0)
1924		return (error == EINVAL ? EPERM : error);
1925	vfslocked = NDHASGIANT(&nd);
1926	vp = nd.ni_vp;
1927	if (vp->v_type == VDIR && oldinum == 0) {
1928		error = EPERM;		/* POSIX */
1929	} else if (oldinum != 0 &&
1930		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1931		  sb.st_ino != oldinum) {
1932			error = EIDRM;	/* Identifier removed */
1933	} else {
1934		/*
1935		 * The root of a mounted filesystem cannot be deleted.
1936		 *
1937		 * XXX: can this only be a VDIR case?
1938		 */
1939		if (vp->v_vflag & VV_ROOT)
1940			error = EBUSY;
1941	}
1942	if (error == 0) {
1943		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1944			NDFREE(&nd, NDF_ONLY_PNBUF);
1945			vput(nd.ni_dvp);
1946			if (vp == nd.ni_dvp)
1947				vrele(vp);
1948			else
1949				vput(vp);
1950			VFS_UNLOCK_GIANT(vfslocked);
1951			if ((error = vn_start_write(NULL, &mp,
1952			    V_XSLEEP | PCATCH)) != 0)
1953				return (error);
1954			goto restart;
1955		}
1956#ifdef MAC
1957		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1958		    &nd.ni_cnd);
1959		if (error)
1960			goto out;
1961#endif
1962		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1963#ifdef MAC
1964out:
1965#endif
1966		vn_finished_write(mp);
1967	}
1968	NDFREE(&nd, NDF_ONLY_PNBUF);
1969	vput(nd.ni_dvp);
1970	if (vp == nd.ni_dvp)
1971		vrele(vp);
1972	else
1973		vput(vp);
1974	VFS_UNLOCK_GIANT(vfslocked);
1975	return (error);
1976}
1977
1978/*
1979 * Reposition read/write file offset.
1980 */
1981#ifndef _SYS_SYSPROTO_H_
1982struct lseek_args {
1983	int	fd;
1984	int	pad;
1985	off_t	offset;
1986	int	whence;
1987};
1988#endif
1989int
1990sys_lseek(td, uap)
1991	struct thread *td;
1992	register struct lseek_args /* {
1993		int fd;
1994		int pad;
1995		off_t offset;
1996		int whence;
1997	} */ *uap;
1998{
1999	struct ucred *cred = td->td_ucred;
2000	struct file *fp;
2001	struct vnode *vp;
2002	struct vattr vattr;
2003	off_t offset, size;
2004	int error, noneg;
2005	int vfslocked;
2006
2007	AUDIT_ARG_FD(uap->fd);
2008	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
2009		return (error);
2010	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
2011		fdrop(fp, td);
2012		return (ESPIPE);
2013	}
2014	vp = fp->f_vnode;
2015	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2016	noneg = (vp->v_type != VCHR);
2017	offset = uap->offset;
2018	switch (uap->whence) {
2019	case L_INCR:
2020		if (noneg &&
2021		    (fp->f_offset < 0 ||
2022		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
2023			error = EOVERFLOW;
2024			break;
2025		}
2026		offset += fp->f_offset;
2027		break;
2028	case L_XTND:
2029		vn_lock(vp, LK_SHARED | LK_RETRY);
2030		error = VOP_GETATTR(vp, &vattr, cred);
2031		VOP_UNLOCK(vp, 0);
2032		if (error)
2033			break;
2034
2035		/*
2036		 * If the file references a disk device, then fetch
2037		 * the media size and use that to determine the ending
2038		 * offset.
2039		 */
2040		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2041		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2042			vattr.va_size = size;
2043		if (noneg &&
2044		    (vattr.va_size > OFF_MAX ||
2045		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2046			error = EOVERFLOW;
2047			break;
2048		}
2049		offset += vattr.va_size;
2050		break;
2051	case L_SET:
2052		break;
2053	case SEEK_DATA:
2054		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2055		break;
2056	case SEEK_HOLE:
2057		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2058		break;
2059	default:
2060		error = EINVAL;
2061	}
2062	if (error == 0 && noneg && offset < 0)
2063		error = EINVAL;
2064	if (error != 0)
2065		goto drop;
2066	fp->f_offset = offset;
2067	VFS_KNOTE_UNLOCKED(vp, 0);
2068	*(off_t *)(td->td_retval) = fp->f_offset;
2069drop:
2070	fdrop(fp, td);
2071	VFS_UNLOCK_GIANT(vfslocked);
2072	return (error);
2073}
2074
2075#if defined(COMPAT_43)
2076/*
2077 * Reposition read/write file offset.
2078 */
2079#ifndef _SYS_SYSPROTO_H_
2080struct olseek_args {
2081	int	fd;
2082	long	offset;
2083	int	whence;
2084};
2085#endif
2086int
2087olseek(td, uap)
2088	struct thread *td;
2089	register struct olseek_args /* {
2090		int fd;
2091		long offset;
2092		int whence;
2093	} */ *uap;
2094{
2095	struct lseek_args /* {
2096		int fd;
2097		int pad;
2098		off_t offset;
2099		int whence;
2100	} */ nuap;
2101
2102	nuap.fd = uap->fd;
2103	nuap.offset = uap->offset;
2104	nuap.whence = uap->whence;
2105	return (sys_lseek(td, &nuap));
2106}
2107#endif /* COMPAT_43 */
2108
2109/* Version with the 'pad' argument */
2110int
2111freebsd6_lseek(td, uap)
2112	struct thread *td;
2113	register struct freebsd6_lseek_args *uap;
2114{
2115	struct lseek_args ouap;
2116
2117	ouap.fd = uap->fd;
2118	ouap.offset = uap->offset;
2119	ouap.whence = uap->whence;
2120	return (sys_lseek(td, &ouap));
2121}
2122
2123/*
2124 * Check access permissions using passed credentials.
2125 */
2126static int
2127vn_access(vp, user_flags, cred, td)
2128	struct vnode	*vp;
2129	int		user_flags;
2130	struct ucred	*cred;
2131	struct thread	*td;
2132{
2133	int error;
2134	accmode_t accmode;
2135
2136	/* Flags == 0 means only check for existence. */
2137	error = 0;
2138	if (user_flags) {
2139		accmode = 0;
2140		if (user_flags & R_OK)
2141			accmode |= VREAD;
2142		if (user_flags & W_OK)
2143			accmode |= VWRITE;
2144		if (user_flags & X_OK)
2145			accmode |= VEXEC;
2146#ifdef MAC
2147		error = mac_vnode_check_access(cred, vp, accmode);
2148		if (error)
2149			return (error);
2150#endif
2151		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2152			error = VOP_ACCESS(vp, accmode, cred, td);
2153	}
2154	return (error);
2155}
2156
2157/*
2158 * Check access permissions using "real" credentials.
2159 */
2160#ifndef _SYS_SYSPROTO_H_
2161struct access_args {
2162	char	*path;
2163	int	flags;
2164};
2165#endif
2166int
2167sys_access(td, uap)
2168	struct thread *td;
2169	register struct access_args /* {
2170		char *path;
2171		int flags;
2172	} */ *uap;
2173{
2174
2175	return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags));
2176}
2177
2178#ifndef _SYS_SYSPROTO_H_
2179struct faccessat_args {
2180	int	dirfd;
2181	char	*path;
2182	int	mode;
2183	int	flag;
2184}
2185#endif
2186int
2187sys_faccessat(struct thread *td, struct faccessat_args *uap)
2188{
2189
2190	if (uap->flag & ~AT_EACCESS)
2191		return (EINVAL);
2192	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2193	    uap->mode));
2194}
2195
2196int
2197kern_access(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2198{
2199
2200	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, mode));
2201}
2202
2203int
2204kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2205    int flags, int mode)
2206{
2207	struct ucred *cred, *tmpcred;
2208	struct vnode *vp;
2209	struct nameidata nd;
2210	int vfslocked;
2211	int error;
2212
2213	/*
2214	 * Create and modify a temporary credential instead of one that
2215	 * is potentially shared.
2216	 */
2217	if (!(flags & AT_EACCESS)) {
2218		cred = td->td_ucred;
2219		tmpcred = crdup(cred);
2220		tmpcred->cr_uid = cred->cr_ruid;
2221		tmpcred->cr_groups[0] = cred->cr_rgid;
2222		td->td_ucred = tmpcred;
2223	} else
2224		cred = tmpcred = td->td_ucred;
2225	AUDIT_ARG_VALUE(mode);
2226	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2227	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
2228	if ((error = namei(&nd)) != 0)
2229		goto out1;
2230	vfslocked = NDHASGIANT(&nd);
2231	vp = nd.ni_vp;
2232
2233	error = vn_access(vp, mode, tmpcred, td);
2234	NDFREE(&nd, NDF_ONLY_PNBUF);
2235	vput(vp);
2236	VFS_UNLOCK_GIANT(vfslocked);
2237out1:
2238	if (!(flags & AT_EACCESS)) {
2239		td->td_ucred = cred;
2240		crfree(tmpcred);
2241	}
2242	return (error);
2243}
2244
2245/*
2246 * Check access permissions using "effective" credentials.
2247 */
2248#ifndef _SYS_SYSPROTO_H_
2249struct eaccess_args {
2250	char	*path;
2251	int	flags;
2252};
2253#endif
2254int
2255sys_eaccess(td, uap)
2256	struct thread *td;
2257	register struct eaccess_args /* {
2258		char *path;
2259		int flags;
2260	} */ *uap;
2261{
2262
2263	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
2264}
2265
2266int
2267kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
2268{
2269
2270	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, flags));
2271}
2272
2273#if defined(COMPAT_43)
2274/*
2275 * Get file status; this version follows links.
2276 */
2277#ifndef _SYS_SYSPROTO_H_
2278struct ostat_args {
2279	char	*path;
2280	struct ostat *ub;
2281};
2282#endif
2283int
2284ostat(td, uap)
2285	struct thread *td;
2286	register struct ostat_args /* {
2287		char *path;
2288		struct ostat *ub;
2289	} */ *uap;
2290{
2291	struct stat sb;
2292	struct ostat osb;
2293	int error;
2294
2295	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2296	if (error)
2297		return (error);
2298	cvtstat(&sb, &osb);
2299	error = copyout(&osb, uap->ub, sizeof (osb));
2300	return (error);
2301}
2302
2303/*
2304 * Get file status; this version does not follow links.
2305 */
2306#ifndef _SYS_SYSPROTO_H_
2307struct olstat_args {
2308	char	*path;
2309	struct ostat *ub;
2310};
2311#endif
2312int
2313olstat(td, uap)
2314	struct thread *td;
2315	register struct olstat_args /* {
2316		char *path;
2317		struct ostat *ub;
2318	} */ *uap;
2319{
2320	struct stat sb;
2321	struct ostat osb;
2322	int error;
2323
2324	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2325	if (error)
2326		return (error);
2327	cvtstat(&sb, &osb);
2328	error = copyout(&osb, uap->ub, sizeof (osb));
2329	return (error);
2330}
2331
2332/*
2333 * Convert from an old to a new stat structure.
2334 */
2335void
2336cvtstat(st, ost)
2337	struct stat *st;
2338	struct ostat *ost;
2339{
2340
2341	ost->st_dev = st->st_dev;
2342	ost->st_ino = st->st_ino;
2343	ost->st_mode = st->st_mode;
2344	ost->st_nlink = st->st_nlink;
2345	ost->st_uid = st->st_uid;
2346	ost->st_gid = st->st_gid;
2347	ost->st_rdev = st->st_rdev;
2348	if (st->st_size < (quad_t)1 << 32)
2349		ost->st_size = st->st_size;
2350	else
2351		ost->st_size = -2;
2352	ost->st_atim = st->st_atim;
2353	ost->st_mtim = st->st_mtim;
2354	ost->st_ctim = st->st_ctim;
2355	ost->st_blksize = st->st_blksize;
2356	ost->st_blocks = st->st_blocks;
2357	ost->st_flags = st->st_flags;
2358	ost->st_gen = st->st_gen;
2359}
2360#endif /* COMPAT_43 */
2361
2362/*
2363 * Get file status; this version follows links.
2364 */
2365#ifndef _SYS_SYSPROTO_H_
2366struct stat_args {
2367	char	*path;
2368	struct stat *ub;
2369};
2370#endif
2371int
2372sys_stat(td, uap)
2373	struct thread *td;
2374	register struct stat_args /* {
2375		char *path;
2376		struct stat *ub;
2377	} */ *uap;
2378{
2379	struct stat sb;
2380	int error;
2381
2382	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2383	if (error == 0)
2384		error = copyout(&sb, uap->ub, sizeof (sb));
2385	return (error);
2386}
2387
2388#ifndef _SYS_SYSPROTO_H_
2389struct fstatat_args {
2390	int	fd;
2391	char	*path;
2392	struct stat	*buf;
2393	int	flag;
2394}
2395#endif
2396int
2397sys_fstatat(struct thread *td, struct fstatat_args *uap)
2398{
2399	struct stat sb;
2400	int error;
2401
2402	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2403	    UIO_USERSPACE, &sb);
2404	if (error == 0)
2405		error = copyout(&sb, uap->buf, sizeof (sb));
2406	return (error);
2407}
2408
2409int
2410kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2411{
2412
2413	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2414}
2415
2416int
2417kern_statat(struct thread *td, int flag, int fd, char *path,
2418    enum uio_seg pathseg, struct stat *sbp)
2419{
2420
2421	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2422}
2423
2424int
2425kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2426    enum uio_seg pathseg, struct stat *sbp,
2427    void (*hook)(struct vnode *vp, struct stat *sbp))
2428{
2429	struct nameidata nd;
2430	struct stat sb;
2431	int error, vfslocked;
2432
2433	if (flag & ~AT_SYMLINK_NOFOLLOW)
2434		return (EINVAL);
2435
2436	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2437	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
2438	    path, fd, CAP_FSTAT, td);
2439
2440	if ((error = namei(&nd)) != 0)
2441		return (error);
2442	vfslocked = NDHASGIANT(&nd);
2443	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2444	if (!error) {
2445		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2446		if (S_ISREG(sb.st_mode))
2447			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2448		if (__predict_false(hook != NULL))
2449			hook(nd.ni_vp, &sb);
2450	}
2451	NDFREE(&nd, NDF_ONLY_PNBUF);
2452	vput(nd.ni_vp);
2453	VFS_UNLOCK_GIANT(vfslocked);
2454	if (error)
2455		return (error);
2456	*sbp = sb;
2457#ifdef KTRACE
2458	if (KTRPOINT(td, KTR_STRUCT))
2459		ktrstat(&sb);
2460#endif
2461	return (0);
2462}
2463
2464/*
2465 * Get file status; this version does not follow links.
2466 */
2467#ifndef _SYS_SYSPROTO_H_
2468struct lstat_args {
2469	char	*path;
2470	struct stat *ub;
2471};
2472#endif
2473int
2474sys_lstat(td, uap)
2475	struct thread *td;
2476	register struct lstat_args /* {
2477		char *path;
2478		struct stat *ub;
2479	} */ *uap;
2480{
2481	struct stat sb;
2482	int error;
2483
2484	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2485	if (error == 0)
2486		error = copyout(&sb, uap->ub, sizeof (sb));
2487	return (error);
2488}
2489
2490int
2491kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2492{
2493
2494	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2495	    sbp));
2496}
2497
2498/*
2499 * Implementation of the NetBSD [l]stat() functions.
2500 */
2501void
2502cvtnstat(sb, nsb)
2503	struct stat *sb;
2504	struct nstat *nsb;
2505{
2506	bzero(nsb, sizeof *nsb);
2507	nsb->st_dev = sb->st_dev;
2508	nsb->st_ino = sb->st_ino;
2509	nsb->st_mode = sb->st_mode;
2510	nsb->st_nlink = sb->st_nlink;
2511	nsb->st_uid = sb->st_uid;
2512	nsb->st_gid = sb->st_gid;
2513	nsb->st_rdev = sb->st_rdev;
2514	nsb->st_atim = sb->st_atim;
2515	nsb->st_mtim = sb->st_mtim;
2516	nsb->st_ctim = sb->st_ctim;
2517	nsb->st_size = sb->st_size;
2518	nsb->st_blocks = sb->st_blocks;
2519	nsb->st_blksize = sb->st_blksize;
2520	nsb->st_flags = sb->st_flags;
2521	nsb->st_gen = sb->st_gen;
2522	nsb->st_birthtim = sb->st_birthtim;
2523}
2524
2525#ifndef _SYS_SYSPROTO_H_
2526struct nstat_args {
2527	char	*path;
2528	struct nstat *ub;
2529};
2530#endif
2531int
2532sys_nstat(td, uap)
2533	struct thread *td;
2534	register struct nstat_args /* {
2535		char *path;
2536		struct nstat *ub;
2537	} */ *uap;
2538{
2539	struct stat sb;
2540	struct nstat nsb;
2541	int error;
2542
2543	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2544	if (error)
2545		return (error);
2546	cvtnstat(&sb, &nsb);
2547	error = copyout(&nsb, uap->ub, sizeof (nsb));
2548	return (error);
2549}
2550
2551/*
2552 * NetBSD lstat.  Get file status; this version does not follow links.
2553 */
2554#ifndef _SYS_SYSPROTO_H_
2555struct lstat_args {
2556	char	*path;
2557	struct stat *ub;
2558};
2559#endif
2560int
2561sys_nlstat(td, uap)
2562	struct thread *td;
2563	register struct nlstat_args /* {
2564		char *path;
2565		struct nstat *ub;
2566	} */ *uap;
2567{
2568	struct stat sb;
2569	struct nstat nsb;
2570	int error;
2571
2572	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2573	if (error)
2574		return (error);
2575	cvtnstat(&sb, &nsb);
2576	error = copyout(&nsb, uap->ub, sizeof (nsb));
2577	return (error);
2578}
2579
2580/*
2581 * Get configurable pathname variables.
2582 */
2583#ifndef _SYS_SYSPROTO_H_
2584struct pathconf_args {
2585	char	*path;
2586	int	name;
2587};
2588#endif
2589int
2590sys_pathconf(td, uap)
2591	struct thread *td;
2592	register struct pathconf_args /* {
2593		char *path;
2594		int name;
2595	} */ *uap;
2596{
2597
2598	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2599}
2600
2601#ifndef _SYS_SYSPROTO_H_
2602struct lpathconf_args {
2603	char	*path;
2604	int	name;
2605};
2606#endif
2607int
2608sys_lpathconf(td, uap)
2609	struct thread *td;
2610	register struct lpathconf_args /* {
2611		char *path;
2612		int name;
2613	} */ *uap;
2614{
2615
2616	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
2617}
2618
2619int
2620kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2621    u_long flags)
2622{
2623	struct nameidata nd;
2624	int error, vfslocked;
2625
2626	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
2627	    flags, pathseg, path, td);
2628	if ((error = namei(&nd)) != 0)
2629		return (error);
2630	vfslocked = NDHASGIANT(&nd);
2631	NDFREE(&nd, NDF_ONLY_PNBUF);
2632
2633	/* If asynchronous I/O is available, it works for all files. */
2634	if (name == _PC_ASYNC_IO)
2635		td->td_retval[0] = async_io_version;
2636	else
2637		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2638	vput(nd.ni_vp);
2639	VFS_UNLOCK_GIANT(vfslocked);
2640	return (error);
2641}
2642
2643/*
2644 * Return target name of a symbolic link.
2645 */
2646#ifndef _SYS_SYSPROTO_H_
2647struct readlink_args {
2648	char	*path;
2649	char	*buf;
2650	size_t	count;
2651};
2652#endif
2653int
2654sys_readlink(td, uap)
2655	struct thread *td;
2656	register struct readlink_args /* {
2657		char *path;
2658		char *buf;
2659		size_t count;
2660	} */ *uap;
2661{
2662
2663	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2664	    UIO_USERSPACE, uap->count));
2665}
2666#ifndef _SYS_SYSPROTO_H_
2667struct readlinkat_args {
2668	int	fd;
2669	char	*path;
2670	char	*buf;
2671	size_t	bufsize;
2672};
2673#endif
2674int
2675sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2676{
2677
2678	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2679	    uap->buf, UIO_USERSPACE, uap->bufsize));
2680}
2681
2682int
2683kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2684    enum uio_seg bufseg, size_t count)
2685{
2686
2687	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2688	    count));
2689}
2690
2691int
2692kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2693    char *buf, enum uio_seg bufseg, size_t count)
2694{
2695	struct vnode *vp;
2696	struct iovec aiov;
2697	struct uio auio;
2698	int error;
2699	struct nameidata nd;
2700	int vfslocked;
2701
2702	if (count > INT_MAX)
2703		return (EINVAL);
2704
2705	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
2706	    AUDITVNODE1, pathseg, path, fd, td);
2707
2708	if ((error = namei(&nd)) != 0)
2709		return (error);
2710	NDFREE(&nd, NDF_ONLY_PNBUF);
2711	vfslocked = NDHASGIANT(&nd);
2712	vp = nd.ni_vp;
2713#ifdef MAC
2714	error = mac_vnode_check_readlink(td->td_ucred, vp);
2715	if (error) {
2716		vput(vp);
2717		VFS_UNLOCK_GIANT(vfslocked);
2718		return (error);
2719	}
2720#endif
2721	if (vp->v_type != VLNK)
2722		error = EINVAL;
2723	else {
2724		aiov.iov_base = buf;
2725		aiov.iov_len = count;
2726		auio.uio_iov = &aiov;
2727		auio.uio_iovcnt = 1;
2728		auio.uio_offset = 0;
2729		auio.uio_rw = UIO_READ;
2730		auio.uio_segflg = bufseg;
2731		auio.uio_td = td;
2732		auio.uio_resid = count;
2733		error = VOP_READLINK(vp, &auio, td->td_ucred);
2734	}
2735	vput(vp);
2736	VFS_UNLOCK_GIANT(vfslocked);
2737	td->td_retval[0] = count - auio.uio_resid;
2738	return (error);
2739}
2740
2741/*
2742 * Common implementation code for chflags() and fchflags().
2743 */
2744static int
2745setfflags(td, vp, flags)
2746	struct thread *td;
2747	struct vnode *vp;
2748	int flags;
2749{
2750	int error;
2751	struct mount *mp;
2752	struct vattr vattr;
2753
2754	/*
2755	 * Prevent non-root users from setting flags on devices.  When
2756	 * a device is reused, users can retain ownership of the device
2757	 * if they are allowed to set flags and programs assume that
2758	 * chown can't fail when done as root.
2759	 */
2760	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2761		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2762		if (error)
2763			return (error);
2764	}
2765
2766	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2767		return (error);
2768	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2769	VATTR_NULL(&vattr);
2770	vattr.va_flags = flags;
2771#ifdef MAC
2772	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2773	if (error == 0)
2774#endif
2775		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2776	VOP_UNLOCK(vp, 0);
2777	vn_finished_write(mp);
2778	return (error);
2779}
2780
2781/*
2782 * Change flags of a file given a path name.
2783 */
2784#ifndef _SYS_SYSPROTO_H_
2785struct chflags_args {
2786	char	*path;
2787	int	flags;
2788};
2789#endif
2790int
2791sys_chflags(td, uap)
2792	struct thread *td;
2793	register struct chflags_args /* {
2794		char *path;
2795		int flags;
2796	} */ *uap;
2797{
2798	int error;
2799	struct nameidata nd;
2800	int vfslocked;
2801
2802	AUDIT_ARG_FFLAGS(uap->flags);
2803	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2804	    uap->path, td);
2805	if ((error = namei(&nd)) != 0)
2806		return (error);
2807	NDFREE(&nd, NDF_ONLY_PNBUF);
2808	vfslocked = NDHASGIANT(&nd);
2809	error = setfflags(td, nd.ni_vp, uap->flags);
2810	vrele(nd.ni_vp);
2811	VFS_UNLOCK_GIANT(vfslocked);
2812	return (error);
2813}
2814
2815/*
2816 * Same as chflags() but doesn't follow symlinks.
2817 */
2818int
2819sys_lchflags(td, uap)
2820	struct thread *td;
2821	register struct lchflags_args /* {
2822		char *path;
2823		int flags;
2824	} */ *uap;
2825{
2826	int error;
2827	struct nameidata nd;
2828	int vfslocked;
2829
2830	AUDIT_ARG_FFLAGS(uap->flags);
2831	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
2832	    uap->path, td);
2833	if ((error = namei(&nd)) != 0)
2834		return (error);
2835	vfslocked = NDHASGIANT(&nd);
2836	NDFREE(&nd, NDF_ONLY_PNBUF);
2837	error = setfflags(td, nd.ni_vp, uap->flags);
2838	vrele(nd.ni_vp);
2839	VFS_UNLOCK_GIANT(vfslocked);
2840	return (error);
2841}
2842
2843/*
2844 * Change flags of a file given a file descriptor.
2845 */
2846#ifndef _SYS_SYSPROTO_H_
2847struct fchflags_args {
2848	int	fd;
2849	int	flags;
2850};
2851#endif
2852int
2853sys_fchflags(td, uap)
2854	struct thread *td;
2855	register struct fchflags_args /* {
2856		int fd;
2857		int flags;
2858	} */ *uap;
2859{
2860	struct file *fp;
2861	int vfslocked;
2862	int error;
2863
2864	AUDIT_ARG_FD(uap->fd);
2865	AUDIT_ARG_FFLAGS(uap->flags);
2866	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
2867	    &fp)) != 0)
2868		return (error);
2869	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
2870#ifdef AUDIT
2871	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2872	AUDIT_ARG_VNODE1(fp->f_vnode);
2873	VOP_UNLOCK(fp->f_vnode, 0);
2874#endif
2875	error = setfflags(td, fp->f_vnode, uap->flags);
2876	VFS_UNLOCK_GIANT(vfslocked);
2877	fdrop(fp, td);
2878	return (error);
2879}
2880
2881/*
2882 * Common implementation code for chmod(), lchmod() and fchmod().
2883 */
2884int
2885setfmode(td, cred, vp, mode)
2886	struct thread *td;
2887	struct ucred *cred;
2888	struct vnode *vp;
2889	int mode;
2890{
2891	int error;
2892	struct mount *mp;
2893	struct vattr vattr;
2894
2895	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2896		return (error);
2897	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2898	VATTR_NULL(&vattr);
2899	vattr.va_mode = mode & ALLPERMS;
2900#ifdef MAC
2901	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2902	if (error == 0)
2903#endif
2904		error = VOP_SETATTR(vp, &vattr, cred);
2905	VOP_UNLOCK(vp, 0);
2906	vn_finished_write(mp);
2907	return (error);
2908}
2909
2910/*
2911 * Change mode of a file given path name.
2912 */
2913#ifndef _SYS_SYSPROTO_H_
2914struct chmod_args {
2915	char	*path;
2916	int	mode;
2917};
2918#endif
2919int
2920sys_chmod(td, uap)
2921	struct thread *td;
2922	register struct chmod_args /* {
2923		char *path;
2924		int mode;
2925	} */ *uap;
2926{
2927
2928	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2929}
2930
2931#ifndef _SYS_SYSPROTO_H_
2932struct fchmodat_args {
2933	int	dirfd;
2934	char	*path;
2935	mode_t	mode;
2936	int	flag;
2937}
2938#endif
2939int
2940sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2941{
2942	int flag = uap->flag;
2943	int fd = uap->fd;
2944	char *path = uap->path;
2945	mode_t mode = uap->mode;
2946
2947	if (flag & ~AT_SYMLINK_NOFOLLOW)
2948		return (EINVAL);
2949
2950	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2951}
2952
2953int
2954kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2955{
2956
2957	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2958}
2959
2960/*
2961 * Change mode of a file given path name (don't follow links.)
2962 */
2963#ifndef _SYS_SYSPROTO_H_
2964struct lchmod_args {
2965	char	*path;
2966	int	mode;
2967};
2968#endif
2969int
2970sys_lchmod(td, uap)
2971	struct thread *td;
2972	register struct lchmod_args /* {
2973		char *path;
2974		int mode;
2975	} */ *uap;
2976{
2977
2978	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2979	    uap->mode, AT_SYMLINK_NOFOLLOW));
2980}
2981
2982
2983int
2984kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2985    mode_t mode, int flag)
2986{
2987	int error;
2988	struct nameidata nd;
2989	int vfslocked;
2990	int follow;
2991
2992	AUDIT_ARG_MODE(mode);
2993	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2994	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
2995	    path, fd, CAP_FCHMOD, td);
2996	if ((error = namei(&nd)) != 0)
2997		return (error);
2998	vfslocked = NDHASGIANT(&nd);
2999	NDFREE(&nd, NDF_ONLY_PNBUF);
3000	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
3001	vrele(nd.ni_vp);
3002	VFS_UNLOCK_GIANT(vfslocked);
3003	return (error);
3004}
3005
3006/*
3007 * Change mode of a file given a file descriptor.
3008 */
3009#ifndef _SYS_SYSPROTO_H_
3010struct fchmod_args {
3011	int	fd;
3012	int	mode;
3013};
3014#endif
3015int
3016sys_fchmod(struct thread *td, struct fchmod_args *uap)
3017{
3018	struct file *fp;
3019	int error;
3020
3021	AUDIT_ARG_FD(uap->fd);
3022	AUDIT_ARG_MODE(uap->mode);
3023
3024	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
3025	if (error != 0)
3026		return (error);
3027	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
3028	fdrop(fp, td);
3029	return (error);
3030}
3031
3032/*
3033 * Common implementation for chown(), lchown(), and fchown()
3034 */
3035int
3036setfown(td, cred, vp, uid, gid)
3037	struct thread *td;
3038	struct ucred *cred;
3039	struct vnode *vp;
3040	uid_t uid;
3041	gid_t gid;
3042{
3043	int error;
3044	struct mount *mp;
3045	struct vattr vattr;
3046
3047	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3048		return (error);
3049	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3050	VATTR_NULL(&vattr);
3051	vattr.va_uid = uid;
3052	vattr.va_gid = gid;
3053#ifdef MAC
3054	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
3055	    vattr.va_gid);
3056	if (error == 0)
3057#endif
3058		error = VOP_SETATTR(vp, &vattr, cred);
3059	VOP_UNLOCK(vp, 0);
3060	vn_finished_write(mp);
3061	return (error);
3062}
3063
3064/*
3065 * Set ownership given a path name.
3066 */
3067#ifndef _SYS_SYSPROTO_H_
3068struct chown_args {
3069	char	*path;
3070	int	uid;
3071	int	gid;
3072};
3073#endif
3074int
3075sys_chown(td, uap)
3076	struct thread *td;
3077	register struct chown_args /* {
3078		char *path;
3079		int uid;
3080		int gid;
3081	} */ *uap;
3082{
3083
3084	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3085}
3086
3087#ifndef _SYS_SYSPROTO_H_
3088struct fchownat_args {
3089	int fd;
3090	const char * path;
3091	uid_t uid;
3092	gid_t gid;
3093	int flag;
3094};
3095#endif
3096int
3097sys_fchownat(struct thread *td, struct fchownat_args *uap)
3098{
3099	int flag;
3100
3101	flag = uap->flag;
3102	if (flag & ~AT_SYMLINK_NOFOLLOW)
3103		return (EINVAL);
3104
3105	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3106	    uap->gid, uap->flag));
3107}
3108
3109int
3110kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3111    int gid)
3112{
3113
3114	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
3115}
3116
3117int
3118kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3119    int uid, int gid, int flag)
3120{
3121	struct nameidata nd;
3122	int error, vfslocked, follow;
3123
3124	AUDIT_ARG_OWNER(uid, gid);
3125	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3126	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
3127	    path, fd, CAP_FCHOWN, td);
3128
3129	if ((error = namei(&nd)) != 0)
3130		return (error);
3131	vfslocked = NDHASGIANT(&nd);
3132	NDFREE(&nd, NDF_ONLY_PNBUF);
3133	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3134	vrele(nd.ni_vp);
3135	VFS_UNLOCK_GIANT(vfslocked);
3136	return (error);
3137}
3138
3139/*
3140 * Set ownership given a path name, do not cross symlinks.
3141 */
3142#ifndef _SYS_SYSPROTO_H_
3143struct lchown_args {
3144	char	*path;
3145	int	uid;
3146	int	gid;
3147};
3148#endif
3149int
3150sys_lchown(td, uap)
3151	struct thread *td;
3152	register struct lchown_args /* {
3153		char *path;
3154		int uid;
3155		int gid;
3156	} */ *uap;
3157{
3158
3159	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3160}
3161
3162int
3163kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3164    int gid)
3165{
3166
3167	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3168	    AT_SYMLINK_NOFOLLOW));
3169}
3170
3171/*
3172 * Set ownership given a file descriptor.
3173 */
3174#ifndef _SYS_SYSPROTO_H_
3175struct fchown_args {
3176	int	fd;
3177	int	uid;
3178	int	gid;
3179};
3180#endif
3181int
3182sys_fchown(td, uap)
3183	struct thread *td;
3184	register struct fchown_args /* {
3185		int fd;
3186		int uid;
3187		int gid;
3188	} */ *uap;
3189{
3190	struct file *fp;
3191	int error;
3192
3193	AUDIT_ARG_FD(uap->fd);
3194	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3195	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
3196	if (error != 0)
3197		return (error);
3198	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3199	fdrop(fp, td);
3200	return (error);
3201}
3202
3203/*
3204 * Common implementation code for utimes(), lutimes(), and futimes().
3205 */
3206static int
3207getutimes(usrtvp, tvpseg, tsp)
3208	const struct timeval *usrtvp;
3209	enum uio_seg tvpseg;
3210	struct timespec *tsp;
3211{
3212	struct timeval tv[2];
3213	const struct timeval *tvp;
3214	int error;
3215
3216	if (usrtvp == NULL) {
3217		vfs_timestamp(&tsp[0]);
3218		tsp[1] = tsp[0];
3219	} else {
3220		if (tvpseg == UIO_SYSSPACE) {
3221			tvp = usrtvp;
3222		} else {
3223			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3224				return (error);
3225			tvp = tv;
3226		}
3227
3228		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3229		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3230			return (EINVAL);
3231		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3232		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3233	}
3234	return (0);
3235}
3236
3237/*
3238 * Common implementation code for utimes(), lutimes(), and futimes().
3239 */
3240static int
3241setutimes(td, vp, ts, numtimes, nullflag)
3242	struct thread *td;
3243	struct vnode *vp;
3244	const struct timespec *ts;
3245	int numtimes;
3246	int nullflag;
3247{
3248	int error, setbirthtime;
3249	struct mount *mp;
3250	struct vattr vattr;
3251
3252	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3253		return (error);
3254	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3255	setbirthtime = 0;
3256	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3257	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3258		setbirthtime = 1;
3259	VATTR_NULL(&vattr);
3260	vattr.va_atime = ts[0];
3261	vattr.va_mtime = ts[1];
3262	if (setbirthtime)
3263		vattr.va_birthtime = ts[1];
3264	if (numtimes > 2)
3265		vattr.va_birthtime = ts[2];
3266	if (nullflag)
3267		vattr.va_vaflags |= VA_UTIMES_NULL;
3268#ifdef MAC
3269	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3270	    vattr.va_mtime);
3271#endif
3272	if (error == 0)
3273		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3274	VOP_UNLOCK(vp, 0);
3275	vn_finished_write(mp);
3276	return (error);
3277}
3278
3279/*
3280 * Set the access and modification times of a file.
3281 */
3282#ifndef _SYS_SYSPROTO_H_
3283struct utimes_args {
3284	char	*path;
3285	struct	timeval *tptr;
3286};
3287#endif
3288int
3289sys_utimes(td, uap)
3290	struct thread *td;
3291	register struct utimes_args /* {
3292		char *path;
3293		struct timeval *tptr;
3294	} */ *uap;
3295{
3296
3297	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3298	    UIO_USERSPACE));
3299}
3300
3301#ifndef _SYS_SYSPROTO_H_
3302struct futimesat_args {
3303	int fd;
3304	const char * path;
3305	const struct timeval * times;
3306};
3307#endif
3308int
3309sys_futimesat(struct thread *td, struct futimesat_args *uap)
3310{
3311
3312	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3313	    uap->times, UIO_USERSPACE));
3314}
3315
3316int
3317kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3318    struct timeval *tptr, enum uio_seg tptrseg)
3319{
3320
3321	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3322}
3323
3324int
3325kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3326    struct timeval *tptr, enum uio_seg tptrseg)
3327{
3328	struct nameidata nd;
3329	struct timespec ts[2];
3330	int error, vfslocked;
3331
3332	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3333		return (error);
3334	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
3335	    path, fd, CAP_FUTIMES, td);
3336
3337	if ((error = namei(&nd)) != 0)
3338		return (error);
3339	vfslocked = NDHASGIANT(&nd);
3340	NDFREE(&nd, NDF_ONLY_PNBUF);
3341	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3342	vrele(nd.ni_vp);
3343	VFS_UNLOCK_GIANT(vfslocked);
3344	return (error);
3345}
3346
3347/*
3348 * Set the access and modification times of a file.
3349 */
3350#ifndef _SYS_SYSPROTO_H_
3351struct lutimes_args {
3352	char	*path;
3353	struct	timeval *tptr;
3354};
3355#endif
3356int
3357sys_lutimes(td, uap)
3358	struct thread *td;
3359	register struct lutimes_args /* {
3360		char *path;
3361		struct timeval *tptr;
3362	} */ *uap;
3363{
3364
3365	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3366	    UIO_USERSPACE));
3367}
3368
3369int
3370kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3371    struct timeval *tptr, enum uio_seg tptrseg)
3372{
3373	struct timespec ts[2];
3374	int error;
3375	struct nameidata nd;
3376	int vfslocked;
3377
3378	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3379		return (error);
3380	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3381	if ((error = namei(&nd)) != 0)
3382		return (error);
3383	vfslocked = NDHASGIANT(&nd);
3384	NDFREE(&nd, NDF_ONLY_PNBUF);
3385	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3386	vrele(nd.ni_vp);
3387	VFS_UNLOCK_GIANT(vfslocked);
3388	return (error);
3389}
3390
3391/*
3392 * Set the access and modification times of a file.
3393 */
3394#ifndef _SYS_SYSPROTO_H_
3395struct futimes_args {
3396	int	fd;
3397	struct	timeval *tptr;
3398};
3399#endif
3400int
3401sys_futimes(td, uap)
3402	struct thread *td;
3403	register struct futimes_args /* {
3404		int  fd;
3405		struct timeval *tptr;
3406	} */ *uap;
3407{
3408
3409	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3410}
3411
3412int
3413kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3414    enum uio_seg tptrseg)
3415{
3416	struct timespec ts[2];
3417	struct file *fp;
3418	int vfslocked;
3419	int error;
3420
3421	AUDIT_ARG_FD(fd);
3422	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3423		return (error);
3424	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
3425	    != 0)
3426		return (error);
3427	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
3428#ifdef AUDIT
3429	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3430	AUDIT_ARG_VNODE1(fp->f_vnode);
3431	VOP_UNLOCK(fp->f_vnode, 0);
3432#endif
3433	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3434	VFS_UNLOCK_GIANT(vfslocked);
3435	fdrop(fp, td);
3436	return (error);
3437}
3438
3439/*
3440 * Truncate a file given its path name.
3441 */
3442#ifndef _SYS_SYSPROTO_H_
3443struct truncate_args {
3444	char	*path;
3445	int	pad;
3446	off_t	length;
3447};
3448#endif
3449int
3450sys_truncate(td, uap)
3451	struct thread *td;
3452	register struct truncate_args /* {
3453		char *path;
3454		int pad;
3455		off_t length;
3456	} */ *uap;
3457{
3458
3459	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3460}
3461
3462int
3463kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3464{
3465	struct mount *mp;
3466	struct vnode *vp;
3467	struct vattr vattr;
3468	int error;
3469	struct nameidata nd;
3470	int vfslocked;
3471
3472	if (length < 0)
3473		return(EINVAL);
3474	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
3475	if ((error = namei(&nd)) != 0)
3476		return (error);
3477	vfslocked = NDHASGIANT(&nd);
3478	vp = nd.ni_vp;
3479	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3480		vrele(vp);
3481		VFS_UNLOCK_GIANT(vfslocked);
3482		return (error);
3483	}
3484	NDFREE(&nd, NDF_ONLY_PNBUF);
3485	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3486	if (vp->v_type == VDIR)
3487		error = EISDIR;
3488#ifdef MAC
3489	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3490	}
3491#endif
3492	else if ((error = vn_writechk(vp)) == 0 &&
3493	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3494		VATTR_NULL(&vattr);
3495		vattr.va_size = length;
3496		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3497	}
3498	vput(vp);
3499	vn_finished_write(mp);
3500	VFS_UNLOCK_GIANT(vfslocked);
3501	return (error);
3502}
3503
3504#if defined(COMPAT_43)
3505/*
3506 * Truncate a file given its path name.
3507 */
3508#ifndef _SYS_SYSPROTO_H_
3509struct otruncate_args {
3510	char	*path;
3511	long	length;
3512};
3513#endif
3514int
3515otruncate(td, uap)
3516	struct thread *td;
3517	register struct otruncate_args /* {
3518		char *path;
3519		long length;
3520	} */ *uap;
3521{
3522	struct truncate_args /* {
3523		char *path;
3524		int pad;
3525		off_t length;
3526	} */ nuap;
3527
3528	nuap.path = uap->path;
3529	nuap.length = uap->length;
3530	return (sys_truncate(td, &nuap));
3531}
3532#endif /* COMPAT_43 */
3533
3534/* Versions with the pad argument */
3535int
3536freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3537{
3538	struct truncate_args ouap;
3539
3540	ouap.path = uap->path;
3541	ouap.length = uap->length;
3542	return (sys_truncate(td, &ouap));
3543}
3544
3545int
3546freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3547{
3548	struct ftruncate_args ouap;
3549
3550	ouap.fd = uap->fd;
3551	ouap.length = uap->length;
3552	return (sys_ftruncate(td, &ouap));
3553}
3554
3555/*
3556 * Sync an open file.
3557 */
3558#ifndef _SYS_SYSPROTO_H_
3559struct fsync_args {
3560	int	fd;
3561};
3562#endif
3563int
3564sys_fsync(td, uap)
3565	struct thread *td;
3566	struct fsync_args /* {
3567		int fd;
3568	} */ *uap;
3569{
3570	struct vnode *vp;
3571	struct mount *mp;
3572	struct file *fp;
3573	int vfslocked;
3574	int error, lock_flags;
3575
3576	AUDIT_ARG_FD(uap->fd);
3577	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
3578	    &fp)) != 0)
3579		return (error);
3580	vp = fp->f_vnode;
3581	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3582	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3583		goto drop;
3584	if (MNT_SHARED_WRITES(mp) ||
3585	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3586		lock_flags = LK_SHARED;
3587	} else {
3588		lock_flags = LK_EXCLUSIVE;
3589	}
3590	vn_lock(vp, lock_flags | LK_RETRY);
3591	AUDIT_ARG_VNODE1(vp);
3592	if (vp->v_object != NULL) {
3593		VM_OBJECT_LOCK(vp->v_object);
3594		vm_object_page_clean(vp->v_object, 0, 0, 0);
3595		VM_OBJECT_UNLOCK(vp->v_object);
3596	}
3597	error = VOP_FSYNC(vp, MNT_WAIT, td);
3598
3599	VOP_UNLOCK(vp, 0);
3600	vn_finished_write(mp);
3601drop:
3602	VFS_UNLOCK_GIANT(vfslocked);
3603	fdrop(fp, td);
3604	return (error);
3605}
3606
3607/*
3608 * Rename files.  Source and destination must either both be directories, or
3609 * both not be directories.  If target is a directory, it must be empty.
3610 */
3611#ifndef _SYS_SYSPROTO_H_
3612struct rename_args {
3613	char	*from;
3614	char	*to;
3615};
3616#endif
3617int
3618sys_rename(td, uap)
3619	struct thread *td;
3620	register struct rename_args /* {
3621		char *from;
3622		char *to;
3623	} */ *uap;
3624{
3625
3626	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3627}
3628
3629#ifndef _SYS_SYSPROTO_H_
3630struct renameat_args {
3631	int	oldfd;
3632	char	*old;
3633	int	newfd;
3634	char	*new;
3635};
3636#endif
3637int
3638sys_renameat(struct thread *td, struct renameat_args *uap)
3639{
3640
3641	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3642	    UIO_USERSPACE));
3643}
3644
3645int
3646kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3647{
3648
3649	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3650}
3651
3652int
3653kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3654    enum uio_seg pathseg)
3655{
3656	struct mount *mp = NULL;
3657	struct vnode *tvp, *fvp, *tdvp;
3658	struct nameidata fromnd, tond;
3659	int tvfslocked;
3660	int fvfslocked;
3661	int error;
3662
3663	bwillwrite();
3664#ifdef MAC
3665	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3666	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3667#else
3668	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
3669	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
3670#endif
3671
3672	if ((error = namei(&fromnd)) != 0)
3673		return (error);
3674	fvfslocked = NDHASGIANT(&fromnd);
3675	tvfslocked = 0;
3676#ifdef MAC
3677	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3678	    fromnd.ni_vp, &fromnd.ni_cnd);
3679	VOP_UNLOCK(fromnd.ni_dvp, 0);
3680	if (fromnd.ni_dvp != fromnd.ni_vp)
3681		VOP_UNLOCK(fromnd.ni_vp, 0);
3682#endif
3683	fvp = fromnd.ni_vp;
3684	if (error == 0)
3685		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3686	if (error != 0) {
3687		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3688		vrele(fromnd.ni_dvp);
3689		vrele(fvp);
3690		goto out1;
3691	}
3692	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3693	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
3694	    td);
3695	if (fromnd.ni_vp->v_type == VDIR)
3696		tond.ni_cnd.cn_flags |= WILLBEDIR;
3697	if ((error = namei(&tond)) != 0) {
3698		/* Translate error code for rename("dir1", "dir2/."). */
3699		if (error == EISDIR && fvp->v_type == VDIR)
3700			error = EINVAL;
3701		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3702		vrele(fromnd.ni_dvp);
3703		vrele(fvp);
3704		vn_finished_write(mp);
3705		goto out1;
3706	}
3707	tvfslocked = NDHASGIANT(&tond);
3708	tdvp = tond.ni_dvp;
3709	tvp = tond.ni_vp;
3710	if (tvp != NULL) {
3711		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3712			error = ENOTDIR;
3713			goto out;
3714		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3715			error = EISDIR;
3716			goto out;
3717		}
3718	}
3719	if (fvp == tdvp) {
3720		error = EINVAL;
3721		goto out;
3722	}
3723	/*
3724	 * If the source is the same as the destination (that is, if they
3725	 * are links to the same vnode), then there is nothing to do.
3726	 */
3727	if (fvp == tvp)
3728		error = -1;
3729#ifdef MAC
3730	else
3731		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3732		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3733#endif
3734out:
3735	if (!error) {
3736		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3737				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3738		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3739		NDFREE(&tond, NDF_ONLY_PNBUF);
3740	} else {
3741		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3742		NDFREE(&tond, NDF_ONLY_PNBUF);
3743		if (tvp)
3744			vput(tvp);
3745		if (tdvp == tvp)
3746			vrele(tdvp);
3747		else
3748			vput(tdvp);
3749		vrele(fromnd.ni_dvp);
3750		vrele(fvp);
3751	}
3752	vrele(tond.ni_startdir);
3753	vn_finished_write(mp);
3754out1:
3755	if (fromnd.ni_startdir)
3756		vrele(fromnd.ni_startdir);
3757	VFS_UNLOCK_GIANT(fvfslocked);
3758	VFS_UNLOCK_GIANT(tvfslocked);
3759	if (error == -1)
3760		return (0);
3761	return (error);
3762}
3763
3764/*
3765 * Make a directory file.
3766 */
3767#ifndef _SYS_SYSPROTO_H_
3768struct mkdir_args {
3769	char	*path;
3770	int	mode;
3771};
3772#endif
3773int
3774sys_mkdir(td, uap)
3775	struct thread *td;
3776	register struct mkdir_args /* {
3777		char *path;
3778		int mode;
3779	} */ *uap;
3780{
3781
3782	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3783}
3784
3785#ifndef _SYS_SYSPROTO_H_
3786struct mkdirat_args {
3787	int	fd;
3788	char	*path;
3789	mode_t	mode;
3790};
3791#endif
3792int
3793sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3794{
3795
3796	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3797}
3798
3799int
3800kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3801{
3802
3803	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3804}
3805
3806int
3807kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3808    int mode)
3809{
3810	struct mount *mp;
3811	struct vnode *vp;
3812	struct vattr vattr;
3813	int error;
3814	struct nameidata nd;
3815	int vfslocked;
3816
3817	AUDIT_ARG_MODE(mode);
3818restart:
3819	bwillwrite();
3820	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
3821	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
3822	nd.ni_cnd.cn_flags |= WILLBEDIR;
3823	if ((error = namei(&nd)) != 0)
3824		return (error);
3825	vfslocked = NDHASGIANT(&nd);
3826	vp = nd.ni_vp;
3827	if (vp != NULL) {
3828		NDFREE(&nd, NDF_ONLY_PNBUF);
3829		/*
3830		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3831		 * the strange behaviour of leaving the vnode unlocked
3832		 * if the target is the same vnode as the parent.
3833		 */
3834		if (vp == nd.ni_dvp)
3835			vrele(nd.ni_dvp);
3836		else
3837			vput(nd.ni_dvp);
3838		vrele(vp);
3839		VFS_UNLOCK_GIANT(vfslocked);
3840		return (EEXIST);
3841	}
3842	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3843		NDFREE(&nd, NDF_ONLY_PNBUF);
3844		vput(nd.ni_dvp);
3845		VFS_UNLOCK_GIANT(vfslocked);
3846		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3847			return (error);
3848		goto restart;
3849	}
3850	VATTR_NULL(&vattr);
3851	vattr.va_type = VDIR;
3852	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3853#ifdef MAC
3854	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3855	    &vattr);
3856	if (error)
3857		goto out;
3858#endif
3859	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3860#ifdef MAC
3861out:
3862#endif
3863	NDFREE(&nd, NDF_ONLY_PNBUF);
3864	vput(nd.ni_dvp);
3865	if (!error)
3866		vput(nd.ni_vp);
3867	vn_finished_write(mp);
3868	VFS_UNLOCK_GIANT(vfslocked);
3869	return (error);
3870}
3871
3872/*
3873 * Remove a directory file.
3874 */
3875#ifndef _SYS_SYSPROTO_H_
3876struct rmdir_args {
3877	char	*path;
3878};
3879#endif
3880int
3881sys_rmdir(td, uap)
3882	struct thread *td;
3883	struct rmdir_args /* {
3884		char *path;
3885	} */ *uap;
3886{
3887
3888	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3889}
3890
3891int
3892kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3893{
3894
3895	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3896}
3897
3898int
3899kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3900{
3901	struct mount *mp;
3902	struct vnode *vp;
3903	int error;
3904	struct nameidata nd;
3905	int vfslocked;
3906
3907restart:
3908	bwillwrite();
3909	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
3910	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
3911	if ((error = namei(&nd)) != 0)
3912		return (error);
3913	vfslocked = NDHASGIANT(&nd);
3914	vp = nd.ni_vp;
3915	if (vp->v_type != VDIR) {
3916		error = ENOTDIR;
3917		goto out;
3918	}
3919	/*
3920	 * No rmdir "." please.
3921	 */
3922	if (nd.ni_dvp == vp) {
3923		error = EINVAL;
3924		goto out;
3925	}
3926	/*
3927	 * The root of a mounted filesystem cannot be deleted.
3928	 */
3929	if (vp->v_vflag & VV_ROOT) {
3930		error = EBUSY;
3931		goto out;
3932	}
3933#ifdef MAC
3934	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3935	    &nd.ni_cnd);
3936	if (error)
3937		goto out;
3938#endif
3939	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3940		NDFREE(&nd, NDF_ONLY_PNBUF);
3941		vput(vp);
3942		if (nd.ni_dvp == vp)
3943			vrele(nd.ni_dvp);
3944		else
3945			vput(nd.ni_dvp);
3946		VFS_UNLOCK_GIANT(vfslocked);
3947		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3948			return (error);
3949		goto restart;
3950	}
3951	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3952	vn_finished_write(mp);
3953out:
3954	NDFREE(&nd, NDF_ONLY_PNBUF);
3955	vput(vp);
3956	if (nd.ni_dvp == vp)
3957		vrele(nd.ni_dvp);
3958	else
3959		vput(nd.ni_dvp);
3960	VFS_UNLOCK_GIANT(vfslocked);
3961	return (error);
3962}
3963
3964#ifdef COMPAT_43
3965/*
3966 * Read a block of directory entries in a filesystem independent format.
3967 */
3968#ifndef _SYS_SYSPROTO_H_
3969struct ogetdirentries_args {
3970	int	fd;
3971	char	*buf;
3972	u_int	count;
3973	long	*basep;
3974};
3975#endif
3976int
3977ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3978{
3979	long loff;
3980	int error;
3981
3982	error = kern_ogetdirentries(td, uap, &loff);
3983	if (error == 0)
3984		error = copyout(&loff, uap->basep, sizeof(long));
3985	return (error);
3986}
3987
3988int
3989kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3990    long *ploff)
3991{
3992	struct vnode *vp;
3993	struct file *fp;
3994	struct uio auio, kuio;
3995	struct iovec aiov, kiov;
3996	struct dirent *dp, *edp;
3997	caddr_t dirbuf;
3998	int error, eofflag, readcnt, vfslocked;
3999	long loff;
4000
4001	/* XXX arbitrary sanity limit on `count'. */
4002	if (uap->count > 64 * 1024)
4003		return (EINVAL);
4004	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
4005	    &fp)) != 0)
4006		return (error);
4007	if ((fp->f_flag & FREAD) == 0) {
4008		fdrop(fp, td);
4009		return (EBADF);
4010	}
4011	vp = fp->f_vnode;
4012unionread:
4013	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4014	if (vp->v_type != VDIR) {
4015		VFS_UNLOCK_GIANT(vfslocked);
4016		fdrop(fp, td);
4017		return (EINVAL);
4018	}
4019	aiov.iov_base = uap->buf;
4020	aiov.iov_len = uap->count;
4021	auio.uio_iov = &aiov;
4022	auio.uio_iovcnt = 1;
4023	auio.uio_rw = UIO_READ;
4024	auio.uio_segflg = UIO_USERSPACE;
4025	auio.uio_td = td;
4026	auio.uio_resid = uap->count;
4027	vn_lock(vp, LK_SHARED | LK_RETRY);
4028	loff = auio.uio_offset = fp->f_offset;
4029#ifdef MAC
4030	error = mac_vnode_check_readdir(td->td_ucred, vp);
4031	if (error) {
4032		VOP_UNLOCK(vp, 0);
4033		VFS_UNLOCK_GIANT(vfslocked);
4034		fdrop(fp, td);
4035		return (error);
4036	}
4037#endif
4038#	if (BYTE_ORDER != LITTLE_ENDIAN)
4039		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4040			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4041			    NULL, NULL);
4042			fp->f_offset = auio.uio_offset;
4043		} else
4044#	endif
4045	{
4046		kuio = auio;
4047		kuio.uio_iov = &kiov;
4048		kuio.uio_segflg = UIO_SYSSPACE;
4049		kiov.iov_len = uap->count;
4050		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4051		kiov.iov_base = dirbuf;
4052		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4053			    NULL, NULL);
4054		fp->f_offset = kuio.uio_offset;
4055		if (error == 0) {
4056			readcnt = uap->count - kuio.uio_resid;
4057			edp = (struct dirent *)&dirbuf[readcnt];
4058			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4059#				if (BYTE_ORDER == LITTLE_ENDIAN)
4060					/*
4061					 * The expected low byte of
4062					 * dp->d_namlen is our dp->d_type.
4063					 * The high MBZ byte of dp->d_namlen
4064					 * is our dp->d_namlen.
4065					 */
4066					dp->d_type = dp->d_namlen;
4067					dp->d_namlen = 0;
4068#				else
4069					/*
4070					 * The dp->d_type is the high byte
4071					 * of the expected dp->d_namlen,
4072					 * so must be zero'ed.
4073					 */
4074					dp->d_type = 0;
4075#				endif
4076				if (dp->d_reclen > 0) {
4077					dp = (struct dirent *)
4078					    ((char *)dp + dp->d_reclen);
4079				} else {
4080					error = EIO;
4081					break;
4082				}
4083			}
4084			if (dp >= edp)
4085				error = uiomove(dirbuf, readcnt, &auio);
4086		}
4087		free(dirbuf, M_TEMP);
4088	}
4089	if (error) {
4090		VOP_UNLOCK(vp, 0);
4091		VFS_UNLOCK_GIANT(vfslocked);
4092		fdrop(fp, td);
4093		return (error);
4094	}
4095	if (uap->count == auio.uio_resid &&
4096	    (vp->v_vflag & VV_ROOT) &&
4097	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4098		struct vnode *tvp = vp;
4099		vp = vp->v_mount->mnt_vnodecovered;
4100		VREF(vp);
4101		fp->f_vnode = vp;
4102		fp->f_data = vp;
4103		fp->f_offset = 0;
4104		vput(tvp);
4105		VFS_UNLOCK_GIANT(vfslocked);
4106		goto unionread;
4107	}
4108	VOP_UNLOCK(vp, 0);
4109	VFS_UNLOCK_GIANT(vfslocked);
4110	fdrop(fp, td);
4111	td->td_retval[0] = uap->count - auio.uio_resid;
4112	if (error == 0)
4113		*ploff = loff;
4114	return (error);
4115}
4116#endif /* COMPAT_43 */
4117
4118/*
4119 * Read a block of directory entries in a filesystem independent format.
4120 */
4121#ifndef _SYS_SYSPROTO_H_
4122struct getdirentries_args {
4123	int	fd;
4124	char	*buf;
4125	u_int	count;
4126	long	*basep;
4127};
4128#endif
4129int
4130sys_getdirentries(td, uap)
4131	struct thread *td;
4132	register struct getdirentries_args /* {
4133		int fd;
4134		char *buf;
4135		u_int count;
4136		long *basep;
4137	} */ *uap;
4138{
4139	long base;
4140	int error;
4141
4142	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
4143	if (error)
4144		return (error);
4145	if (uap->basep != NULL)
4146		error = copyout(&base, uap->basep, sizeof(long));
4147	return (error);
4148}
4149
4150int
4151kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4152    long *basep)
4153{
4154	struct vnode *vp;
4155	struct file *fp;
4156	struct uio auio;
4157	struct iovec aiov;
4158	int vfslocked;
4159	long loff;
4160	int error, eofflag;
4161
4162	AUDIT_ARG_FD(fd);
4163	if (count > INT_MAX)
4164		return (EINVAL);
4165	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
4166	    &fp)) != 0)
4167		return (error);
4168	if ((fp->f_flag & FREAD) == 0) {
4169		fdrop(fp, td);
4170		return (EBADF);
4171	}
4172	vp = fp->f_vnode;
4173unionread:
4174	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4175	if (vp->v_type != VDIR) {
4176		VFS_UNLOCK_GIANT(vfslocked);
4177		error = EINVAL;
4178		goto fail;
4179	}
4180	aiov.iov_base = buf;
4181	aiov.iov_len = count;
4182	auio.uio_iov = &aiov;
4183	auio.uio_iovcnt = 1;
4184	auio.uio_rw = UIO_READ;
4185	auio.uio_segflg = UIO_USERSPACE;
4186	auio.uio_td = td;
4187	auio.uio_resid = count;
4188	vn_lock(vp, LK_SHARED | LK_RETRY);
4189	AUDIT_ARG_VNODE1(vp);
4190	loff = auio.uio_offset = fp->f_offset;
4191#ifdef MAC
4192	error = mac_vnode_check_readdir(td->td_ucred, vp);
4193	if (error == 0)
4194#endif
4195		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4196		    NULL);
4197	fp->f_offset = auio.uio_offset;
4198	if (error) {
4199		VOP_UNLOCK(vp, 0);
4200		VFS_UNLOCK_GIANT(vfslocked);
4201		goto fail;
4202	}
4203	if (count == auio.uio_resid &&
4204	    (vp->v_vflag & VV_ROOT) &&
4205	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4206		struct vnode *tvp = vp;
4207		vp = vp->v_mount->mnt_vnodecovered;
4208		VREF(vp);
4209		fp->f_vnode = vp;
4210		fp->f_data = vp;
4211		fp->f_offset = 0;
4212		vput(tvp);
4213		VFS_UNLOCK_GIANT(vfslocked);
4214		goto unionread;
4215	}
4216	VOP_UNLOCK(vp, 0);
4217	VFS_UNLOCK_GIANT(vfslocked);
4218	*basep = loff;
4219	td->td_retval[0] = count - auio.uio_resid;
4220fail:
4221	fdrop(fp, td);
4222	return (error);
4223}
4224
4225#ifndef _SYS_SYSPROTO_H_
4226struct getdents_args {
4227	int fd;
4228	char *buf;
4229	size_t count;
4230};
4231#endif
4232int
4233sys_getdents(td, uap)
4234	struct thread *td;
4235	register struct getdents_args /* {
4236		int fd;
4237		char *buf;
4238		u_int count;
4239	} */ *uap;
4240{
4241	struct getdirentries_args ap;
4242	ap.fd = uap->fd;
4243	ap.buf = uap->buf;
4244	ap.count = uap->count;
4245	ap.basep = NULL;
4246	return (sys_getdirentries(td, &ap));
4247}
4248
4249/*
4250 * Set the mode mask for creation of filesystem nodes.
4251 */
4252#ifndef _SYS_SYSPROTO_H_
4253struct umask_args {
4254	int	newmask;
4255};
4256#endif
4257int
4258sys_umask(td, uap)
4259	struct thread *td;
4260	struct umask_args /* {
4261		int newmask;
4262	} */ *uap;
4263{
4264	register struct filedesc *fdp;
4265
4266	FILEDESC_XLOCK(td->td_proc->p_fd);
4267	fdp = td->td_proc->p_fd;
4268	td->td_retval[0] = fdp->fd_cmask;
4269	fdp->fd_cmask = uap->newmask & ALLPERMS;
4270	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4271	return (0);
4272}
4273
4274/*
4275 * Void all references to file by ripping underlying filesystem away from
4276 * vnode.
4277 */
4278#ifndef _SYS_SYSPROTO_H_
4279struct revoke_args {
4280	char	*path;
4281};
4282#endif
4283int
4284sys_revoke(td, uap)
4285	struct thread *td;
4286	register struct revoke_args /* {
4287		char *path;
4288	} */ *uap;
4289{
4290	struct vnode *vp;
4291	struct vattr vattr;
4292	int error;
4293	struct nameidata nd;
4294	int vfslocked;
4295
4296	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4297	    UIO_USERSPACE, uap->path, td);
4298	if ((error = namei(&nd)) != 0)
4299		return (error);
4300	vfslocked = NDHASGIANT(&nd);
4301	vp = nd.ni_vp;
4302	NDFREE(&nd, NDF_ONLY_PNBUF);
4303	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4304		error = EINVAL;
4305		goto out;
4306	}
4307#ifdef MAC
4308	error = mac_vnode_check_revoke(td->td_ucred, vp);
4309	if (error)
4310		goto out;
4311#endif
4312	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4313	if (error)
4314		goto out;
4315	if (td->td_ucred->cr_uid != vattr.va_uid) {
4316		error = priv_check(td, PRIV_VFS_ADMIN);
4317		if (error)
4318			goto out;
4319	}
4320	if (vcount(vp) > 1)
4321		VOP_REVOKE(vp, REVOKEALL);
4322out:
4323	vput(vp);
4324	VFS_UNLOCK_GIANT(vfslocked);
4325	return (error);
4326}
4327
4328/*
4329 * Convert a user file descriptor to a kernel file entry and check that, if it
4330 * is a capability, the correct rights are present. A reference on the file
4331 * entry is held upon returning.
4332 */
4333int
4334getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
4335    struct file **fpp)
4336{
4337	struct file *fp;
4338#ifdef CAPABILITIES
4339	struct file *fp_fromcap;
4340#endif
4341	int error;
4342
4343	error = 0;
4344	fp = NULL;
4345	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
4346		return (EBADF);
4347#ifdef CAPABILITIES
4348	/*
4349	 * If the file descriptor is for a capability, test rights and use the
4350	 * file descriptor referenced by the capability.
4351	 */
4352	error = cap_funwrap(fp, rights, &fp_fromcap);
4353	if (error) {
4354		fdrop(fp, curthread);
4355		return (error);
4356	}
4357	if (fp != fp_fromcap) {
4358		fhold(fp_fromcap);
4359		fdrop(fp, curthread);
4360		fp = fp_fromcap;
4361	}
4362#endif /* CAPABILITIES */
4363
4364	/*
4365	 * The file could be not of the vnode type, or it may be not
4366	 * yet fully initialized, in which case the f_vnode pointer
4367	 * may be set, but f_ops is still badfileops.  E.g.,
4368	 * devfs_open() transiently create such situation to
4369	 * facilitate csw d_fdopen().
4370	 *
4371	 * Dupfdopen() handling in kern_openat() installs the
4372	 * half-baked file into the process descriptor table, allowing
4373	 * other thread to dereference it. Guard against the race by
4374	 * checking f_ops.
4375	 */
4376	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4377		fdrop(fp, curthread);
4378		return (EINVAL);
4379	}
4380	*fpp = fp;
4381	return (0);
4382}
4383
4384
4385/*
4386 * Get an (NFS) file handle.
4387 */
4388#ifndef _SYS_SYSPROTO_H_
4389struct lgetfh_args {
4390	char	*fname;
4391	fhandle_t *fhp;
4392};
4393#endif
4394int
4395sys_lgetfh(td, uap)
4396	struct thread *td;
4397	register struct lgetfh_args *uap;
4398{
4399	struct nameidata nd;
4400	fhandle_t fh;
4401	register struct vnode *vp;
4402	int vfslocked;
4403	int error;
4404
4405	error = priv_check(td, PRIV_VFS_GETFH);
4406	if (error)
4407		return (error);
4408	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4409	    UIO_USERSPACE, uap->fname, td);
4410	error = namei(&nd);
4411	if (error)
4412		return (error);
4413	vfslocked = NDHASGIANT(&nd);
4414	NDFREE(&nd, NDF_ONLY_PNBUF);
4415	vp = nd.ni_vp;
4416	bzero(&fh, sizeof(fh));
4417	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4418	error = VOP_VPTOFH(vp, &fh.fh_fid);
4419	vput(vp);
4420	VFS_UNLOCK_GIANT(vfslocked);
4421	if (error)
4422		return (error);
4423	error = copyout(&fh, uap->fhp, sizeof (fh));
4424	return (error);
4425}
4426
4427#ifndef _SYS_SYSPROTO_H_
4428struct getfh_args {
4429	char	*fname;
4430	fhandle_t *fhp;
4431};
4432#endif
4433int
4434sys_getfh(td, uap)
4435	struct thread *td;
4436	register struct getfh_args *uap;
4437{
4438	struct nameidata nd;
4439	fhandle_t fh;
4440	register struct vnode *vp;
4441	int vfslocked;
4442	int error;
4443
4444	error = priv_check(td, PRIV_VFS_GETFH);
4445	if (error)
4446		return (error);
4447	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
4448	    UIO_USERSPACE, uap->fname, td);
4449	error = namei(&nd);
4450	if (error)
4451		return (error);
4452	vfslocked = NDHASGIANT(&nd);
4453	NDFREE(&nd, NDF_ONLY_PNBUF);
4454	vp = nd.ni_vp;
4455	bzero(&fh, sizeof(fh));
4456	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4457	error = VOP_VPTOFH(vp, &fh.fh_fid);
4458	vput(vp);
4459	VFS_UNLOCK_GIANT(vfslocked);
4460	if (error)
4461		return (error);
4462	error = copyout(&fh, uap->fhp, sizeof (fh));
4463	return (error);
4464}
4465
4466/*
4467 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4468 * open descriptor.
4469 *
4470 * warning: do not remove the priv_check() call or this becomes one giant
4471 * security hole.
4472 */
4473#ifndef _SYS_SYSPROTO_H_
4474struct fhopen_args {
4475	const struct fhandle *u_fhp;
4476	int flags;
4477};
4478#endif
4479int
4480sys_fhopen(td, uap)
4481	struct thread *td;
4482	struct fhopen_args /* {
4483		const struct fhandle *u_fhp;
4484		int flags;
4485	} */ *uap;
4486{
4487	struct proc *p = td->td_proc;
4488	struct mount *mp;
4489	struct vnode *vp;
4490	struct fhandle fhp;
4491	struct vattr vat;
4492	struct vattr *vap = &vat;
4493	struct flock lf;
4494	struct file *fp;
4495	register struct filedesc *fdp = p->p_fd;
4496	int fmode, error, type;
4497	accmode_t accmode;
4498	struct file *nfp;
4499	int vfslocked;
4500	int indx;
4501
4502	error = priv_check(td, PRIV_VFS_FHOPEN);
4503	if (error)
4504		return (error);
4505	fmode = FFLAGS(uap->flags);
4506	/* why not allow a non-read/write open for our lockd? */
4507	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4508		return (EINVAL);
4509	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4510	if (error)
4511		return(error);
4512	/* find the mount point */
4513	mp = vfs_busyfs(&fhp.fh_fsid);
4514	if (mp == NULL)
4515		return (ESTALE);
4516	vfslocked = VFS_LOCK_GIANT(mp);
4517	/* now give me my vnode, it gets returned to me locked */
4518	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4519	vfs_unbusy(mp);
4520	if (error)
4521		goto out;
4522	/*
4523	 * from now on we have to make sure not
4524	 * to forget about the vnode
4525	 * any error that causes an abort must vput(vp)
4526	 * just set error = err and 'goto bad;'.
4527	 */
4528
4529	/*
4530	 * from vn_open
4531	 */
4532	if (vp->v_type == VLNK) {
4533		error = EMLINK;
4534		goto bad;
4535	}
4536	if (vp->v_type == VSOCK) {
4537		error = EOPNOTSUPP;
4538		goto bad;
4539	}
4540	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
4541		error = ENOTDIR;
4542		goto bad;
4543	}
4544	accmode = 0;
4545	if (fmode & (FWRITE | O_TRUNC)) {
4546		if (vp->v_type == VDIR) {
4547			error = EISDIR;
4548			goto bad;
4549		}
4550		error = vn_writechk(vp);
4551		if (error)
4552			goto bad;
4553		accmode |= VWRITE;
4554	}
4555	if (fmode & FREAD)
4556		accmode |= VREAD;
4557	if ((fmode & O_APPEND) && (fmode & FWRITE))
4558		accmode |= VAPPEND;
4559#ifdef MAC
4560	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
4561	if (error)
4562		goto bad;
4563#endif
4564	if (accmode) {
4565		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
4566		if (error)
4567			goto bad;
4568	}
4569	if (fmode & O_TRUNC) {
4570		vfs_ref(mp);
4571		VOP_UNLOCK(vp, 0);				/* XXX */
4572		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
4573			vrele(vp);
4574			vfs_rel(mp);
4575			goto out;
4576		}
4577		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4578		vfs_rel(mp);
4579#ifdef MAC
4580		/*
4581		 * We don't yet have fp->f_cred, so use td->td_ucred, which
4582		 * should be right.
4583		 */
4584		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
4585		if (error == 0) {
4586#endif
4587			VATTR_NULL(vap);
4588			vap->va_size = 0;
4589			error = VOP_SETATTR(vp, vap, td->td_ucred);
4590#ifdef MAC
4591		}
4592#endif
4593		vn_finished_write(mp);
4594		if (error)
4595			goto bad;
4596	}
4597	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
4598	if (error)
4599		goto bad;
4600
4601	if (fmode & FWRITE)
4602		vp->v_writecount++;
4603
4604	/*
4605	 * end of vn_open code
4606	 */
4607
4608	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
4609		if (fmode & FWRITE)
4610			vp->v_writecount--;
4611		goto bad;
4612	}
4613	/* An extra reference on `nfp' has been held for us by falloc(). */
4614	fp = nfp;
4615	nfp->f_vnode = vp;
4616	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
4617	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4618		lf.l_whence = SEEK_SET;
4619		lf.l_start = 0;
4620		lf.l_len = 0;
4621		if (fmode & O_EXLOCK)
4622			lf.l_type = F_WRLCK;
4623		else
4624			lf.l_type = F_RDLCK;
4625		type = F_FLOCK;
4626		if ((fmode & FNONBLOCK) == 0)
4627			type |= F_WAIT;
4628		VOP_UNLOCK(vp, 0);
4629		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
4630			    type)) != 0) {
4631			/*
4632			 * The lock request failed.  Normally close the
4633			 * descriptor but handle the case where someone might
4634			 * have dup()d or close()d it when we weren't looking.
4635			 */
4636			fdclose(fdp, fp, indx, td);
4637
4638			/*
4639			 * release our private reference
4640			 */
4641			fdrop(fp, td);
4642			goto out;
4643		}
4644		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4645		atomic_set_int(&fp->f_flag, FHASLOCK);
4646	}
4647
4648	VOP_UNLOCK(vp, 0);
4649	fdrop(fp, td);
4650	VFS_UNLOCK_GIANT(vfslocked);
4651	td->td_retval[0] = indx;
4652	return (0);
4653
4654bad:
4655	vput(vp);
4656out:
4657	VFS_UNLOCK_GIANT(vfslocked);
4658	return (error);
4659}
4660
4661/*
4662 * Stat an (NFS) file handle.
4663 */
4664#ifndef _SYS_SYSPROTO_H_
4665struct fhstat_args {
4666	struct fhandle *u_fhp;
4667	struct stat *sb;
4668};
4669#endif
4670int
4671sys_fhstat(td, uap)
4672	struct thread *td;
4673	register struct fhstat_args /* {
4674		struct fhandle *u_fhp;
4675		struct stat *sb;
4676	} */ *uap;
4677{
4678	struct stat sb;
4679	fhandle_t fh;
4680	struct mount *mp;
4681	struct vnode *vp;
4682	int vfslocked;
4683	int error;
4684
4685	error = priv_check(td, PRIV_VFS_FHSTAT);
4686	if (error)
4687		return (error);
4688	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4689	if (error)
4690		return (error);
4691	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4692		return (ESTALE);
4693	vfslocked = VFS_LOCK_GIANT(mp);
4694	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4695	vfs_unbusy(mp);
4696	if (error) {
4697		VFS_UNLOCK_GIANT(vfslocked);
4698		return (error);
4699	}
4700	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
4701	vput(vp);
4702	VFS_UNLOCK_GIANT(vfslocked);
4703	if (error)
4704		return (error);
4705	error = copyout(&sb, uap->sb, sizeof(sb));
4706	return (error);
4707}
4708
4709/*
4710 * Implement fstatfs() for (NFS) file handles.
4711 */
4712#ifndef _SYS_SYSPROTO_H_
4713struct fhstatfs_args {
4714	struct fhandle *u_fhp;
4715	struct statfs *buf;
4716};
4717#endif
4718int
4719sys_fhstatfs(td, uap)
4720	struct thread *td;
4721	struct fhstatfs_args /* {
4722		struct fhandle *u_fhp;
4723		struct statfs *buf;
4724	} */ *uap;
4725{
4726	struct statfs sf;
4727	fhandle_t fh;
4728	int error;
4729
4730	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4731	if (error)
4732		return (error);
4733	error = kern_fhstatfs(td, fh, &sf);
4734	if (error)
4735		return (error);
4736	return (copyout(&sf, uap->buf, sizeof(sf)));
4737}
4738
4739int
4740kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4741{
4742	struct statfs *sp;
4743	struct mount *mp;
4744	struct vnode *vp;
4745	int vfslocked;
4746	int error;
4747
4748	error = priv_check(td, PRIV_VFS_FHSTATFS);
4749	if (error)
4750		return (error);
4751	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4752		return (ESTALE);
4753	vfslocked = VFS_LOCK_GIANT(mp);
4754	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4755	if (error) {
4756		vfs_unbusy(mp);
4757		VFS_UNLOCK_GIANT(vfslocked);
4758		return (error);
4759	}
4760	vput(vp);
4761	error = prison_canseemount(td->td_ucred, mp);
4762	if (error)
4763		goto out;
4764#ifdef MAC
4765	error = mac_mount_check_stat(td->td_ucred, mp);
4766	if (error)
4767		goto out;
4768#endif
4769	/*
4770	 * Set these in case the underlying filesystem fails to do so.
4771	 */
4772	sp = &mp->mnt_stat;
4773	sp->f_version = STATFS_VERSION;
4774	sp->f_namemax = NAME_MAX;
4775	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4776	error = VFS_STATFS(mp, sp);
4777	if (error == 0)
4778		*buf = *sp;
4779out:
4780	vfs_unbusy(mp);
4781	VFS_UNLOCK_GIANT(vfslocked);
4782	return (error);
4783}
4784
4785int
4786kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4787{
4788	struct file *fp;
4789	struct mount *mp;
4790	struct vnode *vp;
4791	off_t olen, ooffset;
4792	int error, vfslocked;
4793
4794	fp = NULL;
4795	vfslocked = 0;
4796	error = fget(td, fd, CAP_WRITE, &fp);
4797	if (error != 0)
4798		goto out;
4799
4800	switch (fp->f_type) {
4801	case DTYPE_VNODE:
4802		break;
4803	case DTYPE_PIPE:
4804	case DTYPE_FIFO:
4805		error = ESPIPE;
4806		goto out;
4807	default:
4808		error = ENODEV;
4809		goto out;
4810	}
4811	if ((fp->f_flag & FWRITE) == 0) {
4812		error = EBADF;
4813		goto out;
4814	}
4815	vp = fp->f_vnode;
4816	if (vp->v_type != VREG) {
4817		error = ENODEV;
4818		goto out;
4819	}
4820	if (offset < 0 || len <= 0) {
4821		error = EINVAL;
4822		goto out;
4823	}
4824	/* Check for wrap. */
4825	if (offset > OFF_MAX - len) {
4826		error = EFBIG;
4827		goto out;
4828	}
4829
4830	/* Allocating blocks may take a long time, so iterate. */
4831	for (;;) {
4832		olen = len;
4833		ooffset = offset;
4834
4835		bwillwrite();
4836		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
4837		mp = NULL;
4838		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4839		if (error != 0) {
4840			VFS_UNLOCK_GIANT(vfslocked);
4841			break;
4842		}
4843		error = vn_lock(vp, LK_EXCLUSIVE);
4844		if (error != 0) {
4845			vn_finished_write(mp);
4846			VFS_UNLOCK_GIANT(vfslocked);
4847			break;
4848		}
4849#ifdef MAC
4850		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4851		if (error == 0)
4852#endif
4853			error = VOP_ALLOCATE(vp, &offset, &len);
4854		VOP_UNLOCK(vp, 0);
4855		vn_finished_write(mp);
4856		VFS_UNLOCK_GIANT(vfslocked);
4857
4858		if (olen + ooffset != offset + len) {
4859			panic("offset + len changed from %jx/%jx to %jx/%jx",
4860			    ooffset, olen, offset, len);
4861		}
4862		if (error != 0 || len == 0)
4863			break;
4864		KASSERT(olen > len, ("Iteration did not make progress?"));
4865		maybe_yield();
4866	}
4867 out:
4868	if (fp != NULL)
4869		fdrop(fp, td);
4870	return (error);
4871}
4872
4873int
4874sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4875{
4876
4877	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
4878}
4879
4880/*
4881 * Unlike madvise(2), we do not make a best effort to remember every
4882 * possible caching hint.  Instead, we remember the last setting with
4883 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4884 * region of any current setting.
4885 */
4886int
4887kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4888    int advice)
4889{
4890	struct fadvise_info *fa, *new;
4891	struct file *fp;
4892	struct vnode *vp;
4893	off_t end;
4894	int error;
4895
4896	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4897		return (EINVAL);
4898	switch (advice) {
4899	case POSIX_FADV_SEQUENTIAL:
4900	case POSIX_FADV_RANDOM:
4901	case POSIX_FADV_NOREUSE:
4902		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4903		break;
4904	case POSIX_FADV_NORMAL:
4905	case POSIX_FADV_WILLNEED:
4906	case POSIX_FADV_DONTNEED:
4907		new = NULL;
4908		break;
4909	default:
4910		return (EINVAL);
4911	}
4912	/* XXX: CAP_POSIX_FADVISE? */
4913	error = fget(td, fd, 0, &fp);
4914	if (error != 0)
4915		goto out;
4916
4917	switch (fp->f_type) {
4918	case DTYPE_VNODE:
4919		break;
4920	case DTYPE_PIPE:
4921	case DTYPE_FIFO:
4922		error = ESPIPE;
4923		goto out;
4924	default:
4925		error = ENODEV;
4926		goto out;
4927	}
4928	vp = fp->f_vnode;
4929	if (vp->v_type != VREG) {
4930		error = ENODEV;
4931		goto out;
4932	}
4933	if (len == 0)
4934		end = OFF_MAX;
4935	else
4936		end = offset + len - 1;
4937	switch (advice) {
4938	case POSIX_FADV_SEQUENTIAL:
4939	case POSIX_FADV_RANDOM:
4940	case POSIX_FADV_NOREUSE:
4941		/*
4942		 * Try to merge any existing non-standard region with
4943		 * this new region if possible, otherwise create a new
4944		 * non-standard region for this request.
4945		 */
4946		mtx_pool_lock(mtxpool_sleep, fp);
4947		fa = fp->f_advice;
4948		if (fa != NULL && fa->fa_advice == advice &&
4949		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4950		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4951		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4952			if (offset < fa->fa_start)
4953				fa->fa_start = offset;
4954			if (end > fa->fa_end)
4955				fa->fa_end = end;
4956		} else {
4957			new->fa_advice = advice;
4958			new->fa_start = offset;
4959			new->fa_end = end;
4960			fp->f_advice = new;
4961			new = fa;
4962		}
4963		mtx_pool_unlock(mtxpool_sleep, fp);
4964		break;
4965	case POSIX_FADV_NORMAL:
4966		/*
4967		 * If a the "normal" region overlaps with an existing
4968		 * non-standard region, trim or remove the
4969		 * non-standard region.
4970		 */
4971		mtx_pool_lock(mtxpool_sleep, fp);
4972		fa = fp->f_advice;
4973		if (fa != NULL) {
4974			if (offset <= fa->fa_start && end >= fa->fa_end) {
4975				new = fa;
4976				fp->f_advice = NULL;
4977			} else if (offset <= fa->fa_start &&
4978 			    end >= fa->fa_start)
4979				fa->fa_start = end + 1;
4980			else if (offset <= fa->fa_end && end >= fa->fa_end)
4981				fa->fa_end = offset - 1;
4982			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4983				/*
4984				 * If the "normal" region is a middle
4985				 * portion of the existing
4986				 * non-standard region, just remove
4987				 * the whole thing rather than picking
4988				 * one side or the other to
4989				 * preserve.
4990				 */
4991				new = fa;
4992				fp->f_advice = NULL;
4993			}
4994		}
4995		mtx_pool_unlock(mtxpool_sleep, fp);
4996		break;
4997	case POSIX_FADV_WILLNEED:
4998	case POSIX_FADV_DONTNEED:
4999		error = VOP_ADVISE(vp, offset, end, advice);
5000		break;
5001	}
5002out:
5003	if (fp != NULL)
5004		fdrop(fp, td);
5005	free(new, M_FADVISE);
5006	return (error);
5007}
5008
5009int
5010sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
5011{
5012
5013	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
5014	    uap->advice));
5015}
5016