1/*
2 * Copyright (c) 1995-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1989, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *	This product includes software developed by the University of
48 *	California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66 */
67/*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections.  This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/namei.h>
77#include <sys/filedesc.h>
78#include <sys/kernel.h>
79#include <sys/file_internal.h>
80#include <sys/stat.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
83#include <sys/proc_internal.h>
84#include <sys/kauth.h>
85#include <sys/uio_internal.h>
86#include <sys/malloc.h>
87#include <sys/mman.h>
88#include <sys/dirent.h>
89#include <sys/attr.h>
90#include <sys/sysctl.h>
91#include <sys/ubc.h>
92#include <sys/quota.h>
93#include <sys/kdebug.h>
94#include <sys/fsevents.h>
95#include <sys/imgsrc.h>
96#include <sys/sysproto.h>
97#include <sys/xattr.h>
98#include <sys/fcntl.h>
99#include <sys/fsctl.h>
100#include <sys/ubc_internal.h>
101#include <sys/disk.h>
102#include <machine/cons.h>
103#include <machine/limits.h>
104#include <miscfs/specfs/specdev.h>
105
106#include <security/audit/audit.h>
107#include <bsm/audit_kevents.h>
108
109#include <mach/mach_types.h>
110#include <kern/kern_types.h>
111#include <kern/kalloc.h>
112#include <kern/task.h>
113
114#include <vm/vm_pageout.h>
115
116#include <libkern/OSAtomic.h>
117#include <pexpert/pexpert.h>
118
119#if CONFIG_MACF
120#include <security/mac.h>
121#include <security/mac_framework.h>
122#endif
123
124#if CONFIG_FSE
125#define GET_PATH(x) \
126	(x) = get_pathbuff();
127#define RELEASE_PATH(x) \
128	release_pathbuff(x);
129#else
130#define GET_PATH(x)	\
131	MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
132#define RELEASE_PATH(x) \
133	FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
134#endif /* CONFIG_FSE */
135
136/* struct for checkdirs iteration */
137struct cdirargs {
138	vnode_t olddp;
139	vnode_t newdp;
140};
141/* callback  for checkdirs iteration */
142static int checkdirs_callback(proc_t p, void * arg);
143
144static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
145static int checkdirs(vnode_t olddp, vfs_context_t ctx);
146void enablequotas(struct mount *mp, vfs_context_t ctx);
147static int getfsstat_callback(mount_t mp, void * arg);
148static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
149static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
150static int sync_callback(mount_t, void *);
151static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
152			user_addr_t bufp, int *sizep, boolean_t is_64_bit,
153						boolean_t partial_copy);
154static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
155			user_addr_t bufp);
156static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
157static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
158                        struct componentname *cnp, user_addr_t fsmountargs,
159                        int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
160                        vfs_context_t ctx);
161void vfs_notify_mount(vnode_t pdvp);
162
163int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
164
165#ifdef CONFIG_IMGSRC_ACCESS
166static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
167static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
168static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
169static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
170static void mount_end_update(mount_t mp);
171static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
172#endif /* CONFIG_IMGSRC_ACCESS */
173
174int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
175
176__private_extern__
177int sync_internal(void);
178
179__private_extern__
180int unlink1(vfs_context_t, struct nameidata *, int);
181
182/*
183 * incremented each time a mount or unmount operation occurs
184 * used to invalidate the cached value of the rootvp in the
185 * mount structure utilized by cache_lookup_path
186 */
187uint32_t mount_generation = 0;
188
189/* counts number of mount and unmount operations */
190unsigned int vfs_nummntops=0;
191
192extern const struct fileops vnops;
193#if CONFIG_APPLEDOUBLE
194extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
195#endif /* CONFIG_APPLEDOUBLE */
196
197/*
198 * Virtual File System System Calls
199 */
200
201#if NFSCLIENT
202/*
203 * Private in-kernel mounting spi (NFS only, not exported)
204 */
205 __private_extern__
206boolean_t
207vfs_iskernelmount(mount_t mp)
208{
209	return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
210}
211
212 __private_extern__
213int
214kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
215             void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
216{
217	struct nameidata nd;
218	boolean_t did_namei;
219	int error;
220
221	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
222	       UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
223
224	/*
225	 * Get the vnode to be covered if it's not supplied
226	 */
227	if (vp == NULLVP) {
228		error = namei(&nd);
229		if (error)
230			return (error);
231		vp = nd.ni_vp;
232		pvp = nd.ni_dvp;
233		did_namei = TRUE;
234	} else {
235		char *pnbuf = CAST_DOWN(char *, path);
236
237		nd.ni_cnd.cn_pnbuf = pnbuf;
238		nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
239		did_namei = FALSE;
240	}
241
242	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
243	                     syscall_flags, kern_flags, NULL, TRUE, ctx);
244
245	if (did_namei) {
246		vnode_put(vp);
247		vnode_put(pvp);
248		nameidone(&nd);
249	}
250
251	return (error);
252}
253#endif /* NFSCLIENT */
254
255/*
256 * Mount a file system.
257 */
258/* ARGSUSED */
259int
260mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
261{
262	struct __mac_mount_args muap;
263
264	muap.type = uap->type;
265	muap.path = uap->path;
266	muap.flags = uap->flags;
267	muap.data = uap->data;
268	muap.mac_p = USER_ADDR_NULL;
269	return (__mac_mount(p, &muap, retval));
270}
271
272void
273vfs_notify_mount(vnode_t pdvp)
274{
275	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
276	lock_vnode_and_post(pdvp, NOTE_WRITE);
277}
278
279/*
280 * __mac_mount:
281 *	Mount a file system taking into account MAC label behavior.
282 *	See mount(2) man page for more information
283 *
284 * Parameters:    p                        Process requesting the mount
285 *                uap                      User argument descriptor (see below)
286 *                retval                   (ignored)
287 *
288 * Indirect:      uap->type                Filesystem type
289 *                uap->path                Path to mount
290 *                uap->data                Mount arguments
291 *                uap->mac_p               MAC info
292 *                uap->flags               Mount flags
293 *
294 *
295 * Returns:        0                       Success
296 *                !0                       Not success
297 */
298boolean_t root_fs_upgrade_try = FALSE;
299
300int
301__mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
302{
303	vnode_t pvp = NULL;
304   	vnode_t vp = NULL;
305	int need_nameidone = 0;
306	vfs_context_t ctx = vfs_context_current();
307	char fstypename[MFSNAMELEN];
308	struct nameidata nd;
309	size_t dummy=0;
310	char *labelstr = NULL;
311	int flags = uap->flags;
312	int error;
313#if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
314	boolean_t is_64bit = IS_64BIT_PROCESS(p);
315#else
316#pragma unused(p)
317#endif
318	/*
319	 * Get the fs type name from user space
320	 */
321	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
322	if (error)
323		return (error);
324
325	/*
326	 * Get the vnode to be covered
327	 */
328	NDINIT(&nd, LOOKUP, OP_MOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1 | WANTPARENT,
329	       UIO_USERSPACE, uap->path, ctx);
330	error = namei(&nd);
331	if (error) {
332		goto out;
333	}
334	need_nameidone = 1;
335	vp = nd.ni_vp;
336	pvp = nd.ni_dvp;
337
338#ifdef CONFIG_IMGSRC_ACCESS
339	/* Mounting image source cannot be batched with other operations */
340	if (flags == MNT_IMGSRC_BY_INDEX) {
341		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
342		                                  ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
343		goto out;
344	}
345#endif /* CONFIG_IMGSRC_ACCESS */
346
347#if CONFIG_MACF
348	/*
349	 * Get the label string (if any) from user space
350	 */
351	if (uap->mac_p != USER_ADDR_NULL) {
352		struct user_mac mac;
353		size_t ulen = 0;
354
355		if (is_64bit) {
356			struct user64_mac mac64;
357			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
358			mac.m_buflen = mac64.m_buflen;
359			mac.m_string = mac64.m_string;
360		} else {
361			struct user32_mac mac32;
362			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
363			mac.m_buflen = mac32.m_buflen;
364			mac.m_string = mac32.m_string;
365		}
366		if (error)
367			goto out;
368		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
369		    (mac.m_buflen < 2)) {
370			error = EINVAL;
371			goto out;
372		}
373		MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
374		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
375		if (error) {
376			goto out;
377		}
378		AUDIT_ARG(mac_string, labelstr);
379	}
380#endif /* CONFIG_MACF */
381
382	AUDIT_ARG(fflags, flags);
383
384	if ((vp->v_flag & VROOT) &&
385			(vp->v_mount->mnt_flag & MNT_ROOTFS)) {
386		if (!(flags & MNT_UNION)) {
387			flags |= MNT_UPDATE;
388		}
389		else {
390			/*
391			 * For a union mount on '/', treat it as fresh
392			 * mount instead of update.
393			 * Otherwise, union mouting on '/' used to panic the
394			 * system before, since mnt_vnodecovered was found to
395			 * be NULL for '/' which is required for unionlookup
396			 * after it gets ENOENT on union mount.
397			 */
398			flags = (flags & ~(MNT_UPDATE));
399		}
400
401#if 0
402//#ifdef SECURE_KERNEL
403		if ((flags & MNT_RDONLY) == 0) {
404			/* Release kernels are not allowed to mount "/" as rw */
405			error = EPERM;
406			goto out;
407		}
408//#endif
409#endif
410		/*
411		 * See 7392553 for more details on why this check exists.
412		 * Suffice to say: If this check is ON and something tries
413		 * to mount the rootFS RW, we'll turn off the codesign
414		 * bitmap optimization.
415		 */
416#if CHECK_CS_VALIDATION_BITMAP
417		if ((flags & MNT_RDONLY) == 0 ) {
418			root_fs_upgrade_try = TRUE;
419		}
420#endif
421	}
422
423	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
424	                     labelstr, FALSE, ctx);
425
426out:
427
428#if CONFIG_MACF
429	if (labelstr)
430		FREE(labelstr, M_MACTEMP);
431#endif /* CONFIG_MACF */
432
433	if (vp) {
434		vnode_put(vp);
435	}
436	if (pvp) {
437		vnode_put(pvp);
438	}
439	if (need_nameidone) {
440		nameidone(&nd);
441	}
442
443	return (error);
444}
445
446/*
447 * common mount implementation (final stage of mounting)
448
449 * Arguments:
450 *  fstypename	file system type (ie it's vfs name)
451 *  pvp		parent of covered vnode
452 *  vp		covered vnode
453 *  cnp		component name (ie path) of covered vnode
454 *  flags	generic mount flags
455 *  fsmountargs	file system specific data
456 *  labelstr	optional MAC label
457 *  kernelmount	TRUE for mounts initiated from inside the kernel
458 *  ctx		caller's context
459 */
460static int
461mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
462             struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
463             char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
464{
465#if !CONFIG_MACF
466#pragma unused(labelstr)
467#endif
468	struct vnode *devvp = NULLVP;
469	struct vnode *device_vnode = NULLVP;
470#if CONFIG_MACF
471	struct vnode *rvp;
472#endif
473	struct mount *mp;
474	struct vfstable *vfsp = (struct vfstable *)0;
475	struct proc *p = vfs_context_proc(ctx);
476	int error, flag = 0;
477	user_addr_t devpath = USER_ADDR_NULL;
478	int ronly = 0;
479	int mntalloc = 0;
480	boolean_t vfsp_ref = FALSE;
481	boolean_t is_rwlock_locked = FALSE;
482	boolean_t did_rele = FALSE;
483	boolean_t have_usecount = FALSE;
484
485	/*
486	 * Process an update for an existing mount
487	 */
488	if (flags & MNT_UPDATE) {
489		if ((vp->v_flag & VROOT) == 0) {
490			error = EINVAL;
491			goto out1;
492		}
493		mp = vp->v_mount;
494
495		/* unmount in progress return error */
496		mount_lock_spin(mp);
497		if (mp->mnt_lflag & MNT_LUNMOUNT) {
498			mount_unlock(mp);
499			error = EBUSY;
500			goto out1;
501		}
502		mount_unlock(mp);
503		lck_rw_lock_exclusive(&mp->mnt_rwlock);
504		is_rwlock_locked = TRUE;
505		/*
506		 * We only allow the filesystem to be reloaded if it
507		 * is currently mounted read-only.
508		 */
509		if ((flags & MNT_RELOAD) &&
510		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
511			error = ENOTSUP;
512			goto out1;
513		}
514
515		/*
516		 * If content protection is enabled, update mounts are not
517		 * allowed to turn it off.
518		 */
519		if ((mp->mnt_flag & MNT_CPROTECT) &&
520			   ((flags & MNT_CPROTECT) == 0)) {
521			error = EINVAL;
522			goto out1;
523		}
524
525#ifdef CONFIG_IMGSRC_ACCESS
526		/* Can't downgrade the backer of the root FS */
527		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
528			(!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
529			error = ENOTSUP;
530			goto out1;
531		}
532#endif /* CONFIG_IMGSRC_ACCESS */
533
534		/*
535		 * Only root, or the user that did the original mount is
536		 * permitted to update it.
537		 */
538		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
539		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
540			goto out1;
541		}
542#if CONFIG_MACF
543		error = mac_mount_check_remount(ctx, mp);
544		if (error != 0) {
545			goto out1;
546		}
547#endif
548		/*
549		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
550		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
551		 */
552		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
553			flags |= MNT_NOSUID | MNT_NODEV;
554			if (mp->mnt_flag & MNT_NOEXEC)
555				flags |= MNT_NOEXEC;
556		}
557		flag = mp->mnt_flag;
558
559
560
561		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
562
563		vfsp = mp->mnt_vtable;
564		goto update;
565	}
566	/*
567	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
568	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
569	 */
570	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
571		flags |= MNT_NOSUID | MNT_NODEV;
572		if (vp->v_mount->mnt_flag & MNT_NOEXEC)
573			flags |= MNT_NOEXEC;
574	}
575
576	/* XXXAUDIT: Should we capture the type on the error path as well? */
577	AUDIT_ARG(text, fstypename);
578	mount_list_lock();
579	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
580		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
581			vfsp->vfc_refcount++;
582			vfsp_ref = TRUE;
583			break;
584		}
585	mount_list_unlock();
586	if (vfsp == NULL) {
587		error = ENODEV;
588		goto out1;
589	}
590
591	/*
592	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts
593	 */
594	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
595		error = EINVAL;  /* unsupported request */
596		goto out1;
597	}
598
599	error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
600	if (error != 0) {
601		goto out1;
602	}
603
604	/*
605	 * Allocate and initialize the filesystem (mount_t)
606	 */
607	MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
608		M_MOUNT, M_WAITOK);
609	bzero((char *)mp, (u_int32_t)sizeof(struct mount));
610	mntalloc = 1;
611
612	/* Initialize the default IO constraints */
613	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
614	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
615	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
616	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
617	mp->mnt_devblocksize = DEV_BSIZE;
618	mp->mnt_alignmentmask = PAGE_MASK;
619	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
620	mp->mnt_ioscale = 1;
621	mp->mnt_ioflags = 0;
622	mp->mnt_realrootvp = NULLVP;
623	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
624
625	TAILQ_INIT(&mp->mnt_vnodelist);
626	TAILQ_INIT(&mp->mnt_workerqueue);
627	TAILQ_INIT(&mp->mnt_newvnodes);
628	mount_lock_init(mp);
629	lck_rw_lock_exclusive(&mp->mnt_rwlock);
630	is_rwlock_locked = TRUE;
631	mp->mnt_op = vfsp->vfc_vfsops;
632	mp->mnt_vtable = vfsp;
633	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
634	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
635	strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
636	strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
637	mp->mnt_vnodecovered = vp;
638	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
639	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
640	mp->mnt_devbsdunit = 0;
641
642	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
643	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
644
645#if NFSCLIENT
646	if (kernelmount)
647		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
648	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
649		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
650#endif /* NFSCLIENT */
651
652update:
653	/*
654	 * Set the mount level flags.
655	 */
656	if (flags & MNT_RDONLY)
657		mp->mnt_flag |= MNT_RDONLY;
658	else if (mp->mnt_flag & MNT_RDONLY) {
659		// disallow read/write upgrades of file systems that
660		// had the TYPENAME_OVERRIDE feature set.
661		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
662			error = EPERM;
663			goto out1;
664		}
665		mp->mnt_kern_flag |= MNTK_WANTRDWR;
666	}
667	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
668			  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
669			  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
670			  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
671			  MNT_QUARANTINE | MNT_CPROTECT);
672	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
673				 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
674				 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
675				 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
676				 MNT_QUARANTINE | MNT_CPROTECT);
677
678#if CONFIG_MACF
679	if (flags & MNT_MULTILABEL) {
680		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
681			error = EINVAL;
682			goto out1;
683		}
684		mp->mnt_flag |= MNT_MULTILABEL;
685	}
686#endif
687	/*
688	 * Process device path for local file systems if requested
689	 */
690	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
691		if (vfs_context_is64bit(ctx)) {
692			if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
693				goto out1;
694			fsmountargs += sizeof(devpath);
695		} else {
696			user32_addr_t tmp;
697			if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
698				goto out1;
699			/* munge into LP64 addr */
700			devpath = CAST_USER_ADDR_T(tmp);
701			fsmountargs += sizeof(tmp);
702		}
703
704		/* Lookup device and authorize access to it */
705		if ((devpath)) {
706			struct nameidata nd;
707
708			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
709			if ( (error = namei(&nd)) )
710				goto out1;
711
712			strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
713			devvp = nd.ni_vp;
714
715			nameidone(&nd);
716
717			if (devvp->v_type != VBLK) {
718				error = ENOTBLK;
719				goto out2;
720			}
721			if (major(devvp->v_rdev) >= nblkdev) {
722				error = ENXIO;
723				goto out2;
724			}
725			/*
726			* If mount by non-root, then verify that user has necessary
727			* permissions on the device.
728			*/
729			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
730				mode_t accessmode = KAUTH_VNODE_READ_DATA;
731
732				if ((mp->mnt_flag & MNT_RDONLY) == 0)
733					accessmode |= KAUTH_VNODE_WRITE_DATA;
734				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
735					goto out2;
736			}
737		}
738		/* On first mount, preflight and open device */
739		if (devpath && ((flags & MNT_UPDATE) == 0)) {
740			if ( (error = vnode_ref(devvp)) )
741				goto out2;
742			/*
743			* Disallow multiple mounts of the same device.
744			* Disallow mounting of a device that is currently in use
745			* (except for root, which might share swap device for miniroot).
746			* Flush out any old buffers remaining from a previous use.
747			*/
748			if ( (error = vfs_mountedon(devvp)) )
749				goto out3;
750
751			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
752				error = EBUSY;
753				goto out3;
754			}
755			if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
756				error = ENOTBLK;
757				goto out3;
758			}
759			if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
760				goto out3;
761
762			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
763#if CONFIG_MACF
764			error = mac_vnode_check_open(ctx,
765			    devvp,
766			    ronly ? FREAD : FREAD|FWRITE);
767			if (error)
768				goto out3;
769#endif /* MAC */
770			if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
771				goto out3;
772
773			mp->mnt_devvp = devvp;
774			device_vnode = devvp;
775
776		} else if ((mp->mnt_flag & MNT_RDONLY) &&
777		           (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
778		           (device_vnode = mp->mnt_devvp)) {
779			dev_t dev;
780			int maj;
781			/*
782			 * If upgrade to read-write by non-root, then verify
783			 * that user has necessary permissions on the device.
784			 */
785			vnode_getalways(device_vnode);
786
787			if (suser(vfs_context_ucred(ctx), NULL) &&
788			    (error = vnode_authorize(device_vnode, NULL,
789			     KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
790			     ctx)) != 0) {
791				vnode_put(device_vnode);
792				goto out2;
793			}
794
795			/* Tell the device that we're upgrading */
796			dev = (dev_t)device_vnode->v_rdev;
797			maj = major(dev);
798
799			if ((u_int)maj >= (u_int)nblkdev)
800				panic("Volume mounted on a device with invalid major number.");
801
802			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
803			vnode_put(device_vnode);
804			device_vnode = NULLVP;
805			if (error != 0) {
806				goto out2;
807			}
808		}
809	}
810#if CONFIG_MACF
811	if ((flags & MNT_UPDATE) == 0) {
812		mac_mount_label_init(mp);
813		mac_mount_label_associate(ctx, mp);
814	}
815	if (labelstr) {
816		if ((flags & MNT_UPDATE) != 0) {
817			error = mac_mount_check_label_update(ctx, mp);
818			if (error != 0)
819				goto out3;
820		}
821	}
822#endif
823	/*
824	 * Mount the filesystem.
825	 */
826	error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
827
828	if (flags & MNT_UPDATE) {
829		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
830			mp->mnt_flag &= ~MNT_RDONLY;
831		mp->mnt_flag &=~
832		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
833		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
834		if (error)
835			mp->mnt_flag = flag;  /* restore flag value */
836		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
837		lck_rw_done(&mp->mnt_rwlock);
838		is_rwlock_locked = FALSE;
839		if (!error)
840			enablequotas(mp, ctx);
841		goto exit;
842	}
843
844	/*
845	 * Put the new filesystem on the mount list after root.
846	 */
847	if (error == 0) {
848		struct vfs_attr	vfsattr;
849#if CONFIG_MACF
850		if (vfs_flags(mp) & MNT_MULTILABEL) {
851			error = VFS_ROOT(mp, &rvp, ctx);
852			if (error) {
853				printf("%s() VFS_ROOT returned %d\n", __func__, error);
854				goto out3;
855			}
856			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
857                        /*
858			 * drop reference provided by VFS_ROOT
859			 */
860			vnode_put(rvp);
861
862			if (error)
863				goto out3;
864		}
865#endif	/* MAC */
866
867		vnode_lock_spin(vp);
868		CLR(vp->v_flag, VMOUNT);
869		vp->v_mountedhere = mp;
870		vnode_unlock(vp);
871
872		/*
873		 * taking the name_cache_lock exclusively will
874		 * insure that everyone is out of the fast path who
875		 * might be trying to use a now stale copy of
876		 * vp->v_mountedhere->mnt_realrootvp
877		 * bumping mount_generation causes the cached values
878		 * to be invalidated
879		 */
880		name_cache_lock();
881		mount_generation++;
882		name_cache_unlock();
883
884		error = vnode_ref(vp);
885		if (error != 0) {
886			goto out4;
887		}
888
889		have_usecount = TRUE;
890
891		error = checkdirs(vp, ctx);
892		if (error != 0)  {
893			/* Unmount the filesystem as cdir/rdirs cannot be updated */
894			goto out4;
895		}
896		/*
897		 * there is no cleanup code here so I have made it void
898		 * we need to revisit this
899		 */
900		(void)VFS_START(mp, 0, ctx);
901
902		if (mount_list_add(mp) != 0) {
903			/*
904			 * The system is shutting down trying to umount
905			 * everything, so fail with a plausible errno.
906			 */
907			error = EBUSY;
908			goto out4;
909		}
910		lck_rw_done(&mp->mnt_rwlock);
911		is_rwlock_locked = FALSE;
912
913		/* Check if this mounted file system supports EAs or named streams. */
914		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
915		VFSATTR_INIT(&vfsattr);
916		VFSATTR_WANTED(&vfsattr, f_capabilities);
917		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
918		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
919		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
920			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
921			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
922				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
923			}
924#if NAMEDSTREAMS
925			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
926			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
927				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
928			}
929#endif
930			/* Check if this file system supports path from id lookups. */
931			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
932			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
933				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
934			} else if (mp->mnt_flag & MNT_DOVOLFS) {
935				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
936				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
937			}
938		}
939		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
940			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
941		}
942		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
943			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
944		}
945		/* increment the operations count */
946		OSAddAtomic(1, &vfs_nummntops);
947		enablequotas(mp, ctx);
948
949		if (device_vnode) {
950			device_vnode->v_specflags |= SI_MOUNTEDON;
951
952			/*
953			 *   cache the IO attributes for the underlying physical media...
954			 *   an error return indicates the underlying driver doesn't
955			 *   support all the queries necessary... however, reasonable
956			 *   defaults will have been set, so no reason to bail or care
957			 */
958			vfs_init_io_attributes(device_vnode, mp);
959		}
960
961		/* Now that mount is setup, notify the listeners */
962		vfs_notify_mount(pvp);
963	} else {
964		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
965		if (mp->mnt_vnodelist.tqh_first != NULL) {
966			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
967					mp->mnt_vtable->vfc_name, error);
968		}
969
970		vnode_lock_spin(vp);
971		CLR(vp->v_flag, VMOUNT);
972		vnode_unlock(vp);
973		mount_list_lock();
974		mp->mnt_vtable->vfc_refcount--;
975		mount_list_unlock();
976
977		if (device_vnode ) {
978			vnode_rele(device_vnode);
979			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
980		}
981		lck_rw_done(&mp->mnt_rwlock);
982		is_rwlock_locked = FALSE;
983
984		/*
985		 * if we get here, we have a mount structure that needs to be freed,
986		 * but since the coveredvp hasn't yet been updated to point at it,
987		 * no need to worry about other threads holding a crossref on this mp
988		 * so it's ok to just free it
989		 */
990		mount_lock_destroy(mp);
991#if CONFIG_MACF
992		mac_mount_label_destroy(mp);
993#endif
994		FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
995	}
996exit:
997	/*
998	 * drop I/O count on the device vp if there was one
999	 */
1000	if (devpath && devvp)
1001	        vnode_put(devvp);
1002
1003	return(error);
1004
1005/* Error condition exits */
1006out4:
1007	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1008
1009	/*
1010	 * If the mount has been placed on the covered vp,
1011	 * it may have been discovered by now, so we have
1012	 * to treat this just like an unmount
1013	 */
1014	mount_lock_spin(mp);
1015	mp->mnt_lflag |= MNT_LDEAD;
1016	mount_unlock(mp);
1017
1018	if (device_vnode != NULLVP) {
1019		vnode_rele(device_vnode);
1020		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1021                       ctx);
1022		did_rele = TRUE;
1023	}
1024
1025	vnode_lock_spin(vp);
1026
1027	mp->mnt_crossref++;
1028	vp->v_mountedhere = (mount_t) 0;
1029
1030	vnode_unlock(vp);
1031
1032	if (have_usecount) {
1033		vnode_rele(vp);
1034	}
1035out3:
1036	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1037		vnode_rele(devvp);
1038out2:
1039	if (devpath && devvp)
1040	        vnode_put(devvp);
1041out1:
1042	/* Release mnt_rwlock only when it was taken */
1043	if (is_rwlock_locked == TRUE) {
1044		lck_rw_done(&mp->mnt_rwlock);
1045	}
1046
1047	if (mntalloc) {
1048		if (mp->mnt_crossref)
1049			mount_dropcrossref(mp, vp, 0);
1050		else {
1051			mount_lock_destroy(mp);
1052#if CONFIG_MACF
1053			mac_mount_label_destroy(mp);
1054#endif
1055			FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1056		}
1057	}
1058	if (vfsp_ref) {
1059		mount_list_lock();
1060		vfsp->vfc_refcount--;
1061		mount_list_unlock();
1062	}
1063
1064	return(error);
1065}
1066
1067/*
1068 * Flush in-core data, check for competing mount attempts,
1069 * and set VMOUNT
1070 */
1071int
1072prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1073{
1074#if !CONFIG_MACF
1075#pragma unused(cnp,fsname)
1076#endif
1077	struct vnode_attr va;
1078	int error;
1079
1080	if (!skip_auth) {
1081		/*
1082		 * If the user is not root, ensure that they own the directory
1083		 * onto which we are attempting to mount.
1084		 */
1085		VATTR_INIT(&va);
1086		VATTR_WANTED(&va, va_uid);
1087		if ((error = vnode_getattr(vp, &va, ctx)) ||
1088				(va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1089				 (!vfs_context_issuser(ctx)))) {
1090			error = EPERM;
1091			goto out;
1092		}
1093	}
1094
1095	if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1096		goto out;
1097
1098	if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1099		goto out;
1100
1101	if (vp->v_type != VDIR) {
1102		error = ENOTDIR;
1103		goto out;
1104	}
1105
1106	if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1107		error = EBUSY;
1108		goto out;
1109	}
1110
1111#if CONFIG_MACF
1112	error = mac_mount_check_mount(ctx, vp,
1113	    cnp, fsname);
1114	if (error != 0)
1115		goto out;
1116#endif
1117
1118	vnode_lock_spin(vp);
1119	SET(vp->v_flag, VMOUNT);
1120	vnode_unlock(vp);
1121
1122out:
1123	return error;
1124}
1125
1126#if CONFIG_IMGSRC_ACCESS
1127
1128#if DEBUG
1129#define IMGSRC_DEBUG(args...) printf(args)
1130#else
1131#define IMGSRC_DEBUG(args...) do { } while(0)
1132#endif
1133
1134static int
1135authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1136{
1137	struct nameidata nd;
1138	vnode_t vp, realdevvp;
1139	mode_t accessmode;
1140	int error;
1141
1142	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1143	if ( (error = namei(&nd)) ) {
1144		IMGSRC_DEBUG("namei() failed with %d\n", error);
1145		return error;
1146	}
1147
1148	vp = nd.ni_vp;
1149
1150	if (!vnode_isblk(vp)) {
1151		IMGSRC_DEBUG("Not block device.\n");
1152		error = ENOTBLK;
1153		goto out;
1154	}
1155
1156	realdevvp = mp->mnt_devvp;
1157	if (realdevvp == NULLVP) {
1158		IMGSRC_DEBUG("No device backs the mount.\n");
1159		error = ENXIO;
1160		goto out;
1161	}
1162
1163	error = vnode_getwithref(realdevvp);
1164	if (error != 0) {
1165		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1166		goto out;
1167	}
1168
1169	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1170		IMGSRC_DEBUG("Wrong dev_t.\n");
1171		error = ENXIO;
1172		goto out1;
1173	}
1174
1175	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1176
1177	/*
1178	 * If mount by non-root, then verify that user has necessary
1179	 * permissions on the device.
1180	 */
1181	if (!vfs_context_issuser(ctx)) {
1182		accessmode = KAUTH_VNODE_READ_DATA;
1183		if ((mp->mnt_flag & MNT_RDONLY) == 0)
1184			accessmode |= KAUTH_VNODE_WRITE_DATA;
1185		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1186			IMGSRC_DEBUG("Access denied.\n");
1187			goto out1;
1188		}
1189	}
1190
1191	*devvpp = vp;
1192
1193out1:
1194	vnode_put(realdevvp);
1195out:
1196	nameidone(&nd);
1197	if (error) {
1198		vnode_put(vp);
1199	}
1200
1201	return error;
1202}
1203
1204/*
1205 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1206 * and call checkdirs()
1207 */
1208static int
1209place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1210{
1211	int error;
1212
1213	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1214
1215	vnode_lock_spin(vp);
1216	CLR(vp->v_flag, VMOUNT);
1217	vp->v_mountedhere = mp;
1218	vnode_unlock(vp);
1219
1220	/*
1221	 * taking the name_cache_lock exclusively will
1222	 * insure that everyone is out of the fast path who
1223	 * might be trying to use a now stale copy of
1224	 * vp->v_mountedhere->mnt_realrootvp
1225	 * bumping mount_generation causes the cached values
1226	 * to be invalidated
1227	 */
1228	name_cache_lock();
1229	mount_generation++;
1230	name_cache_unlock();
1231
1232	error = vnode_ref(vp);
1233	if (error != 0) {
1234		goto out;
1235	}
1236
1237	error = checkdirs(vp, ctx);
1238	if (error != 0)  {
1239		/* Unmount the filesystem as cdir/rdirs cannot be updated */
1240		vnode_rele(vp);
1241		goto out;
1242	}
1243
1244out:
1245	if (error != 0) {
1246		mp->mnt_vnodecovered = NULLVP;
1247	}
1248	return error;
1249}
1250
1251static void
1252undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1253{
1254	vnode_rele(vp);
1255	vnode_lock_spin(vp);
1256	vp->v_mountedhere = (mount_t)NULL;
1257	vnode_unlock(vp);
1258
1259	mp->mnt_vnodecovered = NULLVP;
1260}
1261
1262static int
1263mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1264{
1265	int error;
1266
1267	/* unmount in progress return error */
1268	mount_lock_spin(mp);
1269	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1270		mount_unlock(mp);
1271		return EBUSY;
1272	}
1273	mount_unlock(mp);
1274	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1275
1276	/*
1277	 * We only allow the filesystem to be reloaded if it
1278	 * is currently mounted read-only.
1279	 */
1280	if ((flags & MNT_RELOAD) &&
1281			((mp->mnt_flag & MNT_RDONLY) == 0)) {
1282		error = ENOTSUP;
1283		goto out;
1284	}
1285
1286	/*
1287	 * Only root, or the user that did the original mount is
1288	 * permitted to update it.
1289	 */
1290	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1291			(!vfs_context_issuser(ctx))) {
1292		error = EPERM;
1293		goto out;
1294	}
1295#if CONFIG_MACF
1296	error = mac_mount_check_remount(ctx, mp);
1297	if (error != 0) {
1298		goto out;
1299	}
1300#endif
1301
1302out:
1303	if (error) {
1304		lck_rw_done(&mp->mnt_rwlock);
1305	}
1306
1307	return error;
1308}
1309
1310static void
1311mount_end_update(mount_t mp)
1312{
1313	lck_rw_done(&mp->mnt_rwlock);
1314}
1315
1316static int
1317get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1318{
1319	vnode_t vp;
1320
1321	if (height >= MAX_IMAGEBOOT_NESTING) {
1322		return EINVAL;
1323	}
1324
1325	vp = imgsrc_rootvnodes[height];
1326	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1327		*rvpp = vp;
1328		return 0;
1329	} else {
1330		return ENOENT;
1331	}
1332}
1333
1334static int
1335relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1336		const char *fsname, vfs_context_t ctx,
1337		boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1338{
1339	int error;
1340	mount_t mp;
1341	boolean_t placed = FALSE;
1342	vnode_t devvp = NULLVP;
1343	struct vfstable *vfsp;
1344	user_addr_t devpath;
1345	char *old_mntonname;
1346	vnode_t rvp;
1347	uint32_t height;
1348	uint32_t flags;
1349
1350	/* If we didn't imageboot, nothing to move */
1351	if (imgsrc_rootvnodes[0] == NULLVP) {
1352		return EINVAL;
1353	}
1354
1355	/* Only root can do this */
1356	if (!vfs_context_issuser(ctx)) {
1357		return EPERM;
1358	}
1359
1360	IMGSRC_DEBUG("looking for root vnode.\n");
1361
1362	/*
1363	 * Get root vnode of filesystem we're moving.
1364	 */
1365	if (by_index) {
1366		if (is64bit) {
1367			struct user64_mnt_imgsrc_args mia64;
1368			error = copyin(fsmountargs, &mia64, sizeof(mia64));
1369			if (error != 0) {
1370				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1371				return error;
1372			}
1373
1374			height = mia64.mi_height;
1375			flags = mia64.mi_flags;
1376			devpath = mia64.mi_devpath;
1377		} else {
1378			struct user32_mnt_imgsrc_args mia32;
1379			error = copyin(fsmountargs, &mia32, sizeof(mia32));
1380			if (error != 0) {
1381				IMGSRC_DEBUG("Failed to copy in arguments.\n");
1382				return error;
1383			}
1384
1385			height = mia32.mi_height;
1386			flags = mia32.mi_flags;
1387			devpath = mia32.mi_devpath;
1388		}
1389	} else {
1390		/*
1391		 * For binary compatibility--assumes one level of nesting.
1392		 */
1393		if (is64bit) {
1394			if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1395				return error;
1396		} else {
1397			user32_addr_t tmp;
1398			if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1399				return error;
1400
1401			/* munge into LP64 addr */
1402			devpath = CAST_USER_ADDR_T(tmp);
1403		}
1404
1405		height = 0;
1406		flags = 0;
1407	}
1408
1409	if (flags != 0) {
1410		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1411		return EINVAL;
1412	}
1413
1414	error = get_imgsrc_rootvnode(height, &rvp);
1415	if (error != 0) {
1416		IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1417		return error;
1418	}
1419
1420	IMGSRC_DEBUG("got root vnode.\n");
1421
1422	MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1423
1424	/* Can only move once */
1425	mp = vnode_mount(rvp);
1426	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1427		IMGSRC_DEBUG("Already moved.\n");
1428		error = EBUSY;
1429		goto out0;
1430	}
1431
1432	IMGSRC_DEBUG("Starting updated.\n");
1433
1434	/* Get exclusive rwlock on mount, authorize update on mp */
1435	error = mount_begin_update(mp , ctx, 0);
1436	if (error != 0) {
1437		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1438		goto out0;
1439	}
1440
1441	/*
1442	 * It can only be moved once.  Flag is set under the rwlock,
1443	 * so we're now safe to proceed.
1444	 */
1445	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1446		IMGSRC_DEBUG("Already moved [2]\n");
1447		goto out1;
1448	}
1449
1450
1451	IMGSRC_DEBUG("Preparing coveredvp.\n");
1452
1453	/* Mark covered vnode as mount in progress, authorize placing mount on top */
1454	error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1455	if (error != 0) {
1456		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1457		goto out1;
1458	}
1459
1460	IMGSRC_DEBUG("Covered vp OK.\n");
1461
1462	/* Sanity check the name caller has provided */
1463	vfsp = mp->mnt_vtable;
1464	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1465		IMGSRC_DEBUG("Wrong fs name.\n");
1466		error = EINVAL;
1467		goto out2;
1468	}
1469
1470	/* Check the device vnode and update mount-from name, for local filesystems */
1471	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1472		IMGSRC_DEBUG("Local, doing device validation.\n");
1473
1474		if (devpath != USER_ADDR_NULL) {
1475			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1476			if (error) {
1477				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1478				goto out2;
1479			}
1480
1481			vnode_put(devvp);
1482		}
1483	}
1484
1485	/*
1486	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1487	 * and increment the name cache's mount generation
1488	 */
1489
1490	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1491	error = place_mount_and_checkdirs(mp, vp, ctx);
1492	if (error != 0) {
1493		goto out2;
1494	}
1495
1496	placed = TRUE;
1497
1498	strncpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1499	strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1500
1501	/* Forbid future moves */
1502	mount_lock(mp);
1503	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1504	mount_unlock(mp);
1505
1506	/* Finally, add to mount list, completely ready to go */
1507	if (mount_list_add(mp) != 0) {
1508		/*
1509		 * The system is shutting down trying to umount
1510		 * everything, so fail with a plausible errno.
1511		 */
1512		error = EBUSY;
1513		goto out3;
1514	}
1515
1516	mount_end_update(mp);
1517	vnode_put(rvp);
1518	FREE(old_mntonname, M_TEMP);
1519
1520	vfs_notify_mount(pvp);
1521
1522	return 0;
1523out3:
1524	strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1525
1526	mount_lock(mp);
1527	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1528	mount_unlock(mp);
1529
1530out2:
1531	/*
1532	 * Placing the mp on the vnode clears VMOUNT,
1533	 * so cleanup is different after that point
1534	 */
1535	if (placed) {
1536		/* Rele the vp, clear VMOUNT and v_mountedhere */
1537		undo_place_on_covered_vp(mp, vp);
1538	} else {
1539		vnode_lock_spin(vp);
1540		CLR(vp->v_flag, VMOUNT);
1541		vnode_unlock(vp);
1542	}
1543out1:
1544	mount_end_update(mp);
1545
1546out0:
1547	vnode_put(rvp);
1548	FREE(old_mntonname, M_TEMP);
1549	return error;
1550}
1551
1552#endif /* CONFIG_IMGSRC_ACCESS */
1553
1554void
1555enablequotas(struct mount *mp, vfs_context_t ctx)
1556{
1557	struct nameidata qnd;
1558	int type;
1559	char qfpath[MAXPATHLEN];
1560	const char *qfname = QUOTAFILENAME;
1561	const char *qfopsname = QUOTAOPSNAME;
1562	const char *qfextension[] = INITQFNAMES;
1563
1564	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1565	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1566		return;
1567	}
1568	/*
1569	 * Enable filesystem disk quotas if necessary.
1570	 * We ignore errors as this should not interfere with final mount
1571	 */
1572	for (type=0; type < MAXQUOTAS; type++) {
1573		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1574		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1575		       CAST_USER_ADDR_T(qfpath), ctx);
1576		if (namei(&qnd) != 0)
1577			continue; 	    /* option file to trigger quotas is not present */
1578		vnode_put(qnd.ni_vp);
1579		nameidone(&qnd);
1580		snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1581
1582		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1583	}
1584	return;
1585}
1586
1587
1588static int
1589checkdirs_callback(proc_t p, void * arg)
1590{
1591	struct cdirargs * cdrp = (struct cdirargs * )arg;
1592	vnode_t olddp = cdrp->olddp;
1593	vnode_t newdp = cdrp->newdp;
1594	struct filedesc *fdp;
1595	vnode_t tvp;
1596	vnode_t fdp_cvp;
1597	vnode_t fdp_rvp;
1598	int cdir_changed = 0;
1599	int rdir_changed = 0;
1600
1601	/*
1602	 * XXX Also needs to iterate each thread in the process to see if it
1603	 * XXX is using a per-thread current working directory, and, if so,
1604	 * XXX update that as well.
1605	 */
1606
1607	proc_fdlock(p);
1608	fdp = p->p_fd;
1609	if (fdp == (struct filedesc *)0) {
1610		proc_fdunlock(p);
1611		return(PROC_RETURNED);
1612	}
1613	fdp_cvp = fdp->fd_cdir;
1614	fdp_rvp = fdp->fd_rdir;
1615	proc_fdunlock(p);
1616
1617	if (fdp_cvp == olddp) {
1618		vnode_ref(newdp);
1619		tvp = fdp->fd_cdir;
1620		fdp_cvp = newdp;
1621		cdir_changed = 1;
1622		vnode_rele(tvp);
1623	}
1624	if (fdp_rvp == olddp) {
1625		vnode_ref(newdp);
1626		tvp = fdp->fd_rdir;
1627		fdp_rvp = newdp;
1628		rdir_changed = 1;
1629		vnode_rele(tvp);
1630	}
1631	if (cdir_changed || rdir_changed) {
1632		proc_fdlock(p);
1633		fdp->fd_cdir = fdp_cvp;
1634		fdp->fd_rdir = fdp_rvp;
1635		proc_fdunlock(p);
1636	}
1637	return(PROC_RETURNED);
1638}
1639
1640
1641
1642/*
1643 * Scan all active processes to see if any of them have a current
1644 * or root directory onto which the new filesystem has just been
1645 * mounted. If so, replace them with the new mount point.
1646 */
1647static int
1648checkdirs(vnode_t olddp, vfs_context_t ctx)
1649{
1650	vnode_t newdp;
1651	vnode_t tvp;
1652	int err;
1653	struct cdirargs cdr;
1654	struct uthread * uth = get_bsdthread_info(current_thread());
1655
1656	if (olddp->v_usecount == 1)
1657		return(0);
1658	if (uth != (struct uthread *)0)
1659		uth->uu_notrigger = 1;
1660	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1661	if (uth != (struct uthread *)0)
1662		uth->uu_notrigger = 0;
1663
1664	if (err != 0) {
1665#if DIAGNOSTIC
1666		panic("mount: lost mount: error %d", err);
1667#endif
1668		return(err);
1669	}
1670
1671	cdr.olddp = olddp;
1672	cdr.newdp = newdp;
1673	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1674	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1675
1676	if (rootvnode == olddp) {
1677		vnode_ref(newdp);
1678		tvp = rootvnode;
1679		rootvnode = newdp;
1680		vnode_rele(tvp);
1681	}
1682
1683	vnode_put(newdp);
1684	return(0);
1685}
1686
1687/*
1688 * Unmount a file system.
1689 *
1690 * Note: unmount takes a path to the vnode mounted on as argument,
1691 * not special file (as before).
1692 */
1693/* ARGSUSED */
1694int
1695unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1696{
1697	vnode_t vp;
1698	struct mount *mp;
1699	int error;
1700	struct nameidata nd;
1701	vfs_context_t ctx = vfs_context_current();
1702
1703	NDINIT(&nd, LOOKUP, OP_UNMOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1,
1704		UIO_USERSPACE, uap->path, ctx);
1705	error = namei(&nd);
1706	if (error)
1707		return (error);
1708	vp = nd.ni_vp;
1709	mp = vp->v_mount;
1710	nameidone(&nd);
1711
1712#if CONFIG_MACF
1713	error = mac_mount_check_umount(ctx, mp);
1714	if (error != 0) {
1715		vnode_put(vp);
1716		return (error);
1717	}
1718#endif
1719	/*
1720	 * Must be the root of the filesystem
1721	 */
1722	if ((vp->v_flag & VROOT) == 0) {
1723		vnode_put(vp);
1724		return (EINVAL);
1725	}
1726	mount_ref(mp, 0);
1727	vnode_put(vp);
1728	/* safedounmount consumes the mount ref */
1729	return (safedounmount(mp, uap->flags, ctx));
1730}
1731
1732int
1733vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx)
1734{
1735	mount_t mp;
1736
1737	mp = mount_list_lookupby_fsid(fsid, 0, 1);
1738	if (mp == (mount_t)0) {
1739		return(ENOENT);
1740	}
1741	mount_ref(mp, 0);
1742	mount_iterdrop(mp);
1743	/* safedounmount consumes the mount ref */
1744	return(safedounmount(mp, flags, ctx));
1745}
1746
1747
1748/*
1749 * The mount struct comes with a mount ref which will be consumed.
1750 * Do the actual file system unmount, prevent some common foot shooting.
1751 */
1752int
1753safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1754{
1755	int error;
1756	proc_t p = vfs_context_proc(ctx);
1757
1758	/*
1759	 * If the file system is not responding and MNT_NOBLOCK
1760	 * is set and not a forced unmount then return EBUSY.
1761	 */
1762	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1763		(flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1764		error = EBUSY;
1765		goto out;
1766	}
1767
1768	/*
1769	 * Skip authorization if the mount is tagged as permissive and
1770	 * this is not a forced-unmount attempt.
1771	 */
1772	if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1773		/*
1774		 * Only root, or the user that did the original mount is
1775		 * permitted to unmount this filesystem.
1776		 */
1777		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1778				(error = suser(kauth_cred_get(), &p->p_acflag)))
1779			goto out;
1780	}
1781	/*
1782	 * Don't allow unmounting the root file system.
1783	 */
1784	if (mp->mnt_flag & MNT_ROOTFS) {
1785		error = EBUSY; /* the root is always busy */
1786		goto out;
1787	}
1788
1789#ifdef CONFIG_IMGSRC_ACCESS
1790	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1791		error = EBUSY;
1792		goto out;
1793	}
1794#endif /* CONFIG_IMGSRC_ACCESS */
1795
1796	return (dounmount(mp, flags, 1, ctx));
1797
1798out:
1799	mount_drop(mp, 0);
1800	return(error);
1801}
1802
1803/*
1804 * Do the actual file system unmount.
1805 */
1806int
1807dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1808{
1809	vnode_t coveredvp = (vnode_t)0;
1810	int error;
1811	int needwakeup = 0;
1812	int forcedunmount = 0;
1813	int lflags = 0;
1814	struct vnode *devvp = NULLVP;
1815#if CONFIG_TRIGGERS
1816	proc_t p = vfs_context_proc(ctx);
1817	int did_vflush = 0;
1818	int pflags_save = 0;
1819#endif /* CONFIG_TRIGGERS */
1820
1821	if (flags & MNT_FORCE)
1822		forcedunmount = 1;
1823
1824	mount_lock(mp);
1825	/* XXX post jaguar fix LK_DRAIN - then clean this up */
1826	if ((flags & MNT_FORCE)) {
1827		mp->mnt_kern_flag |= MNTK_FRCUNMOUNT;
1828		mp->mnt_lflag |= MNT_LFORCE;
1829	}
1830	if (mp->mnt_lflag & MNT_LUNMOUNT) {
1831		mp->mnt_lflag |= MNT_LWAIT;
1832		if(withref != 0)
1833			mount_drop(mp, 1);
1834		msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "dounmount", NULL);
1835		/*
1836		 * The prior unmount attempt has probably succeeded.
1837		 * Do not dereference mp here - returning EBUSY is safest.
1838		 */
1839		return (EBUSY);
1840	}
1841
1842#if CONFIG_TRIGGERS
1843	if (flags & MNT_NOBLOCK && p != kernproc)
1844		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1845#endif
1846
1847	mp->mnt_kern_flag |= MNTK_UNMOUNT;
1848	mp->mnt_lflag |= MNT_LUNMOUNT;
1849	mp->mnt_flag &=~ MNT_ASYNC;
1850	/*
1851	 * anyone currently in the fast path that
1852	 * trips over the cached rootvp will be
1853	 * dumped out and forced into the slow path
1854	 * to regenerate a new cached value
1855	 */
1856	mp->mnt_realrootvp = NULLVP;
1857	mount_unlock(mp);
1858
1859	/*
1860	 * taking the name_cache_lock exclusively will
1861	 * insure that everyone is out of the fast path who
1862	 * might be trying to use a now stale copy of
1863	 * vp->v_mountedhere->mnt_realrootvp
1864	 * bumping mount_generation causes the cached values
1865	 * to be invalidated
1866	 */
1867	name_cache_lock();
1868	mount_generation++;
1869	name_cache_unlock();
1870
1871
1872	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1873	if (withref != 0)
1874		mount_drop(mp, 0);
1875#if CONFIG_FSE
1876	fsevent_unmount(mp);  /* has to come first! */
1877#endif
1878	error = 0;
1879	if (forcedunmount == 0) {
1880		ubc_umount(mp);	/* release cached vnodes */
1881		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1882			error = VFS_SYNC(mp, MNT_WAIT, ctx);
1883			if (error) {
1884				mount_lock(mp);
1885				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1886				mp->mnt_lflag &= ~MNT_LUNMOUNT;
1887				mp->mnt_lflag &= ~MNT_LFORCE;
1888				goto out;
1889			}
1890		}
1891	}
1892
1893#if CONFIG_TRIGGERS
1894	vfs_nested_trigger_unmounts(mp, flags, ctx);
1895	did_vflush = 1;
1896#endif
1897	if (forcedunmount)
1898		lflags |= FORCECLOSE;
1899	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1900	if ((forcedunmount == 0) && error) {
1901		mount_lock(mp);
1902		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1903		mp->mnt_lflag &= ~MNT_LUNMOUNT;
1904		mp->mnt_lflag &= ~MNT_LFORCE;
1905		goto out;
1906	}
1907
1908	/* make sure there are no one in the mount iterations or lookup */
1909	mount_iterdrain(mp);
1910
1911	error = VFS_UNMOUNT(mp, flags, ctx);
1912	if (error) {
1913		mount_iterreset(mp);
1914		mount_lock(mp);
1915		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1916		mp->mnt_lflag &= ~MNT_LUNMOUNT;
1917		mp->mnt_lflag &= ~MNT_LFORCE;
1918		goto out;
1919	}
1920
1921	/* increment the operations count */
1922	if (!error)
1923		OSAddAtomic(1, &vfs_nummntops);
1924
1925	if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1926		/* hold an io reference and drop the usecount before close */
1927		devvp = mp->mnt_devvp;
1928		vnode_getalways(devvp);
1929		vnode_rele(devvp);
1930		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1931                       ctx);
1932		vnode_clearmountedon(devvp);
1933		vnode_put(devvp);
1934	}
1935	lck_rw_done(&mp->mnt_rwlock);
1936	mount_list_remove(mp);
1937	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1938
1939	/* mark the mount point hook in the vp but not drop the ref yet */
1940	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1941		vnode_getwithref(coveredvp);
1942		vnode_lock_spin(coveredvp);
1943
1944		mp->mnt_crossref++;
1945		coveredvp->v_mountedhere = (struct mount *)0;
1946
1947		vnode_unlock(coveredvp);
1948		vnode_put(coveredvp);
1949	}
1950
1951	mount_list_lock();
1952	mp->mnt_vtable->vfc_refcount--;
1953	mount_list_unlock();
1954
1955	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1956	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
1957	mount_lock(mp);
1958	mp->mnt_lflag |= MNT_LDEAD;
1959
1960	if (mp->mnt_lflag & MNT_LWAIT) {
1961	        /*
1962		 * do the wakeup here
1963		 * in case we block in mount_refdrain
1964		 * which will drop the mount lock
1965		 * and allow anyone blocked in vfs_busy
1966		 * to wakeup and see the LDEAD state
1967		 */
1968		mp->mnt_lflag &= ~MNT_LWAIT;
1969		wakeup((caddr_t)mp);
1970	}
1971	mount_refdrain(mp);
1972out:
1973	if (mp->mnt_lflag & MNT_LWAIT) {
1974		mp->mnt_lflag &= ~MNT_LWAIT;
1975		needwakeup = 1;
1976	}
1977
1978#if CONFIG_TRIGGERS
1979	if (flags & MNT_NOBLOCK && p != kernproc) {
1980	 	// Restore P_NOREMOTEHANG bit to its previous value
1981		if ((pflags_save & P_NOREMOTEHANG) == 0)
1982			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
1983	}
1984
1985	/*
1986	 * Callback and context are set together under the mount lock, and
1987	 * never cleared, so we're safe to examine them here, drop the lock,
1988	 * and call out.
1989	 */
1990	if (mp->mnt_triggercallback != NULL) {
1991		mount_unlock(mp);
1992		if (error == 0) {
1993			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
1994		} else if (did_vflush) {
1995			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
1996		}
1997	} else {
1998		mount_unlock(mp);
1999	}
2000#else
2001	mount_unlock(mp);
2002#endif /* CONFIG_TRIGGERS */
2003
2004	lck_rw_done(&mp->mnt_rwlock);
2005
2006	if (needwakeup)
2007		wakeup((caddr_t)mp);
2008
2009	if (!error) {
2010		if ((coveredvp != NULLVP)) {
2011			vnode_t pvp;
2012
2013			vnode_getwithref(coveredvp);
2014			pvp = vnode_getparent(coveredvp);
2015			vnode_rele(coveredvp);
2016
2017			mount_dropcrossref(mp, coveredvp, 0);
2018#if CONFIG_TRIGGERS
2019			if (coveredvp->v_resolve)
2020				vnode_trigger_rearm(coveredvp, ctx);
2021#endif
2022			vnode_put(coveredvp);
2023
2024			if (pvp) {
2025				lock_vnode_and_post(pvp, NOTE_WRITE);
2026				vnode_put(pvp);
2027			}
2028		} else if (mp->mnt_flag & MNT_ROOTFS) {
2029				mount_lock_destroy(mp);
2030#if CONFIG_MACF
2031				mac_mount_label_destroy(mp);
2032#endif
2033				FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2034		} else
2035			panic("dounmount: no coveredvp");
2036	}
2037	return (error);
2038}
2039
2040void
2041mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2042{
2043	vnode_lock(dp);
2044	mp->mnt_crossref--;
2045
2046	if (mp->mnt_crossref < 0)
2047		panic("mount cross refs -ve");
2048
2049	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2050
2051		if (need_put)
2052			vnode_put_locked(dp);
2053		vnode_unlock(dp);
2054
2055		mount_lock_destroy(mp);
2056#if CONFIG_MACF
2057		mac_mount_label_destroy(mp);
2058#endif
2059		FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2060		return;
2061	}
2062	if (need_put)
2063		vnode_put_locked(dp);
2064	vnode_unlock(dp);
2065}
2066
2067
2068/*
2069 * Sync each mounted filesystem.
2070 */
2071#if DIAGNOSTIC
2072int syncprt = 0;
2073struct ctldebug debug0 = { "syncprt", &syncprt };
2074#endif
2075
2076int print_vmpage_stat=0;
2077
2078static int
2079sync_callback(mount_t mp, void * arg)
2080{
2081	int asyncflag;
2082
2083	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2084			asyncflag = mp->mnt_flag & MNT_ASYNC;
2085			mp->mnt_flag &= ~MNT_ASYNC;
2086			VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_current());
2087			if (asyncflag)
2088				mp->mnt_flag |= MNT_ASYNC;
2089	}
2090	return(VFS_RETURNED);
2091}
2092
2093
2094/* ARGSUSED */
2095int
2096sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2097{
2098	vfs_iterate(LK_NOWAIT, sync_callback, (void *)0);
2099
2100	if(print_vmpage_stat) {
2101		vm_countdirtypages();
2102	}
2103
2104#if DIAGNOSTIC
2105	if (syncprt)
2106		vfs_bufstats();
2107#endif /* DIAGNOSTIC */
2108	return (0);
2109}
2110
2111/*
2112 * Change filesystem quotas.
2113 */
2114#if QUOTA
2115static int quotactl_funneled(proc_t p, struct quotactl_args *uap, int32_t *retval);
2116
2117int
2118quotactl(proc_t p, struct quotactl_args *uap, int32_t *retval)
2119{
2120	boolean_t funnel_state;
2121	int error;
2122
2123	funnel_state = thread_funnel_set(kernel_flock, TRUE);
2124	error = quotactl_funneled(p, uap, retval);
2125	thread_funnel_set(kernel_flock, funnel_state);
2126	return(error);
2127}
2128
2129static int
2130quotactl_funneled(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2131{
2132	struct mount *mp;
2133	int error, quota_cmd, quota_status;
2134	caddr_t datap;
2135	size_t fnamelen;
2136	struct nameidata nd;
2137	vfs_context_t ctx = vfs_context_current();
2138	struct dqblk my_dqblk;
2139
2140	AUDIT_ARG(uid, uap->uid);
2141	AUDIT_ARG(cmd, uap->cmd);
2142	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2143	       uap->path, ctx);
2144	error = namei(&nd);
2145	if (error)
2146		return (error);
2147	mp = nd.ni_vp->v_mount;
2148	vnode_put(nd.ni_vp);
2149	nameidone(&nd);
2150
2151	/* copyin any data we will need for downstream code */
2152	quota_cmd = uap->cmd >> SUBCMDSHIFT;
2153
2154	switch (quota_cmd) {
2155	case Q_QUOTAON:
2156		/* uap->arg specifies a file from which to take the quotas */
2157		fnamelen = MAXPATHLEN;
2158		datap = kalloc(MAXPATHLEN);
2159		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2160		break;
2161	case Q_GETQUOTA:
2162		/* uap->arg is a pointer to a dqblk structure. */
2163		datap = (caddr_t) &my_dqblk;
2164		break;
2165	case Q_SETQUOTA:
2166	case Q_SETUSE:
2167		/* uap->arg is a pointer to a dqblk structure. */
2168		datap = (caddr_t) &my_dqblk;
2169		if (proc_is64bit(p)) {
2170			struct user_dqblk	my_dqblk64;
2171			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2172			if (error == 0) {
2173				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2174			}
2175		}
2176		else {
2177			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2178		}
2179		break;
2180	case Q_QUOTASTAT:
2181		/* uap->arg is a pointer to an integer */
2182		datap = (caddr_t) &quota_status;
2183		break;
2184	default:
2185		datap = NULL;
2186		break;
2187	} /* switch */
2188
2189	if (error == 0) {
2190		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2191	}
2192
2193	switch (quota_cmd) {
2194	case Q_QUOTAON:
2195		if (datap != NULL)
2196			kfree(datap, MAXPATHLEN);
2197		break;
2198	case Q_GETQUOTA:
2199		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
2200		if (error == 0) {
2201			if (proc_is64bit(p)) {
2202				struct user_dqblk	my_dqblk64;
2203				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2204				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2205			}
2206			else {
2207				error = copyout(datap, uap->arg, sizeof (struct dqblk));
2208			}
2209		}
2210		break;
2211	case Q_QUOTASTAT:
2212		/* uap->arg is a pointer to an integer */
2213		if (error == 0) {
2214			error = copyout(datap, uap->arg, sizeof(quota_status));
2215		}
2216		break;
2217	default:
2218		break;
2219	} /* switch */
2220
2221	return (error);
2222}
2223#else
2224int
2225quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2226{
2227	return (EOPNOTSUPP);
2228}
2229#endif /* QUOTA */
2230
2231/*
2232 * Get filesystem statistics.
2233 *
2234 * Returns:	0			Success
2235 *	namei:???
2236 *	vfs_update_vfsstat:???
2237 *	munge_statfs:EFAULT
2238 */
2239/* ARGSUSED */
2240int
2241statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2242{
2243	struct mount *mp;
2244	struct vfsstatfs *sp;
2245	int error;
2246	struct nameidata nd;
2247	vfs_context_t ctx = vfs_context_current();
2248	vnode_t vp;
2249
2250	NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1,
2251		UIO_USERSPACE, uap->path, ctx);
2252	error = namei(&nd);
2253	if (error)
2254		return (error);
2255	vp = nd.ni_vp;
2256	mp = vp->v_mount;
2257	sp = &mp->mnt_vfsstat;
2258	nameidone(&nd);
2259
2260	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2261	if (error != 0) {
2262		vnode_put(vp);
2263		return (error);
2264	}
2265
2266	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2267	vnode_put(vp);
2268	return (error);
2269}
2270
2271/*
2272 * Get filesystem statistics.
2273 */
2274/* ARGSUSED */
2275int
2276fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2277{
2278	vnode_t vp;
2279	struct mount *mp;
2280	struct vfsstatfs *sp;
2281	int error;
2282
2283	AUDIT_ARG(fd, uap->fd);
2284
2285	if ( (error = file_vnode(uap->fd, &vp)) )
2286		return (error);
2287
2288	error = vnode_getwithref(vp);
2289	if (error) {
2290		file_drop(uap->fd);
2291		return (error);
2292	}
2293
2294	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2295
2296	mp = vp->v_mount;
2297	if (!mp) {
2298		error = EBADF;
2299		goto out;
2300	}
2301	sp = &mp->mnt_vfsstat;
2302	if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) {
2303		goto out;
2304	}
2305
2306	error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2307
2308out:
2309	file_drop(uap->fd);
2310	vnode_put(vp);
2311
2312	return (error);
2313}
2314
2315/*
2316 * Common routine to handle copying of statfs64 data to user space
2317 */
2318static int
2319statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2320{
2321	int error;
2322	struct statfs64 sfs;
2323
2324	bzero(&sfs, sizeof(sfs));
2325
2326	sfs.f_bsize = sfsp->f_bsize;
2327	sfs.f_iosize = (int32_t)sfsp->f_iosize;
2328	sfs.f_blocks = sfsp->f_blocks;
2329	sfs.f_bfree = sfsp->f_bfree;
2330	sfs.f_bavail = sfsp->f_bavail;
2331	sfs.f_files = sfsp->f_files;
2332	sfs.f_ffree = sfsp->f_ffree;
2333	sfs.f_fsid = sfsp->f_fsid;
2334	sfs.f_owner = sfsp->f_owner;
2335	sfs.f_type = mp->mnt_vtable->vfc_typenum;
2336	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2337	sfs.f_fssubtype = sfsp->f_fssubtype;
2338	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2339		strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2340	} else {
2341		strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2342	}
2343	strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2344	strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2345
2346	error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2347
2348	return(error);
2349}
2350
2351/*
2352 * Get file system statistics in 64-bit mode
2353 */
2354int
2355statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2356{
2357	struct mount *mp;
2358	struct vfsstatfs *sp;
2359	int error;
2360	struct nameidata nd;
2361	vfs_context_t ctxp = vfs_context_current();
2362	vnode_t vp;
2363
2364	NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1,
2365		UIO_USERSPACE, uap->path, ctxp);
2366	error = namei(&nd);
2367	if (error)
2368		return (error);
2369	vp = nd.ni_vp;
2370	mp = vp->v_mount;
2371	sp = &mp->mnt_vfsstat;
2372	nameidone(&nd);
2373
2374	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2375	if (error != 0) {
2376		vnode_put(vp);
2377		return (error);
2378	}
2379
2380	error = statfs64_common(mp, sp, uap->buf);
2381	vnode_put(vp);
2382
2383	return (error);
2384}
2385
2386/*
2387 * Get file system statistics in 64-bit mode
2388 */
2389int
2390fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2391{
2392	struct vnode *vp;
2393	struct mount *mp;
2394	struct vfsstatfs *sp;
2395	int error;
2396
2397	AUDIT_ARG(fd, uap->fd);
2398
2399	if ( (error = file_vnode(uap->fd, &vp)) )
2400		return (error);
2401
2402	error = vnode_getwithref(vp);
2403	if (error) {
2404		file_drop(uap->fd);
2405		return (error);
2406	}
2407
2408	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2409
2410	mp = vp->v_mount;
2411	if (!mp) {
2412		error = EBADF;
2413		goto out;
2414	}
2415	sp = &mp->mnt_vfsstat;
2416	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2417		goto out;
2418	}
2419
2420	error = statfs64_common(mp, sp, uap->buf);
2421
2422out:
2423	file_drop(uap->fd);
2424	vnode_put(vp);
2425
2426	return (error);
2427}
2428
2429struct getfsstat_struct {
2430	user_addr_t	sfsp;
2431	user_addr_t	*mp;
2432	int		count;
2433	int		maxcount;
2434	int		flags;
2435	int		error;
2436};
2437
2438
2439static int
2440getfsstat_callback(mount_t mp, void * arg)
2441{
2442
2443	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2444	struct vfsstatfs *sp;
2445	int error, my_size;
2446	vfs_context_t ctx = vfs_context_current();
2447
2448	if (fstp->sfsp && fstp->count < fstp->maxcount) {
2449		sp = &mp->mnt_vfsstat;
2450		/*
2451		 * If MNT_NOWAIT is specified, do not refresh the
2452		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2453		 */
2454		if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2455			(error = vfs_update_vfsstat(mp, ctx,
2456			    VFS_USER_EVENT))) {
2457			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2458			return(VFS_RETURNED);
2459		}
2460
2461		/*
2462		 * Need to handle LP64 version of struct statfs
2463		 */
2464		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2465		if (error) {
2466			fstp->error = error;
2467			return(VFS_RETURNED_DONE);
2468		}
2469		fstp->sfsp += my_size;
2470
2471		if (fstp->mp) {
2472#if CONFIG_MACF
2473			error = mac_mount_label_get(mp, *fstp->mp);
2474			if (error) {
2475				fstp->error = error;
2476				return(VFS_RETURNED_DONE);
2477			}
2478#endif
2479			fstp->mp++;
2480		}
2481	}
2482	fstp->count++;
2483	return(VFS_RETURNED);
2484}
2485
2486/*
2487 * Get statistics on all filesystems.
2488 */
2489int
2490getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2491{
2492	struct __mac_getfsstat_args muap;
2493
2494	muap.buf = uap->buf;
2495	muap.bufsize = uap->bufsize;
2496	muap.mac = USER_ADDR_NULL;
2497	muap.macsize = 0;
2498	muap.flags = uap->flags;
2499
2500	return (__mac_getfsstat(p, &muap, retval));
2501}
2502
2503/*
2504 * __mac_getfsstat: Get MAC-related file system statistics
2505 *
2506 * Parameters:    p                        (ignored)
2507 *                uap                      User argument descriptor (see below)
2508 *                retval                   Count of file system statistics (N stats)
2509 *
2510 * Indirect:      uap->bufsize             Buffer size
2511 *                uap->macsize             MAC info size
2512 *                uap->buf                 Buffer where information will be returned
2513 *                uap->mac                 MAC info
2514 *                uap->flags               File system flags
2515 *
2516 *
2517 * Returns:        0                       Success
2518 *                !0                       Not success
2519 *
2520 */
2521int
2522__mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2523{
2524	user_addr_t sfsp;
2525	user_addr_t *mp;
2526	size_t count, maxcount, bufsize, macsize;
2527	struct getfsstat_struct fst;
2528
2529	bufsize = (size_t) uap->bufsize;
2530	macsize = (size_t) uap->macsize;
2531
2532	if (IS_64BIT_PROCESS(p)) {
2533		maxcount = bufsize / sizeof(struct user64_statfs);
2534	}
2535	else {
2536		maxcount = bufsize / sizeof(struct user32_statfs);
2537	}
2538	sfsp = uap->buf;
2539	count = 0;
2540
2541	mp = NULL;
2542
2543#if CONFIG_MACF
2544	if (uap->mac != USER_ADDR_NULL) {
2545		u_int32_t *mp0;
2546		int error;
2547		unsigned int i;
2548
2549		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2550		if (count != maxcount)
2551			return (EINVAL);
2552
2553		/* Copy in the array */
2554		MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2555		if (mp0 == NULL) {
2556			return (ENOMEM);
2557		}
2558
2559		error = copyin(uap->mac, mp0, macsize);
2560		if (error) {
2561			FREE(mp0, M_MACTEMP);
2562			return (error);
2563		}
2564
2565		/* Normalize to an array of user_addr_t */
2566		MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2567		if (mp == NULL) {
2568			FREE(mp0, M_MACTEMP);
2569			return (ENOMEM);
2570		}
2571
2572		for (i = 0; i < count; i++) {
2573			if (IS_64BIT_PROCESS(p))
2574				mp[i] = ((user_addr_t *)mp0)[i];
2575			else
2576				mp[i] = (user_addr_t)mp0[i];
2577		}
2578		FREE(mp0, M_MACTEMP);
2579	}
2580#endif
2581
2582
2583	fst.sfsp = sfsp;
2584	fst.mp = mp;
2585	fst.flags = uap->flags;
2586	fst.count = 0;
2587	fst.error = 0;
2588	fst.maxcount = maxcount;
2589
2590
2591	vfs_iterate(0, getfsstat_callback, &fst);
2592
2593	if (mp)
2594		FREE(mp, M_MACTEMP);
2595
2596	if (fst.error ) {
2597		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2598		return(fst.error);
2599	}
2600
2601	if (fst.sfsp && fst.count > fst.maxcount)
2602		*retval = fst.maxcount;
2603	else
2604		*retval = fst.count;
2605	return (0);
2606}
2607
2608static int
2609getfsstat64_callback(mount_t mp, void * arg)
2610{
2611	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2612	struct vfsstatfs *sp;
2613	int error;
2614
2615	if (fstp->sfsp && fstp->count < fstp->maxcount) {
2616		sp = &mp->mnt_vfsstat;
2617		/*
2618		 * If MNT_NOWAIT is specified, do not refresh the fsstat
2619		 * cache. MNT_WAIT overrides MNT_NOWAIT.
2620		 *
2621		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
2622		 * getfsstat, since the constants are out of the same
2623		 * namespace.
2624		 */
2625		if (((fstp->flags & MNT_NOWAIT) == 0 ||
2626		     (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2627		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2628			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2629			return(VFS_RETURNED);
2630		}
2631
2632		error = statfs64_common(mp, sp, fstp->sfsp);
2633		if (error) {
2634			fstp->error = error;
2635			return(VFS_RETURNED_DONE);
2636		}
2637		fstp->sfsp += sizeof(struct statfs64);
2638	}
2639	fstp->count++;
2640	return(VFS_RETURNED);
2641}
2642
2643/*
2644 * Get statistics on all file systems in 64 bit mode.
2645 */
2646int
2647getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2648{
2649	user_addr_t sfsp;
2650	int count, maxcount;
2651	struct getfsstat_struct fst;
2652
2653	maxcount = uap->bufsize / sizeof(struct statfs64);
2654
2655	sfsp = uap->buf;
2656	count = 0;
2657
2658	fst.sfsp = sfsp;
2659	fst.flags = uap->flags;
2660	fst.count = 0;
2661	fst.error = 0;
2662	fst.maxcount = maxcount;
2663
2664	vfs_iterate(0, getfsstat64_callback, &fst);
2665
2666	if (fst.error ) {
2667		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2668		return(fst.error);
2669	}
2670
2671	if (fst.sfsp && fst.count > fst.maxcount)
2672		*retval = fst.maxcount;
2673	else
2674		*retval = fst.count;
2675
2676	return (0);
2677}
2678
2679/*
2680 * Change current working directory to a given file descriptor.
2681 */
2682/* ARGSUSED */
2683static int
2684common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2685{
2686	struct filedesc *fdp = p->p_fd;
2687	vnode_t vp;
2688	vnode_t tdp;
2689	vnode_t tvp;
2690	struct mount *mp;
2691	int error;
2692	vfs_context_t ctx = vfs_context_current();
2693
2694	AUDIT_ARG(fd, uap->fd);
2695	if (per_thread && uap->fd == -1) {
2696		/*
2697		 * Switching back from per-thread to per process CWD; verify we
2698		 * in fact have one before proceeding.  The only success case
2699		 * for this code path is to return 0 preemptively after zapping
2700		 * the thread structure contents.
2701		 */
2702		thread_t th = vfs_context_thread(ctx);
2703		if (th) {
2704			uthread_t uth = get_bsdthread_info(th);
2705			tvp = uth->uu_cdir;
2706			uth->uu_cdir = NULLVP;
2707			if (tvp != NULLVP) {
2708				vnode_rele(tvp);
2709				return (0);
2710			}
2711		}
2712		return (EBADF);
2713	}
2714
2715	if ( (error = file_vnode(uap->fd, &vp)) )
2716		return(error);
2717	if ( (error = vnode_getwithref(vp)) ) {
2718	        file_drop(uap->fd);
2719		return(error);
2720	}
2721
2722	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2723
2724	if (vp->v_type != VDIR) {
2725		error = ENOTDIR;
2726		goto out;
2727	}
2728
2729#if CONFIG_MACF
2730	error = mac_vnode_check_chdir(ctx, vp);
2731	if (error)
2732		goto out;
2733#endif
2734	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
2735	if (error)
2736		goto out;
2737
2738	while (!error && (mp = vp->v_mountedhere) != NULL) {
2739		if (vfs_busy(mp, LK_NOWAIT)) {
2740			error = EACCES;
2741			goto out;
2742		}
2743		error = VFS_ROOT(mp, &tdp, ctx);
2744		vfs_unbusy(mp);
2745		if (error)
2746			break;
2747		vnode_put(vp);
2748		vp = tdp;
2749	}
2750	if (error)
2751		goto out;
2752	if ( (error = vnode_ref(vp)) )
2753	        goto out;
2754	vnode_put(vp);
2755
2756	if (per_thread) {
2757		thread_t th = vfs_context_thread(ctx);
2758		if (th) {
2759			uthread_t uth = get_bsdthread_info(th);
2760			tvp = uth->uu_cdir;
2761			uth->uu_cdir = vp;
2762			OSBitOrAtomic(P_THCWD, &p->p_flag);
2763		} else {
2764			vnode_rele(vp);
2765			return (ENOENT);
2766		}
2767	} else {
2768		proc_fdlock(p);
2769		tvp = fdp->fd_cdir;
2770		fdp->fd_cdir = vp;
2771		proc_fdunlock(p);
2772	}
2773
2774	if (tvp)
2775	        vnode_rele(tvp);
2776	file_drop(uap->fd);
2777
2778	return (0);
2779out:
2780	vnode_put(vp);
2781	file_drop(uap->fd);
2782
2783	return(error);
2784}
2785
2786int
2787fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
2788{
2789	return common_fchdir(p, uap, 0);
2790}
2791
2792int
2793__pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
2794{
2795	return common_fchdir(p, (void *)uap, 1);
2796}
2797
2798/*
2799 * Change current working directory (".").
2800 *
2801 * Returns:	0			Success
2802 *	change_dir:ENOTDIR
2803 *	change_dir:???
2804 *	vnode_ref:ENOENT		No such file or directory
2805 */
2806/* ARGSUSED */
2807static int
2808common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
2809{
2810	struct filedesc *fdp = p->p_fd;
2811	int error;
2812	struct nameidata nd;
2813	vnode_t tvp;
2814	vfs_context_t ctx = vfs_context_current();
2815
2816	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
2817		UIO_USERSPACE, uap->path, ctx);
2818	error = change_dir(&nd, ctx);
2819	if (error)
2820		return (error);
2821	if ( (error = vnode_ref(nd.ni_vp)) ) {
2822	        vnode_put(nd.ni_vp);
2823		return (error);
2824	}
2825	/*
2826	 * drop the iocount we picked up in change_dir
2827	 */
2828	vnode_put(nd.ni_vp);
2829
2830	if (per_thread) {
2831		thread_t th = vfs_context_thread(ctx);
2832		if (th) {
2833			uthread_t uth = get_bsdthread_info(th);
2834			tvp = uth->uu_cdir;
2835			uth->uu_cdir = nd.ni_vp;
2836			OSBitOrAtomic(P_THCWD, &p->p_flag);
2837		} else {
2838			vnode_rele(nd.ni_vp);
2839			return (ENOENT);
2840		}
2841	} else {
2842		proc_fdlock(p);
2843		tvp = fdp->fd_cdir;
2844		fdp->fd_cdir = nd.ni_vp;
2845		proc_fdunlock(p);
2846	}
2847
2848	if (tvp)
2849	        vnode_rele(tvp);
2850
2851	return (0);
2852}
2853
2854
2855/*
2856 * chdir
2857 *
2858 * Change current working directory (".") for the entire process
2859 *
2860 * Parameters:  p       Process requesting the call
2861 * 		uap     User argument descriptor (see below)
2862 * 		retval  (ignored)
2863 *
2864 * Indirect parameters:	uap->path	Directory path
2865 *
2866 * Returns:	0			Success
2867 * 		common_chdir: ENOTDIR
2868 * 		common_chdir: ENOENT	No such file or directory
2869 * 		common_chdir: ???
2870 *
2871 */
2872int
2873chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
2874{
2875	return common_chdir(p, (void *)uap, 0);
2876}
2877
2878/*
2879 * __pthread_chdir
2880 *
2881 * Change current working directory (".") for a single thread
2882 *
2883 * Parameters:  p       Process requesting the call
2884 * 		uap     User argument descriptor (see below)
2885 * 		retval  (ignored)
2886 *
2887 * Indirect parameters:	uap->path	Directory path
2888 *
2889 * Returns:	0			Success
2890 * 		common_chdir: ENOTDIR
2891 *		common_chdir: ENOENT	No such file or directory
2892 *		common_chdir: ???
2893 *
2894 */
2895int
2896__pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
2897{
2898	return common_chdir(p, (void *)uap, 1);
2899}
2900
2901
2902/*
2903 * Change notion of root (``/'') directory.
2904 */
2905/* ARGSUSED */
2906int
2907chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
2908{
2909	struct filedesc *fdp = p->p_fd;
2910	int error;
2911	struct nameidata nd;
2912	vnode_t tvp;
2913	vfs_context_t ctx = vfs_context_current();
2914
2915	if ((error = suser(kauth_cred_get(), &p->p_acflag)))
2916		return (error);
2917
2918	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
2919		UIO_USERSPACE, uap->path, ctx);
2920	error = change_dir(&nd, ctx);
2921	if (error)
2922		return (error);
2923
2924#if CONFIG_MACF
2925	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
2926	    &nd.ni_cnd);
2927	if (error) {
2928		vnode_put(nd.ni_vp);
2929		return (error);
2930	}
2931#endif
2932
2933	if ( (error = vnode_ref(nd.ni_vp)) ) {
2934	        vnode_put(nd.ni_vp);
2935		return (error);
2936	}
2937	vnode_put(nd.ni_vp);
2938
2939	proc_fdlock(p);
2940	tvp = fdp->fd_rdir;
2941	fdp->fd_rdir = nd.ni_vp;
2942	fdp->fd_flags |= FD_CHROOT;
2943	proc_fdunlock(p);
2944
2945	if (tvp != NULL)
2946		vnode_rele(tvp);
2947
2948	return (0);
2949}
2950
2951/*
2952 * Common routine for chroot and chdir.
2953 *
2954 * Returns:	0			Success
2955 *		ENOTDIR			Not a directory
2956 *		namei:???		[anything namei can return]
2957 *		vnode_authorize:???	[anything vnode_authorize can return]
2958 */
2959static int
2960change_dir(struct nameidata *ndp, vfs_context_t ctx)
2961{
2962	vnode_t vp;
2963	int error;
2964
2965	if ((error = namei(ndp)))
2966		return (error);
2967	nameidone(ndp);
2968	vp = ndp->ni_vp;
2969
2970	if (vp->v_type != VDIR) {
2971		vnode_put(vp);
2972		return (ENOTDIR);
2973	}
2974
2975#if CONFIG_MACF
2976	error = mac_vnode_check_chdir(ctx, vp);
2977	if (error) {
2978		vnode_put(vp);
2979		return (error);
2980	}
2981#endif
2982
2983	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
2984	if (error) {
2985		vnode_put(vp);
2986		return (error);
2987	}
2988
2989	return (error);
2990}
2991
2992/*
2993 * Check permissions, allocate an open file structure,
2994 * and call the device open routine if any.
2995 *
2996 * Returns:	0			Success
2997 *		EINVAL
2998 *		EINTR
2999 *	falloc:ENFILE
3000 *	falloc:EMFILE
3001 *	falloc:ENOMEM
3002 *	vn_open_auth:???
3003 *	dupfdopen:???
3004 *	VNOP_ADVLOCK:???
3005 *	vnode_setsize:???
3006 *
3007 * XXX Need to implement uid, gid
3008 */
3009int
3010open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3011    struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3012    int32_t *retval)
3013{
3014	proc_t p = vfs_context_proc(ctx);
3015	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3016	struct fileproc *fp;
3017	vnode_t vp;
3018	int flags, oflags;
3019	int type, indx, error;
3020	struct flock lf;
3021	int no_controlling_tty = 0;
3022	int deny_controlling_tty = 0;
3023	struct session *sessp = SESSION_NULL;
3024
3025	oflags = uflags;
3026
3027	if ((oflags & O_ACCMODE) == O_ACCMODE)
3028		return(EINVAL);
3029	flags = FFLAGS(uflags);
3030
3031	AUDIT_ARG(fflags, oflags);
3032	AUDIT_ARG(mode, vap->va_mode);
3033
3034	if ((error = falloc_withalloc(p,
3035	    &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3036		return (error);
3037	}
3038	uu->uu_dupfd = -indx - 1;
3039
3040	if (!(p->p_flag & P_CONTROLT)) {
3041		sessp = proc_session(p);
3042		no_controlling_tty = 1;
3043		/*
3044		 * If conditions would warrant getting a controlling tty if
3045		 * the device being opened is a tty (see ttyopen in tty.c),
3046		 * but the open flags deny it, set a flag in the session to
3047		 * prevent it.
3048		 */
3049		if (SESS_LEADER(p, sessp) &&
3050		    sessp->s_ttyvp == NULL &&
3051		    (flags & O_NOCTTY)) {
3052			session_lock(sessp);
3053		    	sessp->s_flags |= S_NOCTTY;
3054			session_unlock(sessp);
3055			deny_controlling_tty = 1;
3056		}
3057	}
3058
3059	if ((error = vn_open_auth(ndp, &flags, vap))) {
3060		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){	/* XXX from fdopen */
3061			if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3062				fp_drop(p, indx, NULL, 0);
3063			        *retval = indx;
3064				if (deny_controlling_tty) {
3065					session_lock(sessp);
3066					sessp->s_flags &= ~S_NOCTTY;
3067					session_unlock(sessp);
3068				}
3069				if (sessp != SESSION_NULL)
3070					session_rele(sessp);
3071				return (0);
3072			}
3073		}
3074		if (error == ERESTART)
3075		        error = EINTR;
3076		fp_free(p, indx, fp);
3077
3078		if (deny_controlling_tty) {
3079			session_lock(sessp);
3080			sessp->s_flags &= ~S_NOCTTY;
3081			session_unlock(sessp);
3082		}
3083		if (sessp != SESSION_NULL)
3084			session_rele(sessp);
3085		return (error);
3086	}
3087	uu->uu_dupfd = 0;
3088	vp = ndp->ni_vp;
3089
3090	fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY);
3091	fp->f_fglob->fg_ops = &vnops;
3092	fp->f_fglob->fg_data = (caddr_t)vp;
3093
3094#if CONFIG_PROTECT
3095	if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) {
3096		if (vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) {
3097			fp->f_fglob->fg_flag |= FENCRYPTED;
3098		}
3099	}
3100#endif
3101
3102	if (flags & (O_EXLOCK | O_SHLOCK)) {
3103		lf.l_whence = SEEK_SET;
3104		lf.l_start = 0;
3105		lf.l_len = 0;
3106		if (flags & O_EXLOCK)
3107			lf.l_type = F_WRLCK;
3108		else
3109			lf.l_type = F_RDLCK;
3110		type = F_FLOCK;
3111		if ((flags & FNONBLOCK) == 0)
3112			type |= F_WAIT;
3113#if CONFIG_MACF
3114		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3115		    F_SETLK, &lf);
3116		if (error)
3117			goto bad;
3118#endif
3119		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3120			goto bad;
3121		fp->f_fglob->fg_flag |= FHASLOCK;
3122	}
3123
3124	/* try to truncate by setting the size attribute */
3125	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3126		goto bad;
3127
3128	/*
3129	 * If the open flags denied the acquisition of a controlling tty,
3130	 * clear the flag in the session structure that prevented the lower
3131	 * level code from assigning one.
3132	 */
3133	if (deny_controlling_tty) {
3134		session_lock(sessp);
3135		sessp->s_flags &= ~S_NOCTTY;
3136		session_unlock(sessp);
3137	}
3138
3139	/*
3140	 * If a controlling tty was set by the tty line discipline, then we
3141	 * want to set the vp of the tty into the session structure.  We have
3142	 * a race here because we can't get to the vp for the tp in ttyopen,
3143	 * because it's not passed as a parameter in the open path.
3144	 */
3145	if (no_controlling_tty && (p->p_flag & P_CONTROLT)) {
3146		vnode_t ttyvp;
3147
3148		/*
3149		 * We already have a ref from vn_open_auth(), so we can demand another reference.
3150		 */
3151		error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE);
3152		if (error != 0) {
3153			panic("vnode_ref_ext() with VNODE_REF_FORCE failed?!");
3154		}
3155
3156		session_lock(sessp);
3157		ttyvp = sessp->s_ttyvp;
3158		sessp->s_ttyvp = vp;
3159		sessp->s_ttyvid = vnode_vid(vp);
3160		session_unlock(sessp);
3161		if (ttyvp != NULLVP)
3162			vnode_rele(ttyvp);
3163	}
3164
3165	vnode_put(vp);
3166
3167	proc_fdlock(p);
3168	if (flags & O_CLOEXEC)
3169		*fdflags(p, indx) |= UF_EXCLOSE;
3170	if (flags & O_CLOFORK)
3171		*fdflags(p, indx) |= UF_FORKCLOSE;
3172	procfdtbl_releasefd(p, indx, NULL);
3173	fp_drop(p, indx, fp, 1);
3174	proc_fdunlock(p);
3175
3176	*retval = indx;
3177
3178	if (sessp != SESSION_NULL)
3179		session_rele(sessp);
3180	return (0);
3181bad:
3182	if (deny_controlling_tty) {
3183		session_lock(sessp);
3184		sessp->s_flags &= ~S_NOCTTY;
3185		session_unlock(sessp);
3186	}
3187	if (sessp != SESSION_NULL)
3188		session_rele(sessp);
3189
3190	struct vfs_context context = *vfs_context_current();
3191	context.vc_ucred = fp->f_fglob->fg_cred;
3192
3193	vn_close(vp, fp->f_fglob->fg_flag, &context);
3194	vnode_put(vp);
3195	fp_free(p, indx, fp);
3196
3197	return (error);
3198}
3199
3200/*
3201 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3202 *
3203 * Parameters:	p			Process requesting the open
3204 *		uap			User argument descriptor (see below)
3205 *		retval			Pointer to an area to receive the
3206 *					return calue from the system call
3207 *
3208 * Indirect:	uap->path		Path to open (same as 'open')
3209 *		uap->flags		Flags to open (same as 'open'
3210 *		uap->uid		UID to set, if creating
3211 *		uap->gid		GID to set, if creating
3212 *		uap->mode		File mode, if creating (same as 'open')
3213 *		uap->xsecurity		ACL to set, if creating
3214 *
3215 * Returns:	0			Success
3216 *		!0			errno value
3217 *
3218 * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
3219 *
3220 * XXX:		We should enummerate the possible errno values here, and where
3221 *		in the code they originated.
3222 */
3223int
3224open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3225{
3226	struct filedesc *fdp = p->p_fd;
3227	int ciferror;
3228	kauth_filesec_t xsecdst;
3229	struct vnode_attr va;
3230	struct nameidata nd;
3231	int cmode;
3232
3233	AUDIT_ARG(owner, uap->uid, uap->gid);
3234
3235	xsecdst = NULL;
3236	if ((uap->xsecurity != USER_ADDR_NULL) &&
3237	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3238		return ciferror;
3239
3240	VATTR_INIT(&va);
3241	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3242	VATTR_SET(&va, va_mode, cmode);
3243	if (uap->uid != KAUTH_UID_NONE)
3244		VATTR_SET(&va, va_uid, uap->uid);
3245	if (uap->gid != KAUTH_GID_NONE)
3246		VATTR_SET(&va, va_gid, uap->gid);
3247	if (xsecdst != NULL)
3248		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3249
3250	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3251	       uap->path, vfs_context_current());
3252
3253	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3254			 fileproc_alloc_init, NULL, retval);
3255	if (xsecdst != NULL)
3256		kauth_filesec_free(xsecdst);
3257
3258	return ciferror;
3259}
3260
3261/*
3262 * Go through the data-protected atomically controlled open (2)
3263 *
3264 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3265 */
3266int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3267	int flags = uap->flags;
3268	int class = uap->class;
3269	int dpflags = uap->dpflags;
3270
3271	/*
3272	 * Follow the same path as normal open(2)
3273	 * Look up the item if it exists, and acquire the vnode.
3274	 */
3275	struct filedesc *fdp = p->p_fd;
3276	struct vnode_attr va;
3277	struct nameidata nd;
3278	int cmode;
3279	int error;
3280
3281	VATTR_INIT(&va);
3282	/* Mask off all but regular access permissions */
3283	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3284	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3285
3286	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3287	       uap->path, vfs_context_current());
3288
3289	/*
3290	 * Initialize the extra fields in vnode_attr to pass down our
3291	 * extra fields.
3292	 * 1. target cprotect class.
3293	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3294	 */
3295	if (flags & O_CREAT) {
3296		VATTR_SET(&va, va_dataprotect_class, class);
3297	}
3298
3299	if (dpflags & O_DP_GETRAWENCRYPTED) {
3300		if ( flags & (O_RDWR | O_WRONLY)) {
3301			/* Not allowed to write raw encrypted bytes */
3302			return EINVAL;
3303		}
3304		VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3305	}
3306
3307	error = open1(vfs_context_current(), &nd, uap->flags, &va,
3308		      fileproc_alloc_init, NULL, retval);
3309
3310	return error;
3311}
3312
3313
3314int
3315open(proc_t p, struct open_args *uap, int32_t *retval)
3316{
3317	__pthread_testcancel(1);
3318	return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3319}
3320
3321int
3322open_nocancel(proc_t p, struct open_nocancel_args *uap, int32_t *retval)
3323{
3324	struct filedesc *fdp = p->p_fd;
3325	struct vnode_attr va;
3326	struct nameidata nd;
3327	int cmode;
3328
3329	VATTR_INIT(&va);
3330	/* Mask off all but regular access permissions */
3331	cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3332	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3333
3334	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3335	       uap->path, vfs_context_current());
3336
3337	return (open1(vfs_context_current(), &nd, uap->flags, &va,
3338		      fileproc_alloc_init, NULL, retval));
3339}
3340
3341
3342/*
3343 * Create a special file.
3344 */
3345static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3346
3347int
3348mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3349{
3350	struct vnode_attr va;
3351	vfs_context_t ctx = vfs_context_current();
3352	int error;
3353	struct nameidata nd;
3354	vnode_t	vp, dvp;
3355
3356 	VATTR_INIT(&va);
3357 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3358 	VATTR_SET(&va, va_rdev, uap->dev);
3359
3360	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
3361	if ((uap->mode & S_IFMT) == S_IFIFO)
3362 		return(mkfifo1(ctx, uap->path, &va));
3363
3364	AUDIT_ARG(mode, uap->mode);
3365	AUDIT_ARG(value32, uap->dev);
3366
3367	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3368		return (error);
3369	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3370		UIO_USERSPACE, uap->path, ctx);
3371	error = namei(&nd);
3372	if (error)
3373		return (error);
3374	dvp = nd.ni_dvp;
3375	vp = nd.ni_vp;
3376
3377	if (vp != NULL) {
3378		error = EEXIST;
3379		goto out;
3380	}
3381
3382	switch (uap->mode & S_IFMT) {
3383	case S_IFMT:	/* used by badsect to flag bad sectors */
3384		VATTR_SET(&va, va_type, VBAD);
3385		break;
3386	case S_IFCHR:
3387		VATTR_SET(&va, va_type, VCHR);
3388		break;
3389	case S_IFBLK:
3390		VATTR_SET(&va, va_type, VBLK);
3391		break;
3392	default:
3393		error = EINVAL;
3394		goto out;
3395	}
3396
3397#if CONFIG_MACF
3398	error = mac_vnode_check_create(ctx,
3399	    nd.ni_dvp, &nd.ni_cnd, &va);
3400	if (error)
3401		goto out;
3402#endif
3403
3404 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3405 		goto out;
3406
3407	if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3408		goto out;
3409
3410	if (vp) {
3411		int	update_flags = 0;
3412
3413	        // Make sure the name & parent pointers are hooked up
3414	        if (vp->v_name == NULL)
3415			update_flags |= VNODE_UPDATE_NAME;
3416		if (vp->v_parent == NULLVP)
3417		        update_flags |= VNODE_UPDATE_PARENT;
3418
3419		if (update_flags)
3420		        vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3421
3422#if CONFIG_FSE
3423		add_fsevent(FSE_CREATE_FILE, ctx,
3424		    FSE_ARG_VNODE, vp,
3425		    FSE_ARG_DONE);
3426#endif
3427	}
3428
3429out:
3430	/*
3431	 * nameidone has to happen before we vnode_put(dvp)
3432	 * since it may need to release the fs_nodelock on the dvp
3433	 */
3434	nameidone(&nd);
3435
3436	if (vp)
3437	        vnode_put(vp);
3438	vnode_put(dvp);
3439
3440	return (error);
3441}
3442
3443/*
3444 * Create a named pipe.
3445 *
3446 * Returns:	0			Success
3447 *		EEXIST
3448 *	namei:???
3449 *	vnode_authorize:???
3450 *	vn_create:???
3451 */
3452static int
3453mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3454{
3455	vnode_t	vp, dvp;
3456	int error;
3457	struct nameidata nd;
3458
3459	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3460		UIO_USERSPACE, upath, ctx);
3461	error = namei(&nd);
3462	if (error)
3463		return (error);
3464	dvp = nd.ni_dvp;
3465	vp = nd.ni_vp;
3466
3467   	/* check that this is a new file and authorize addition */
3468   	if (vp != NULL) {
3469   		error = EEXIST;
3470   		goto out;
3471   	}
3472   	VATTR_SET(vap, va_type, VFIFO);
3473
3474	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3475		goto out;
3476
3477  	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
3478out:
3479	/*
3480	 * nameidone has to happen before we vnode_put(dvp)
3481	 * since it may need to release the fs_nodelock on the dvp
3482	 */
3483	nameidone(&nd);
3484
3485	if (vp)
3486	        vnode_put(vp);
3487	vnode_put(dvp);
3488
3489	return error;
3490}
3491
3492
3493/*
3494 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
3495 *
3496 * Parameters:	p			Process requesting the open
3497 *		uap			User argument descriptor (see below)
3498 *		retval			(Ignored)
3499 *
3500 * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
3501 *		uap->uid		UID to set
3502 *		uap->gid		GID to set
3503 *		uap->mode		File mode to set (same as 'mkfifo')
3504 *		uap->xsecurity		ACL to set, if creating
3505 *
3506 * Returns:	0			Success
3507 *		!0			errno value
3508 *
3509 * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
3510 *
3511 * XXX:		We should enummerate the possible errno values here, and where
3512 *		in the code they originated.
3513 */
3514int
3515mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
3516{
3517	int ciferror;
3518	kauth_filesec_t xsecdst;
3519	struct vnode_attr va;
3520
3521	AUDIT_ARG(owner, uap->uid, uap->gid);
3522
3523	xsecdst = KAUTH_FILESEC_NONE;
3524	if (uap->xsecurity != USER_ADDR_NULL) {
3525		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
3526			return ciferror;
3527	}
3528
3529	VATTR_INIT(&va);
3530   	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3531	if (uap->uid != KAUTH_UID_NONE)
3532		VATTR_SET(&va, va_uid, uap->uid);
3533	if (uap->gid != KAUTH_GID_NONE)
3534		VATTR_SET(&va, va_gid, uap->gid);
3535	if (xsecdst != KAUTH_FILESEC_NONE)
3536		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3537
3538	ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
3539
3540	if (xsecdst != KAUTH_FILESEC_NONE)
3541		kauth_filesec_free(xsecdst);
3542	return ciferror;
3543}
3544
3545/* ARGSUSED */
3546int
3547mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
3548{
3549	struct vnode_attr va;
3550
3551   	VATTR_INIT(&va);
3552   	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3553
3554	return(mkfifo1(vfs_context_current(), uap->path, &va));
3555}
3556
3557
3558static char *
3559my_strrchr(char *p, int ch)
3560{
3561	char *save;
3562
3563	for (save = NULL;; ++p) {
3564		if (*p == ch)
3565			save = p;
3566		if (!*p)
3567			return(save);
3568	}
3569	/* NOTREACHED */
3570}
3571
3572extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
3573
3574int
3575safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
3576{
3577	int ret, len = _len;
3578
3579	*truncated_path = 0;
3580	ret = vn_getpath(dvp, path, &len);
3581	if (ret == 0 && len < (MAXPATHLEN - 1)) {
3582		if (leafname) {
3583			path[len-1] = '/';
3584			len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
3585			if (len > MAXPATHLEN) {
3586				char *ptr;
3587
3588				// the string got truncated!
3589				*truncated_path = 1;
3590				ptr = my_strrchr(path, '/');
3591				if (ptr) {
3592					*ptr = '\0';   // chop off the string at the last directory component
3593				}
3594				len = strlen(path) + 1;
3595			}
3596		}
3597	} else if (ret == 0) {
3598		*truncated_path = 1;
3599	} else if (ret != 0) {
3600		struct vnode *mydvp=dvp;
3601
3602		if (ret != ENOSPC) {
3603			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
3604			       dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
3605		}
3606		*truncated_path = 1;
3607
3608		do {
3609			if (mydvp->v_parent != NULL) {
3610				mydvp = mydvp->v_parent;
3611			} else if (mydvp->v_mount) {
3612				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
3613				break;
3614			} else {
3615				// no parent and no mount point?  only thing is to punt and say "/" changed
3616				strlcpy(path, "/", _len);
3617				len = 2;
3618				mydvp = NULL;
3619			}
3620
3621			if (mydvp == NULL) {
3622				break;
3623			}
3624
3625			len = _len;
3626			ret = vn_getpath(mydvp, path, &len);
3627		} while (ret == ENOSPC);
3628	}
3629
3630	return len;
3631}
3632
3633
3634/*
3635 * Make a hard file link.
3636 *
3637 * Returns:	0			Success
3638 *		EPERM
3639 *		EEXIST
3640 *		EXDEV
3641 *	namei:???
3642 *	vnode_authorize:???
3643 *	VNOP_LINK:???
3644 */
3645/* ARGSUSED */
3646int
3647link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
3648{
3649	vnode_t	vp, dvp, lvp;
3650	struct nameidata nd;
3651	vfs_context_t ctx = vfs_context_current();
3652	int error;
3653#if CONFIG_FSE
3654	fse_info finfo;
3655#endif
3656	int need_event, has_listeners;
3657	char *target_path = NULL;
3658	int truncated=0;
3659
3660	vp = dvp = lvp = NULLVP;
3661
3662	/* look up the object we are linking to */
3663	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1,
3664		UIO_USERSPACE, uap->path, ctx);
3665	error = namei(&nd);
3666	if (error)
3667		return (error);
3668	vp = nd.ni_vp;
3669
3670	nameidone(&nd);
3671
3672	/*
3673	 * Normally, linking to directories is not supported.
3674	 * However, some file systems may have limited support.
3675	 */
3676	if (vp->v_type == VDIR) {
3677		if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
3678			error = EPERM;   /* POSIX */
3679			goto out;
3680		}
3681		/* Linking to a directory requires ownership. */
3682		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
3683			struct vnode_attr dva;
3684
3685			VATTR_INIT(&dva);
3686			VATTR_WANTED(&dva, va_uid);
3687			if (vnode_getattr(vp, &dva, ctx) != 0 ||
3688			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
3689			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
3690				error = EACCES;
3691				goto out;
3692			}
3693		}
3694	}
3695
3696	/* lookup the target node */
3697#if CONFIG_TRIGGERS
3698	nd.ni_op = OP_LINK;
3699#endif
3700	nd.ni_cnd.cn_nameiop = CREATE;
3701	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
3702	nd.ni_dirp = uap->link;
3703	error = namei(&nd);
3704	if (error != 0)
3705		goto out;
3706	dvp = nd.ni_dvp;
3707	lvp = nd.ni_vp;
3708
3709#if CONFIG_MACF
3710	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
3711		goto out2;
3712#endif
3713
3714  	/* or to anything that kauth doesn't want us to (eg. immutable items) */
3715  	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
3716 		goto out2;
3717
3718	/* target node must not exist */
3719	if (lvp != NULLVP) {
3720		error = EEXIST;
3721		goto out2;
3722	}
3723  	/* cannot link across mountpoints */
3724  	if (vnode_mount(vp) != vnode_mount(dvp)) {
3725  		error = EXDEV;
3726  		goto out2;
3727  	}
3728
3729  	/* authorize creation of the target note */
3730  	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3731  		goto out2;
3732
3733	/* and finally make the link */
3734	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
3735	if (error)
3736		goto out2;
3737
3738#if CONFIG_MACF
3739	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
3740#endif
3741
3742#if CONFIG_FSE
3743	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
3744#else
3745	need_event = 0;
3746#endif
3747	has_listeners = kauth_authorize_fileop_has_listeners();
3748
3749	if (need_event || has_listeners) {
3750		char *link_to_path = NULL;
3751		int len, link_name_len;
3752
3753		/* build the path to the new link file */
3754		GET_PATH(target_path);
3755		if (target_path == NULL) {
3756			error = ENOMEM;
3757			goto out2;
3758		}
3759
3760		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
3761
3762		if (has_listeners) {
3763		        /* build the path to file we are linking to */
3764			GET_PATH(link_to_path);
3765			if (link_to_path == NULL) {
3766				error = ENOMEM;
3767				goto out2;
3768			}
3769
3770			link_name_len = MAXPATHLEN;
3771			vn_getpath(vp, link_to_path, &link_name_len);
3772
3773			/*
3774			 * Call out to allow 3rd party notification of rename.
3775			 * Ignore result of kauth_authorize_fileop call.
3776			 */
3777			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
3778					       (uintptr_t)link_to_path, (uintptr_t)target_path);
3779			if (link_to_path != NULL) {
3780				RELEASE_PATH(link_to_path);
3781			}
3782		}
3783#if CONFIG_FSE
3784		if (need_event) {
3785		        /* construct fsevent */
3786		        if (get_fse_info(vp, &finfo, ctx) == 0) {
3787				if (truncated) {
3788					finfo.mode |= FSE_TRUNCATED_PATH;
3789				}
3790
3791			        // build the path to the destination of the link
3792			        add_fsevent(FSE_CREATE_FILE, ctx,
3793					    FSE_ARG_STRING, len, target_path,
3794					    FSE_ARG_FINFO, &finfo,
3795					    FSE_ARG_DONE);
3796			}
3797			if (vp->v_parent) {
3798			    add_fsevent(FSE_STAT_CHANGED, ctx,
3799				FSE_ARG_VNODE, vp->v_parent,
3800				FSE_ARG_DONE);
3801			}
3802		}
3803#endif
3804	}
3805out2:
3806	/*
3807	 * nameidone has to happen before we vnode_put(dvp)
3808	 * since it may need to release the fs_nodelock on the dvp
3809	 */
3810	nameidone(&nd);
3811	if (target_path != NULL) {
3812		RELEASE_PATH(target_path);
3813	}
3814out:
3815	if (lvp)
3816		vnode_put(lvp);
3817	if (dvp)
3818		vnode_put(dvp);
3819	vnode_put(vp);
3820	return (error);
3821}
3822
3823/*
3824 * Make a symbolic link.
3825 *
3826 * We could add support for ACLs here too...
3827 */
3828/* ARGSUSED */
3829int
3830symlink(proc_t p, struct symlink_args *uap, __unused int32_t *retval)
3831{
3832	struct vnode_attr va;
3833	char *path;
3834	int error;
3835	struct nameidata nd;
3836	vfs_context_t ctx = vfs_context_current();
3837	vnode_t	vp, dvp;
3838	size_t dummy=0;
3839
3840	MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
3841	error = copyinstr(uap->path, path, MAXPATHLEN, &dummy);
3842	if (error)
3843		goto out;
3844	AUDIT_ARG(text, path);	/* This is the link string */
3845
3846	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
3847		UIO_USERSPACE, uap->link, ctx);
3848	error = namei(&nd);
3849	if (error)
3850		goto out;
3851	dvp = nd.ni_dvp;
3852	vp = nd.ni_vp;
3853
3854	VATTR_INIT(&va);
3855	VATTR_SET(&va, va_type, VLNK);
3856	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
3857#if CONFIG_MACF
3858	error = mac_vnode_check_create(ctx,
3859			dvp, &nd.ni_cnd, &va);
3860#endif
3861	if (error != 0) {
3862	    goto skipit;
3863	}
3864
3865	if (vp != NULL) {
3866	    error = EEXIST;
3867	    goto skipit;
3868	}
3869
3870	/* authorize */
3871	if (error == 0)
3872		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
3873	/* get default ownership, etc. */
3874	if (error == 0)
3875		error = vnode_authattr_new(dvp, &va, 0, ctx);
3876	if (error == 0)
3877		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
3878
3879#if CONFIG_MACF
3880	if (error == 0)
3881		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
3882#endif
3883
3884	/* do fallback attribute handling */
3885	if (error == 0)
3886		error = vnode_setattr_fallback(vp, &va, ctx);
3887
3888	if (error == 0) {
3889		int	update_flags = 0;
3890
3891		if (vp == NULL) {
3892			nd.ni_cnd.cn_nameiop = LOOKUP;
3893#if CONFIG_TRIGGERS
3894			nd.ni_op = OP_LOOKUP;
3895#endif
3896			nd.ni_cnd.cn_flags = 0;
3897			error = namei(&nd);
3898			vp = nd.ni_vp;
3899
3900			if (vp == NULL)
3901				goto skipit;
3902		}
3903
3904#if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
3905		/* call out to allow 3rd party notification of rename.
3906		 * Ignore result of kauth_authorize_fileop call.
3907		 */
3908		if (kauth_authorize_fileop_has_listeners() &&
3909		    namei(&nd) == 0) {
3910			char *new_link_path = NULL;
3911			int		len;
3912
3913			/* build the path to the new link file */
3914			new_link_path = get_pathbuff();
3915			len = MAXPATHLEN;
3916			vn_getpath(dvp, new_link_path, &len);
3917			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
3918				new_link_path[len - 1] = '/';
3919				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
3920			}
3921
3922			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
3923					   (uintptr_t)path, (uintptr_t)new_link_path);
3924			if (new_link_path != NULL)
3925				release_pathbuff(new_link_path);
3926		}
3927#endif
3928		// Make sure the name & parent pointers are hooked up
3929		if (vp->v_name == NULL)
3930			update_flags |= VNODE_UPDATE_NAME;
3931		if (vp->v_parent == NULLVP)
3932			update_flags |= VNODE_UPDATE_PARENT;
3933
3934		if (update_flags)
3935			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3936
3937#if CONFIG_FSE
3938		add_fsevent(FSE_CREATE_FILE, ctx,
3939			    FSE_ARG_VNODE, vp,
3940			    FSE_ARG_DONE);
3941#endif
3942	}
3943
3944skipit:
3945	/*
3946	 * nameidone has to happen before we vnode_put(dvp)
3947	 * since it may need to release the fs_nodelock on the dvp
3948	 */
3949	nameidone(&nd);
3950
3951	if (vp)
3952	        vnode_put(vp);
3953	vnode_put(dvp);
3954out:
3955	FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
3956
3957	return (error);
3958}
3959
3960/*
3961 * Delete a whiteout from the filesystem.
3962 * XXX authorization not implmented for whiteouts
3963 */
3964int
3965undelete(__unused proc_t p, struct undelete_args *uap, __unused int32_t *retval)
3966{
3967	int error;
3968	struct nameidata nd;
3969	vfs_context_t ctx = vfs_context_current();
3970	vnode_t	vp, dvp;
3971
3972	NDINIT(&nd, DELETE, OP_UNLINK, LOCKPARENT | DOWHITEOUT | AUDITVNPATH1,
3973		UIO_USERSPACE, uap->path, ctx);
3974	error = namei(&nd);
3975	if (error)
3976		return (error);
3977	dvp = nd.ni_dvp;
3978	vp = nd.ni_vp;
3979
3980	if (vp == NULLVP && (nd.ni_cnd.cn_flags & ISWHITEOUT)) {
3981		error = VNOP_WHITEOUT(dvp, &nd.ni_cnd, DELETE, ctx);
3982	} else
3983	        error = EEXIST;
3984
3985	/*
3986	 * nameidone has to happen before we vnode_put(dvp)
3987	 * since it may need to release the fs_nodelock on the dvp
3988	 */
3989	nameidone(&nd);
3990
3991	if (vp)
3992	        vnode_put(vp);
3993	vnode_put(dvp);
3994
3995	return (error);
3996}
3997
3998
3999/*
4000 * Delete a name from the filesystem.
4001 */
4002/* ARGSUSED */
4003int
4004unlink1(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags)
4005{
4006	vnode_t	vp, dvp;
4007	int error;
4008	struct componentname *cnp;
4009	char  *path = NULL;
4010	int  len=0;
4011#if CONFIG_FSE
4012	fse_info  finfo;
4013	struct vnode_attr va;
4014#endif
4015	int flags = 0;
4016	int need_event = 0;
4017	int has_listeners = 0;
4018	int truncated_path=0;
4019	int batched;
4020	struct vnode_attr *vap = NULL;
4021
4022#if NAMEDRSRCFORK
4023	/* unlink or delete is allowed on rsrc forks and named streams */
4024	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
4025#endif
4026
4027	ndp->ni_cnd.cn_flags |= LOCKPARENT;
4028	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
4029	cnp = &ndp->ni_cnd;
4030
4031lookup_continue:
4032	error = namei(ndp);
4033	if (error)
4034		return (error);
4035
4036	dvp = ndp->ni_dvp;
4037	vp = ndp->ni_vp;
4038
4039
4040	/* With Carbon delete semantics, busy files cannot be deleted */
4041	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4042		flags |= VNODE_REMOVE_NODELETEBUSY;
4043	}
4044
4045	/* Skip any potential upcalls if told to. */
4046	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4047		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4048	}
4049
4050	if (vp) {
4051		batched = vnode_compound_remove_available(vp);
4052		/*
4053		 * The root of a mounted filesystem cannot be deleted.
4054		 */
4055		if (vp->v_flag & VROOT) {
4056			error = EBUSY;
4057		}
4058
4059		if (!batched) {
4060			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4061			if (error) {
4062				goto out;
4063			}
4064		}
4065	} else {
4066		batched = 1;
4067
4068		if (!vnode_compound_remove_available(dvp)) {
4069			panic("No vp, but no compound remove?");
4070		}
4071	}
4072
4073#if CONFIG_FSE
4074	need_event = need_fsevent(FSE_DELETE, dvp);
4075	if (need_event) {
4076		if (!batched) {
4077			if ((vp->v_flag & VISHARDLINK) == 0) {
4078				/* XXX need to get these data in batched VNOP */
4079				get_fse_info(vp, &finfo, ctx);
4080			}
4081		} else {
4082			error = vfs_get_notify_attributes(&va);
4083			if (error) {
4084				goto out;
4085			}
4086
4087			vap = &va;
4088		}
4089	}
4090#endif
4091	has_listeners = kauth_authorize_fileop_has_listeners();
4092	if (need_event || has_listeners) {
4093		if (path == NULL) {
4094			GET_PATH(path);
4095			if (path == NULL) {
4096				error = ENOMEM;
4097				goto out;
4098			}
4099		}
4100		len = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4101	}
4102
4103#if NAMEDRSRCFORK
4104	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4105		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4106	else
4107#endif
4108	{
4109		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
4110		vp = ndp->ni_vp;
4111		if (error == EKEEPLOOKING) {
4112			if (!batched) {
4113				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4114			}
4115
4116			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
4117				panic("EKEEPLOOKING, but continue flag not set?");
4118			}
4119
4120			if (vnode_isdir(vp)) {
4121				error = EISDIR;
4122				goto out;
4123			}
4124			goto lookup_continue;
4125		}
4126	}
4127
4128	/*
4129	 * Call out to allow 3rd party notification of delete.
4130	 * Ignore result of kauth_authorize_fileop call.
4131	 */
4132	if (!error) {
4133		if (has_listeners) {
4134			kauth_authorize_fileop(vfs_context_ucred(ctx),
4135				KAUTH_FILEOP_DELETE,
4136				(uintptr_t)vp,
4137				(uintptr_t)path);
4138		}
4139
4140		if (vp->v_flag & VISHARDLINK) {
4141		    //
4142		    // if a hardlink gets deleted we want to blow away the
4143		    // v_parent link because the path that got us to this
4144		    // instance of the link is no longer valid.  this will
4145		    // force the next call to get the path to ask the file
4146		    // system instead of just following the v_parent link.
4147		    //
4148		    vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4149		}
4150
4151#if CONFIG_FSE
4152		if (need_event) {
4153			if (vp->v_flag & VISHARDLINK) {
4154				get_fse_info(vp, &finfo, ctx);
4155			} else if (vap) {
4156				vnode_get_fse_info_from_vap(vp, &finfo, vap);
4157			}
4158			if (truncated_path) {
4159				finfo.mode |= FSE_TRUNCATED_PATH;
4160			}
4161			add_fsevent(FSE_DELETE, ctx,
4162						FSE_ARG_STRING, len, path,
4163						FSE_ARG_FINFO, &finfo,
4164						FSE_ARG_DONE);
4165		}
4166#endif
4167	}
4168
4169out:
4170	if (path != NULL)
4171		RELEASE_PATH(path);
4172
4173#if NAMEDRSRCFORK
4174	/* recycle the deleted rsrc fork vnode to force a reclaim, which
4175	 * will cause its shadow file to go away if necessary.
4176	 */
4177	 if (vp && (vnode_isnamedstream(vp)) &&
4178		(vp->v_parent != NULLVP) &&
4179		vnode_isshadow(vp)) {
4180   			vnode_recycle(vp);
4181	 }
4182#endif
4183	/*
4184	 * nameidone has to happen before we vnode_put(dvp)
4185	 * since it may need to release the fs_nodelock on the dvp
4186	 */
4187	nameidone(ndp);
4188	vnode_put(dvp);
4189	if (vp) {
4190		vnode_put(vp);
4191	}
4192	return (error);
4193}
4194
4195/*
4196 * Delete a name from the filesystem using POSIX semantics.
4197 */
4198int
4199unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4200{
4201	struct nameidata nd;
4202	vfs_context_t ctx = vfs_context_current();
4203
4204	NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE,
4205	       uap->path, ctx);
4206	return unlink1(ctx, &nd, 0);
4207}
4208
4209/*
4210 * Delete a name from the filesystem using Carbon semantics.
4211 */
4212int
4213delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4214{
4215	struct nameidata nd;
4216	vfs_context_t ctx = vfs_context_current();
4217
4218	NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE,
4219	       uap->path, ctx);
4220	return unlink1(ctx, &nd, VNODE_REMOVE_NODELETEBUSY);
4221}
4222
4223/*
4224 * Reposition read/write file offset.
4225 */
4226int
4227lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4228{
4229	struct fileproc *fp;
4230	vnode_t vp;
4231	struct vfs_context *ctx;
4232	off_t offset = uap->offset, file_size;
4233	int error;
4234
4235	if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4236	        if (error == ENOTSUP)
4237		        return (ESPIPE);
4238		return (error);
4239	}
4240	if (vnode_isfifo(vp)) {
4241		file_drop(uap->fd);
4242		return(ESPIPE);
4243	}
4244
4245
4246	ctx = vfs_context_current();
4247#if CONFIG_MACF
4248	if (uap->whence == L_INCR && uap->offset == 0)
4249		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4250		    fp->f_fglob);
4251	else
4252		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4253		    fp->f_fglob);
4254	if (error) {
4255		file_drop(uap->fd);
4256		return (error);
4257	}
4258#endif
4259	if ( (error = vnode_getwithref(vp)) ) {
4260		file_drop(uap->fd);
4261		return(error);
4262	}
4263
4264	switch (uap->whence) {
4265	case L_INCR:
4266		offset += fp->f_fglob->fg_offset;
4267		break;
4268	case L_XTND:
4269		if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4270			break;
4271		offset += file_size;
4272		break;
4273	case L_SET:
4274		break;
4275	default:
4276		error = EINVAL;
4277	}
4278	if (error == 0) {
4279		if (uap->offset > 0 && offset < 0) {
4280			/* Incremented/relative move past max size */
4281			error = EOVERFLOW;
4282		} else {
4283			/*
4284			 * Allow negative offsets on character devices, per
4285			 * POSIX 1003.1-2001.  Most likely for writing disk
4286			 * labels.
4287			 */
4288			if (offset < 0 && vp->v_type != VCHR) {
4289				/* Decremented/relative move before start */
4290				error = EINVAL;
4291			} else {
4292				/* Success */
4293				fp->f_fglob->fg_offset = offset;
4294				*retval = fp->f_fglob->fg_offset;
4295			}
4296		}
4297	}
4298
4299	/*
4300	 * An lseek can affect whether data is "available to read."  Use
4301	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
4302	 */
4303	post_event_if_success(vp, error, NOTE_NONE);
4304	(void)vnode_put(vp);
4305	file_drop(uap->fd);
4306	return (error);
4307}
4308
4309
4310/*
4311 * Check access permissions.
4312 *
4313 * Returns:	0			Success
4314 *		vnode_authorize:???
4315 */
4316static int
4317access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4318{
4319 	kauth_action_t action;
4320	int error;
4321
4322 	/*
4323 	 * If just the regular access bits, convert them to something
4324	 * that vnode_authorize will understand.
4325 	 */
4326 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4327 		action = 0;
4328  		if (uflags & R_OK)
4329			action |= KAUTH_VNODE_READ_DATA;	/* aka KAUTH_VNODE_LIST_DIRECTORY */
4330  		if (uflags & W_OK) {
4331			if (vnode_isdir(vp)) {
4332				action |= KAUTH_VNODE_ADD_FILE |
4333				    KAUTH_VNODE_ADD_SUBDIRECTORY;
4334				/* might want delete rights here too */
4335			} else {
4336				action |= KAUTH_VNODE_WRITE_DATA;
4337			}
4338		}
4339  		if (uflags & X_OK) {
4340			if (vnode_isdir(vp)) {
4341				action |= KAUTH_VNODE_SEARCH;
4342			} else {
4343				action |= KAUTH_VNODE_EXECUTE;
4344			}
4345		}
4346  	} else {
4347		/* take advantage of definition of uflags */
4348		action = uflags >> 8;
4349	}
4350
4351#if CONFIG_MACF
4352	error = mac_vnode_check_access(ctx, vp, uflags);
4353	if (error)
4354		return (error);
4355#endif /* MAC */
4356
4357 	/* action == 0 means only check for existence */
4358 	if (action != 0) {
4359 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4360	} else {
4361		error = 0;
4362	}
4363
4364	return(error);
4365}
4366
4367
4368
4369/*
4370 * access_extended: Check access permissions in bulk.
4371 *
4372 * Description:	uap->entries		Pointer to an array of accessx
4373 * 					descriptor structs, plus one or
4374 * 					more NULL terminated strings (see
4375 * 					"Notes" section below).
4376 *		uap->size		Size of the area pointed to by
4377 *					uap->entries.
4378 *		uap->results		Pointer to the results array.
4379 *
4380 * Returns:	0			Success
4381 *		ENOMEM			Insufficient memory
4382 *		EINVAL			Invalid arguments
4383 *		namei:EFAULT		Bad address
4384 *		namei:ENAMETOOLONG	Filename too long
4385 *		namei:ENOENT		No such file or directory
4386 *		namei:ELOOP		Too many levels of symbolic links
4387 *		namei:EBADF		Bad file descriptor
4388 *		namei:ENOTDIR		Not a directory
4389 *		namei:???
4390 *		access1:
4391 *
4392 * Implicit returns:
4393 *		uap->results		Array contents modified
4394 *
4395 * Notes:	The uap->entries are structured as an arbitrary length array
4396 *		of accessx descriptors, followed by one or more NULL terminated
4397 *		strings
4398 *
4399 *			struct accessx_descriptor[0]
4400 *			...
4401 *			struct accessx_descriptor[n]
4402 *			char name_data[0];
4403 *
4404 *		We determine the entry count by walking the buffer containing
4405 *		the uap->entries argument descriptor.  For each descriptor we
4406 *		see, the valid values for the offset ad_name_offset will be
4407 *		in the byte range:
4408 *
4409 *			[ uap->entries + sizeof(struct accessx_descriptor) ]
4410 *						to
4411 *				[ uap->entries + uap->size - 2 ]
4412 *
4413 *		since we must have at least one string, and the string must
4414 *		be at least one character plus the NULL terminator in length.
4415 *
4416 * XXX:		Need to support the check-as uid argument
4417 */
4418int
4419access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
4420{
4421	struct accessx_descriptor *input = NULL;
4422	errno_t *result = NULL;
4423	errno_t error = 0;
4424	int wantdelete = 0;
4425	unsigned int desc_max, desc_actual, i, j;
4426	struct vfs_context context;
4427	struct nameidata nd;
4428 	int niopts;
4429	vnode_t vp = NULL;
4430	vnode_t dvp = NULL;
4431#define ACCESSX_MAX_DESCR_ON_STACK 10
4432	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
4433
4434	context.vc_ucred = NULL;
4435
4436	/*
4437	 * Validate parameters; if valid, copy the descriptor array and string
4438	 * arguments into local memory.  Before proceeding, the following
4439	 * conditions must have been met:
4440	 *
4441	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
4442	 * o	There must be sufficient room in the request for at least one
4443	 *	descriptor and a one yte NUL terminated string.
4444	 * o	The allocation of local storage must not fail.
4445	 */
4446	if (uap->size > ACCESSX_MAX_TABLESIZE)
4447		return(ENOMEM);
4448	if (uap->size < (sizeof(struct accessx_descriptor) + 2))
4449		return(EINVAL);
4450	if (uap->size <= sizeof (stack_input)) {
4451		input = stack_input;
4452	} else {
4453	MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
4454	if (input == NULL) {
4455		error = ENOMEM;
4456		goto out;
4457	}
4458	}
4459	error = copyin(uap->entries, input, uap->size);
4460	if (error)
4461		goto out;
4462
4463	AUDIT_ARG(opaque, input, uap->size);
4464
4465	/*
4466	 * Force NUL termination of the copyin buffer to avoid nami() running
4467	 * off the end.  If the caller passes us bogus data, they may get a
4468	 * bogus result.
4469	 */
4470	((char *)input)[uap->size - 1] = 0;
4471
4472	/*
4473	 * Access is defined as checking against the process' real identity,
4474 	 * even if operations are checking the effective identity.  This
4475	 * requires that we use a local vfs context.
4476 	 */
4477	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4478	context.vc_thread = current_thread();
4479
4480	/*
4481	 * Find out how many entries we have, so we can allocate the result
4482	 * array by walking the list and adjusting the count downward by the
4483	 * earliest string offset we see.
4484	 */
4485	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
4486	desc_actual = desc_max;
4487	for (i = 0; i < desc_actual; i++) {
4488		/*
4489		 * Take the offset to the name string for this entry and
4490		 * convert to an input array index, which would be one off
4491		 * the end of the array if this entry was the lowest-addressed
4492		 * name string.
4493		 */
4494		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
4495
4496		/*
4497		 * An offset greater than the max allowable offset is an error.
4498		 * It is also an error for any valid entry to point
4499		 * to a location prior to the end of the current entry, if
4500		 * it's not a reference to the string of the previous entry.
4501		 */
4502		if (j > desc_max || (j != 0 && j <= i)) {
4503			error = EINVAL;
4504			goto out;
4505		}
4506
4507		/*
4508		 * An offset of 0 means use the previous descriptor's offset;
4509		 * this is used to chain multiple requests for the same file
4510		 * to avoid multiple lookups.
4511		 */
4512		if (j == 0) {
4513			/* This is not valid for the first entry */
4514			if (i == 0) {
4515				error = EINVAL;
4516				goto out;
4517			}
4518			continue;
4519		}
4520
4521		/*
4522		 * If the offset of the string for this descriptor is before
4523		 * what we believe is the current actual last descriptor,
4524		 * then we need to adjust our estimate downward; this permits
4525		 * the string table following the last descriptor to be out
4526		 * of order relative to the descriptor list.
4527		 */
4528		if (j < desc_actual)
4529			desc_actual = j;
4530	}
4531
4532	/*
4533	 * We limit the actual number of descriptors we are willing to process
4534	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
4535	 * requested does not exceed this limit,
4536	 */
4537	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
4538		error = ENOMEM;
4539		goto out;
4540	}
4541	MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
4542	if (result == NULL) {
4543		error = ENOMEM;
4544		goto out;
4545	}
4546
4547	/*
4548	 * Do the work by iterating over the descriptor entries we know to
4549	 * at least appear to contain valid data.
4550	 */
4551	error = 0;
4552	for (i = 0; i < desc_actual; i++) {
4553		/*
4554		 * If the ad_name_offset is 0, then we use the previous
4555		 * results to make the check; otherwise, we are looking up
4556		 * a new file name.
4557		 */
4558		if (input[i].ad_name_offset != 0) {
4559			/* discard old vnodes */
4560			if (vp) {
4561				vnode_put(vp);
4562				vp = NULL;
4563			}
4564			if (dvp) {
4565				vnode_put(dvp);
4566				dvp = NULL;
4567			}
4568
4569			/*
4570			 * Scan forward in the descriptor list to see if we
4571			 * need the parent vnode.  We will need it if we are
4572			 * deleting, since we must have rights  to remove
4573			 * entries in the parent directory, as well as the
4574			 * rights to delete the object itself.
4575			 */
4576			wantdelete = input[i].ad_flags & _DELETE_OK;
4577			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
4578				if (input[j].ad_flags & _DELETE_OK)
4579					wantdelete = 1;
4580
4581			niopts = FOLLOW | AUDITVNPATH1;
4582
4583			/* need parent for vnode_authorize for deletion test */
4584			if (wantdelete)
4585				niopts |= WANTPARENT;
4586
4587			/* do the lookup */
4588			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
4589			       CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
4590			       &context);
4591			error = namei(&nd);
4592			if (!error) {
4593				vp = nd.ni_vp;
4594				if (wantdelete)
4595					dvp = nd.ni_dvp;
4596			}
4597			nameidone(&nd);
4598		}
4599
4600		/*
4601		 * Handle lookup errors.
4602		 */
4603		switch(error) {
4604		case ENOENT:
4605		case EACCES:
4606		case EPERM:
4607		case ENOTDIR:
4608			result[i] = error;
4609			break;
4610		case 0:
4611			/* run this access check */
4612			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
4613			break;
4614		default:
4615			/* fatal lookup error */
4616
4617			goto out;
4618		}
4619	}
4620
4621	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
4622
4623	/* copy out results */
4624	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
4625
4626out:
4627	if (input && input != stack_input)
4628		FREE(input, M_TEMP);
4629	if (result)
4630		FREE(result, M_TEMP);
4631	if (vp)
4632		vnode_put(vp);
4633	if (dvp)
4634		vnode_put(dvp);
4635	if (IS_VALID_CRED(context.vc_ucred))
4636 		kauth_cred_unref(&context.vc_ucred);
4637	return(error);
4638}
4639
4640
4641/*
4642 * Returns:	0			Success
4643 *		namei:EFAULT		Bad address
4644 *		namei:ENAMETOOLONG	Filename too long
4645 *		namei:ENOENT		No such file or directory
4646 *		namei:ELOOP		Too many levels of symbolic links
4647 *		namei:EBADF		Bad file descriptor
4648 *		namei:ENOTDIR		Not a directory
4649 *		namei:???
4650 *		access1:
4651 */
4652int
4653access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
4654{
4655	int error;
4656	struct nameidata nd;
4657 	int niopts;
4658	struct vfs_context context;
4659#if NAMEDRSRCFORK
4660	int is_namedstream = 0;
4661#endif
4662
4663 	/*
4664 	 * Access is defined as checking against the process'
4665 	 * real identity, even if operations are checking the
4666 	 * effective identity.  So we need to tweak the credential
4667 	 * in the context.
4668 	 */
4669	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4670	context.vc_thread = current_thread();
4671
4672	niopts = FOLLOW | AUDITVNPATH1;
4673 	/* need parent for vnode_authorize for deletion test */
4674 	if (uap->flags & _DELETE_OK)
4675 		niopts |= WANTPARENT;
4676 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_USERSPACE,
4677 	       uap->path, &context);
4678
4679#if NAMEDRSRCFORK
4680	/* access(F_OK) calls are allowed for resource forks. */
4681	if (uap->flags == F_OK)
4682		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
4683#endif
4684 	error = namei(&nd);
4685 	if (error)
4686 		goto out;
4687
4688#if NAMEDRSRCFORK
4689	/* Grab reference on the shadow stream file vnode to
4690	 * force an inactive on release which will mark it
4691	 * for recycle.
4692	 */
4693	if (vnode_isnamedstream(nd.ni_vp) &&
4694	    (nd.ni_vp->v_parent != NULLVP) &&
4695	    vnode_isshadow(nd.ni_vp)) {
4696		is_namedstream = 1;
4697		vnode_ref(nd.ni_vp);
4698	}
4699#endif
4700
4701	error = access1(nd.ni_vp, nd.ni_dvp, uap->flags, &context);
4702
4703#if NAMEDRSRCFORK
4704	if (is_namedstream) {
4705		vnode_rele(nd.ni_vp);
4706	}
4707#endif
4708
4709 	vnode_put(nd.ni_vp);
4710 	if (uap->flags & _DELETE_OK)
4711 		vnode_put(nd.ni_dvp);
4712  	nameidone(&nd);
4713
4714out:
4715 	kauth_cred_unref(&context.vc_ucred);
4716 	return(error);
4717}
4718
4719
4720/*
4721 * Returns:	0			Success
4722 *		EFAULT
4723 *	copyout:EFAULT
4724 *	namei:???
4725 *	vn_stat:???
4726 */
4727static int
4728stat2(vfs_context_t ctx, struct nameidata *ndp, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64)
4729{
4730	union {
4731		struct stat sb;
4732		struct stat64 sb64;
4733	} source;
4734	union {
4735		struct user64_stat user64_sb;
4736		struct user32_stat user32_sb;
4737		struct user64_stat64 user64_sb64;
4738		struct user32_stat64 user32_sb64;
4739	} dest;
4740	caddr_t sbp;
4741	int error, my_size;
4742	kauth_filesec_t fsec;
4743	size_t xsecurity_bufsize;
4744	void * statptr;
4745
4746#if NAMEDRSRCFORK
4747	int is_namedstream = 0;
4748	/* stat calls are allowed for resource forks. */
4749	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
4750#endif
4751	error = namei(ndp);
4752	if (error)
4753		return (error);
4754	fsec = KAUTH_FILESEC_NONE;
4755
4756	statptr = (void *)&source;
4757
4758#if NAMEDRSRCFORK
4759	/* Grab reference on the shadow stream file vnode to
4760	 * force an inactive on release which will mark it
4761	 * for recycle.
4762	 */
4763	if (vnode_isnamedstream(ndp->ni_vp) &&
4764	    (ndp->ni_vp->v_parent != NULLVP) &&
4765	    vnode_isshadow(ndp->ni_vp)) {
4766		is_namedstream = 1;
4767		vnode_ref(ndp->ni_vp);
4768	}
4769#endif
4770
4771	error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
4772
4773#if NAMEDRSRCFORK
4774	if (is_namedstream) {
4775		vnode_rele(ndp->ni_vp);
4776	}
4777#endif
4778	vnode_put(ndp->ni_vp);
4779	nameidone(ndp);
4780
4781	if (error)
4782		return (error);
4783	/* Zap spare fields */
4784	if (isstat64 != 0) {
4785		source.sb64.st_lspare = 0;
4786		source.sb64.st_qspare[0] = 0LL;
4787		source.sb64.st_qspare[1] = 0LL;
4788		if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
4789			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
4790			my_size = sizeof(dest.user64_sb64);
4791			sbp = (caddr_t)&dest.user64_sb64;
4792		} else {
4793			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
4794			my_size = sizeof(dest.user32_sb64);
4795			sbp = (caddr_t)&dest.user32_sb64;
4796		}
4797		/*
4798		 * Check if we raced (post lookup) against the last unlink of a file.
4799		 */
4800		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
4801			source.sb64.st_nlink = 1;
4802		}
4803	} else {
4804		source.sb.st_lspare = 0;
4805		source.sb.st_qspare[0] = 0LL;
4806		source.sb.st_qspare[1] = 0LL;
4807		if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
4808			munge_user64_stat(&source.sb, &dest.user64_sb);
4809			my_size = sizeof(dest.user64_sb);
4810			sbp = (caddr_t)&dest.user64_sb;
4811		} else {
4812			munge_user32_stat(&source.sb, &dest.user32_sb);
4813			my_size = sizeof(dest.user32_sb);
4814			sbp = (caddr_t)&dest.user32_sb;
4815		}
4816
4817		/*
4818		 * Check if we raced (post lookup) against the last unlink of a file.
4819		 */
4820		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
4821			source.sb.st_nlink = 1;
4822		}
4823	}
4824	if ((error = copyout(sbp, ub, my_size)) != 0)
4825		goto out;
4826
4827	/* caller wants extended security information? */
4828	if (xsecurity != USER_ADDR_NULL) {
4829
4830		/* did we get any? */
4831		if (fsec == KAUTH_FILESEC_NONE) {
4832			if (susize(xsecurity_size, 0) != 0) {
4833				error = EFAULT;
4834				goto out;
4835			}
4836		} else {
4837			/* find the user buffer size */
4838			xsecurity_bufsize = fusize(xsecurity_size);
4839
4840			/* copy out the actual data size */
4841			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
4842				error = EFAULT;
4843				goto out;
4844			}
4845
4846			/* if the caller supplied enough room, copy out to it */
4847			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
4848				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
4849		}
4850	}
4851out:
4852	if (fsec != KAUTH_FILESEC_NONE)
4853		kauth_filesec_free(fsec);
4854	return (error);
4855}
4856
4857/*
4858 * Get file status; this version follows links.
4859 *
4860 * Returns:	0			Success
4861 *	stat2:???			[see stat2() in this file]
4862 */
4863static int
4864stat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64)
4865{
4866	struct nameidata nd;
4867	vfs_context_t ctx = vfs_context_current();
4868
4869	NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | FOLLOW | AUDITVNPATH1,
4870	    UIO_USERSPACE, path, ctx);
4871	return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64));
4872}
4873
4874/*
4875 * stat_extended: Get file status; with extended security (ACL).
4876 *
4877 * Parameters:    p                       (ignored)
4878 *                uap                     User argument descriptor (see below)
4879 *                retval                  (ignored)
4880 *
4881 * Indirect:      uap->path               Path of file to get status from
4882 *                uap->ub                 User buffer (holds file status info)
4883 *                uap->xsecurity          ACL to get (extended security)
4884 *                uap->xsecurity_size     Size of ACL
4885 *
4886 * Returns:        0                      Success
4887 *                !0                      errno value
4888 *
4889 */
4890int
4891stat_extended(__unused proc_t p, struct stat_extended_args *uap, __unused int32_t *retval)
4892{
4893	return (stat1(uap->path, uap->ub, uap->xsecurity, uap->xsecurity_size, 0));
4894}
4895
4896/*
4897 * Returns:	0			Success
4898 *	stat1:???			[see stat1() in this file]
4899 */
4900int
4901stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
4902{
4903	return(stat1(uap->path, uap->ub, 0, 0, 0));
4904}
4905
4906int
4907stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
4908{
4909	return(stat1(uap->path, uap->ub, 0, 0, 1));
4910}
4911
4912/*
4913 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
4914 *
4915 * Parameters:    p                       (ignored)
4916 *                uap                     User argument descriptor (see below)
4917 *                retval                  (ignored)
4918 *
4919 * Indirect:      uap->path               Path of file to get status from
4920 *                uap->ub                 User buffer (holds file status info)
4921 *                uap->xsecurity          ACL to get (extended security)
4922 *                uap->xsecurity_size     Size of ACL
4923 *
4924 * Returns:        0                      Success
4925 *                !0                      errno value
4926 *
4927 */
4928int
4929stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
4930{
4931	return (stat1(uap->path, uap->ub, uap->xsecurity, uap->xsecurity_size, 1));
4932}
4933/*
4934 * Get file status; this version does not follow links.
4935 */
4936static int
4937lstat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64)
4938{
4939	struct nameidata nd;
4940	vfs_context_t ctx = vfs_context_current();
4941
4942	NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | NOFOLLOW | AUDITVNPATH1,
4943	    UIO_USERSPACE, path, ctx);
4944
4945	return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64));
4946}
4947
4948/*
4949 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
4950 *
4951 * Parameters:    p                       (ignored)
4952 *                uap                     User argument descriptor (see below)
4953 *                retval                  (ignored)
4954 *
4955 * Indirect:      uap->path               Path of file to get status from
4956 *                uap->ub                 User buffer (holds file status info)
4957 *                uap->xsecurity          ACL to get (extended security)
4958 *                uap->xsecurity_size     Size of ACL
4959 *
4960 * Returns:        0                      Success
4961 *                !0                      errno value
4962 *
4963 */
4964int
4965lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
4966{
4967	return (lstat1(uap->path, uap->ub, uap->xsecurity, uap->xsecurity_size, 0));
4968}
4969
4970int
4971lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
4972{
4973	return(lstat1(uap->path, uap->ub, 0, 0, 0));
4974}
4975
4976int
4977lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
4978{
4979	return(lstat1(uap->path, uap->ub, 0, 0, 1));
4980}
4981
4982/*
4983 * lstat64_extended: Get file status; can handle large inode numbers; does not
4984 * follow links; with extended security (ACL).
4985 *
4986 * Parameters:    p                       (ignored)
4987 *                uap                     User argument descriptor (see below)
4988 *                retval                  (ignored)
4989 *
4990 * Indirect:      uap->path               Path of file to get status from
4991 *                uap->ub                 User buffer (holds file status info)
4992 *                uap->xsecurity          ACL to get (extended security)
4993 *                uap->xsecurity_size     Size of ACL
4994 *
4995 * Returns:        0                      Success
4996 *                !0                      errno value
4997 *
4998 */
4999int
5000lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5001{
5002	return (lstat1(uap->path, uap->ub, uap->xsecurity, uap->xsecurity_size, 1));
5003}
5004
5005/*
5006 * Get configurable pathname variables.
5007 *
5008 * Returns:	0			Success
5009 *	namei:???
5010 *	vn_pathconf:???
5011 *
5012 * Notes:	Global implementation  constants are intended to be
5013 *		implemented in this function directly; all other constants
5014 *		are per-FS implementation, and therefore must be handled in
5015 *		each respective FS, instead.
5016 *
5017 * XXX We implement some things globally right now that should actually be
5018 * XXX per-FS; we will need to deal with this at some point.
5019 */
5020/* ARGSUSED */
5021int
5022pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5023{
5024	int error;
5025	struct nameidata nd;
5026	vfs_context_t ctx = vfs_context_current();
5027
5028	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5029		UIO_USERSPACE, uap->path, ctx);
5030	error = namei(&nd);
5031	if (error)
5032		return (error);
5033
5034	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5035
5036	vnode_put(nd.ni_vp);
5037	nameidone(&nd);
5038	return (error);
5039}
5040
5041/*
5042 * Return target name of a symbolic link.
5043 */
5044/* ARGSUSED */
5045int
5046readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5047{
5048	vnode_t vp;
5049	uio_t auio;
5050	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5051	int error;
5052	struct nameidata nd;
5053	vfs_context_t ctx = vfs_context_current();
5054	char uio_buf[ UIO_SIZEOF(1) ];
5055
5056	NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5057		UIO_USERSPACE, uap->path, ctx);
5058	error = namei(&nd);
5059	if (error)
5060		return (error);
5061	vp = nd.ni_vp;
5062
5063	nameidone(&nd);
5064
5065	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
5066								  &uio_buf[0], sizeof(uio_buf));
5067	uio_addiov(auio, uap->buf, uap->count);
5068	if (vp->v_type != VLNK)
5069		error = EINVAL;
5070	else {
5071#if CONFIG_MACF
5072		error = mac_vnode_check_readlink(ctx,
5073		    vp);
5074#endif
5075		if (error == 0)
5076			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, ctx);
5077		if (error == 0)
5078			error = VNOP_READLINK(vp, auio, ctx);
5079	}
5080	vnode_put(vp);
5081
5082	/* Safe: uio_resid() is bounded above by "count", and "count" is an int  */
5083	*retval = uap->count - (int)uio_resid(auio);
5084	return (error);
5085}
5086
5087/*
5088 * Change file flags.
5089 */
5090static int
5091chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5092{
5093	struct vnode_attr va;
5094 	kauth_action_t action;
5095	int error;
5096
5097	VATTR_INIT(&va);
5098	VATTR_SET(&va, va_flags, flags);
5099
5100#if CONFIG_MACF
5101	error = mac_vnode_check_setflags(ctx, vp, flags);
5102	if (error)
5103		goto out;
5104#endif
5105
5106	/* request authorisation, disregard immutability */
5107 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5108		goto out;
5109	/*
5110	 * Request that the auth layer disregard those file flags it's allowed to when
5111	 * authorizing this operation; we need to do this in order to be able to
5112	 * clear immutable flags.
5113	 */
5114	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5115		goto out;
5116	error = vnode_setattr(vp, &va, ctx);
5117
5118	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5119		error = ENOTSUP;
5120	}
5121out:
5122	vnode_put(vp);
5123	return(error);
5124}
5125
5126/*
5127 * Change flags of a file given a path name.
5128 */
5129/* ARGSUSED */
5130int
5131chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5132{
5133	vnode_t vp;
5134	vfs_context_t ctx = vfs_context_current();
5135	int error;
5136	struct nameidata nd;
5137
5138	AUDIT_ARG(fflags, uap->flags);
5139	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5140		UIO_USERSPACE, uap->path, ctx);
5141	error = namei(&nd);
5142	if (error)
5143		return (error);
5144	vp = nd.ni_vp;
5145	nameidone(&nd);
5146
5147	error = chflags1(vp, uap->flags, ctx);
5148
5149	return(error);
5150}
5151
5152/*
5153 * Change flags of a file given a file descriptor.
5154 */
5155/* ARGSUSED */
5156int
5157fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5158{
5159	vnode_t vp;
5160	int error;
5161
5162	AUDIT_ARG(fd, uap->fd);
5163	AUDIT_ARG(fflags, uap->flags);
5164	if ( (error = file_vnode(uap->fd, &vp)) )
5165		return (error);
5166
5167	if ((error = vnode_getwithref(vp))) {
5168		file_drop(uap->fd);
5169		return(error);
5170	}
5171
5172	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5173
5174	error = chflags1(vp, uap->flags, vfs_context_current());
5175
5176	file_drop(uap->fd);
5177	return (error);
5178}
5179
5180/*
5181 * Change security information on a filesystem object.
5182 *
5183 * Returns:	0			Success
5184 *		EPERM			Operation not permitted
5185 *		vnode_authattr:???	[anything vnode_authattr can return]
5186 *		vnode_authorize:???	[anything vnode_authorize can return]
5187 *		vnode_setattr:???	[anything vnode_setattr can return]
5188 *
5189 * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
5190 *		translated to EPERM before being returned.
5191 */
5192static int
5193chmod2(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5194{
5195	kauth_action_t action;
5196	int error;
5197
5198	AUDIT_ARG(mode, vap->va_mode);
5199	/* XXX audit new args */
5200
5201#if NAMEDSTREAMS
5202	/* chmod calls are not allowed for resource forks. */
5203	if (vp->v_flag & VISNAMEDSTREAM) {
5204		return (EPERM);
5205	}
5206#endif
5207
5208#if CONFIG_MACF
5209	if (VATTR_IS_ACTIVE(vap, va_mode) &&
5210	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5211		return (error);
5212#endif
5213
5214 	/* make sure that the caller is allowed to set this security information */
5215	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5216	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5217		if (error == EACCES)
5218			error = EPERM;
5219		return(error);
5220	}
5221
5222	error = vnode_setattr(vp, vap, ctx);
5223
5224	return (error);
5225}
5226
5227
5228/*
5229 * Change mode of a file given a path name.
5230 *
5231 * Returns:	0			Success
5232 *		namei:???		[anything namei can return]
5233 *		chmod2:???		[anything chmod2 can return]
5234 */
5235static int
5236chmod1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap)
5237{
5238	struct nameidata nd;
5239	int error;
5240
5241	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5242		UIO_USERSPACE, path, ctx);
5243	if ((error = namei(&nd)))
5244		return (error);
5245	error = chmod2(ctx, nd.ni_vp, vap);
5246	vnode_put(nd.ni_vp);
5247	nameidone(&nd);
5248	return(error);
5249}
5250
5251/*
5252 * chmod_extended: Change the mode of a file given a path name; with extended
5253 * argument list (including extended security (ACL)).
5254 *
5255 * Parameters:	p			Process requesting the open
5256 *		uap			User argument descriptor (see below)
5257 *		retval			(ignored)
5258 *
5259 * Indirect:	uap->path		Path to object (same as 'chmod')
5260 *		uap->uid		UID to set
5261 *		uap->gid		GID to set
5262 *		uap->mode		File mode to set (same as 'chmod')
5263 *		uap->xsecurity		ACL to set (or delete)
5264 *
5265 * Returns:	0			Success
5266 *		!0			errno value
5267 *
5268 * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5269 *
5270 * XXX:		We should enummerate the possible errno values here, and where
5271 *		in the code they originated.
5272 */
5273int
5274chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5275{
5276	int error;
5277	struct vnode_attr va;
5278	kauth_filesec_t xsecdst;
5279
5280	AUDIT_ARG(owner, uap->uid, uap->gid);
5281
5282	VATTR_INIT(&va);
5283	if (uap->mode != -1)
5284		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5285	if (uap->uid != KAUTH_UID_NONE)
5286		VATTR_SET(&va, va_uid, uap->uid);
5287	if (uap->gid != KAUTH_GID_NONE)
5288		VATTR_SET(&va, va_gid, uap->gid);
5289
5290	xsecdst = NULL;
5291	switch(uap->xsecurity) {
5292		/* explicit remove request */
5293	case CAST_USER_ADDR_T((void *)1):	/* _FILESEC_REMOVE_ACL */
5294		VATTR_SET(&va, va_acl, NULL);
5295		break;
5296		/* not being set */
5297	case USER_ADDR_NULL:
5298		break;
5299	default:
5300		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5301			return(error);
5302		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5303		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
5304	}
5305
5306	error = chmod1(vfs_context_current(), uap->path, &va);
5307
5308	if (xsecdst != NULL)
5309		kauth_filesec_free(xsecdst);
5310	return(error);
5311}
5312
5313/*
5314 * Returns:	0			Success
5315 *		chmod1:???		[anything chmod1 can return]
5316 */
5317int
5318chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
5319{
5320	struct vnode_attr va;
5321
5322	VATTR_INIT(&va);
5323	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5324
5325	return(chmod1(vfs_context_current(), uap->path, &va));
5326}
5327
5328/*
5329 * Change mode of a file given a file descriptor.
5330 */
5331static int
5332fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
5333{
5334	vnode_t vp;
5335	int error;
5336
5337	AUDIT_ARG(fd, fd);
5338
5339	if ((error = file_vnode(fd, &vp)) != 0)
5340		return (error);
5341	if ((error = vnode_getwithref(vp)) != 0) {
5342		file_drop(fd);
5343		return(error);
5344	}
5345	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5346
5347	error = chmod2(vfs_context_current(), vp, vap);
5348	(void)vnode_put(vp);
5349	file_drop(fd);
5350
5351	return (error);
5352}
5353
5354/*
5355 * fchmod_extended: Change mode of a file given a file descriptor; with
5356 * extended argument list (including extended security (ACL)).
5357 *
5358 * Parameters:    p                       Process requesting to change file mode
5359 *                uap                     User argument descriptor (see below)
5360 *                retval                  (ignored)
5361 *
5362 * Indirect:      uap->mode               File mode to set (same as 'chmod')
5363 *                uap->uid                UID to set
5364 *                uap->gid                GID to set
5365 *                uap->xsecurity          ACL to set (or delete)
5366 *                uap->fd                 File descriptor of file to change mode
5367 *
5368 * Returns:        0                      Success
5369 *                !0                      errno value
5370 *
5371 */
5372int
5373fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
5374{
5375	int error;
5376	struct vnode_attr va;
5377	kauth_filesec_t xsecdst;
5378
5379	AUDIT_ARG(owner, uap->uid, uap->gid);
5380
5381	VATTR_INIT(&va);
5382	if (uap->mode != -1)
5383		VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5384	if (uap->uid != KAUTH_UID_NONE)
5385		VATTR_SET(&va, va_uid, uap->uid);
5386	if (uap->gid != KAUTH_GID_NONE)
5387		VATTR_SET(&va, va_gid, uap->gid);
5388
5389	xsecdst = NULL;
5390	switch(uap->xsecurity) {
5391	case USER_ADDR_NULL:
5392		VATTR_SET(&va, va_acl, NULL);
5393		break;
5394	case CAST_USER_ADDR_T((void *)1):	/* _FILESEC_REMOVE_ACL */
5395		VATTR_SET(&va, va_acl, NULL);
5396		break;
5397		/* not being set */
5398	case CAST_USER_ADDR_T(-1):
5399		break;
5400	default:
5401		if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5402			return(error);
5403		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5404	}
5405
5406	error = fchmod1(p, uap->fd, &va);
5407
5408
5409	switch(uap->xsecurity) {
5410	case USER_ADDR_NULL:
5411	case CAST_USER_ADDR_T(-1):
5412		break;
5413	default:
5414		if (xsecdst != NULL)
5415			kauth_filesec_free(xsecdst);
5416	}
5417	return(error);
5418}
5419
5420int
5421fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
5422{
5423	struct vnode_attr va;
5424
5425	VATTR_INIT(&va);
5426	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5427
5428	return(fchmod1(p, uap->fd, &va));
5429}
5430
5431
5432/*
5433 * Set ownership given a path name.
5434 */
5435/* ARGSUSED */
5436static int
5437chown1(vfs_context_t ctx, struct chown_args *uap, __unused int32_t *retval, int follow)
5438{
5439	vnode_t vp;
5440	struct vnode_attr va;
5441	int error;
5442	struct nameidata nd;
5443	kauth_action_t action;
5444
5445	AUDIT_ARG(owner, uap->uid, uap->gid);
5446
5447	NDINIT(&nd, LOOKUP, OP_SETATTR,
5448		(follow ? FOLLOW : 0) | NOTRIGGER | AUDITVNPATH1,
5449		UIO_USERSPACE, uap->path, ctx);
5450	error = namei(&nd);
5451	if (error)
5452		return (error);
5453	vp = nd.ni_vp;
5454
5455	nameidone(&nd);
5456
5457	VATTR_INIT(&va);
5458	if (uap->uid != VNOVAL)
5459		VATTR_SET(&va, va_uid, uap->uid);
5460	if (uap->gid != VNOVAL)
5461		VATTR_SET(&va, va_gid, uap->gid);
5462
5463#if CONFIG_MACF
5464	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
5465	if (error)
5466		goto out;
5467#endif
5468
5469	/* preflight and authorize attribute changes */
5470	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5471		goto out;
5472	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
5473		goto out;
5474	error = vnode_setattr(vp, &va, ctx);
5475
5476out:
5477	/*
5478	 * EACCES is only allowed from namei(); permissions failure should
5479	 * return EPERM, so we need to translate the error code.
5480	 */
5481	if (error == EACCES)
5482		error = EPERM;
5483
5484	vnode_put(vp);
5485	return (error);
5486}
5487
5488int
5489chown(__unused proc_t p, struct chown_args *uap, int32_t *retval)
5490{
5491	return chown1(vfs_context_current(), uap, retval, 1);
5492}
5493
5494int
5495lchown(__unused proc_t p, struct lchown_args *uap, int32_t *retval)
5496{
5497	/* Argument list identical, but machine generated; cast for chown1() */
5498	return chown1(vfs_context_current(), (struct chown_args *)uap, retval, 0);
5499}
5500
5501/*
5502 * Set ownership given a file descriptor.
5503 */
5504/* ARGSUSED */
5505int
5506fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
5507{
5508	struct vnode_attr va;
5509	vfs_context_t ctx = vfs_context_current();
5510	vnode_t vp;
5511	int error;
5512	kauth_action_t action;
5513
5514	AUDIT_ARG(owner, uap->uid, uap->gid);
5515	AUDIT_ARG(fd, uap->fd);
5516
5517	if ( (error = file_vnode(uap->fd, &vp)) )
5518		return (error);
5519
5520	if ( (error = vnode_getwithref(vp)) ) {
5521		file_drop(uap->fd);
5522		return(error);
5523	}
5524	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5525
5526	VATTR_INIT(&va);
5527	if (uap->uid != VNOVAL)
5528		VATTR_SET(&va, va_uid, uap->uid);
5529	if (uap->gid != VNOVAL)
5530		VATTR_SET(&va, va_gid, uap->gid);
5531
5532#if NAMEDSTREAMS
5533	/* chown calls are not allowed for resource forks. */
5534	if (vp->v_flag & VISNAMEDSTREAM) {
5535		error = EPERM;
5536		goto out;
5537	}
5538#endif
5539
5540#if CONFIG_MACF
5541	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
5542	if (error)
5543		goto out;
5544#endif
5545
5546 	/* preflight and authorize attribute changes */
5547	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5548		goto out;
5549	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5550		if (error == EACCES)
5551			error = EPERM;
5552		goto out;
5553	}
5554	error = vnode_setattr(vp, &va, ctx);
5555
5556out:
5557	(void)vnode_put(vp);
5558	file_drop(uap->fd);
5559	return (error);
5560}
5561
5562static int
5563getutimes(user_addr_t usrtvp, struct timespec *tsp)
5564{
5565	int error;
5566
5567	if (usrtvp == USER_ADDR_NULL) {
5568		struct timeval old_tv;
5569		/* XXX Y2038 bug because of microtime argument */
5570		microtime(&old_tv);
5571		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
5572		tsp[1] = tsp[0];
5573	} else {
5574		if (IS_64BIT_PROCESS(current_proc())) {
5575			struct user64_timeval tv[2];
5576			error = copyin(usrtvp, (void *)tv, sizeof(tv));
5577			if (error)
5578				return (error);
5579			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
5580			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
5581		} else {
5582			struct user32_timeval tv[2];
5583			error = copyin(usrtvp, (void *)tv, sizeof(tv));
5584			if (error)
5585				return (error);
5586			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
5587			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
5588		}
5589	}
5590	return 0;
5591}
5592
5593static int
5594setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
5595	int nullflag)
5596{
5597	int error;
5598	struct vnode_attr va;
5599	kauth_action_t action;
5600
5601	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5602
5603	VATTR_INIT(&va);
5604	VATTR_SET(&va, va_access_time, ts[0]);
5605	VATTR_SET(&va, va_modify_time, ts[1]);
5606	if (nullflag)
5607		va.va_vaflags |= VA_UTIMES_NULL;
5608
5609#if NAMEDSTREAMS
5610	/* utimes calls are not allowed for resource forks. */
5611	if (vp->v_flag & VISNAMEDSTREAM) {
5612		error = EPERM;
5613		goto out;
5614	}
5615#endif
5616
5617#if CONFIG_MACF
5618	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
5619	if (error)
5620		goto out;
5621#endif
5622	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
5623		if (!nullflag && error == EACCES)
5624			error = EPERM;
5625		goto out;
5626	}
5627
5628	/* since we may not need to auth anything, check here */
5629	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5630		if (!nullflag && error == EACCES)
5631			error = EPERM;
5632		goto out;
5633	}
5634	error = vnode_setattr(vp, &va, ctx);
5635
5636out:
5637	return error;
5638}
5639
5640/*
5641 * Set the access and modification times of a file.
5642 */
5643/* ARGSUSED */
5644int
5645utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
5646{
5647	struct timespec ts[2];
5648	user_addr_t usrtvp;
5649	int error;
5650	struct nameidata nd;
5651	vfs_context_t ctx = vfs_context_current();
5652
5653	/*
5654	 * AUDIT: Needed to change the order of operations to do the
5655	 * name lookup first because auditing wants the path.
5656	 */
5657	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5658		UIO_USERSPACE, uap->path, ctx);
5659	error = namei(&nd);
5660	if (error)
5661		return (error);
5662	nameidone(&nd);
5663
5664	/*
5665	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
5666	 * the current time instead.
5667	 */
5668	usrtvp = uap->tptr;
5669	if ((error = getutimes(usrtvp, ts)) != 0)
5670		goto out;
5671
5672	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
5673
5674out:
5675	vnode_put(nd.ni_vp);
5676	return (error);
5677}
5678
5679/*
5680 * Set the access and modification times of a file.
5681 */
5682/* ARGSUSED */
5683int
5684futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
5685{
5686	struct timespec ts[2];
5687	vnode_t vp;
5688	user_addr_t usrtvp;
5689	int error;
5690
5691	AUDIT_ARG(fd, uap->fd);
5692	usrtvp = uap->tptr;
5693	if ((error = getutimes(usrtvp, ts)) != 0)
5694		return (error);
5695	if ((error = file_vnode(uap->fd, &vp)) != 0)
5696		return (error);
5697	if((error = vnode_getwithref(vp))) {
5698		file_drop(uap->fd);
5699		return(error);
5700	}
5701
5702	error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
5703	vnode_put(vp);
5704	file_drop(uap->fd);
5705	return(error);
5706}
5707
5708/*
5709 * Truncate a file given its path name.
5710 */
5711/* ARGSUSED */
5712int
5713truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
5714{
5715	vnode_t vp;
5716	struct vnode_attr va;
5717	vfs_context_t ctx = vfs_context_current();
5718	int error;
5719	struct nameidata nd;
5720	kauth_action_t action;
5721
5722	if (uap->length < 0)
5723		return(EINVAL);
5724	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
5725		UIO_USERSPACE, uap->path, ctx);
5726	if ((error = namei(&nd)))
5727		return (error);
5728	vp = nd.ni_vp;
5729
5730	nameidone(&nd);
5731
5732	VATTR_INIT(&va);
5733	VATTR_SET(&va, va_data_size, uap->length);
5734
5735#if CONFIG_MACF
5736	error = mac_vnode_check_truncate(ctx, NOCRED, vp);
5737	if (error)
5738		goto out;
5739#endif
5740
5741	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5742		goto out;
5743	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
5744		goto out;
5745	error = vnode_setattr(vp, &va, ctx);
5746out:
5747	vnode_put(vp);
5748	return (error);
5749}
5750
5751/*
5752 * Truncate a file given a file descriptor.
5753 */
5754/* ARGSUSED */
5755int
5756ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
5757{
5758	vfs_context_t ctx = vfs_context_current();
5759	struct vnode_attr va;
5760	vnode_t vp;
5761	struct fileproc *fp;
5762	int error ;
5763	int fd = uap->fd;
5764
5765	AUDIT_ARG(fd, uap->fd);
5766	if (uap->length < 0)
5767		return(EINVAL);
5768
5769	if ( (error = fp_lookup(p,fd,&fp,0)) ) {
5770		return(error);
5771	}
5772
5773	switch (FILEGLOB_DTYPE(fp->f_fglob)) {
5774	case DTYPE_PSXSHM:
5775		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
5776		goto out;
5777	case DTYPE_VNODE:
5778		break;
5779	default:
5780		error = EINVAL;
5781		goto out;
5782	}
5783
5784	vp = (vnode_t)fp->f_fglob->fg_data;
5785
5786	if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
5787		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
5788		error = EINVAL;
5789		goto out;
5790	}
5791
5792	if ((error = vnode_getwithref(vp)) != 0) {
5793		goto out;
5794	}
5795
5796	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5797
5798#if CONFIG_MACF
5799	error = mac_vnode_check_truncate(ctx,
5800	    fp->f_fglob->fg_cred, vp);
5801	if (error) {
5802		(void)vnode_put(vp);
5803		goto out;
5804	}
5805#endif
5806	VATTR_INIT(&va);
5807	VATTR_SET(&va, va_data_size, uap->length);
5808	error = vnode_setattr(vp, &va, ctx);
5809	(void)vnode_put(vp);
5810out:
5811	file_drop(fd);
5812	return (error);
5813}
5814
5815
5816/*
5817 * Sync an open file with synchronized I/O _file_ integrity completion
5818 */
5819/* ARGSUSED */
5820int
5821fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
5822{
5823	__pthread_testcancel(1);
5824	return(fsync_common(p, uap, MNT_WAIT));
5825}
5826
5827
5828/*
5829 * Sync an open file with synchronized I/O _file_ integrity completion
5830 *
5831 * Notes:	This is a legacy support function that does not test for
5832 *		thread cancellation points.
5833 */
5834/* ARGSUSED */
5835int
5836fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
5837{
5838	return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
5839}
5840
5841
5842/*
5843 * Sync an open file with synchronized I/O _data_ integrity completion
5844 */
5845/* ARGSUSED */
5846int
5847fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
5848{
5849	__pthread_testcancel(1);
5850	return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
5851}
5852
5853
5854/*
5855 * fsync_common
5856 *
5857 * Common fsync code to support both synchronized I/O file integrity completion
5858 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
5859 *
5860 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
5861 * will only guarantee that the file data contents are retrievable.  If
5862 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
5863 * includes additional metadata unnecessary for retrieving the file data
5864 * contents, such as atime, mtime, ctime, etc., also be committed to stable
5865 * storage.
5866 *
5867 * Parameters:	p				The process
5868 *		uap->fd				The descriptor to synchronize
5869 *		flags				The data integrity flags
5870 *
5871 * Returns:	int				Success
5872 *	fp_getfvp:EBADF				Bad file descriptor
5873 *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
5874 *	VNOP_FSYNC:???				unspecified
5875 *
5876 * Notes:	We use struct fsync_args because it is a short name, and all
5877 *		caller argument structures are otherwise identical.
5878 */
5879static int
5880fsync_common(proc_t p, struct fsync_args *uap, int flags)
5881{
5882	vnode_t vp;
5883	struct fileproc *fp;
5884	vfs_context_t ctx = vfs_context_current();
5885	int error;
5886
5887	AUDIT_ARG(fd, uap->fd);
5888
5889	if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
5890		return (error);
5891	if ( (error = vnode_getwithref(vp)) ) {
5892		file_drop(uap->fd);
5893		return(error);
5894	}
5895
5896	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5897
5898	error = VNOP_FSYNC(vp, flags, ctx);
5899
5900#if NAMEDRSRCFORK
5901	/* Sync resource fork shadow file if necessary. */
5902	if ((error == 0) &&
5903	    (vp->v_flag & VISNAMEDSTREAM) &&
5904	    (vp->v_parent != NULLVP) &&
5905	    vnode_isshadow(vp) &&
5906	    (fp->f_flags & FP_WRITTEN)) {
5907		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
5908	}
5909#endif
5910
5911	(void)vnode_put(vp);
5912	file_drop(uap->fd);
5913	return (error);
5914}
5915
5916/*
5917 * Duplicate files.  Source must be a file, target must be a file or
5918 * must not exist.
5919 *
5920 * XXX Copyfile authorisation checking is woefully inadequate, and will not
5921 *     perform inheritance correctly.
5922 */
5923/* ARGSUSED */
5924int
5925copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
5926{
5927	vnode_t tvp, fvp, tdvp, sdvp;
5928	struct nameidata fromnd, tond;
5929	int error;
5930	vfs_context_t ctx = vfs_context_current();
5931
5932	/* Check that the flags are valid. */
5933
5934	if (uap->flags & ~CPF_MASK) {
5935		return(EINVAL);
5936	}
5937
5938	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1,
5939		UIO_USERSPACE, uap->from, ctx);
5940	if ((error = namei(&fromnd)))
5941		return (error);
5942	fvp = fromnd.ni_vp;
5943
5944	NDINIT(&tond, CREATE, OP_LINK,
5945	       LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
5946	       UIO_USERSPACE, uap->to, ctx);
5947	if ((error = namei(&tond))) {
5948		goto out1;
5949	}
5950	tdvp = tond.ni_dvp;
5951	tvp = tond.ni_vp;
5952
5953	if (tvp != NULL) {
5954		if (!(uap->flags & CPF_OVERWRITE)) {
5955			error = EEXIST;
5956			goto out;
5957		}
5958	}
5959	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
5960		error = EISDIR;
5961		goto out;
5962	}
5963
5964	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
5965		goto out;
5966
5967	if (fvp == tdvp)
5968		error = EINVAL;
5969	/*
5970	 * If source is the same as the destination (that is the
5971	 * same inode number) then there is nothing to do.
5972	 * (fixed to have POSIX semantics - CSM 3/2/98)
5973	 */
5974	if (fvp == tvp)
5975		error = -1;
5976	if (!error)
5977	        error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
5978out:
5979	sdvp = tond.ni_startdir;
5980	/*
5981	 * nameidone has to happen before we vnode_put(tdvp)
5982	 * since it may need to release the fs_nodelock on the tdvp
5983	 */
5984	nameidone(&tond);
5985
5986	if (tvp)
5987		vnode_put(tvp);
5988	vnode_put(tdvp);
5989	vnode_put(sdvp);
5990out1:
5991	vnode_put(fvp);
5992
5993	if (fromnd.ni_startdir)
5994	        vnode_put(fromnd.ni_startdir);
5995	nameidone(&fromnd);
5996
5997	if (error == -1)
5998		return (0);
5999	return (error);
6000}
6001
6002
6003/*
6004 * Rename files.  Source and destination must either both be directories,
6005 * or both not be directories.  If target is a directory, it must be empty.
6006 */
6007/* ARGSUSED */
6008int
6009rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
6010{
6011	vnode_t tvp, tdvp;
6012	vnode_t fvp, fdvp;
6013	struct nameidata *fromnd, *tond;
6014	vfs_context_t ctx = vfs_context_current();
6015	int error;
6016	int do_retry;
6017	int mntrename;
6018	int need_event;
6019	const char *oname = NULL;
6020	char *from_name = NULL, *to_name = NULL;
6021	int from_len=0, to_len=0;
6022	int holding_mntlock;
6023	mount_t locked_mp = NULL;
6024	vnode_t oparent = NULLVP;
6025#if CONFIG_FSE
6026	fse_info from_finfo, to_finfo;
6027#endif
6028	int from_truncated=0, to_truncated;
6029	int batched = 0;
6030	struct vnode_attr *fvap, *tvap;
6031	int continuing = 0;
6032	/* carving out a chunk for structs that are too big to be on stack. */
6033	struct {
6034		struct nameidata from_node, to_node;
6035		struct vnode_attr fv_attr, tv_attr;
6036	} * __rename_data;
6037	MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
6038	fromnd = &__rename_data->from_node;
6039	tond = &__rename_data->to_node;
6040
6041	holding_mntlock = 0;
6042	do_retry = 0;
6043retry:
6044	fvp = tvp = NULL;
6045	fdvp = tdvp = NULL;
6046	fvap = tvap = NULL;
6047	mntrename = FALSE;
6048
6049	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
6050	       UIO_USERSPACE, uap->from, ctx);
6051	fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
6052
6053	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6054	       UIO_USERSPACE, uap->to, ctx);
6055	tond->ni_flag = NAMEI_COMPOUNDRENAME;
6056
6057continue_lookup:
6058	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6059		if ( (error = namei(fromnd)) )
6060			goto out1;
6061		fdvp = fromnd->ni_dvp;
6062		fvp  = fromnd->ni_vp;
6063
6064		if (fvp && fvp->v_type == VDIR)
6065			tond->ni_cnd.cn_flags |= WILLBEDIR;
6066	}
6067
6068	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6069		if ( (error = namei(tond)) ) {
6070			/*
6071			 * Translate error code for rename("dir1", "dir2/.").
6072			 */
6073			if (error == EISDIR && fvp->v_type == VDIR)
6074				error = EINVAL;
6075			goto out1;
6076		}
6077		tdvp = tond->ni_dvp;
6078		tvp  = tond->ni_vp;
6079	}
6080
6081	batched = vnode_compound_rename_available(fdvp);
6082	if (!fvp) {
6083		/*
6084		 * Claim: this check will never reject a valid rename.
6085		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
6086		 * Suppose fdvp and tdvp are not on the same mount.
6087		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
6088		 * 	then you can't move it to within another dir on the same mountpoint.
6089		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
6090		 *
6091		 * If this check passes, then we are safe to pass these vnodes to the same FS.
6092		 */
6093		if (fdvp->v_mount != tdvp->v_mount) {
6094			error = EXDEV;
6095			goto out1;
6096		}
6097		goto skipped_lookup;
6098	}
6099
6100	if (!batched) {
6101		error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
6102		if (error) {
6103			if (error == ENOENT) {
6104				/*
6105				 * We encountered a race where after doing the namei, tvp stops
6106				 * being valid. If so, simply re-drive the rename call from the
6107				 * top.
6108				 */
6109				do_retry = 1;
6110			}
6111			goto out1;
6112		}
6113	}
6114
6115        /*
6116         * If the source and destination are the same (i.e. they're
6117         * links to the same vnode) and the target file system is
6118         * case sensitive, then there is nothing to do.
6119	 *
6120	 * XXX Come back to this.
6121         */
6122	if (fvp == tvp) {
6123		int pathconf_val;
6124
6125		/*
6126		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
6127		 * then assume that this file system is case sensitive.
6128		 */
6129		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
6130		    pathconf_val != 0) {
6131			goto out1;
6132		}
6133	}
6134
6135	/*
6136	 * Allow the renaming of mount points.
6137	 * - target must not exist
6138	 * - target must reside in the same directory as source
6139	 * - union mounts cannot be renamed
6140	 * - "/" cannot be renamed
6141	 *
6142	 * XXX Handle this in VFS after a continued lookup (if we missed
6143	 * in the cache to start off)
6144	 */
6145	if ((fvp->v_flag & VROOT) &&
6146	    (fvp->v_type == VDIR) &&
6147	    (tvp == NULL)  &&
6148	    (fvp->v_mountedhere == NULL)  &&
6149	    (fdvp == tdvp)  &&
6150	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
6151	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
6152		vnode_t coveredvp;
6153
6154		/* switch fvp to the covered vnode */
6155		coveredvp = fvp->v_mount->mnt_vnodecovered;
6156		if ( (vnode_getwithref(coveredvp)) ) {
6157		        error = ENOENT;
6158			goto out1;
6159		}
6160		vnode_put(fvp);
6161
6162		fvp = coveredvp;
6163		mntrename = TRUE;
6164	}
6165	/*
6166	 * Check for cross-device rename.
6167	 */
6168	if ((fvp->v_mount != tdvp->v_mount) ||
6169	    (tvp && (fvp->v_mount != tvp->v_mount))) {
6170		error = EXDEV;
6171		goto out1;
6172	}
6173
6174	/*
6175	 * If source is the same as the destination (that is the
6176	 * same inode number) then there is nothing to do...
6177	 * EXCEPT if the underlying file system supports case
6178	 * insensitivity and is case preserving.  In this case
6179	 * the file system needs to handle the special case of
6180	 * getting the same vnode as target (fvp) and source (tvp).
6181	 *
6182	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
6183	 * and _PC_CASE_PRESERVING can have this exception, and they need to
6184	 * handle the special case of getting the same vnode as target and
6185	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
6186	 * so not to cause locking problems. There is a single reference on tvp.
6187	 *
6188	 * NOTE - that fvp == tvp also occurs if they are hard linked and
6189	 * that correct behaviour then is just to return success without doing
6190	 * anything.
6191	 *
6192	 * XXX filesystem should take care of this itself, perhaps...
6193	 */
6194	if (fvp == tvp && fdvp == tdvp) {
6195		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
6196	       	    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
6197			  fromnd->ni_cnd.cn_namelen)) {
6198			goto out1;
6199		}
6200	}
6201
6202	if (holding_mntlock && fvp->v_mount != locked_mp) {
6203	        /*
6204		 * we're holding a reference and lock
6205		 * on locked_mp, but it no longer matches
6206		 * what we want to do... so drop our hold
6207		 */
6208		mount_unlock_renames(locked_mp);
6209		mount_drop(locked_mp, 0);
6210	        holding_mntlock = 0;
6211	}
6212	if (tdvp != fdvp && fvp->v_type == VDIR) {
6213	        /*
6214		 * serialize renames that re-shape
6215		 * the tree... if holding_mntlock is
6216		 * set, then we're ready to go...
6217		 * otherwise we
6218		 * first need to drop the iocounts
6219		 * we picked up, second take the
6220		 * lock to serialize the access,
6221		 * then finally start the lookup
6222		 * process over with the lock held
6223		 */
6224	        if (!holding_mntlock) {
6225		        /*
6226			 * need to grab a reference on
6227			 * the mount point before we
6228			 * drop all the iocounts... once
6229			 * the iocounts are gone, the mount
6230			 * could follow
6231			 */
6232			locked_mp = fvp->v_mount;
6233			mount_ref(locked_mp, 0);
6234
6235			/*
6236			 * nameidone has to happen before we vnode_put(tvp)
6237			 * since it may need to release the fs_nodelock on the tvp
6238			 */
6239			nameidone(tond);
6240
6241			if (tvp)
6242			        vnode_put(tvp);
6243			vnode_put(tdvp);
6244
6245			/*
6246			 * nameidone has to happen before we vnode_put(fdvp)
6247			 * since it may need to release the fs_nodelock on the fvp
6248			 */
6249			nameidone(fromnd);
6250
6251			vnode_put(fvp);
6252			vnode_put(fdvp);
6253
6254			mount_lock_renames(locked_mp);
6255			holding_mntlock = 1;
6256
6257			goto retry;
6258		}
6259	} else {
6260	        /*
6261		 * when we dropped the iocounts to take
6262		 * the lock, we allowed the identity of
6263		 * the various vnodes to change... if they did,
6264		 * we may no longer be dealing with a rename
6265		 * that reshapes the tree... once we're holding
6266		 * the iocounts, the vnodes can't change type
6267		 * so we're free to drop the lock at this point
6268		 * and continue on
6269		 */
6270	        if (holding_mntlock) {
6271			mount_unlock_renames(locked_mp);
6272			mount_drop(locked_mp, 0);
6273		        holding_mntlock = 0;
6274		}
6275	}
6276
6277	// save these off so we can later verify that fvp is the same
6278	oname   = fvp->v_name;
6279	oparent = fvp->v_parent;
6280
6281skipped_lookup:
6282#if CONFIG_FSE
6283	need_event = need_fsevent(FSE_RENAME, fdvp);
6284	if (need_event) {
6285		if (fvp) {
6286			get_fse_info(fvp, &from_finfo, ctx);
6287		} else {
6288			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
6289			if (error) {
6290				goto out1;
6291			}
6292
6293			fvap = &__rename_data->fv_attr;
6294		}
6295
6296		if (tvp) {
6297		        get_fse_info(tvp, &to_finfo, ctx);
6298		} else if (batched) {
6299			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
6300			if (error) {
6301				goto out1;
6302			}
6303
6304			tvap = &__rename_data->tv_attr;
6305		}
6306	}
6307#else
6308	need_event = 0;
6309#endif /* CONFIG_FSE */
6310
6311	if (need_event || kauth_authorize_fileop_has_listeners()) {
6312		if (from_name == NULL) {
6313			GET_PATH(from_name);
6314			if (from_name == NULL) {
6315				error = ENOMEM;
6316				goto out1;
6317			}
6318		}
6319
6320		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
6321
6322		if (to_name == NULL) {
6323			GET_PATH(to_name);
6324			if (to_name == NULL) {
6325				error = ENOMEM;
6326				goto out1;
6327			}
6328		}
6329
6330		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
6331	}
6332
6333	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
6334			    tdvp, &tvp, &tond->ni_cnd, tvap,
6335			    0, ctx);
6336
6337	if (holding_mntlock) {
6338		/*
6339		 * we can drop our serialization
6340		 * lock now
6341		 */
6342		mount_unlock_renames(locked_mp);
6343		mount_drop(locked_mp, 0);
6344		holding_mntlock = 0;
6345	}
6346	if (error) {
6347		if (error == EKEEPLOOKING) {
6348			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6349				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6350					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
6351				}
6352			}
6353
6354			fromnd->ni_vp = fvp;
6355			tond->ni_vp = tvp;
6356
6357			goto continue_lookup;
6358		}
6359
6360		/*
6361		 * We may encounter a race in the VNOP where the destination didn't
6362		 * exist when we did the namei, but it does by the time we go and
6363		 * try to create the entry. In this case, we should re-drive this rename
6364		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
6365		 * but other filesystems susceptible to this race could return it, too.
6366		 */
6367		if (error == ERECYCLE) {
6368			do_retry = 1;
6369		}
6370
6371		goto out1;
6372	}
6373
6374	/* call out to allow 3rd party notification of rename.
6375	 * Ignore result of kauth_authorize_fileop call.
6376	 */
6377	kauth_authorize_fileop(vfs_context_ucred(ctx),
6378			KAUTH_FILEOP_RENAME,
6379			(uintptr_t)from_name, (uintptr_t)to_name);
6380
6381#if CONFIG_FSE
6382	if (from_name != NULL && to_name != NULL) {
6383		if (from_truncated || to_truncated) {
6384			// set it here since only the from_finfo gets reported up to user space
6385			from_finfo.mode |= FSE_TRUNCATED_PATH;
6386		}
6387
6388		if (tvap && tvp) {
6389			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
6390		}
6391		if (fvap) {
6392			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
6393		}
6394
6395	        if (tvp) {
6396		        add_fsevent(FSE_RENAME, ctx,
6397				    FSE_ARG_STRING, from_len, from_name,
6398				    FSE_ARG_FINFO, &from_finfo,
6399				    FSE_ARG_STRING, to_len, to_name,
6400				    FSE_ARG_FINFO, &to_finfo,
6401				    FSE_ARG_DONE);
6402		} else {
6403		        add_fsevent(FSE_RENAME, ctx,
6404				    FSE_ARG_STRING, from_len, from_name,
6405				    FSE_ARG_FINFO, &from_finfo,
6406				    FSE_ARG_STRING, to_len, to_name,
6407				    FSE_ARG_DONE);
6408		}
6409	}
6410#endif /* CONFIG_FSE */
6411
6412	/*
6413	 * update filesystem's mount point data
6414	 */
6415	if (mntrename) {
6416	        char *cp, *pathend, *mpname;
6417		char * tobuf;
6418		struct mount *mp;
6419		int maxlen;
6420		size_t len = 0;
6421
6422		mp = fvp->v_mountedhere;
6423
6424		if (vfs_busy(mp, LK_NOWAIT)) {
6425		        error = EBUSY;
6426			goto out1;
6427		}
6428		MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
6429
6430		error = copyinstr(uap->to, tobuf, MAXPATHLEN, &len);
6431		if (!error) {
6432		        /* find current mount point prefix */
6433		        pathend = &mp->mnt_vfsstat.f_mntonname[0];
6434			for (cp = pathend; *cp != '\0'; ++cp) {
6435			        if (*cp == '/')
6436				        pathend = cp + 1;
6437			}
6438			/* find last component of target name */
6439			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
6440			        if (*cp == '/')
6441				        mpname = cp + 1;
6442			}
6443			/* append name to prefix */
6444			maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
6445			bzero(pathend, maxlen);
6446			strlcpy(pathend, mpname, maxlen);
6447		}
6448		FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
6449
6450		vfs_unbusy(mp);
6451	}
6452	/*
6453	 * fix up name & parent pointers.  note that we first
6454	 * check that fvp has the same name/parent pointers it
6455	 * had before the rename call... this is a 'weak' check
6456	 * at best...
6457	 *
6458	 * XXX oparent and oname may not be set in the compound vnop case
6459	 */
6460	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
6461	        int update_flags;
6462
6463	        update_flags = VNODE_UPDATE_NAME;
6464
6465		if (fdvp != tdvp)
6466		        update_flags |= VNODE_UPDATE_PARENT;
6467
6468	        vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
6469	}
6470out1:
6471	if (to_name != NULL) {
6472		RELEASE_PATH(to_name);
6473		to_name = NULL;
6474	}
6475	if (from_name != NULL) {
6476		RELEASE_PATH(from_name);
6477		from_name = NULL;
6478	}
6479	if (holding_mntlock) {
6480	        mount_unlock_renames(locked_mp);
6481		mount_drop(locked_mp, 0);
6482		holding_mntlock = 0;
6483	}
6484	if (tdvp) {
6485		/*
6486		 * nameidone has to happen before we vnode_put(tdvp)
6487		 * since it may need to release the fs_nodelock on the tdvp
6488		 */
6489		nameidone(tond);
6490
6491		if (tvp)
6492		        vnode_put(tvp);
6493	        vnode_put(tdvp);
6494	}
6495	if (fdvp) {
6496		/*
6497		 * nameidone has to happen before we vnode_put(fdvp)
6498		 * since it may need to release the fs_nodelock on the fdvp
6499		 */
6500		nameidone(fromnd);
6501
6502		if (fvp)
6503		        vnode_put(fvp);
6504	        vnode_put(fdvp);
6505	}
6506
6507
6508	/*
6509	 * If things changed after we did the namei, then we will re-drive
6510	 * this rename call from the top.
6511	 */
6512	if (do_retry) {
6513		do_retry = 0;
6514		goto retry;
6515	}
6516
6517	FREE(__rename_data, M_TEMP);
6518	return (error);
6519}
6520
6521/*
6522 * Make a directory file.
6523 *
6524 * Returns:	0			Success
6525 *		EEXIST
6526 *	namei:???
6527 *	vnode_authorize:???
6528 *	vn_create:???
6529 */
6530/* ARGSUSED */
6531static int
6532mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap)
6533{
6534	vnode_t	vp, dvp;
6535	int error;
6536	int update_flags = 0;
6537	int batched;
6538	struct nameidata nd;
6539
6540	AUDIT_ARG(mode, vap->va_mode);
6541	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE,
6542	       path, ctx);
6543	nd.ni_cnd.cn_flags |= WILLBEDIR;
6544	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
6545
6546continue_lookup:
6547	error = namei(&nd);
6548	if (error)
6549		return (error);
6550	dvp = nd.ni_dvp;
6551	vp = nd.ni_vp;
6552
6553  	if (vp != NULL) {
6554  		error = EEXIST;
6555  		goto out;
6556  	}
6557
6558	batched = vnode_compound_mkdir_available(dvp);
6559
6560	VATTR_SET(vap, va_type, VDIR);
6561
6562	/*
6563	 * XXX
6564	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
6565	 * only get EXISTS or EISDIR for existing path components, and not that it could see
6566	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
6567	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
6568	 */
6569 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
6570		if (error == EACCES || error == EPERM) {
6571			int error2;
6572
6573			nameidone(&nd);
6574			vnode_put(dvp);
6575			dvp = NULLVP;
6576
6577			/*
6578			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
6579			 * rather than EACCESS if the target exists.
6580			 */
6581			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, UIO_USERSPACE,
6582					path, ctx);
6583			error2 = namei(&nd);
6584			if (error2) {
6585				goto out;
6586			} else {
6587				vp = nd.ni_vp;
6588				error = EEXIST;
6589				goto out;
6590			}
6591		}
6592
6593		goto out;
6594	}
6595
6596	/*
6597	 * make the directory
6598	 */
6599  	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
6600		if (error == EKEEPLOOKING) {
6601			nd.ni_vp = vp;
6602			goto continue_lookup;
6603		}
6604
6605  		goto out;
6606	}
6607
6608	// Make sure the name & parent pointers are hooked up
6609	if (vp->v_name == NULL)
6610	        update_flags |= VNODE_UPDATE_NAME;
6611	if (vp->v_parent == NULLVP)
6612	        update_flags |= VNODE_UPDATE_PARENT;
6613
6614	if (update_flags)
6615	        vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6616
6617#if CONFIG_FSE
6618	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
6619#endif
6620
6621out:
6622	/*
6623	 * nameidone has to happen before we vnode_put(dvp)
6624	 * since it may need to release the fs_nodelock on the dvp
6625	 */
6626	nameidone(&nd);
6627
6628	if (vp)
6629		vnode_put(vp);
6630	if (dvp)
6631		vnode_put(dvp);
6632
6633	return (error);
6634}
6635
6636/*
6637 * mkdir_extended: Create a directory; with extended security (ACL).
6638 *
6639 * Parameters:    p                       Process requesting to create the directory
6640 *                uap                     User argument descriptor (see below)
6641 *                retval                  (ignored)
6642 *
6643 * Indirect:      uap->path               Path of directory to create
6644 *                uap->mode               Access permissions to set
6645 *                uap->xsecurity          ACL to set
6646 *
6647 * Returns:        0                      Success
6648 *                !0                      Not success
6649 *
6650 */
6651int
6652mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
6653{
6654	int ciferror;
6655	kauth_filesec_t xsecdst;
6656	struct vnode_attr va;
6657
6658	AUDIT_ARG(owner, uap->uid, uap->gid);
6659
6660	xsecdst = NULL;
6661	if ((uap->xsecurity != USER_ADDR_NULL) &&
6662	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
6663		return ciferror;
6664
6665	VATTR_INIT(&va);
6666  	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
6667	if (xsecdst != NULL)
6668		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6669
6670	ciferror = mkdir1(vfs_context_current(), uap->path, &va);
6671	if (xsecdst != NULL)
6672		kauth_filesec_free(xsecdst);
6673	return ciferror;
6674}
6675
6676int
6677mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
6678{
6679	struct vnode_attr va;
6680
6681	VATTR_INIT(&va);
6682  	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
6683
6684	return(mkdir1(vfs_context_current(), uap->path, &va));
6685}
6686
6687/*
6688 * Remove a directory file.
6689 */
6690/* ARGSUSED */
6691int
6692rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
6693{
6694	vnode_t vp, dvp;
6695	int error;
6696	struct nameidata nd;
6697	char     *path = NULL;
6698	int       len=0;
6699	int has_listeners = 0;
6700	int need_event = 0;
6701	int truncated = 0;
6702	vfs_context_t ctx = vfs_context_current();
6703#if CONFIG_FSE
6704	struct vnode_attr va;
6705#endif /* CONFIG_FSE */
6706	struct vnode_attr *vap = NULL;
6707	int batched;
6708
6709	int restart_flag;
6710
6711	/*
6712	 * This loop exists to restart rmdir in the unlikely case that two
6713	 * processes are simultaneously trying to remove the same directory
6714	 * containing orphaned appleDouble files.
6715	 */
6716	do {
6717		NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
6718		       UIO_USERSPACE, uap->path, ctx);
6719		nd.ni_flag = NAMEI_COMPOUNDRMDIR;
6720continue_lookup:
6721		restart_flag = 0;
6722		vap = NULL;
6723
6724		error = namei(&nd);
6725		if (error)
6726			return (error);
6727
6728		dvp = nd.ni_dvp;
6729		vp = nd.ni_vp;
6730
6731		if (vp) {
6732			batched = vnode_compound_rmdir_available(vp);
6733
6734			if (vp->v_flag & VROOT) {
6735				/*
6736				 * The root of a mounted filesystem cannot be deleted.
6737				 */
6738				error = EBUSY;
6739				goto out;
6740			}
6741
6742			/*
6743			 * Removed a check here; we used to abort if vp's vid
6744			 * was not the same as what we'd seen the last time around.
6745			 * I do not think that check was valid, because if we retry
6746			 * and all dirents are gone, the directory could legitimately
6747			 * be recycled but still be present in a situation where we would
6748			 * have had permission to delete.  Therefore, we won't make
6749			 * an effort to preserve that check now that we may not have a
6750			 * vp here.
6751			 */
6752
6753			if (!batched) {
6754				error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
6755				if (error) {
6756					goto out;
6757				}
6758			}
6759		} else {
6760			batched = 1;
6761
6762			if (!vnode_compound_rmdir_available(dvp)) {
6763				panic("No error, but no compound rmdir?");
6764			}
6765		}
6766
6767#if CONFIG_FSE
6768		fse_info  finfo;
6769
6770		need_event = need_fsevent(FSE_DELETE, dvp);
6771		if (need_event) {
6772			if (!batched) {
6773				get_fse_info(vp, &finfo, ctx);
6774			} else {
6775				error = vfs_get_notify_attributes(&va);
6776				if (error) {
6777					goto out;
6778				}
6779
6780				vap = &va;
6781			}
6782		}
6783#endif
6784		has_listeners = kauth_authorize_fileop_has_listeners();
6785		if (need_event || has_listeners) {
6786			if (path == NULL) {
6787				GET_PATH(path);
6788				if (path == NULL) {
6789					error = ENOMEM;
6790					goto out;
6791				}
6792			}
6793
6794			len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
6795#if CONFIG_FSE
6796			if (truncated) {
6797				finfo.mode |= FSE_TRUNCATED_PATH;
6798			}
6799#endif
6800		}
6801
6802		error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
6803		nd.ni_vp = vp;
6804		if (vp == NULLVP) {
6805			/* Couldn't find a vnode */
6806			goto out;
6807		}
6808
6809		if (error == EKEEPLOOKING) {
6810			goto continue_lookup;
6811		}
6812#if CONFIG_APPLEDOUBLE
6813		/*
6814		 * Special case to remove orphaned AppleDouble
6815		 * files. I don't like putting this in the kernel,
6816		 * but carbon does not like putting this in carbon either,
6817		 * so here we are.
6818		 */
6819		if (error == ENOTEMPTY) {
6820			error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
6821			if (error == EBUSY) {
6822				goto out;
6823			}
6824
6825
6826			/*
6827			 * Assuming everything went well, we will try the RMDIR again
6828			 */
6829			if (!error)
6830				error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
6831		}
6832#endif /* CONFIG_APPLEDOUBLE */
6833		/*
6834		 * Call out to allow 3rd party notification of delete.
6835		 * Ignore result of kauth_authorize_fileop call.
6836		 */
6837		if (!error) {
6838			if (has_listeners) {
6839				kauth_authorize_fileop(vfs_context_ucred(ctx),
6840						KAUTH_FILEOP_DELETE,
6841						(uintptr_t)vp,
6842						(uintptr_t)path);
6843			}
6844
6845			if (vp->v_flag & VISHARDLINK) {
6846				// see the comment in unlink1() about why we update
6847				// the parent of a hard link when it is removed
6848				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6849			}
6850
6851#if CONFIG_FSE
6852			if (need_event) {
6853				if (vap) {
6854					vnode_get_fse_info_from_vap(vp, &finfo, vap);
6855				}
6856				add_fsevent(FSE_DELETE, ctx,
6857						FSE_ARG_STRING, len, path,
6858						FSE_ARG_FINFO, &finfo,
6859						FSE_ARG_DONE);
6860			}
6861#endif
6862		}
6863
6864out:
6865		if (path != NULL) {
6866			RELEASE_PATH(path);
6867			path = NULL;
6868		}
6869		/*
6870		 * nameidone has to happen before we vnode_put(dvp)
6871		 * since it may need to release the fs_nodelock on the dvp
6872		 */
6873		nameidone(&nd);
6874		vnode_put(dvp);
6875
6876		if (vp)
6877			vnode_put(vp);
6878
6879		if (restart_flag == 0) {
6880			wakeup_one((caddr_t)vp);
6881			return (error);
6882		}
6883		tsleep(vp, PVFS, "rm AD", 1);
6884
6885	} while (restart_flag != 0);
6886
6887	return (error);
6888
6889}
6890
6891/* Get direntry length padded to 8 byte alignment */
6892#define DIRENT64_LEN(namlen) \
6893	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
6894
6895static errno_t
6896vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
6897                int *numdirent, vfs_context_t ctxp)
6898{
6899	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
6900	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
6901		   ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))	{
6902		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
6903	} else {
6904		size_t bufsize;
6905		void * bufptr;
6906		uio_t auio;
6907		struct direntry *entry64;
6908		struct dirent *dep;
6909		int bytesread;
6910		int error;
6911
6912		/*
6913		 * Our kernel buffer needs to be smaller since re-packing
6914		 * will expand each dirent.  The worse case (when the name
6915		 * length is 3) corresponds to a struct direntry size of 32
6916		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
6917		 * (4-byte aligned).  So having a buffer that is 3/8 the size
6918		 * will prevent us from reading more than we can pack.
6919                 *
6920		 * Since this buffer is wired memory, we will limit the
6921		 * buffer size to a maximum of 32K. We would really like to
6922		 * use 32K in the MIN(), but we use magic number 87371 to
6923		 * prevent uio_resid() * 3 / 8 from overflowing.
6924		 */
6925		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
6926		MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
6927		if (bufptr == NULL) {
6928			return ENOMEM;
6929		}
6930
6931		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
6932		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
6933		auio->uio_offset = uio->uio_offset;
6934
6935		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
6936
6937		dep = (struct dirent *)bufptr;
6938		bytesread = bufsize - uio_resid(auio);
6939
6940		MALLOC(entry64, struct direntry *, sizeof(struct direntry),
6941		       M_TEMP, M_WAITOK);
6942		/*
6943		 * Convert all the entries and copy them out to user's buffer.
6944		 */
6945		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
6946			size_t	enbufsize = DIRENT64_LEN(dep->d_namlen);
6947
6948			bzero(entry64, enbufsize);
6949			/* Convert a dirent to a dirent64. */
6950			entry64->d_ino = dep->d_ino;
6951			entry64->d_seekoff = 0;
6952			entry64->d_reclen = enbufsize;
6953			entry64->d_namlen = dep->d_namlen;
6954			entry64->d_type = dep->d_type;
6955			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
6956
6957			/* Move to next entry. */
6958			dep = (struct dirent *)((char *)dep + dep->d_reclen);
6959
6960			/* Copy entry64 to user's buffer. */
6961			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
6962		}
6963
6964		/* Update the real offset using the offset we got from VNOP_READDIR. */
6965		if (error == 0) {
6966			uio->uio_offset = auio->uio_offset;
6967		}
6968		uio_free(auio);
6969		FREE(bufptr, M_TEMP);
6970		FREE(entry64, M_TEMP);
6971		return (error);
6972	}
6973}
6974
6975#define GETDIRENTRIES_MAXBUFSIZE	(128 * 1024 * 1024U)
6976
6977/*
6978 * Read a block of directory entries in a file system independent format.
6979 */
6980static int
6981getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
6982                     off_t *offset, int flags)
6983{
6984	vnode_t vp;
6985	struct vfs_context context = *vfs_context_current();	/* local copy */
6986	struct fileproc *fp;
6987	uio_t auio;
6988	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6989	off_t loff;
6990	int error, eofflag, numdirent;
6991	char uio_buf[ UIO_SIZEOF(1) ];
6992
6993	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
6994	if (error) {
6995		return (error);
6996	}
6997	if ((fp->f_fglob->fg_flag & FREAD) == 0) {
6998		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6999		error = EBADF;
7000		goto out;
7001	}
7002
7003	if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
7004		bufsize = GETDIRENTRIES_MAXBUFSIZE;
7005
7006#if CONFIG_MACF
7007	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
7008	if (error)
7009		goto out;
7010#endif
7011	if ( (error = vnode_getwithref(vp)) ) {
7012		goto out;
7013	}
7014	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7015
7016unionread:
7017	if (vp->v_type != VDIR) {
7018		(void)vnode_put(vp);
7019		error = EINVAL;
7020		goto out;
7021	}
7022
7023#if CONFIG_MACF
7024	error = mac_vnode_check_readdir(&context, vp);
7025	if (error != 0) {
7026		(void)vnode_put(vp);
7027		goto out;
7028	}
7029#endif /* MAC */
7030
7031	loff = fp->f_fglob->fg_offset;
7032	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7033	uio_addiov(auio, bufp, bufsize);
7034
7035	if (flags & VNODE_READDIR_EXTENDED) {
7036		error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
7037		fp->f_fglob->fg_offset = uio_offset(auio);
7038	} else {
7039		error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
7040		fp->f_fglob->fg_offset = uio_offset(auio);
7041	}
7042	if (error) {
7043		(void)vnode_put(vp);
7044		goto out;
7045	}
7046
7047	if ((user_ssize_t)bufsize == uio_resid(auio)){
7048		if (union_dircheckp) {
7049			error = union_dircheckp(&vp, fp, &context);
7050			if (error == -1)
7051				goto unionread;
7052			if (error)
7053				goto out;
7054		}
7055
7056		if ((vp->v_mount->mnt_flag & MNT_UNION)) {
7057			struct vnode *tvp = vp;
7058			if (lookup_traverse_union(tvp, &vp, &context) == 0) {
7059				vnode_ref(vp);
7060				fp->f_fglob->fg_data = (caddr_t) vp;
7061				fp->f_fglob->fg_offset = 0;
7062				vnode_rele(tvp);
7063				vnode_put(tvp);
7064				goto unionread;
7065			}
7066			vp = tvp;
7067		}
7068	}
7069
7070	vnode_put(vp);
7071	if (offset) {
7072		*offset = loff;
7073	}
7074
7075	*bytesread = bufsize - uio_resid(auio);
7076out:
7077	file_drop(fd);
7078	return (error);
7079}
7080
7081
7082int
7083getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
7084{
7085	off_t offset;
7086	ssize_t bytesread;
7087	int error;
7088
7089	AUDIT_ARG(fd, uap->fd);
7090	error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
7091
7092	if (error == 0) {
7093		if (proc_is64bit(p)) {
7094			user64_long_t base = (user64_long_t)offset;
7095			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
7096		} else {
7097			user32_long_t base = (user32_long_t)offset;
7098			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
7099		}
7100		*retval = bytesread;
7101	}
7102	return (error);
7103}
7104
7105int
7106getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
7107{
7108	off_t offset;
7109	ssize_t bytesread;
7110	int error;
7111
7112	AUDIT_ARG(fd, uap->fd);
7113	error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
7114
7115	if (error == 0) {
7116		*retval = bytesread;
7117		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
7118	}
7119	return (error);
7120}
7121
7122
7123/*
7124 * Set the mode mask for creation of filesystem nodes.
7125 * XXX implement xsecurity
7126 */
7127#define UMASK_NOXSECURITY	 (void *)1	/* leave existing xsecurity alone */
7128static int
7129umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
7130{
7131	struct filedesc *fdp;
7132
7133	AUDIT_ARG(mask, newmask);
7134	proc_fdlock(p);
7135	fdp = p->p_fd;
7136	*retval = fdp->fd_cmask;
7137	fdp->fd_cmask = newmask & ALLPERMS;
7138	proc_fdunlock(p);
7139	return (0);
7140}
7141
7142/*
7143 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
7144 *
7145 * Parameters:    p                       Process requesting to set the umask
7146 *                uap                     User argument descriptor (see below)
7147 *                retval                  umask of the process (parameter p)
7148 *
7149 * Indirect:      uap->newmask            umask to set
7150 *                uap->xsecurity          ACL to set
7151 *
7152 * Returns:        0                      Success
7153 *                !0                      Not success
7154 *
7155 */
7156int
7157umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
7158{
7159	int ciferror;
7160	kauth_filesec_t xsecdst;
7161
7162	xsecdst = KAUTH_FILESEC_NONE;
7163	if (uap->xsecurity != USER_ADDR_NULL) {
7164		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
7165			return ciferror;
7166	} else {
7167		xsecdst = KAUTH_FILESEC_NONE;
7168	}
7169
7170	ciferror = umask1(p, uap->newmask, xsecdst, retval);
7171
7172	if (xsecdst != KAUTH_FILESEC_NONE)
7173		kauth_filesec_free(xsecdst);
7174	return ciferror;
7175}
7176
7177int
7178umask(proc_t p, struct umask_args *uap, int32_t *retval)
7179{
7180	return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
7181}
7182
7183/*
7184 * Void all references to file by ripping underlying filesystem
7185 * away from vnode.
7186 */
7187/* ARGSUSED */
7188int
7189revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
7190{
7191	vnode_t vp;
7192	struct vnode_attr va;
7193	vfs_context_t ctx = vfs_context_current();
7194	int error;
7195	struct nameidata nd;
7196
7197	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
7198	       uap->path, ctx);
7199	error = namei(&nd);
7200	if (error)
7201		return (error);
7202	vp = nd.ni_vp;
7203
7204	nameidone(&nd);
7205
7206	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
7207		error = ENOTSUP;
7208		goto out;
7209	}
7210
7211	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
7212		error = EBUSY;
7213		goto out;
7214	}
7215
7216#if CONFIG_MACF
7217	error = mac_vnode_check_revoke(ctx, vp);
7218	if (error)
7219		goto out;
7220#endif
7221
7222	VATTR_INIT(&va);
7223	VATTR_WANTED(&va, va_uid);
7224	if ((error = vnode_getattr(vp, &va, ctx)))
7225		goto out;
7226	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
7227	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
7228		goto out;
7229	if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
7230		VNOP_REVOKE(vp, REVOKEALL, ctx);
7231out:
7232	vnode_put(vp);
7233	return (error);
7234}
7235
7236
7237/*
7238 *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
7239 *  The following system calls are designed to support features
7240 *  which are specific to the HFS & HFS Plus volume formats
7241 */
7242
7243
7244/*
7245 * Obtain attribute information on objects in a directory while enumerating
7246 * the directory.
7247 */
7248/* ARGSUSED */
7249int
7250getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
7251{
7252	vnode_t vp;
7253	struct fileproc *fp;
7254	uio_t auio = NULL;
7255	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7256	uint32_t count, savecount;
7257	uint32_t newstate;
7258	int error, eofflag;
7259	uint32_t loff;
7260	struct attrlist attributelist;
7261	vfs_context_t ctx = vfs_context_current();
7262	int fd = uap->fd;
7263	char uio_buf[ UIO_SIZEOF(1) ];
7264	kauth_action_t action;
7265
7266	AUDIT_ARG(fd, fd);
7267
7268	/* Get the attributes into kernel space */
7269	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
7270		return(error);
7271	}
7272	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
7273		return(error);
7274	}
7275	savecount = count;
7276	if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
7277		return (error);
7278	}
7279	if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7280		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7281		error = EBADF;
7282		goto out;
7283	}
7284
7285
7286#if CONFIG_MACF
7287	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
7288	    fp->f_fglob);
7289	if (error)
7290		goto out;
7291#endif
7292
7293
7294	if ( (error = vnode_getwithref(vp)) )
7295		goto out;
7296
7297	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7298
7299unionread:
7300	if (vp->v_type != VDIR) {
7301		(void)vnode_put(vp);
7302		error = EINVAL;
7303		goto out;
7304	}
7305
7306#if CONFIG_MACF
7307	error = mac_vnode_check_readdir(ctx, vp);
7308	if (error != 0) {
7309		(void)vnode_put(vp);
7310		goto out;
7311	}
7312#endif /* MAC */
7313
7314	/* set up the uio structure which will contain the users return buffer */
7315	loff = fp->f_fglob->fg_offset;
7316	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7317	uio_addiov(auio, uap->buffer, uap->buffersize);
7318
7319	/*
7320	 * If the only item requested is file names, we can let that past with
7321	 * just LIST_DIRECTORY.  If they want any other attributes, that means
7322	 * they need SEARCH as well.
7323	 */
7324	action = KAUTH_VNODE_LIST_DIRECTORY;
7325	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
7326	    attributelist.fileattr || attributelist.dirattr)
7327		action |= KAUTH_VNODE_SEARCH;
7328
7329	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
7330
7331		/* Believe it or not, uap->options only has 32-bits of valid
7332		 * info, so truncate before extending again */
7333
7334		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
7335				(u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
7336	}
7337
7338	if (error) {
7339		(void) vnode_put(vp);
7340		goto out;
7341	}
7342
7343	/*
7344	 * If we've got the last entry of a directory in a union mount
7345	 * then reset the eofflag and pretend there's still more to come.
7346	 * The next call will again set eofflag and the buffer will be empty,
7347	 * so traverse to the underlying directory and do the directory
7348	 * read there.
7349	 */
7350	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
7351		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
7352			eofflag = 0;
7353		} else {						// Empty buffer
7354			struct vnode *tvp = vp;
7355			if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
7356				vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
7357				fp->f_fglob->fg_data = (caddr_t) vp;
7358				fp->f_fglob->fg_offset = 0; // reset index for new dir
7359				count = savecount;
7360				vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
7361				vnode_put(tvp);
7362				goto unionread;
7363			}
7364			vp = tvp;
7365		}
7366	}
7367
7368	(void)vnode_put(vp);
7369
7370	if (error)
7371		goto out;
7372	fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
7373
7374	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
7375		goto out;
7376	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
7377		goto out;
7378	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
7379		goto out;
7380
7381	*retval = eofflag;  /* similar to getdirentries */
7382	error = 0;
7383out:
7384	file_drop(fd);
7385	return (error); /* return error earlier, an retval of 0 or 1 now */
7386
7387} /* end of getdirentriesattr system call */
7388
7389/*
7390* Exchange data between two files
7391*/
7392
7393/* ARGSUSED */
7394int
7395exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
7396{
7397
7398	struct nameidata fnd, snd;
7399	vfs_context_t ctx = vfs_context_current();
7400	vnode_t fvp;
7401	vnode_t svp;
7402	int error;
7403	u_int32_t nameiflags;
7404	char *fpath = NULL;
7405	char *spath = NULL;
7406	int   flen=0, slen=0;
7407	int from_truncated=0, to_truncated=0;
7408#if CONFIG_FSE
7409	fse_info f_finfo, s_finfo;
7410#endif
7411
7412	nameiflags = 0;
7413	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
7414
7415	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
7416	       UIO_USERSPACE, uap->path1, ctx);
7417
7418	error = namei(&fnd);
7419	if (error)
7420		goto out2;
7421
7422	nameidone(&fnd);
7423	fvp = fnd.ni_vp;
7424
7425	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
7426               UIO_USERSPACE, uap->path2, ctx);
7427
7428	error = namei(&snd);
7429	if (error) {
7430		vnode_put(fvp);
7431		goto out2;
7432	}
7433	nameidone(&snd);
7434	svp = snd.ni_vp;
7435
7436	/*
7437	 * if the files are the same, return an inval error
7438	 */
7439	if (svp == fvp) {
7440		error = EINVAL;
7441		goto out;
7442	}
7443
7444	/*
7445	 * if the files are on different volumes, return an error
7446	 */
7447	if (svp->v_mount != fvp->v_mount) {
7448	        error = EXDEV;
7449		goto out;
7450	}
7451
7452	/* If they're not files, return an error */
7453	if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
7454		error = EINVAL;
7455		goto out;
7456	}
7457
7458#if CONFIG_MACF
7459	error = mac_vnode_check_exchangedata(ctx,
7460	    fvp, svp);
7461	if (error)
7462		goto out;
7463#endif
7464	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
7465	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
7466		goto out;
7467
7468	if (
7469#if CONFIG_FSE
7470	need_fsevent(FSE_EXCHANGE, fvp) ||
7471#endif
7472	kauth_authorize_fileop_has_listeners()) {
7473		GET_PATH(fpath);
7474		GET_PATH(spath);
7475		if (fpath == NULL || spath == NULL) {
7476			error = ENOMEM;
7477			goto out;
7478		}
7479
7480		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
7481		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
7482
7483#if CONFIG_FSE
7484		get_fse_info(fvp, &f_finfo, ctx);
7485		get_fse_info(svp, &s_finfo, ctx);
7486		if (from_truncated || to_truncated) {
7487			// set it here since only the f_finfo gets reported up to user space
7488			f_finfo.mode |= FSE_TRUNCATED_PATH;
7489		}
7490#endif
7491	}
7492	/* Ok, make the call */
7493	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
7494
7495	if (error == 0) {
7496	    const char *tmpname;
7497
7498	    if (fpath != NULL && spath != NULL) {
7499	            /* call out to allow 3rd party notification of exchangedata.
7500		     * Ignore result of kauth_authorize_fileop call.
7501		     */
7502	            kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
7503					   (uintptr_t)fpath, (uintptr_t)spath);
7504	    }
7505	    name_cache_lock();
7506
7507	    tmpname     = fvp->v_name;
7508	    fvp->v_name = svp->v_name;
7509	    svp->v_name = tmpname;
7510
7511	    if (fvp->v_parent != svp->v_parent) {
7512		vnode_t tmp;
7513
7514		tmp           = fvp->v_parent;
7515		fvp->v_parent = svp->v_parent;
7516		svp->v_parent = tmp;
7517	    }
7518	    name_cache_unlock();
7519
7520#if CONFIG_FSE
7521	    if (fpath != NULL && spath != NULL) {
7522	            add_fsevent(FSE_EXCHANGE, ctx,
7523				FSE_ARG_STRING, flen, fpath,
7524				FSE_ARG_FINFO, &f_finfo,
7525				FSE_ARG_STRING, slen, spath,
7526				FSE_ARG_FINFO, &s_finfo,
7527				FSE_ARG_DONE);
7528	    }
7529#endif
7530	}
7531
7532out:
7533	if (fpath != NULL)
7534	        RELEASE_PATH(fpath);
7535	if (spath != NULL)
7536	        RELEASE_PATH(spath);
7537	vnode_put(svp);
7538	vnode_put(fvp);
7539out2:
7540        return (error);
7541}
7542
7543/*
7544 * Return (in MB) the amount of freespace on the given vnode's volume.
7545 */
7546uint32_t freespace_mb(vnode_t vp);
7547
7548uint32_t
7549freespace_mb(vnode_t vp)
7550{
7551	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
7552 	return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
7553 	        vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
7554}
7555
7556#if CONFIG_SEARCHFS
7557
7558/* ARGSUSED */
7559
7560int
7561searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
7562{
7563	vnode_t vp, tvp;
7564	int i, error=0;
7565	int fserror = 0;
7566	struct nameidata nd;
7567	struct user64_fssearchblock searchblock;
7568	struct searchstate *state;
7569	struct attrlist *returnattrs;
7570	struct timeval timelimit;
7571	void *searchparams1,*searchparams2;
7572	uio_t auio = NULL;
7573	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7574	uint32_t nummatches;
7575	int mallocsize;
7576	uint32_t nameiflags;
7577	vfs_context_t ctx = vfs_context_current();
7578	char uio_buf[ UIO_SIZEOF(1) ];
7579
7580	/* Start by copying in fsearchblock parameter list */
7581    if (IS_64BIT_PROCESS(p)) {
7582        error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
7583        timelimit.tv_sec = searchblock.timelimit.tv_sec;
7584        timelimit.tv_usec = searchblock.timelimit.tv_usec;
7585    }
7586    else {
7587        struct user32_fssearchblock tmp_searchblock;
7588
7589        error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
7590        // munge into 64-bit version
7591        searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
7592        searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
7593        searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
7594        searchblock.maxmatches = tmp_searchblock.maxmatches;
7595		/*
7596		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
7597		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
7598		 */
7599        timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
7600        timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
7601        searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
7602        searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
7603        searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
7604        searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
7605        searchblock.searchattrs = tmp_searchblock.searchattrs;
7606    }
7607	if (error)
7608		return(error);
7609
7610	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
7611	 */
7612	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
7613		searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
7614		return(EINVAL);
7615
7616	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
7617	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
7618	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
7619	/* block.  											      */
7620
7621	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
7622		      sizeof(struct attrlist) + sizeof(struct searchstate);
7623
7624	MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
7625
7626	/* Now set up the various pointers to the correct place in our newly allocated memory */
7627
7628	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
7629	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
7630	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
7631
7632	/* Now copy in the stuff given our local variables. */
7633
7634	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
7635		goto freeandexit;
7636
7637	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
7638		goto freeandexit;
7639
7640	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
7641		goto freeandexit;
7642
7643	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
7644		goto freeandexit;
7645
7646	/*
7647	 * When searching a union mount, need to set the
7648	 * start flag at the first call on each layer to
7649	 * reset state for the new volume.
7650	 */
7651	if (uap->options & SRCHFS_START)
7652		state->ss_union_layer = 0;
7653	else
7654		uap->options |= state->ss_union_flags;
7655	state->ss_union_flags = 0;
7656
7657	/*
7658	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
7659	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
7660	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
7661	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
7662	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
7663	 */
7664
7665	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
7666		attrreference_t* string_ref;
7667		u_int32_t* start_length;
7668		user64_size_t param_length;
7669
7670		/* validate searchparams1 */
7671		param_length = searchblock.sizeofsearchparams1;
7672		/* skip the word that specifies length of the buffer */
7673		start_length= (u_int32_t*) searchparams1;
7674		start_length= start_length+1;
7675		string_ref= (attrreference_t*) start_length;
7676
7677		/* ensure no negative offsets or too big offsets */
7678		if (string_ref->attr_dataoffset < 0 ) {
7679			error = EINVAL;
7680			goto freeandexit;
7681		}
7682		if (string_ref->attr_length > MAXPATHLEN) {
7683			error = EINVAL;
7684			goto freeandexit;
7685		}
7686
7687		/* Check for pointer overflow in the string ref */
7688		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
7689			error = EINVAL;
7690			goto freeandexit;
7691		}
7692
7693		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
7694			error = EINVAL;
7695			goto freeandexit;
7696		}
7697		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
7698			error = EINVAL;
7699			goto freeandexit;
7700		}
7701	}
7702
7703	/* set up the uio structure which will contain the users return buffer */
7704	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7705 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
7706
7707	nameiflags = 0;
7708	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
7709	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
7710	       UIO_USERSPACE, uap->path, ctx);
7711
7712	error = namei(&nd);
7713	if (error)
7714		goto freeandexit;
7715	vp = nd.ni_vp;
7716	nameidone(&nd);
7717
7718	/*
7719	 * Switch to the root vnode for the volume
7720	 */
7721	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
7722	if (error)
7723		goto freeandexit;
7724	vnode_put(vp);
7725	vp = tvp;
7726
7727	/*
7728	 * If it's a union mount, the path lookup takes
7729	 * us to the top layer. But we may need to descend
7730	 * to a lower layer. For non-union mounts the layer
7731	 * is always zero.
7732	 */
7733	for (i = 0; i < (int) state->ss_union_layer; i++) {
7734		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
7735			break;
7736		tvp = vp;
7737		vp = vp->v_mount->mnt_vnodecovered;
7738		if (vp == NULL) {
7739			vp = tvp;
7740			error = ENOENT;
7741			goto freeandexit;
7742		}
7743		vnode_getwithref(vp);
7744		vnode_put(tvp);
7745	}
7746
7747#if CONFIG_MACF
7748	error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
7749	if (error) {
7750		vnode_put(vp);
7751		goto freeandexit;
7752	}
7753#endif
7754
7755
7756	/*
7757	 * If searchblock.maxmatches == 0, then skip the search. This has happened
7758	 * before and sometimes the underlying code doesnt deal with it well.
7759	 */
7760	 if (searchblock.maxmatches == 0) {
7761		nummatches = 0;
7762		goto saveandexit;
7763	 }
7764
7765	/*
7766	 * Allright, we have everything we need, so lets make that call.
7767	 *
7768	 * We keep special track of the return value from the file system:
7769	 * EAGAIN is an acceptable error condition that shouldn't keep us
7770	 * from copying out any results...
7771	 */
7772
7773	fserror = VNOP_SEARCHFS(vp,
7774		searchparams1,
7775		searchparams2,
7776		&searchblock.searchattrs,
7777		(u_long)searchblock.maxmatches,
7778		&timelimit,
7779		returnattrs,
7780		&nummatches,
7781		(u_long)uap->scriptcode,
7782		(u_long)uap->options,
7783		auio,
7784		(struct searchstate *) &state->ss_fsstate,
7785		ctx);
7786
7787	/*
7788	 * If it's a union mount we need to be called again
7789	 * to search the mounted-on filesystem.
7790	 */
7791	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
7792		state->ss_union_flags = SRCHFS_START;
7793		state->ss_union_layer++;	// search next layer down
7794		fserror = EAGAIN;
7795	}
7796
7797saveandexit:
7798
7799	vnode_put(vp);
7800
7801	/* Now copy out the stuff that needs copying out. That means the number of matches, the
7802	   search state.  Everything was already put into he return buffer by the vop call. */
7803
7804	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
7805		goto freeandexit;
7806
7807	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
7808		goto freeandexit;
7809
7810	error = fserror;
7811
7812freeandexit:
7813
7814	FREE(searchparams1,M_TEMP);
7815
7816	return(error);
7817
7818
7819} /* end of searchfs system call */
7820
7821#else /* CONFIG_SEARCHFS */
7822
7823int
7824searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
7825{
7826	return (ENOTSUP);
7827}
7828
7829#endif /* CONFIG_SEARCHFS */
7830
7831
7832lck_grp_attr_t *  nspace_group_attr;
7833lck_attr_t *      nspace_lock_attr;
7834lck_grp_t *       nspace_mutex_group;
7835
7836lck_mtx_t         nspace_handler_lock;
7837lck_mtx_t         nspace_handler_exclusion_lock;
7838
7839time_t snapshot_timestamp=0;
7840int nspace_allow_virtual_devs=0;
7841
7842void nspace_handler_init(void);
7843
7844typedef struct nspace_item_info {
7845	struct vnode *vp;
7846	void         *arg;
7847	uint64_t      op;
7848	uint32_t      vid;
7849	uint32_t      flags;
7850	uint32_t      token;
7851	uint32_t      refcount;
7852} nspace_item_info;
7853
7854#define MAX_NSPACE_ITEMS   128
7855nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
7856uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
7857uint32_t      nspace_token_id=0;
7858uint32_t      nspace_handler_timeout = 15;    // seconds
7859
7860#define NSPACE_ITEM_NEW         0x0001
7861#define NSPACE_ITEM_PROCESSING  0x0002
7862#define NSPACE_ITEM_DEAD        0x0004
7863#define NSPACE_ITEM_CANCELLED   0x0008
7864#define NSPACE_ITEM_DONE        0x0010
7865#define NSPACE_ITEM_RESET_TIMER 0x0020
7866
7867#define NSPACE_ITEM_NSPACE_EVENT   0x0040
7868#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
7869#define NSPACE_ITEM_TRACK_EVENT    0x0100
7870
7871#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT | NSPACE_ITEM_TRACK_EVENT)
7872
7873//#pragma optimization_level 0
7874
7875typedef enum {
7876	NSPACE_HANDLER_NSPACE = 0,
7877	NSPACE_HANDLER_SNAPSHOT = 1,
7878	NSPACE_HANDLER_TRACK = 2,
7879
7880	NSPACE_HANDLER_COUNT,
7881} nspace_type_t;
7882
7883typedef struct {
7884	uint64_t handler_tid;
7885	struct proc *handler_proc;
7886	int handler_busy;
7887} nspace_handler_t;
7888
7889nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
7890
7891/* namespace fsctl functions */
7892static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
7893static int nspace_item_flags_for_type(nspace_type_t nspace_type);
7894static int nspace_open_flags_for_type(nspace_type_t nspace_type);
7895static nspace_type_t nspace_type_for_op(uint64_t op);
7896static int nspace_is_special_process(struct proc *proc);
7897static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
7898static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
7899static int validate_namespace_args (int is64bit, int size);
7900static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
7901
7902
7903static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
7904{
7905	switch(nspace_type) {
7906		case NSPACE_HANDLER_NSPACE:
7907			return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
7908		case NSPACE_HANDLER_SNAPSHOT:
7909			return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
7910		case NSPACE_HANDLER_TRACK:
7911			return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_TRACK_EVENT;
7912		default:
7913			printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
7914			return 0;
7915	}
7916}
7917
7918static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
7919{
7920	switch(nspace_type) {
7921		case NSPACE_HANDLER_NSPACE:
7922			return NSPACE_ITEM_NSPACE_EVENT;
7923		case NSPACE_HANDLER_SNAPSHOT:
7924			return NSPACE_ITEM_SNAPSHOT_EVENT;
7925		case NSPACE_HANDLER_TRACK:
7926			return NSPACE_ITEM_TRACK_EVENT;
7927		default:
7928			printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
7929			return 0;
7930	}
7931}
7932
7933static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
7934{
7935	switch(nspace_type) {
7936		case NSPACE_HANDLER_NSPACE:
7937			return FREAD | FWRITE | O_EVTONLY;
7938		case NSPACE_HANDLER_SNAPSHOT:
7939		case NSPACE_HANDLER_TRACK:
7940			return FREAD | O_EVTONLY;
7941		default:
7942			printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
7943			return 0;
7944	}
7945}
7946
7947static inline nspace_type_t nspace_type_for_op(uint64_t op)
7948{
7949	switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
7950		case NAMESPACE_HANDLER_NSPACE_EVENT:
7951			return NSPACE_HANDLER_NSPACE;
7952		case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
7953			return NSPACE_HANDLER_SNAPSHOT;
7954		case NAMESPACE_HANDLER_TRACK_EVENT:
7955			return NSPACE_HANDLER_TRACK;
7956		default:
7957			printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
7958			return NSPACE_HANDLER_NSPACE;
7959	}
7960}
7961
7962static inline int nspace_is_special_process(struct proc *proc)
7963{
7964	int i;
7965	for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
7966		if (proc == nspace_handlers[i].handler_proc)
7967			return 1;
7968	}
7969	return 0;
7970}
7971
7972void
7973nspace_handler_init(void)
7974{
7975	nspace_lock_attr    = lck_attr_alloc_init();
7976	nspace_group_attr   = lck_grp_attr_alloc_init();
7977	nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
7978	lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
7979	lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
7980	memset(&nspace_items[0], 0, sizeof(nspace_items));
7981}
7982
7983void
7984nspace_proc_exit(struct proc *p)
7985{
7986	int i, event_mask = 0;
7987
7988	for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
7989		if (p == nspace_handlers[i].handler_proc) {
7990			event_mask |= nspace_item_flags_for_type(i);
7991			nspace_handlers[i].handler_tid = 0;
7992			nspace_handlers[i].handler_proc = NULL;
7993		}
7994	}
7995
7996	if (event_mask == 0) {
7997		return;
7998	}
7999
8000	if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
8001		// if this process was the snapshot handler, zero snapshot_timeout
8002		snapshot_timestamp = 0;
8003	}
8004
8005	//
8006	// unblock anyone that's waiting for the handler that died
8007	//
8008	lck_mtx_lock(&nspace_handler_lock);
8009	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8010		if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
8011
8012			if ( nspace_items[i].flags & event_mask ) {
8013
8014				if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8015					vnode_lock_spin(nspace_items[i].vp);
8016					nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8017					vnode_unlock(nspace_items[i].vp);
8018				}
8019				nspace_items[i].vp = NULL;
8020				nspace_items[i].vid = 0;
8021				nspace_items[i].flags = NSPACE_ITEM_DONE;
8022				nspace_items[i].token = 0;
8023
8024				wakeup((caddr_t)&(nspace_items[i].vp));
8025			}
8026		}
8027	}
8028
8029	wakeup((caddr_t)&nspace_item_idx);
8030	lck_mtx_unlock(&nspace_handler_lock);
8031}
8032
8033
8034int
8035resolve_nspace_item(struct vnode *vp, uint64_t op)
8036{
8037	return resolve_nspace_item_ext(vp, op, NULL);
8038}
8039
8040int
8041resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
8042{
8043	int i, error, keep_waiting;
8044	struct timespec ts;
8045	nspace_type_t nspace_type = nspace_type_for_op(op);
8046
8047	// only allow namespace events on regular files, directories and symlinks.
8048	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
8049		return 0;
8050	}
8051
8052	//
8053	// if this is a snapshot event and the vnode is on a
8054	// disk image just pretend nothing happened since any
8055	// change to the disk image will cause the disk image
8056	// itself to get backed up and this avoids multi-way
8057	// deadlocks between the snapshot handler and the ever
8058	// popular diskimages-helper process.  the variable
8059	// nspace_allow_virtual_devs allows this behavior to
8060	// be overridden (for use by the Mobile TimeMachine
8061	// testing infrastructure which uses disk images)
8062	//
8063	if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
8064	    && (vp->v_mount != NULL)
8065	    && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
8066	    && !nspace_allow_virtual_devs) {
8067
8068		return 0;
8069	}
8070
8071	// if (thread_tid(current_thread()) == namespace_handler_tid) {
8072	if (nspace_handlers[nspace_type].handler_proc == NULL) {
8073		return 0;
8074	}
8075
8076	if (nspace_is_special_process(current_proc())) {
8077		return EDEADLK;
8078	}
8079
8080	lck_mtx_lock(&nspace_handler_lock);
8081
8082retry:
8083	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8084		if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
8085			break;
8086		}
8087	}
8088
8089	if (i >= MAX_NSPACE_ITEMS) {
8090		for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8091			if (nspace_items[i].flags == 0) {
8092				break;
8093			}
8094		}
8095	} else {
8096		nspace_items[i].refcount++;
8097	}
8098
8099	if (i >= MAX_NSPACE_ITEMS) {
8100		ts.tv_sec = nspace_handler_timeout;
8101		ts.tv_nsec = 0;
8102
8103		error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
8104		if (error == 0) {
8105			// an entry got free'd up, go see if we can get a slot
8106			goto retry;
8107		} else {
8108			lck_mtx_unlock(&nspace_handler_lock);
8109			return error;
8110		}
8111	}
8112
8113	//
8114	// if it didn't already exist, add it.  if it did exist
8115	// we'll get woken up when someone does a wakeup() on
8116	// the slot in the nspace_items table.
8117	//
8118	if (vp != nspace_items[i].vp) {
8119		nspace_items[i].vp = vp;
8120		nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
8121		nspace_items[i].op = op;
8122		nspace_items[i].vid = vnode_vid(vp);
8123		nspace_items[i].flags = NSPACE_ITEM_NEW;
8124		nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
8125		if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
8126			if (arg) {
8127				vnode_lock_spin(vp);
8128				vp->v_flag |= VNEEDSSNAPSHOT;
8129				vnode_unlock(vp);
8130			}
8131		}
8132
8133		nspace_items[i].token = 0;
8134		nspace_items[i].refcount = 1;
8135
8136		wakeup((caddr_t)&nspace_item_idx);
8137	}
8138
8139	//
8140	// Now go to sleep until the handler does a wakeup on this
8141	// slot in the nspace_items table (or we timeout).
8142	//
8143	keep_waiting = 1;
8144	while(keep_waiting) {
8145		ts.tv_sec = nspace_handler_timeout;
8146		ts.tv_nsec = 0;
8147		error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
8148
8149		if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
8150			error = 0;
8151		} else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
8152			error = nspace_items[i].token;
8153		} else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
8154			if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
8155				nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
8156				continue;
8157			} else {
8158				error = ETIMEDOUT;
8159			}
8160		} else if (error == 0) {
8161			// hmmm, why did we get woken up?
8162			printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
8163			       nspace_items[i].token);
8164		}
8165
8166		if (--nspace_items[i].refcount == 0) {
8167			nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
8168			nspace_items[i].arg = NULL;
8169			nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
8170			nspace_items[i].flags = 0;     // this clears it for re-use
8171		}
8172		wakeup(&nspace_token_id);
8173		keep_waiting = 0;
8174	}
8175
8176	lck_mtx_unlock(&nspace_handler_lock);
8177
8178	return error;
8179}
8180
8181
8182int
8183get_nspace_item_status(struct vnode *vp, int32_t *status)
8184{
8185	int i;
8186
8187	lck_mtx_lock(&nspace_handler_lock);
8188	for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8189		if (nspace_items[i].vp == vp) {
8190			break;
8191		}
8192	}
8193
8194	if (i >= MAX_NSPACE_ITEMS) {
8195		lck_mtx_unlock(&nspace_handler_lock);
8196		return ENOENT;
8197	}
8198
8199	*status = nspace_items[i].flags;
8200	lck_mtx_unlock(&nspace_handler_lock);
8201	return 0;
8202}
8203
8204
8205#if 0
8206static int
8207build_volfs_path(struct vnode *vp, char *path, int *len)
8208{
8209	struct vnode_attr va;
8210	int ret;
8211
8212	VATTR_INIT(&va);
8213	VATTR_WANTED(&va, va_fsid);
8214	VATTR_WANTED(&va, va_fileid);
8215
8216	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
8217		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
8218		ret = -1;
8219	} else {
8220		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
8221		ret = 0;
8222	}
8223
8224	return ret;
8225}
8226#endif
8227
8228//
8229// Note: this function does NOT check permissions on all of the
8230// parent directories leading to this vnode.  It should only be
8231// called on behalf of a root process.  Otherwise a process may
8232// get access to a file because the file itself is readable even
8233// though its parent directories would prevent access.
8234//
8235static int
8236vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
8237{
8238	int error, action;
8239
8240	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8241		return error;
8242	}
8243
8244#if CONFIG_MACF
8245	error = mac_vnode_check_open(ctx, vp, fmode);
8246	if (error)
8247		return error;
8248#endif
8249
8250	/* compute action to be authorized */
8251	action = 0;
8252	if (fmode & FREAD) {
8253		action |= KAUTH_VNODE_READ_DATA;
8254	}
8255	if (fmode & (FWRITE | O_TRUNC)) {
8256		/*
8257		 * If we are writing, appending, and not truncating,
8258		 * indicate that we are appending so that if the
8259		 * UF_APPEND or SF_APPEND bits are set, we do not deny
8260		 * the open.
8261		 */
8262		if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8263			action |= KAUTH_VNODE_APPEND_DATA;
8264		} else {
8265			action |= KAUTH_VNODE_WRITE_DATA;
8266		}
8267	}
8268
8269	if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
8270		return error;
8271
8272
8273	//
8274	// if the vnode is tagged VOPENEVT and the current process
8275	// has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
8276	// flag to the open mode so that this open won't count against
8277	// the vnode when carbon delete() does a vnode_isinuse() to see
8278	// if a file is currently in use.  this allows spotlight
8279	// importers to not interfere with carbon apps that depend on
8280	// the no-delete-if-busy semantics of carbon delete().
8281	//
8282	if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
8283		fmode |= O_EVTONLY;
8284	}
8285
8286	if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
8287		return error;
8288	}
8289	if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
8290		VNOP_CLOSE(vp, fmode, ctx);
8291		return error;
8292	}
8293
8294	/* Call out to allow 3rd party notification of open.
8295	 * Ignore result of kauth_authorize_fileop call.
8296	 */
8297#if CONFIG_MACF
8298	mac_vnode_notify_open(ctx, vp, fmode);
8299#endif
8300	kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
8301			       (uintptr_t)vp, 0);
8302
8303
8304	return 0;
8305}
8306
8307static int
8308wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
8309{
8310	int i, error=0, unblock=0;
8311	task_t curtask;
8312
8313	lck_mtx_lock(&nspace_handler_exclusion_lock);
8314	if (nspace_handlers[nspace_type].handler_busy) {
8315		lck_mtx_unlock(&nspace_handler_exclusion_lock);
8316		return EBUSY;
8317	}
8318	nspace_handlers[nspace_type].handler_busy = 1;
8319	lck_mtx_unlock(&nspace_handler_exclusion_lock);
8320
8321	/*
8322	 * Any process that gets here will be one of the namespace handlers.
8323	 * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
8324	 * as we can cause deadlocks to occur, because the namespace handler may prevent
8325	 * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
8326	 * process.
8327	 */
8328	curtask = current_task();
8329	bsd_set_dependency_capable (curtask);
8330
8331	lck_mtx_lock(&nspace_handler_lock);
8332	if (nspace_handlers[nspace_type].handler_proc == NULL) {
8333		nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
8334		nspace_handlers[nspace_type].handler_proc = current_proc();
8335	}
8336
8337	while (error == 0) {
8338
8339		for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8340			if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
8341				if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
8342					continue;
8343				}
8344				break;
8345			}
8346		}
8347
8348		if (i < MAX_NSPACE_ITEMS) {
8349			nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
8350			nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
8351			nspace_items[i].token  = ++nspace_token_id;
8352
8353			if (nspace_items[i].vp) {
8354				struct fileproc *fp;
8355				int32_t indx, fmode;
8356				struct proc *p = current_proc();
8357				vfs_context_t ctx = vfs_context_current();
8358				struct vnode_attr va;
8359
8360
8361				/*
8362				 * Use vnode pointer to acquire a file descriptor for
8363				 * hand-off to userland
8364				 */
8365				fmode = nspace_open_flags_for_type(nspace_type);
8366				error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
8367				if (error) {
8368					unblock = 1;
8369					break;
8370				}
8371				error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
8372				if (error) {
8373					unblock = 1;
8374					vnode_put(nspace_items[i].vp);
8375					break;
8376				}
8377
8378				if ((error = falloc(p, &fp, &indx, ctx))) {
8379					vn_close(nspace_items[i].vp, fmode, ctx);
8380					vnode_put(nspace_items[i].vp);
8381					unblock = 1;
8382					break;
8383				}
8384
8385				fp->f_fglob->fg_flag = fmode;
8386				fp->f_fglob->fg_ops = &vnops;
8387				fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
8388
8389				proc_fdlock(p);
8390				procfdtbl_releasefd(p, indx, NULL);
8391				fp_drop(p, indx, fp, 1);
8392				proc_fdunlock(p);
8393
8394				/*
8395				 * All variants of the namespace handler struct support these three fields:
8396				 * token, flags, and the FD pointer
8397				 */
8398				error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
8399				error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
8400				error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
8401
8402				/*
8403				 * Handle optional fields:
8404				 * extended version support an info ptr (offset, length), and the
8405				 *
8406				 * namedata version supports a unique per-link object ID
8407				 *
8408				 */
8409				if (nhd->infoptr) {
8410					uio_t uio = (uio_t)nspace_items[i].arg;
8411					uint64_t u_offset, u_length;
8412
8413					if (uio) {
8414						u_offset = uio_offset(uio);
8415						u_length = uio_resid(uio);
8416					} else {
8417						u_offset = 0;
8418						u_length = 0;
8419					}
8420					error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
8421					error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t));
8422				}
8423
8424				if (nhd->objid) {
8425					VATTR_INIT(&va);
8426					VATTR_WANTED(&va, va_linkid);
8427					error = vnode_getattr(nspace_items[i].vp, &va, ctx);
8428					if (error == 0 ) {
8429						uint64_t linkid = 0;
8430						if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
8431							linkid = (uint64_t)va.va_linkid;
8432						}
8433						error = copyout (&linkid, nhd->objid, sizeof(uint64_t));
8434					}
8435				}
8436
8437				if (error) {
8438					vn_close(nspace_items[i].vp, fmode, ctx);
8439					fp_free(p, indx, fp);
8440					unblock = 1;
8441				}
8442
8443				vnode_put(nspace_items[i].vp);
8444
8445				break;
8446			} else {
8447				printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n",
8448				       i, nspace_items[i].vp, error, nspace_items[i].vp->v_name);
8449			}
8450
8451		} else {
8452			error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
8453			if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
8454				error = EINVAL;
8455				break;
8456			}
8457
8458		}
8459	}
8460
8461	if (unblock) {
8462		if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8463			vnode_lock_spin(nspace_items[i].vp);
8464			nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8465			vnode_unlock(nspace_items[i].vp);
8466		}
8467		nspace_items[i].vp = NULL;
8468		nspace_items[i].vid = 0;
8469		nspace_items[i].flags = NSPACE_ITEM_DONE;
8470		nspace_items[i].token = 0;
8471
8472		wakeup((caddr_t)&(nspace_items[i].vp));
8473	}
8474
8475	if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
8476		// just go through every snapshot event and unblock it immediately.
8477		if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
8478			for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8479				if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
8480					if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
8481						nspace_items[i].vp = NULL;
8482						nspace_items[i].vid = 0;
8483						nspace_items[i].flags = NSPACE_ITEM_DONE;
8484						nspace_items[i].token = 0;
8485
8486						wakeup((caddr_t)&(nspace_items[i].vp));
8487					}
8488				}
8489			}
8490		}
8491	}
8492
8493	lck_mtx_unlock(&nspace_handler_lock);
8494
8495	lck_mtx_lock(&nspace_handler_exclusion_lock);
8496	nspace_handlers[nspace_type].handler_busy = 0;
8497	lck_mtx_unlock(&nspace_handler_exclusion_lock);
8498
8499	return error;
8500}
8501
8502static inline int validate_namespace_args (int is64bit, int size) {
8503
8504	if (is64bit) {
8505		/* Must be one of these */
8506		if (size == sizeof(user64_namespace_handler_info)) {
8507			goto sizeok;
8508		}
8509		if (size == sizeof(user64_namespace_handler_info_ext)) {
8510			goto sizeok;
8511		}
8512		if (size == sizeof(user64_namespace_handler_data)) {
8513			goto sizeok;
8514		}
8515		return EINVAL;
8516	}
8517	else {
8518		/* 32 bit -- must be one of these */
8519		if (size == sizeof(user32_namespace_handler_info)) {
8520			goto sizeok;
8521		}
8522		if (size == sizeof(user32_namespace_handler_info_ext)) {
8523			goto sizeok;
8524		}
8525		if (size == sizeof(user32_namespace_handler_data)) {
8526			goto sizeok;
8527		}
8528		return EINVAL;
8529	}
8530
8531sizeok:
8532
8533	return 0;
8534
8535}
8536
8537static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
8538{
8539	int error = 0;
8540	namespace_handler_data nhd;
8541
8542	bzero (&nhd, sizeof(namespace_handler_data));
8543
8544	if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
8545			(snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
8546		return EINVAL;
8547	}
8548
8549	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8550		return error;
8551	}
8552
8553	error = validate_namespace_args (is64bit, size);
8554	if (error) {
8555		return error;
8556	}
8557
8558	/* Copy in the userland pointers into our kernel-only struct */
8559
8560	if (is64bit) {
8561		/* 64 bit userland structures */
8562		nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
8563		nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
8564		nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
8565
8566		/* If the size is greater than the standard info struct, add in extra fields */
8567		if (size > (sizeof(user64_namespace_handler_info))) {
8568			if (size >= (sizeof(user64_namespace_handler_info_ext))) {
8569				nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
8570			}
8571			if (size == (sizeof(user64_namespace_handler_data))) {
8572				nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
8573			}
8574			/* Otherwise the fields were pre-zeroed when we did the bzero above. */
8575		}
8576	}
8577	else {
8578		/* 32 bit userland structures */
8579		nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
8580		nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
8581		nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
8582
8583		if (size > (sizeof(user32_namespace_handler_info))) {
8584			if (size >= (sizeof(user32_namespace_handler_info_ext))) {
8585				nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
8586			}
8587			if (size == (sizeof(user32_namespace_handler_data))) {
8588				nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
8589			}
8590			/* Otherwise the fields were pre-zeroed when we did the bzero above. */
8591		}
8592	}
8593
8594	return wait_for_namespace_event(&nhd, nspace_type);
8595}
8596
8597/*
8598 * Make a filesystem-specific control call:
8599 */
8600/* ARGSUSED */
8601static int
8602fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
8603{
8604	int error=0;
8605	boolean_t is64bit;
8606	u_int size;
8607#define STK_PARAMS 128
8608	char stkbuf[STK_PARAMS];
8609	caddr_t data, memp;
8610	vnode_t vp = *arg_vp;
8611
8612	size = IOCPARM_LEN(cmd);
8613	if (size > IOCPARM_MAX) return (EINVAL);
8614
8615	is64bit = proc_is64bit(p);
8616
8617	memp = NULL;
8618	if (size > sizeof (stkbuf)) {
8619		if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
8620		data = memp;
8621	} else {
8622		data = &stkbuf[0];
8623	};
8624
8625	if (cmd & IOC_IN) {
8626		if (size) {
8627			error = copyin(udata, data, size);
8628			if (error) goto FSCtl_Exit;
8629		} else {
8630			if (is64bit) {
8631				*(user_addr_t *)data = udata;
8632			}
8633			else {
8634				*(uint32_t *)data = (uint32_t)udata;
8635			}
8636		};
8637	} else if ((cmd & IOC_OUT) && size) {
8638		/*
8639		 * Zero the buffer so the user always
8640		 * gets back something deterministic.
8641		 */
8642		bzero(data, size);
8643	} else if (cmd & IOC_VOID) {
8644		if (is64bit) {
8645			*(user_addr_t *)data = udata;
8646		}
8647		else {
8648			*(uint32_t *)data = (uint32_t)udata;
8649		}
8650	}
8651
8652	/* Check to see if it's a generic command */
8653	if (IOCBASECMD(cmd) == FSCTL_SYNC_VOLUME) {
8654		mount_t mp = vp->v_mount;
8655		int arg = *(uint32_t*)data;
8656
8657		/* record vid of vp so we can drop it below. */
8658		uint32_t vvid = vp->v_id;
8659
8660		/*
8661		 * Then grab mount_iterref so that we can release the vnode.
8662		 * Without this, a thread may call vnode_iterate_prepare then
8663		 * get into a deadlock because we've never released the root vp
8664		 */
8665		error = mount_iterref (mp, 0);
8666		if (error)  {
8667			goto FSCtl_Exit;
8668		}
8669		vnode_put(vp);
8670
8671		/* issue the sync for this volume */
8672		(void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
8673
8674		/*
8675		 * Then release the mount_iterref once we're done syncing; it's not
8676		 * needed for the VNOP_IOCTL below
8677		 */
8678		mount_iterdrop(mp);
8679
8680		if (arg & FSCTL_SYNC_FULLSYNC) {
8681			/* re-obtain vnode iocount on the root vp, if possible */
8682			error = vnode_getwithvid (vp, vvid);
8683			if (error == 0) {
8684				error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
8685				vnode_put (vp);
8686			}
8687		}
8688		/* mark the argument VP as having been released */
8689		*arg_vp = NULL;
8690
8691	} else if (IOCBASECMD(cmd) == FSCTL_SET_PACKAGE_EXTS) {
8692		user_addr_t ext_strings;
8693		uint32_t    num_entries;
8694		uint32_t    max_width;
8695
8696		if (   (is64bit && size != sizeof(user64_package_ext_info))
8697		   || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
8698
8699			// either you're 64-bit and passed a 64-bit struct or
8700			// you're 32-bit and passed a 32-bit struct.  otherwise
8701			// it's not ok.
8702			error = EINVAL;
8703			goto FSCtl_Exit;
8704		}
8705
8706		if (is64bit) {
8707			ext_strings = ((user64_package_ext_info *)data)->strings;
8708			num_entries = ((user64_package_ext_info *)data)->num_entries;
8709			max_width   = ((user64_package_ext_info *)data)->max_width;
8710		} else {
8711			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
8712			num_entries = ((user32_package_ext_info *)data)->num_entries;
8713			max_width   = ((user32_package_ext_info *)data)->max_width;
8714		}
8715
8716		error = set_package_extensions_table(ext_strings, num_entries, max_width);
8717
8718
8719	}
8720
8721   	/* namespace handlers */
8722	else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_GET) {
8723		error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
8724	}
8725
8726	/* Snapshot handlers */
8727	else if (IOCBASECMD(cmd) == FSCTL_OLD_SNAPSHOT_HANDLER_GET) {
8728		error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
8729	} else if (IOCBASECMD(cmd) == FSCTL_SNAPSHOT_HANDLER_GET_EXT) {
8730		error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
8731	}
8732
8733	/* Tracked File Handlers */
8734	else if (IOCBASECMD(cmd) == FSCTL_TRACKED_HANDLER_GET) {
8735		error = process_namespace_fsctl(NSPACE_HANDLER_TRACK, is64bit, size, data);
8736	}
8737	else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_GETDATA) {
8738		error = process_namespace_fsctl(NSPACE_HANDLER_TRACK, is64bit, size, data);
8739	} else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UPDATE) {
8740		uint32_t token, val;
8741		int i;
8742
8743		if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
8744			goto FSCtl_Exit;
8745		}
8746
8747		if (!nspace_is_special_process(p)) {
8748			error = EINVAL;
8749			goto FSCtl_Exit;
8750		}
8751
8752		token = ((uint32_t *)data)[0];
8753		val   = ((uint32_t *)data)[1];
8754
8755		lck_mtx_lock(&nspace_handler_lock);
8756
8757		for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8758			if (nspace_items[i].token == token) {
8759				break;
8760			}
8761		}
8762
8763		if (i >= MAX_NSPACE_ITEMS) {
8764			error = ENOENT;
8765		} else {
8766			//
8767			// if this bit is set, when resolve_nspace_item() times out
8768			// it will loop and go back to sleep.
8769			//
8770			nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
8771		}
8772
8773		lck_mtx_unlock(&nspace_handler_lock);
8774
8775		if (error) {
8776			printf("nspace-handler-update: did not find token %u\n", token);
8777		}
8778
8779	} else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UNBLOCK) {
8780		uint32_t token, val;
8781		int i;
8782
8783		if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
8784			goto FSCtl_Exit;
8785		}
8786
8787		if (!nspace_is_special_process(p)) {
8788			error = EINVAL;
8789			goto FSCtl_Exit;
8790		}
8791
8792		token = ((uint32_t *)data)[0];
8793		val   = ((uint32_t *)data)[1];
8794
8795		lck_mtx_lock(&nspace_handler_lock);
8796
8797		for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8798			if (nspace_items[i].token == token) {
8799				break;
8800			}
8801		}
8802
8803		if (i >= MAX_NSPACE_ITEMS) {
8804			printf("nspace-handler-unblock: did not find token %u\n", token);
8805			error = ENOENT;
8806		} else {
8807			if (val == 0 && nspace_items[i].vp) {
8808				vnode_lock_spin(nspace_items[i].vp);
8809				nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8810				vnode_unlock(nspace_items[i].vp);
8811			}
8812
8813			nspace_items[i].vp = NULL;
8814			nspace_items[i].arg = NULL;
8815			nspace_items[i].op = 0;
8816			nspace_items[i].vid = 0;
8817			nspace_items[i].flags = NSPACE_ITEM_DONE;
8818			nspace_items[i].token = 0;
8819
8820			wakeup((caddr_t)&(nspace_items[i].vp));
8821		}
8822
8823		lck_mtx_unlock(&nspace_handler_lock);
8824
8825	} else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_CANCEL) {
8826		uint32_t token, val;
8827		int i;
8828
8829		if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
8830			goto FSCtl_Exit;
8831		}
8832
8833		if (!nspace_is_special_process(p)) {
8834			error = EINVAL;
8835			goto FSCtl_Exit;
8836		}
8837
8838		token = ((uint32_t *)data)[0];
8839		val   = ((uint32_t *)data)[1];
8840
8841		lck_mtx_lock(&nspace_handler_lock);
8842
8843		for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8844			if (nspace_items[i].token == token) {
8845				break;
8846			}
8847		}
8848
8849		if (i >= MAX_NSPACE_ITEMS) {
8850			printf("nspace-handler-cancel: did not find token %u\n", token);
8851			error = ENOENT;
8852		} else {
8853			if (nspace_items[i].vp) {
8854				vnode_lock_spin(nspace_items[i].vp);
8855				nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8856				vnode_unlock(nspace_items[i].vp);
8857			}
8858
8859			nspace_items[i].vp = NULL;
8860			nspace_items[i].arg = NULL;
8861			nspace_items[i].vid = 0;
8862			nspace_items[i].token = val;
8863			nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
8864			nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
8865
8866			wakeup((caddr_t)&(nspace_items[i].vp));
8867		}
8868
8869		lck_mtx_unlock(&nspace_handler_lock);
8870	} else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) {
8871		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8872			goto FSCtl_Exit;
8873		}
8874
8875		// we explicitly do not do the namespace_handler_proc check here
8876
8877		lck_mtx_lock(&nspace_handler_lock);
8878		snapshot_timestamp = ((uint32_t *)data)[0];
8879		wakeup(&nspace_item_idx);
8880		lck_mtx_unlock(&nspace_handler_lock);
8881		printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
8882
8883	} else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) {
8884		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8885			goto FSCtl_Exit;
8886		}
8887
8888		lck_mtx_lock(&nspace_handler_lock);
8889		nspace_allow_virtual_devs = ((uint32_t *)data)[0];
8890		lck_mtx_unlock(&nspace_handler_lock);
8891		printf("nspace-snapshot-handler will%s allow events on disk-images\n",
8892		       nspace_allow_virtual_devs ? "" : " NOT");
8893		error = 0;
8894
8895	} else if (IOCBASECMD(cmd) == FSCTL_SET_FSTYPENAME_OVERRIDE) {
8896		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8897			goto FSCtl_Exit;
8898		}
8899		if (vp->v_mount) {
8900			mount_lock(vp->v_mount);
8901			if (data[0] != 0) {
8902				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
8903				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
8904				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
8905					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
8906					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
8907				}
8908			} else {
8909				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
8910					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
8911				}
8912				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
8913				vp->v_mount->fstypename_override[0] = '\0';
8914			}
8915			mount_unlock(vp->v_mount);
8916		}
8917	} else {
8918		/* Invoke the filesystem-specific code */
8919		error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
8920	}
8921
8922
8923	/*
8924	 * Copy any data to user, size was
8925	 * already set and checked above.
8926	 */
8927	if (error == 0 && (cmd & IOC_OUT) && size)
8928		error = copyout(data, udata, size);
8929
8930FSCtl_Exit:
8931	if (memp) kfree(memp, size);
8932
8933	return error;
8934}
8935
8936/* ARGSUSED */
8937int
8938fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
8939{
8940	int error;
8941	struct nameidata nd;
8942	u_long nameiflags;
8943	vnode_t vp = NULL;
8944	vfs_context_t ctx = vfs_context_current();
8945
8946	AUDIT_ARG(cmd, uap->cmd);
8947	AUDIT_ARG(value32, uap->options);
8948	/* Get the vnode for the file we are getting info on:  */
8949	nameiflags = 0;
8950	if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8951	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
8952	       UIO_USERSPACE, uap->path, ctx);
8953	if ((error = namei(&nd))) goto done;
8954	vp = nd.ni_vp;
8955	nameidone(&nd);
8956
8957#if CONFIG_MACF
8958	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
8959	if (error) {
8960		goto done;
8961	}
8962#endif
8963
8964	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
8965
8966done:
8967	if (vp)
8968		vnode_put(vp);
8969	return error;
8970}
8971/* ARGSUSED */
8972int
8973ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
8974{
8975	int error;
8976	vnode_t vp = NULL;
8977	vfs_context_t ctx = vfs_context_current();
8978	int fd = -1;
8979
8980	AUDIT_ARG(fd, uap->fd);
8981	AUDIT_ARG(cmd, uap->cmd);
8982	AUDIT_ARG(value32, uap->options);
8983
8984	/* Get the vnode for the file we are getting info on:  */
8985	if ((error = file_vnode(uap->fd, &vp)))
8986		goto done;
8987	fd = uap->fd;
8988	if ((error = vnode_getwithref(vp))) {
8989		goto done;
8990	}
8991
8992#if CONFIG_MACF
8993	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
8994	if (error) {
8995		goto done;
8996	}
8997#endif
8998
8999	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9000
9001done:
9002	if (fd != -1)
9003		file_drop(fd);
9004
9005	if (vp)
9006		vnode_put(vp);
9007	return error;
9008}
9009/* end of fsctl system call */
9010
9011/*
9012 * An in-kernel sync for power management to call.
9013 */
9014__private_extern__ int
9015sync_internal(void)
9016{
9017	int error;
9018
9019	struct sync_args data;
9020
9021	int retval[2];
9022
9023
9024	error = sync(current_proc(), &data, &retval[0]);
9025
9026
9027	return (error);
9028} /* end of sync_internal call */
9029
9030
9031/*
9032 *  Retrieve the data of an extended attribute.
9033 */
9034int
9035getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
9036{
9037	vnode_t vp;
9038	struct nameidata nd;
9039	char attrname[XATTR_MAXNAMELEN+1];
9040	vfs_context_t ctx = vfs_context_current();
9041	uio_t auio = NULL;
9042	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9043	size_t attrsize = 0;
9044	size_t namelen;
9045	u_int32_t nameiflags;
9046	int error;
9047	char uio_buf[ UIO_SIZEOF(1) ];
9048
9049	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9050		return (EINVAL);
9051
9052	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9053	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
9054	if ((error = namei(&nd))) {
9055		return (error);
9056	}
9057	vp = nd.ni_vp;
9058	nameidone(&nd);
9059
9060	if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9061		goto out;
9062	}
9063	if (xattr_protected(attrname)) {
9064		if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
9065			error = EPERM;
9066			goto out;
9067		}
9068	}
9069	/*
9070	 * the specific check for 0xffffffff is a hack to preserve
9071	 * binaray compatibilty in K64 with applications that discovered
9072	 * that passing in a buf pointer and a size of -1 resulted in
9073	 * just the size of the indicated extended attribute being returned.
9074	 * this isn't part of the documented behavior, but because of the
9075	 * original implemtation's check for "uap->size > 0", this behavior
9076	 * was allowed. In K32 that check turned into a signed comparison
9077	 * even though uap->size is unsigned...  in K64, we blow by that
9078	 * check because uap->size is unsigned and doesn't get sign smeared
9079	 * in the munger for a 32 bit user app.  we also need to add a
9080	 * check to limit the maximum size of the buffer being passed in...
9081	 * unfortunately, the underlying fileystems seem to just malloc
9082	 * the requested size even if the actual extended attribute is tiny.
9083	 * because that malloc is for kernel wired memory, we have to put a
9084	 * sane limit on it.
9085	 *
9086	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
9087	 * U64 running on K64 will yield -1 (64 bits wide)
9088	 * U32/U64 running on K32 will yield -1 (32 bits wide)
9089	 */
9090	if (uap->size == 0xffffffff || uap->size == (size_t)-1)
9091		goto no_uio;
9092
9093	if (uap->value) {
9094		if (uap->size > (size_t)XATTR_MAXSIZE)
9095			uap->size = XATTR_MAXSIZE;
9096
9097		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9098		                            &uio_buf[0], sizeof(uio_buf));
9099		uio_addiov(auio, uap->value, uap->size);
9100	}
9101no_uio:
9102	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
9103out:
9104	vnode_put(vp);
9105
9106	if (auio) {
9107		*retval = uap->size - uio_resid(auio);
9108	} else {
9109		*retval = (user_ssize_t)attrsize;
9110	}
9111
9112	return (error);
9113}
9114
9115/*
9116 * Retrieve the data of an extended attribute.
9117 */
9118int
9119fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
9120{
9121	vnode_t vp;
9122	char attrname[XATTR_MAXNAMELEN+1];
9123	uio_t auio = NULL;
9124	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9125	size_t attrsize = 0;
9126	size_t namelen;
9127	int error;
9128	char uio_buf[ UIO_SIZEOF(1) ];
9129
9130	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9131		return (EINVAL);
9132
9133	if ( (error = file_vnode(uap->fd, &vp)) ) {
9134		return (error);
9135	}
9136	if ( (error = vnode_getwithref(vp)) ) {
9137		file_drop(uap->fd);
9138		return(error);
9139	}
9140	if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9141		goto out;
9142	}
9143	if (xattr_protected(attrname)) {
9144		error = EPERM;
9145		goto out;
9146	}
9147	if (uap->value && uap->size > 0) {
9148		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9149		                            &uio_buf[0], sizeof(uio_buf));
9150		uio_addiov(auio, uap->value, uap->size);
9151	}
9152
9153	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
9154out:
9155	(void)vnode_put(vp);
9156	file_drop(uap->fd);
9157
9158	if (auio) {
9159		*retval = uap->size - uio_resid(auio);
9160	} else {
9161		*retval = (user_ssize_t)attrsize;
9162	}
9163	return (error);
9164}
9165
9166/*
9167 * Set the data of an extended attribute.
9168 */
9169int
9170setxattr(proc_t p, struct setxattr_args *uap, int *retval)
9171{
9172	vnode_t vp;
9173	struct nameidata nd;
9174	char attrname[XATTR_MAXNAMELEN+1];
9175	vfs_context_t ctx = vfs_context_current();
9176	uio_t auio = NULL;
9177	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9178	size_t namelen;
9179	u_int32_t nameiflags;
9180	int error;
9181	char uio_buf[ UIO_SIZEOF(1) ];
9182
9183	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9184		return (EINVAL);
9185
9186	if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9187		if (error == EPERM) {
9188			/* if the string won't fit in attrname, copyinstr emits EPERM */
9189			return (ENAMETOOLONG);
9190		}
9191		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9192		return error;
9193	}
9194	if (xattr_protected(attrname))
9195		return(EPERM);
9196	if (uap->size != 0 && uap->value == 0) {
9197		return (EINVAL);
9198	}
9199
9200	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9201	NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
9202	if ((error = namei(&nd))) {
9203		return (error);
9204	}
9205	vp = nd.ni_vp;
9206	nameidone(&nd);
9207
9208	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9209	                            &uio_buf[0], sizeof(uio_buf));
9210	uio_addiov(auio, uap->value, uap->size);
9211
9212	error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
9213#if CONFIG_FSE
9214	if (error == 0) {
9215		add_fsevent(FSE_XATTR_MODIFIED, ctx,
9216		    FSE_ARG_VNODE, vp,
9217		    FSE_ARG_DONE);
9218	}
9219#endif
9220	vnode_put(vp);
9221	*retval = 0;
9222	return (error);
9223}
9224
9225/*
9226 * Set the data of an extended attribute.
9227 */
9228int
9229fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
9230{
9231	vnode_t vp;
9232	char attrname[XATTR_MAXNAMELEN+1];
9233	uio_t auio = NULL;
9234	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9235	size_t namelen;
9236	int error;
9237	char uio_buf[ UIO_SIZEOF(1) ];
9238#if CONFIG_FSE
9239	vfs_context_t ctx = vfs_context_current();
9240#endif
9241
9242	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9243		return (EINVAL);
9244
9245	if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9246		return (error);
9247	}
9248	if (xattr_protected(attrname))
9249		return(EPERM);
9250	if (uap->size != 0 && uap->value == 0) {
9251		return (EINVAL);
9252	}
9253	if ( (error = file_vnode(uap->fd, &vp)) ) {
9254		return (error);
9255	}
9256	if ( (error = vnode_getwithref(vp)) ) {
9257		file_drop(uap->fd);
9258		return(error);
9259	}
9260	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9261	                            &uio_buf[0], sizeof(uio_buf));
9262	uio_addiov(auio, uap->value, uap->size);
9263
9264	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
9265#if CONFIG_FSE
9266	if (error == 0) {
9267		add_fsevent(FSE_XATTR_MODIFIED, ctx,
9268		    FSE_ARG_VNODE, vp,
9269		    FSE_ARG_DONE);
9270	}
9271#endif
9272	vnode_put(vp);
9273	file_drop(uap->fd);
9274	*retval = 0;
9275	return (error);
9276}
9277
9278/*
9279 * Remove an extended attribute.
9280 * XXX Code duplication here.
9281 */
9282int
9283removexattr(proc_t p, struct removexattr_args *uap, int *retval)
9284{
9285	vnode_t vp;
9286	struct nameidata nd;
9287	char attrname[XATTR_MAXNAMELEN+1];
9288	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9289	vfs_context_t ctx = vfs_context_current();
9290	size_t namelen;
9291	u_int32_t nameiflags;
9292	int error;
9293
9294	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9295		return (EINVAL);
9296
9297	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9298	if (error != 0) {
9299		return (error);
9300	}
9301	if (xattr_protected(attrname))
9302		return(EPERM);
9303	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9304	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
9305	if ((error = namei(&nd))) {
9306		return (error);
9307	}
9308	vp = nd.ni_vp;
9309	nameidone(&nd);
9310
9311	error = vn_removexattr(vp, attrname, uap->options, ctx);
9312#if CONFIG_FSE
9313	if (error == 0) {
9314		add_fsevent(FSE_XATTR_REMOVED, ctx,
9315		    FSE_ARG_VNODE, vp,
9316		    FSE_ARG_DONE);
9317	}
9318#endif
9319	vnode_put(vp);
9320	*retval = 0;
9321	return (error);
9322}
9323
9324/*
9325 * Remove an extended attribute.
9326 * XXX Code duplication here.
9327 */
9328int
9329fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
9330{
9331	vnode_t vp;
9332	char attrname[XATTR_MAXNAMELEN+1];
9333	size_t namelen;
9334	int error;
9335#if CONFIG_FSE
9336	vfs_context_t ctx = vfs_context_current();
9337#endif
9338
9339	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9340		return (EINVAL);
9341
9342	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
9343	if (error != 0) {
9344		return (error);
9345	}
9346	if (xattr_protected(attrname))
9347		return(EPERM);
9348	if ( (error = file_vnode(uap->fd, &vp)) ) {
9349		return (error);
9350	}
9351	if ( (error = vnode_getwithref(vp)) ) {
9352		file_drop(uap->fd);
9353		return(error);
9354	}
9355
9356	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
9357#if CONFIG_FSE
9358	if (error == 0) {
9359		add_fsevent(FSE_XATTR_REMOVED, ctx,
9360		    FSE_ARG_VNODE, vp,
9361		    FSE_ARG_DONE);
9362	}
9363#endif
9364	vnode_put(vp);
9365	file_drop(uap->fd);
9366	*retval = 0;
9367	return (error);
9368}
9369
9370/*
9371 * Retrieve the list of extended attribute names.
9372 * XXX Code duplication here.
9373 */
9374int
9375listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
9376{
9377	vnode_t vp;
9378	struct nameidata nd;
9379	vfs_context_t ctx = vfs_context_current();
9380	uio_t auio = NULL;
9381	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9382	size_t attrsize = 0;
9383	u_int32_t nameiflags;
9384	int error;
9385	char uio_buf[ UIO_SIZEOF(1) ];
9386
9387	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9388		return (EINVAL);
9389
9390	nameiflags = ((uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW) | NOTRIGGER;
9391	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
9392	if ((error = namei(&nd))) {
9393		return (error);
9394	}
9395	vp = nd.ni_vp;
9396	nameidone(&nd);
9397	if (uap->namebuf != 0 && uap->bufsize > 0) {
9398		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
9399		                            &uio_buf[0], sizeof(uio_buf));
9400		uio_addiov(auio, uap->namebuf, uap->bufsize);
9401	}
9402
9403	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
9404
9405	vnode_put(vp);
9406	if (auio) {
9407		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
9408	} else {
9409		*retval = (user_ssize_t)attrsize;
9410	}
9411	return (error);
9412}
9413
9414/*
9415 * Retrieve the list of extended attribute names.
9416 * XXX Code duplication here.
9417 */
9418int
9419flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
9420{
9421	vnode_t vp;
9422	uio_t auio = NULL;
9423	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9424	size_t attrsize = 0;
9425	int error;
9426	char uio_buf[ UIO_SIZEOF(1) ];
9427
9428	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9429		return (EINVAL);
9430
9431	if ( (error = file_vnode(uap->fd, &vp)) ) {
9432		return (error);
9433	}
9434	if ( (error = vnode_getwithref(vp)) ) {
9435		file_drop(uap->fd);
9436		return(error);
9437	}
9438	if (uap->namebuf != 0 && uap->bufsize > 0) {
9439		auio = uio_createwithbuffer(1, 0, spacetype,
9440								  	  UIO_READ, &uio_buf[0], sizeof(uio_buf));
9441		uio_addiov(auio, uap->namebuf, uap->bufsize);
9442	}
9443
9444	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
9445
9446	vnode_put(vp);
9447	file_drop(uap->fd);
9448	if (auio) {
9449		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
9450	} else {
9451		*retval = (user_ssize_t)attrsize;
9452	}
9453	return (error);
9454}
9455
9456/*
9457 * Obtain the full pathname of a file system object by id.
9458 *
9459 * This is a private SPI used by the File Manager.
9460 */
9461__private_extern__
9462int
9463fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
9464{
9465	vnode_t vp;
9466	struct mount *mp = NULL;
9467	vfs_context_t ctx = vfs_context_current();
9468	fsid_t fsid;
9469	char *realpath;
9470	int bpflags;
9471	int length;
9472	int error;
9473
9474	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
9475		return (error);
9476	}
9477	AUDIT_ARG(value32, fsid.val[0]);
9478	AUDIT_ARG(value64, uap->objid);
9479	/* Restrict output buffer size for now. */
9480	if (uap->bufsize > PAGE_SIZE) {
9481		return (EINVAL);
9482	}
9483	MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
9484	if (realpath == NULL) {
9485		return (ENOMEM);
9486	}
9487	/* Find the target mountpoint. */
9488	if ((mp = mount_lookupby_volfsid(fsid.val[0], 1)) == NULL) {
9489		error = ENOTSUP;  /* unexpected failure */
9490		goto out;
9491	}
9492unionget:
9493	/* Find the target vnode. */
9494	if (uap->objid == 2) {
9495		error = VFS_ROOT(mp, &vp, ctx);
9496	} else {
9497		error = VFS_VGET(mp, (ino64_t)uap->objid, &vp, ctx);
9498	}
9499
9500	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
9501		/*
9502		 * If the fileid isn't found and we're in a union
9503		 * mount volume, then see if the fileid is in the
9504		 * mounted-on volume.
9505		 */
9506		struct mount *tmp = mp;
9507		mp = vnode_mount(tmp->mnt_vnodecovered);
9508		vfs_unbusy(tmp);
9509		if (vfs_busy(mp, LK_NOWAIT) == 0)
9510			goto unionget;
9511	} else
9512		vfs_unbusy(mp);
9513
9514	if (error) {
9515		goto out;
9516	}
9517#if CONFIG_MACF
9518	error = mac_vnode_check_fsgetpath(ctx, vp);
9519	if (error) {
9520		vnode_put(vp);
9521		goto out;
9522	}
9523#endif
9524	/* Obtain the absolute path to this vnode. */
9525	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
9526	bpflags |= BUILDPATH_CHECK_MOVED;
9527	error = build_path(vp, realpath, uap->bufsize, &length, bpflags, ctx);
9528	vnode_put(vp);
9529	if (error) {
9530		goto out;
9531	}
9532	AUDIT_ARG(text, realpath);
9533
9534	if (kdebug_enable) {
9535		long dbg_parms[NUMPARMS];
9536                int  dbg_namelen;
9537
9538                dbg_namelen = (int)sizeof(dbg_parms);
9539
9540                if (length < dbg_namelen) {
9541			memcpy((char *)dbg_parms, realpath, length);
9542			memset((char *)dbg_parms + length, 0, dbg_namelen - length);
9543
9544			dbg_namelen = length;
9545		} else
9546			memcpy((char *)dbg_parms, realpath + (length - dbg_namelen), dbg_namelen);
9547
9548		kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
9549	}
9550	error = copyout((caddr_t)realpath, uap->buf, length);
9551
9552	*retval = (user_ssize_t)length; /* may be superseded by error */
9553out:
9554	if (realpath) {
9555		FREE(realpath, M_TEMP);
9556	}
9557	return (error);
9558}
9559
9560/*
9561 * Common routine to handle various flavors of statfs data heading out
9562 *	to user space.
9563 *
9564 * Returns:	0			Success
9565 *		EFAULT
9566 */
9567static int
9568munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
9569    user_addr_t bufp, int *sizep, boolean_t is_64_bit,
9570    boolean_t partial_copy)
9571{
9572	int		error;
9573	int		my_size, copy_size;
9574
9575	if (is_64_bit) {
9576		struct user64_statfs sfs;
9577		my_size = copy_size = sizeof(sfs);
9578		bzero(&sfs, my_size);
9579		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
9580		sfs.f_type = mp->mnt_vtable->vfc_typenum;
9581		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
9582		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
9583		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
9584		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
9585		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
9586		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
9587		sfs.f_files = (user64_long_t)sfsp->f_files;
9588		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
9589		sfs.f_fsid = sfsp->f_fsid;
9590		sfs.f_owner = sfsp->f_owner;
9591		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
9592			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
9593		} else {
9594			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
9595		}
9596		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
9597		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
9598
9599		if (partial_copy) {
9600			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
9601		}
9602		error = copyout((caddr_t)&sfs, bufp, copy_size);
9603	}
9604	else {
9605		struct user32_statfs sfs;
9606
9607		my_size = copy_size = sizeof(sfs);
9608		bzero(&sfs, my_size);
9609
9610		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
9611		sfs.f_type = mp->mnt_vtable->vfc_typenum;
9612		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
9613
9614		/*
9615		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
9616		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
9617		 * to reflect the filesystem size as best we can.
9618		 */
9619		if ((sfsp->f_blocks > INT_MAX)
9620			/* Hack for 4061702 . I think the real fix is for Carbon to
9621			 * look for some volume capability and not depend on hidden
9622			 * semantics agreed between a FS and carbon.
9623			 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
9624			 * for Carbon to set bNoVolumeSizes volume attribute.
9625			 * Without this the webdavfs files cannot be copied onto
9626			 * disk as they look huge. This change should not affect
9627			 * XSAN as they should not setting these to -1..
9628			 */
9629			 && (sfsp->f_blocks != 0xffffffffffffffffULL)
9630			 && (sfsp->f_bfree != 0xffffffffffffffffULL)
9631			 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
9632			int		shift;
9633
9634			/*
9635			 * Work out how far we have to shift the block count down to make it fit.
9636			 * Note that it's possible to have to shift so far that the resulting
9637			 * blocksize would be unreportably large.  At that point, we will clip
9638			 * any values that don't fit.
9639			 *
9640			 * For safety's sake, we also ensure that f_iosize is never reported as
9641			 * being smaller than f_bsize.
9642			 */
9643			for (shift = 0; shift < 32; shift++) {
9644				if ((sfsp->f_blocks >> shift) <= INT_MAX)
9645					break;
9646				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
9647					break;
9648			}
9649#define __SHIFT_OR_CLIP(x, s)	((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
9650			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
9651			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
9652			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
9653#undef __SHIFT_OR_CLIP
9654			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
9655			sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
9656		} else {
9657			/* filesystem is small enough to be reported honestly */
9658			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
9659			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
9660			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
9661			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
9662			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
9663		}
9664		sfs.f_files = (user32_long_t)sfsp->f_files;
9665		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
9666		sfs.f_fsid = sfsp->f_fsid;
9667		sfs.f_owner = sfsp->f_owner;
9668		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
9669			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
9670		} else {
9671			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
9672		}
9673		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
9674		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
9675
9676		if (partial_copy) {
9677			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
9678		}
9679		error = copyout((caddr_t)&sfs, bufp, copy_size);
9680	}
9681
9682	if (sizep != NULL) {
9683		*sizep = my_size;
9684	}
9685	return(error);
9686}
9687
9688/*
9689 * copy stat structure into user_stat structure.
9690 */
9691void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
9692{
9693	bzero(usbp, sizeof(*usbp));
9694
9695	usbp->st_dev = sbp->st_dev;
9696	usbp->st_ino = sbp->st_ino;
9697	usbp->st_mode = sbp->st_mode;
9698	usbp->st_nlink = sbp->st_nlink;
9699	usbp->st_uid = sbp->st_uid;
9700	usbp->st_gid = sbp->st_gid;
9701	usbp->st_rdev = sbp->st_rdev;
9702#ifndef _POSIX_C_SOURCE
9703	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
9704	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
9705	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
9706	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
9707	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
9708	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
9709#else
9710	usbp->st_atime = sbp->st_atime;
9711	usbp->st_atimensec = sbp->st_atimensec;
9712	usbp->st_mtime = sbp->st_mtime;
9713	usbp->st_mtimensec = sbp->st_mtimensec;
9714	usbp->st_ctime = sbp->st_ctime;
9715	usbp->st_ctimensec = sbp->st_ctimensec;
9716#endif
9717	usbp->st_size = sbp->st_size;
9718	usbp->st_blocks = sbp->st_blocks;
9719	usbp->st_blksize = sbp->st_blksize;
9720	usbp->st_flags = sbp->st_flags;
9721	usbp->st_gen = sbp->st_gen;
9722	usbp->st_lspare = sbp->st_lspare;
9723	usbp->st_qspare[0] = sbp->st_qspare[0];
9724	usbp->st_qspare[1] = sbp->st_qspare[1];
9725}
9726
9727void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
9728{
9729	bzero(usbp, sizeof(*usbp));
9730
9731	usbp->st_dev = sbp->st_dev;
9732	usbp->st_ino = sbp->st_ino;
9733	usbp->st_mode = sbp->st_mode;
9734	usbp->st_nlink = sbp->st_nlink;
9735	usbp->st_uid = sbp->st_uid;
9736	usbp->st_gid = sbp->st_gid;
9737	usbp->st_rdev = sbp->st_rdev;
9738#ifndef _POSIX_C_SOURCE
9739	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
9740	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
9741	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
9742	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
9743	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
9744	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
9745#else
9746	usbp->st_atime = sbp->st_atime;
9747	usbp->st_atimensec = sbp->st_atimensec;
9748	usbp->st_mtime = sbp->st_mtime;
9749	usbp->st_mtimensec = sbp->st_mtimensec;
9750	usbp->st_ctime = sbp->st_ctime;
9751	usbp->st_ctimensec = sbp->st_ctimensec;
9752#endif
9753	usbp->st_size = sbp->st_size;
9754	usbp->st_blocks = sbp->st_blocks;
9755	usbp->st_blksize = sbp->st_blksize;
9756	usbp->st_flags = sbp->st_flags;
9757	usbp->st_gen = sbp->st_gen;
9758	usbp->st_lspare = sbp->st_lspare;
9759	usbp->st_qspare[0] = sbp->st_qspare[0];
9760	usbp->st_qspare[1] = sbp->st_qspare[1];
9761}
9762
9763/*
9764 * copy stat64 structure into user_stat64 structure.
9765 */
9766void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
9767{
9768	bzero(usbp, sizeof(*usbp));
9769
9770	usbp->st_dev = sbp->st_dev;
9771	usbp->st_ino = sbp->st_ino;
9772	usbp->st_mode = sbp->st_mode;
9773	usbp->st_nlink = sbp->st_nlink;
9774	usbp->st_uid = sbp->st_uid;
9775	usbp->st_gid = sbp->st_gid;
9776	usbp->st_rdev = sbp->st_rdev;
9777#ifndef _POSIX_C_SOURCE
9778	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
9779	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
9780	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
9781	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
9782	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
9783	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
9784	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
9785	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
9786#else
9787	usbp->st_atime = sbp->st_atime;
9788	usbp->st_atimensec = sbp->st_atimensec;
9789	usbp->st_mtime = sbp->st_mtime;
9790	usbp->st_mtimensec = sbp->st_mtimensec;
9791	usbp->st_ctime = sbp->st_ctime;
9792	usbp->st_ctimensec = sbp->st_ctimensec;
9793	usbp->st_birthtime = sbp->st_birthtime;
9794	usbp->st_birthtimensec = sbp->st_birthtimensec;
9795#endif
9796	usbp->st_size = sbp->st_size;
9797	usbp->st_blocks = sbp->st_blocks;
9798	usbp->st_blksize = sbp->st_blksize;
9799	usbp->st_flags = sbp->st_flags;
9800	usbp->st_gen = sbp->st_gen;
9801	usbp->st_lspare = sbp->st_lspare;
9802	usbp->st_qspare[0] = sbp->st_qspare[0];
9803	usbp->st_qspare[1] = sbp->st_qspare[1];
9804}
9805
9806void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
9807{
9808	bzero(usbp, sizeof(*usbp));
9809
9810	usbp->st_dev = sbp->st_dev;
9811	usbp->st_ino = sbp->st_ino;
9812	usbp->st_mode = sbp->st_mode;
9813	usbp->st_nlink = sbp->st_nlink;
9814	usbp->st_uid = sbp->st_uid;
9815	usbp->st_gid = sbp->st_gid;
9816	usbp->st_rdev = sbp->st_rdev;
9817#ifndef _POSIX_C_SOURCE
9818	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
9819	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
9820	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
9821	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
9822	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
9823	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
9824	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
9825	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
9826#else
9827	usbp->st_atime = sbp->st_atime;
9828	usbp->st_atimensec = sbp->st_atimensec;
9829	usbp->st_mtime = sbp->st_mtime;
9830	usbp->st_mtimensec = sbp->st_mtimensec;
9831	usbp->st_ctime = sbp->st_ctime;
9832	usbp->st_ctimensec = sbp->st_ctimensec;
9833	usbp->st_birthtime = sbp->st_birthtime;
9834	usbp->st_birthtimensec = sbp->st_birthtimensec;
9835#endif
9836	usbp->st_size = sbp->st_size;
9837	usbp->st_blocks = sbp->st_blocks;
9838	usbp->st_blksize = sbp->st_blksize;
9839	usbp->st_flags = sbp->st_flags;
9840	usbp->st_gen = sbp->st_gen;
9841	usbp->st_lspare = sbp->st_lspare;
9842	usbp->st_qspare[0] = sbp->st_qspare[0];
9843	usbp->st_qspare[1] = sbp->st_qspare[1];
9844}
9845
9846/*
9847 * Purge buffer cache for simulating cold starts
9848 */
9849static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
9850{
9851	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
9852
9853	return VNODE_RETURNED;
9854}
9855
9856static int vfs_purge_callback(mount_t mp, __unused void * arg)
9857{
9858	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
9859
9860	return VFS_RETURNED;
9861}
9862
9863int
9864vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
9865{
9866	if (!kauth_cred_issuser(kauth_cred_get()))
9867		return EPERM;
9868
9869	vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
9870
9871	return 0;
9872}
9873
9874