zfs_ctldir.c revision 212694
175592Sru/*
275592Sru * CDDL HEADER START
394427Sru *
475592Sru * The contents of this file are subject to the terms of the
594427Sru * Common Development and Distribution License (the "License").
694427Sru * You may not use this file except in compliance with the License.
775592Sru *
875592Sru * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
975592Sru * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * ZFS control directory (a.k.a. ".zfs")
27 *
28 * This directory provides a common location for all ZFS meta-objects.
29 * Currently, this is only the 'snapshot' directory, but this may expand in the
30 * future.  The elements are built using the GFS primitives, as the hierarchy
31 * does not actually exist on disk.
32 *
33 * For 'snapshot', we don't want to have all snapshots always mounted, because
34 * this would take up a huge amount of space in /etc/mnttab.  We have three
35 * types of objects:
36 *
37 * 	ctldir ------> snapshotdir -------> snapshot
38 *                                             |
39 *                                             |
40 *                                             V
41 *                                         mounted fs
42 *
43 * The 'snapshot' node contains just enough information to lookup '..' and act
44 * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
45 * perform an automount of the underlying filesystem and return the
46 * corresponding vnode.
47 *
48 * All mounts are handled automatically by the kernel, but unmounts are
49 * (currently) handled from user land.  The main reason is that there is no
50 * reliable way to auto-unmount the filesystem when it's "no longer in use".
51 * When the user unmounts a filesystem, we call zfsctl_unmount(), which
52 * unmounts any snapshots within the snapshot directory.
53 *
54 * The '.zfs', '.zfs/snapshot', and all directories created under
55 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
56 * share the same vfs_t as the head filesystem (what '.zfs' lives under).
57 *
58 * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
59 * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
60 * However, vnodes within these mounted on file systems have their v_vfsp
61 * fields set to the head filesystem to make NFS happy (see
62 * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
63 * so that it cannot be freed until all snapshots have been unmounted.
64 */
65
66#include <sys/zfs_context.h>
67#include <sys/zfs_ctldir.h>
68#include <sys/zfs_ioctl.h>
69#include <sys/zfs_vfsops.h>
70#include <sys/namei.h>
71#include <sys/gfs.h>
72#include <sys/stat.h>
73#include <sys/dmu.h>
74#include <sys/dsl_deleg.h>
75#include <sys/mount.h>
76#include <sys/sunddi.h>
77
78#include "zfs_namecheck.h"
79
80typedef struct zfsctl_node {
81	gfs_dir_t	zc_gfs_private;
82	uint64_t	zc_id;
83	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
84} zfsctl_node_t;
85
86typedef struct zfsctl_snapdir {
87	zfsctl_node_t	sd_node;
88	kmutex_t	sd_lock;
89	avl_tree_t	sd_snaps;
90} zfsctl_snapdir_t;
91
92typedef struct {
93	char		*se_name;
94	vnode_t		*se_root;
95	avl_node_t	se_node;
96} zfs_snapentry_t;
97
98static int
99snapentry_compare(const void *a, const void *b)
100{
101	const zfs_snapentry_t *sa = a;
102	const zfs_snapentry_t *sb = b;
103	int ret = strcmp(sa->se_name, sb->se_name);
104
105	if (ret < 0)
106		return (-1);
107	else if (ret > 0)
108		return (1);
109	else
110		return (0);
111}
112
113static struct vop_vector zfsctl_ops_root;
114static struct vop_vector zfsctl_ops_snapdir;
115static struct vop_vector zfsctl_ops_snapshot;
116static struct vop_vector zfsctl_ops_shares;
117static struct vop_vector zfsctl_ops_shares_dir;
118
119static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
120static vnode_t *zfsctl_mknode_shares(vnode_t *);
121static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
122static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
123
124/*
125 * Root directory elements.  We only have two entries
126 * snapshot and shares.
127 */
128static gfs_dirent_t zfsctl_root_entries[] = {
129	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
130	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
131	{ NULL }
132};
133
134/* include . and .. in the calculation */
135#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
136    sizeof (gfs_dirent_t)) + 1)
137
138
139/*
140 * Initialize the various GFS pieces we'll need to create and manipulate .zfs
141 * directories.  This is called from the ZFS init routine, and initializes the
142 * vnode ops vectors that we'll be using.
143 */
144void
145zfsctl_init(void)
146{
147}
148
149void
150zfsctl_fini(void)
151{
152}
153
154boolean_t
155zfsctl_is_node(vnode_t *vp)
156{
157	return (vn_matchops(vp, zfsctl_ops_root) ||
158	    vn_matchops(vp, zfsctl_ops_snapdir) ||
159	    vn_matchops(vp, zfsctl_ops_snapshot) ||
160	    vn_matchops(vp, zfsctl_ops_shares) ||
161	    vn_matchops(vp, zfsctl_ops_shares_dir));
162
163}
164
165/*
166 * Return the inode number associated with the 'snapshot' or
167 * 'shares' directory.
168 */
169/* ARGSUSED */
170static ino64_t
171zfsctl_root_inode_cb(vnode_t *vp, int index)
172{
173	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
174
175	ASSERT(index <= 2);
176
177	if (index == 0)
178		return (ZFSCTL_INO_SNAPDIR);
179
180	return (zfsvfs->z_shares_dir);
181}
182
183/*
184 * Create the '.zfs' directory.  This directory is cached as part of the VFS
185 * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
186 * therefore checks against a vfs_count of 2 instead of 1.  This reference
187 * is removed when the ctldir is destroyed in the unmount.
188 */
189void
190zfsctl_create(zfsvfs_t *zfsvfs)
191{
192	vnode_t *vp, *rvp;
193	zfsctl_node_t *zcp;
194
195	ASSERT(zfsvfs->z_ctldir == NULL);
196
197	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
198	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
199	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
200	zcp = vp->v_data;
201	zcp->zc_id = ZFSCTL_INO_ROOT;
202
203	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
204	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
205	VN_URELE(rvp);
206
207	/*
208	 * We're only faking the fact that we have a root of a filesystem for
209	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
210	 * for us.
211	 */
212	vp->v_vflag &= ~VV_ROOT;
213
214	zfsvfs->z_ctldir = vp;
215
216	VOP_UNLOCK(vp, 0);
217}
218
219/*
220 * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
221 * There might still be more references if we were force unmounted, but only
222 * new zfs_inactive() calls can occur and they don't reference .zfs
223 */
224void
225zfsctl_destroy(zfsvfs_t *zfsvfs)
226{
227	VN_RELE(zfsvfs->z_ctldir);
228	zfsvfs->z_ctldir = NULL;
229}
230
231/*
232 * Given a root znode, retrieve the associated .zfs directory.
233 * Add a hold to the vnode and return it.
234 */
235vnode_t *
236zfsctl_root(znode_t *zp)
237{
238	ASSERT(zfs_has_ctldir(zp));
239	VN_HOLD(zp->z_zfsvfs->z_ctldir);
240	return (zp->z_zfsvfs->z_ctldir);
241}
242
243/*
244 * Common open routine.  Disallow any write access.
245 */
246/* ARGSUSED */
247static int
248zfsctl_common_open(struct vop_open_args *ap)
249{
250	int flags = ap->a_mode;
251
252	if (flags & FWRITE)
253		return (EACCES);
254
255	return (0);
256}
257
258/*
259 * Common close routine.  Nothing to do here.
260 */
261/* ARGSUSED */
262static int
263zfsctl_common_close(struct vop_close_args *ap)
264{
265	return (0);
266}
267
268/*
269 * Common access routine.  Disallow writes.
270 */
271/* ARGSUSED */
272static int
273zfsctl_common_access(ap)
274	struct vop_access_args /* {
275		struct vnode *a_vp;
276		int  a_accmode;
277		struct ucred *a_cred;
278		struct thread *a_td;
279	} */ *ap;
280{
281	int mode = ap->a_accmode;
282
283#ifdef TODO
284	if (flags & V_ACE_MASK) {
285		if (accmode & ACE_ALL_WRITE_PERMS)
286			return (EACCES);
287	} else {
288#endif
289	if (mode & VWRITE)
290		return (EACCES);
291#ifdef TODO
292	}
293#endif
294
295	return (0);
296}
297
298/*
299 * Common getattr function.  Fill in basic information.
300 */
301static void
302zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
303{
304	zfsctl_node_t	*zcp = vp->v_data;
305	timestruc_t	now;
306
307	vap->va_uid = 0;
308	vap->va_gid = 0;
309	vap->va_rdev = 0;
310	/*
311	 * We are a purly virtual object, so we have no
312	 * blocksize or allocated blocks.
313	 */
314	vap->va_blksize = 0;
315	vap->va_nblocks = 0;
316	vap->va_seq = 0;
317	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
318	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
319	    S_IROTH | S_IXOTH;
320	vap->va_type = VDIR;
321	/*
322	 * We live in the now (for atime).
323	 */
324	gethrestime(&now);
325	vap->va_atime = now;
326	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
327	/* FreeBSD: Reset chflags(2) flags. */
328	vap->va_flags = 0;
329}
330
331/*ARGSUSED*/
332static int
333zfsctl_common_fid(ap)
334	struct vop_fid_args /* {
335		struct vnode *a_vp;
336		struct fid *a_fid;
337	} */ *ap;
338{
339	vnode_t		*vp = ap->a_vp;
340	fid_t		*fidp = (void *)ap->a_fid;
341	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
342	zfsctl_node_t	*zcp = vp->v_data;
343	uint64_t	object = zcp->zc_id;
344	zfid_short_t	*zfid;
345	int		i;
346
347	ZFS_ENTER(zfsvfs);
348
349	fidp->fid_len = SHORT_FID_LEN;
350
351	zfid = (zfid_short_t *)fidp;
352
353	zfid->zf_len = SHORT_FID_LEN;
354
355	for (i = 0; i < sizeof (zfid->zf_object); i++)
356		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
357
358	/* .zfs znodes always have a generation number of 0 */
359	for (i = 0; i < sizeof (zfid->zf_gen); i++)
360		zfid->zf_gen[i] = 0;
361
362	ZFS_EXIT(zfsvfs);
363	return (0);
364}
365
366/*ARGSUSED*/
367static int
368zfsctl_shares_fid(ap)
369	struct vop_fid_args /* {
370		struct vnode *a_vp;
371		struct fid *a_fid;
372	} */ *ap;
373{
374	vnode_t		*vp = ap->a_vp;
375	fid_t		*fidp = (void *)ap->a_fid;
376	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
377	znode_t		*dzp;
378	int		error;
379
380	ZFS_ENTER(zfsvfs);
381
382	if (zfsvfs->z_shares_dir == 0) {
383		ZFS_EXIT(zfsvfs);
384		return (ENOTSUP);
385	}
386
387	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
388		error = VOP_FID(ZTOV(dzp), fidp);
389		VN_RELE(ZTOV(dzp));
390	}
391
392	ZFS_EXIT(zfsvfs);
393	return (error);
394}
395
396static int
397zfsctl_common_reclaim(ap)
398	struct vop_reclaim_args /* {
399		struct vnode *a_vp;
400		struct thread *a_td;
401	} */ *ap;
402{
403	vnode_t *vp = ap->a_vp;
404
405	/*
406	 * Destroy the vm object and flush associated pages.
407	 */
408	vnode_destroy_vobject(vp);
409	VI_LOCK(vp);
410	vp->v_data = NULL;
411	VI_UNLOCK(vp);
412	return (0);
413}
414
415/*
416 * .zfs inode namespace
417 *
418 * We need to generate unique inode numbers for all files and directories
419 * within the .zfs pseudo-filesystem.  We use the following scheme:
420 *
421 * 	ENTRY			ZFSCTL_INODE
422 * 	.zfs			1
423 * 	.zfs/snapshot		2
424 * 	.zfs/snapshot/<snap>	objectid(snap)
425 */
426
427#define	ZFSCTL_INO_SNAP(id)	(id)
428
429/*
430 * Get root directory attributes.
431 */
432/* ARGSUSED */
433static int
434zfsctl_root_getattr(ap)
435	struct vop_getattr_args /* {
436		struct vnode *a_vp;
437		struct vattr *a_vap;
438		struct ucred *a_cred;
439		struct thread *a_td;
440	} */ *ap;
441{
442	struct vnode *vp = ap->a_vp;
443	struct vattr *vap = ap->a_vap;
444	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
445
446	ZFS_ENTER(zfsvfs);
447	vap->va_nodeid = ZFSCTL_INO_ROOT;
448	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
449
450	zfsctl_common_getattr(vp, vap);
451	ZFS_EXIT(zfsvfs);
452
453	return (0);
454}
455
456#ifdef sun
457static int
458zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
459    caller_context_t *ct)
460{
461	/*
462	 * We only care about ACL_ENABLED so that libsec can
463	 * display ACL correctly and not default to POSIX draft.
464	 */
465	if (cmd == _PC_ACL_ENABLED) {
466		*valp = _ACL_ACE_ENABLED;
467		return (0);
468	}
469
470	return (fs_pathconf(vp, cmd, valp, cr, ct));
471}
472#endif	/* sun */
473
474#ifdef sun
475static const fs_operation_def_t zfsctl_tops_root[] = {
476	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
477	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
478	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
479	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
480	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
481	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
482	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
483	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
484	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
485	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
486	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
487	{ NULL }
488};
489#endif	/* sun */
490
491/*
492 * Special case the handling of "..".
493 */
494/* ARGSUSED */
495int
496zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
497    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
498    int *direntflags, pathname_t *realpnp)
499{
500	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
501	int err;
502
503	/*
504	 * No extended attributes allowed under .zfs
505	 */
506	if (flags & LOOKUP_XATTR)
507		return (EINVAL);
508
509	ZFS_ENTER(zfsvfs);
510
511	if (strcmp(nm, "..") == 0) {
512		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
513		if (err == 0)
514			VOP_UNLOCK(*vpp, 0);
515	} else {
516		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
517		    cr, ct, direntflags, realpnp);
518	}
519
520	ZFS_EXIT(zfsvfs);
521
522	return (err);
523}
524
525/*
526 * Special case the handling of "..".
527 */
528/* ARGSUSED */
529int
530zfsctl_freebsd_root_lookup(ap)
531	struct vop_lookup_args /* {
532		struct vnode *a_dvp;
533		struct vnode **a_vpp;
534		struct componentname *a_cnp;
535	} */ *ap;
536{
537	vnode_t *dvp = ap->a_dvp;
538	vnode_t **vpp = ap->a_vpp;
539	cred_t *cr = ap->a_cnp->cn_cred;
540	int flags = ap->a_cnp->cn_flags;
541	int nameiop = ap->a_cnp->cn_nameiop;
542	char nm[NAME_MAX + 1];
543	int err;
544
545	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
546		return (EOPNOTSUPP);
547
548	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
549	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
550
551	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
552	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
553		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
554
555	return (err);
556}
557
558static struct vop_vector zfsctl_ops_root = {
559	.vop_default =	&default_vnodeops,
560	.vop_open =	zfsctl_common_open,
561	.vop_close =	zfsctl_common_close,
562	.vop_ioctl =	VOP_EINVAL,
563	.vop_getattr =	zfsctl_root_getattr,
564	.vop_access =	zfsctl_common_access,
565	.vop_readdir =	gfs_vop_readdir,
566	.vop_lookup =	zfsctl_freebsd_root_lookup,
567	.vop_inactive =	gfs_vop_inactive,
568	.vop_reclaim =	zfsctl_common_reclaim,
569	.vop_fid =	zfsctl_common_fid,
570};
571
572static int
573zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
574{
575	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
576
577	if (snapshot_namecheck(name, NULL, NULL) != 0)
578		return (EILSEQ);
579	dmu_objset_name(os, zname);
580	if (strlen(zname) + 1 + strlen(name) >= len)
581		return (ENAMETOOLONG);
582	(void) strcat(zname, "@");
583	(void) strcat(zname, name);
584	return (0);
585}
586
587static int
588zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
589{
590	vnode_t *svp = sep->se_root;
591	int error;
592
593	ASSERT(vn_ismntpt(svp));
594
595	/* this will be dropped by dounmount() */
596	if ((error = vn_vfswlock(svp)) != 0)
597		return (error);
598
599	return (dounmount(vn_mountedvfs(svp), fflags, curthread));
600}
601
602#if 0
603static void
604zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
605{
606	avl_index_t where;
607	vfs_t *vfsp;
608	refstr_t *pathref;
609	char newpath[MAXNAMELEN];
610	char *tail;
611
612	ASSERT(MUTEX_HELD(&sdp->sd_lock));
613	ASSERT(sep != NULL);
614
615	vfsp = vn_mountedvfs(sep->se_root);
616	ASSERT(vfsp != NULL);
617
618	vfs_lock_wait(vfsp);
619
620	/*
621	 * Change the name in the AVL tree.
622	 */
623	avl_remove(&sdp->sd_snaps, sep);
624	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
625	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
626	(void) strcpy(sep->se_name, nm);
627	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
628	avl_insert(&sdp->sd_snaps, sep, where);
629
630	/*
631	 * Change the current mountpoint info:
632	 * 	- update the tail of the mntpoint path
633	 *	- update the tail of the resource path
634	 */
635	pathref = vfs_getmntpoint(vfsp);
636	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
637	VERIFY((tail = strrchr(newpath, '/')) != NULL);
638	*(tail+1) = '\0';
639	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
640	(void) strcat(newpath, nm);
641	refstr_rele(pathref);
642	vfs_setmntpoint(vfsp, newpath);
643
644	pathref = vfs_getresource(vfsp);
645	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
646	VERIFY((tail = strrchr(newpath, '@')) != NULL);
647	*(tail+1) = '\0';
648	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
649	(void) strcat(newpath, nm);
650	refstr_rele(pathref);
651	vfs_setresource(vfsp, newpath);
652
653	vfs_unlock(vfsp);
654}
655#endif
656
657#if 0
658/*ARGSUSED*/
659static int
660zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
661    cred_t *cr, caller_context_t *ct, int flags)
662{
663	zfsctl_snapdir_t *sdp = sdvp->v_data;
664	zfs_snapentry_t search, *sep;
665	zfsvfs_t *zfsvfs;
666	avl_index_t where;
667	char from[MAXNAMELEN], to[MAXNAMELEN];
668	char real[MAXNAMELEN];
669	int err;
670
671	zfsvfs = sdvp->v_vfsp->vfs_data;
672	ZFS_ENTER(zfsvfs);
673
674	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
675		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
676		    MAXNAMELEN, NULL);
677		if (err == 0) {
678			snm = real;
679		} else if (err != ENOTSUP) {
680			ZFS_EXIT(zfsvfs);
681			return (err);
682		}
683	}
684
685	ZFS_EXIT(zfsvfs);
686
687	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
688	if (!err)
689		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
690	if (!err)
691		err = zfs_secpolicy_rename_perms(from, to, cr);
692	if (err)
693		return (err);
694
695	/*
696	 * Cannot move snapshots out of the snapdir.
697	 */
698	if (sdvp != tdvp)
699		return (EINVAL);
700
701	if (strcmp(snm, tnm) == 0)
702		return (0);
703
704	mutex_enter(&sdp->sd_lock);
705
706	search.se_name = (char *)snm;
707	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
708		mutex_exit(&sdp->sd_lock);
709		return (ENOENT);
710	}
711
712	err = dmu_objset_rename(from, to, B_FALSE);
713	if (err == 0)
714		zfsctl_rename_snap(sdp, sep, tnm);
715
716	mutex_exit(&sdp->sd_lock);
717
718	return (err);
719}
720#endif
721
722#if 0
723/* ARGSUSED */
724static int
725zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
726    caller_context_t *ct, int flags)
727{
728	zfsctl_snapdir_t *sdp = dvp->v_data;
729	zfs_snapentry_t *sep;
730	zfs_snapentry_t search;
731	zfsvfs_t *zfsvfs;
732	char snapname[MAXNAMELEN];
733	char real[MAXNAMELEN];
734	int err;
735
736	zfsvfs = dvp->v_vfsp->vfs_data;
737	ZFS_ENTER(zfsvfs);
738
739	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
740
741		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
742		    MAXNAMELEN, NULL);
743		if (err == 0) {
744			name = real;
745		} else if (err != ENOTSUP) {
746			ZFS_EXIT(zfsvfs);
747			return (err);
748		}
749	}
750
751	ZFS_EXIT(zfsvfs);
752
753	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
754	if (!err)
755		err = zfs_secpolicy_destroy_perms(snapname, cr);
756	if (err)
757		return (err);
758
759	mutex_enter(&sdp->sd_lock);
760
761	search.se_name = name;
762	sep = avl_find(&sdp->sd_snaps, &search, NULL);
763	if (sep) {
764		avl_remove(&sdp->sd_snaps, sep);
765		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
766		if (err) {
767			avl_index_t where;
768
769			if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
770				avl_insert(&sdp->sd_snaps, sep, where);
771		} else
772			err = dmu_objset_destroy(snapname);
773	} else {
774		err = ENOENT;
775	}
776
777	mutex_exit(&sdp->sd_lock);
778
779	return (err);
780}
781#endif
782
783/*
784 * This creates a snapshot under '.zfs/snapshot'.
785 */
786/* ARGSUSED */
787static int
788zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
789    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
790{
791	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
792	char name[MAXNAMELEN];
793	int err;
794	static enum symfollow follow = NO_FOLLOW;
795	static enum uio_seg seg = UIO_SYSSPACE;
796
797	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
798		return (EILSEQ);
799
800	dmu_objset_name(zfsvfs->z_os, name);
801
802	*vpp = NULL;
803
804	err = zfs_secpolicy_snapshot_perms(name, cr);
805	if (err)
806		return (err);
807
808	if (err == 0) {
809		err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
810		if (err)
811			return (err);
812		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
813	}
814
815	return (err);
816}
817
818static int
819zfsctl_freebsd_snapdir_mkdir(ap)
820        struct vop_mkdir_args /* {
821                struct vnode *a_dvp;
822                struct vnode **a_vpp;
823                struct componentname *a_cnp;
824                struct vattr *a_vap;
825        } */ *ap;
826{
827
828	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
829
830	return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL,
831	    ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL));
832}
833
834/*
835 * Lookup entry point for the 'snapshot' directory.  Try to open the
836 * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
837 * Perform a mount of the associated dataset on top of the vnode.
838 */
839/* ARGSUSED */
840int
841zfsctl_snapdir_lookup(ap)
842	struct vop_lookup_args /* {
843		struct vnode *a_dvp;
844		struct vnode **a_vpp;
845		struct componentname *a_cnp;
846	} */ *ap;
847{
848	vnode_t *dvp = ap->a_dvp;
849	vnode_t **vpp = ap->a_vpp;
850	struct componentname *cnp = ap->a_cnp;
851	char nm[NAME_MAX + 1];
852	zfsctl_snapdir_t *sdp = dvp->v_data;
853	objset_t *snap;
854	char snapname[MAXNAMELEN];
855	char real[MAXNAMELEN];
856	char *mountpoint;
857	zfs_snapentry_t *sep, search;
858	size_t mountpoint_len;
859	avl_index_t where;
860	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
861	int err;
862	int flags = 0;
863
864	/*
865	 * No extended attributes allowed under .zfs
866	 */
867	if (flags & LOOKUP_XATTR)
868		return (EINVAL);
869	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
870	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
871
872	ASSERT(dvp->v_type == VDIR);
873
874	*vpp = NULL;
875
876	/*
877	 * If we get a recursive call, that means we got called
878	 * from the domount() code while it was trying to look up the
879	 * spec (which looks like a local path for zfs).  We need to
880	 * add some flag to domount() to tell it not to do this lookup.
881	 */
882	if (MUTEX_HELD(&sdp->sd_lock))
883		return (ENOENT);
884
885	ZFS_ENTER(zfsvfs);
886
887	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
888		ZFS_EXIT(zfsvfs);
889		return (0);
890	}
891
892	if (flags & FIGNORECASE) {
893		boolean_t conflict = B_FALSE;
894
895		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
896		    MAXNAMELEN, &conflict);
897		if (err == 0) {
898			strlcpy(nm, real, sizeof(nm));
899		} else if (err != ENOTSUP) {
900			ZFS_EXIT(zfsvfs);
901			return (err);
902		}
903#if 0
904		if (realpnp)
905			(void) strlcpy(realpnp->pn_buf, nm,
906			    realpnp->pn_bufsize);
907		if (conflict && direntflags)
908			*direntflags = ED_CASE_CONFLICT;
909#endif
910	}
911
912	mutex_enter(&sdp->sd_lock);
913	search.se_name = (char *)nm;
914	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
915		*vpp = sep->se_root;
916		VN_HOLD(*vpp);
917		err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY);
918		if (err) {
919			VN_RELE(*vpp);
920			*vpp = NULL;
921		} else if (*vpp == sep->se_root) {
922			/*
923			 * The snapshot was unmounted behind our backs,
924			 * try to remount it.
925			 */
926			goto domount;
927		} else {
928			/*
929			 * VROOT was set during the traverse call.  We need
930			 * to clear it since we're pretending to be part
931			 * of our parent's vfs.
932			 */
933			(*vpp)->v_flag &= ~VROOT;
934		}
935		mutex_exit(&sdp->sd_lock);
936		ZFS_EXIT(zfsvfs);
937		return (err);
938	}
939
940	/*
941	 * The requested snapshot is not currently mounted, look it up.
942	 */
943	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
944	if (err) {
945		mutex_exit(&sdp->sd_lock);
946		ZFS_EXIT(zfsvfs);
947		/*
948		 * handle "ls *" or "?" in a graceful manner,
949		 * forcing EILSEQ to ENOENT.
950		 * Since shell ultimately passes "*" or "?" as name to lookup
951		 */
952		return (err == EILSEQ ? ENOENT : err);
953	}
954	if (dmu_objset_open(snapname, DMU_OST_ZFS,
955	    DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
956		mutex_exit(&sdp->sd_lock);
957		/* Translate errors and add SAVENAME when needed. */
958		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
959			err = EJUSTRETURN;
960			cnp->cn_flags |= SAVENAME;
961		} else {
962			err = ENOENT;
963		}
964		ZFS_EXIT(zfsvfs);
965		return (err);
966	}
967
968	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
969	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
970	(void) strcpy(sep->se_name, nm);
971	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
972	VN_HOLD(*vpp);
973	avl_insert(&sdp->sd_snaps, sep, where);
974
975	dmu_objset_close(snap);
976domount:
977	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
978	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
979	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
980	(void) snprintf(mountpoint, mountpoint_len,
981	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
982	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
983	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0);
984	kmem_free(mountpoint, mountpoint_len);
985	if (err == 0) {
986		/*
987		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
988		 *
989		 * This is where we lie about our v_vfsp in order to
990		 * make .zfs/snapshot/<snapname> accessible over NFS
991		 * without requiring manual mounts of <snapname>.
992		 */
993		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
994		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
995	}
996	mutex_exit(&sdp->sd_lock);
997	ZFS_EXIT(zfsvfs);
998	if (err != 0)
999		*vpp = NULL;
1000	return (err);
1001}
1002
1003/* ARGSUSED */
1004int
1005zfsctl_shares_lookup(ap)
1006	struct vop_lookup_args /* {
1007		struct vnode *a_dvp;
1008		struct vnode **a_vpp;
1009		struct componentname *a_cnp;
1010	} */ *ap;
1011{
1012	vnode_t *dvp = ap->a_dvp;
1013	vnode_t **vpp = ap->a_vpp;
1014	struct componentname *cnp = ap->a_cnp;
1015	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
1016	char nm[NAME_MAX + 1];
1017	znode_t *dzp;
1018	int error;
1019
1020	ZFS_ENTER(zfsvfs);
1021
1022	ASSERT(cnp->cn_namelen < sizeof(nm));
1023	strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
1024
1025	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
1026		ZFS_EXIT(zfsvfs);
1027		return (0);
1028	}
1029
1030	if (zfsvfs->z_shares_dir == 0) {
1031		ZFS_EXIT(zfsvfs);
1032		return (ENOTSUP);
1033	}
1034	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
1035		error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
1036
1037	VN_RELE(ZTOV(dzp));
1038	ZFS_EXIT(zfsvfs);
1039
1040	return (error);
1041}
1042
1043/* ARGSUSED */
1044static int
1045zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
1046    offset_t *offp, offset_t *nextp, void *data, int flags)
1047{
1048	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1049	char snapname[MAXNAMELEN];
1050	uint64_t id, cookie;
1051	boolean_t case_conflict;
1052	int error;
1053
1054	ZFS_ENTER(zfsvfs);
1055
1056	cookie = *offp;
1057	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
1058	    &cookie, &case_conflict);
1059	if (error) {
1060		ZFS_EXIT(zfsvfs);
1061		if (error == ENOENT) {
1062			*eofp = 1;
1063			return (0);
1064		}
1065		return (error);
1066	}
1067
1068	if (flags & V_RDDIR_ENTFLAGS) {
1069		edirent_t *eodp = dp;
1070
1071		(void) strcpy(eodp->ed_name, snapname);
1072		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
1073		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
1074	} else {
1075		struct dirent64 *odp = dp;
1076
1077		(void) strcpy(odp->d_name, snapname);
1078		odp->d_ino = ZFSCTL_INO_SNAP(id);
1079	}
1080	*nextp = cookie;
1081
1082	ZFS_EXIT(zfsvfs);
1083
1084	return (0);
1085}
1086
1087/* ARGSUSED */
1088static int
1089zfsctl_shares_readdir(ap)
1090	struct vop_readdir_args /* {
1091		struct vnode *a_vp;
1092		struct uio *a_uio;
1093		struct ucred *a_cred;
1094		int *a_eofflag;
1095		int *a_ncookies;
1096		u_long **a_cookies;
1097	} */ *ap;
1098{
1099	vnode_t *vp = ap->a_vp;
1100	uio_t *uiop = ap->a_uio;
1101	cred_t *cr = ap->a_cred;
1102	int *eofp = ap->a_eofflag;
1103	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1104	znode_t *dzp;
1105	int error;
1106
1107	ZFS_ENTER(zfsvfs);
1108
1109	if (zfsvfs->z_shares_dir == 0) {
1110		ZFS_EXIT(zfsvfs);
1111		return (ENOTSUP);
1112	}
1113	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1114		vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
1115		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies);
1116		VN_URELE(ZTOV(dzp));
1117	} else {
1118		*eofp = 1;
1119		error = ENOENT;
1120	}
1121
1122	ZFS_EXIT(zfsvfs);
1123	return (error);
1124}
1125
1126/*
1127 * pvp is the '.zfs' directory (zfsctl_node_t).
1128 * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
1129 *
1130 * This function is the callback to create a GFS vnode for '.zfs/snapshot'
1131 * when a lookup is performed on .zfs for "snapshot".
1132 */
1133vnode_t *
1134zfsctl_mknode_snapdir(vnode_t *pvp)
1135{
1136	vnode_t *vp;
1137	zfsctl_snapdir_t *sdp;
1138
1139	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
1140	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
1141	    zfsctl_snapdir_readdir_cb, NULL);
1142	sdp = vp->v_data;
1143	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
1144	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1145	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
1146	avl_create(&sdp->sd_snaps, snapentry_compare,
1147	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
1148	VOP_UNLOCK(vp, 0);
1149	return (vp);
1150}
1151
1152vnode_t *
1153zfsctl_mknode_shares(vnode_t *pvp)
1154{
1155	vnode_t *vp;
1156	zfsctl_node_t *sdp;
1157
1158	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
1159	    &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
1160	    NULL, NULL);
1161	sdp = vp->v_data;
1162	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1163	VOP_UNLOCK(vp, 0);
1164	return (vp);
1165
1166}
1167
1168/* ARGSUSED */
1169static int
1170zfsctl_shares_getattr(ap)
1171	struct vop_getattr_args /* {
1172		struct vnode *a_vp;
1173		struct vattr *a_vap;
1174		struct ucred *a_cred;
1175		struct thread *a_td;
1176	} */ *ap;
1177{
1178	vnode_t *vp = ap->a_vp;
1179	vattr_t *vap = ap->a_vap;
1180	cred_t *cr = ap->a_cred;
1181	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1182	znode_t *dzp;
1183	int error;
1184
1185	ZFS_ENTER(zfsvfs);
1186	if (zfsvfs->z_shares_dir == 0) {
1187		ZFS_EXIT(zfsvfs);
1188		return (ENOTSUP);
1189	}
1190	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1191		vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
1192		error = VOP_GETATTR(ZTOV(dzp), vap, cr);
1193		VN_URELE(ZTOV(dzp));
1194	}
1195	ZFS_EXIT(zfsvfs);
1196	return (error);
1197}
1198
1199/* ARGSUSED */
1200static int
1201zfsctl_snapdir_getattr(ap)
1202	struct vop_getattr_args /* {
1203		struct vnode *a_vp;
1204		struct vattr *a_vap;
1205		struct ucred *a_cred;
1206		struct thread *a_td;
1207	} */ *ap;
1208{
1209	struct vnode *vp = ap->a_vp;
1210	struct vattr *vap = ap->a_vap;
1211	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1212	zfsctl_snapdir_t *sdp = vp->v_data;
1213
1214	ZFS_ENTER(zfsvfs);
1215	zfsctl_common_getattr(vp, vap);
1216	vap->va_nodeid = gfs_file_inode(vp);
1217	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
1218	ZFS_EXIT(zfsvfs);
1219
1220	return (0);
1221}
1222
1223/* ARGSUSED */
1224static int
1225zfsctl_snapdir_inactive(ap)
1226	struct vop_inactive_args /* {
1227		struct vnode *a_vp;
1228		struct thread *a_td;
1229	} */ *ap;
1230{
1231	vnode_t *vp = ap->a_vp;
1232	zfsctl_snapdir_t *sdp = vp->v_data;
1233	zfs_snapentry_t *sep;
1234
1235	/*
1236	 * On forced unmount we have to free snapshots from here.
1237	 */
1238	mutex_enter(&sdp->sd_lock);
1239	while ((sep = avl_first(&sdp->sd_snaps)) != NULL) {
1240		avl_remove(&sdp->sd_snaps, sep);
1241		kmem_free(sep->se_name, strlen(sep->se_name) + 1);
1242		kmem_free(sep, sizeof (zfs_snapentry_t));
1243	}
1244	mutex_exit(&sdp->sd_lock);
1245	gfs_dir_inactive(vp);
1246	ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
1247	mutex_destroy(&sdp->sd_lock);
1248	avl_destroy(&sdp->sd_snaps);
1249	kmem_free(sdp, sizeof (zfsctl_snapdir_t));
1250
1251	return (0);
1252}
1253
1254static struct vop_vector zfsctl_ops_snapdir = {
1255	.vop_default =	&default_vnodeops,
1256	.vop_open =	zfsctl_common_open,
1257	.vop_close =	zfsctl_common_close,
1258	.vop_ioctl =	VOP_EINVAL,
1259	.vop_getattr =	zfsctl_snapdir_getattr,
1260	.vop_access =	zfsctl_common_access,
1261	.vop_mkdir =	zfsctl_freebsd_snapdir_mkdir,
1262	.vop_readdir =	gfs_vop_readdir,
1263	.vop_lookup =	zfsctl_snapdir_lookup,
1264	.vop_inactive =	zfsctl_snapdir_inactive,
1265	.vop_reclaim =	zfsctl_common_reclaim,
1266	.vop_fid =	zfsctl_common_fid,
1267};
1268
1269static struct vop_vector zfsctl_ops_shares = {
1270	.vop_default =	&default_vnodeops,
1271	.vop_open =	zfsctl_common_open,
1272	.vop_close =	zfsctl_common_close,
1273	.vop_ioctl =	VOP_EINVAL,
1274	.vop_getattr =	zfsctl_shares_getattr,
1275	.vop_access =	zfsctl_common_access,
1276	.vop_readdir =	zfsctl_shares_readdir,
1277	.vop_lookup =	zfsctl_shares_lookup,
1278	.vop_inactive =	gfs_vop_inactive,
1279	.vop_reclaim =	zfsctl_common_reclaim,
1280	.vop_fid =	zfsctl_shares_fid,
1281};
1282
1283/*
1284 * pvp is the GFS vnode '.zfs/snapshot'.
1285 *
1286 * This creates a GFS node under '.zfs/snapshot' representing each
1287 * snapshot.  This newly created GFS node is what we mount snapshot
1288 * vfs_t's ontop of.
1289 */
1290static vnode_t *
1291zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
1292{
1293	vnode_t *vp;
1294	zfsctl_node_t *zcp;
1295
1296	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
1297	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
1298	VN_HOLD(vp);
1299	zcp = vp->v_data;
1300	zcp->zc_id = objset;
1301	VOP_UNLOCK(vp, 0);
1302
1303	return (vp);
1304}
1305
1306static int
1307zfsctl_snapshot_inactive(ap)
1308	struct vop_inactive_args /* {
1309		struct vnode *a_vp;
1310		struct thread *a_td;
1311	} */ *ap;
1312{
1313	vnode_t *vp = ap->a_vp;
1314	cred_t *cr = ap->a_td->td_ucred;
1315	struct vop_inactive_args iap;
1316	zfsctl_snapdir_t *sdp;
1317	zfs_snapentry_t *sep, *next;
1318	int locked;
1319	vnode_t *dvp;
1320
1321	if (vp->v_count > 0)
1322		goto end;
1323
1324	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
1325	sdp = dvp->v_data;
1326	VOP_UNLOCK(dvp, 0);
1327
1328	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
1329		mutex_enter(&sdp->sd_lock);
1330
1331	ASSERT(!vn_ismntpt(vp));
1332
1333	sep = avl_first(&sdp->sd_snaps);
1334	while (sep != NULL) {
1335		next = AVL_NEXT(&sdp->sd_snaps, sep);
1336
1337		if (sep->se_root == vp) {
1338			avl_remove(&sdp->sd_snaps, sep);
1339			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
1340			kmem_free(sep, sizeof (zfs_snapentry_t));
1341			break;
1342		}
1343		sep = next;
1344	}
1345	ASSERT(sep != NULL);
1346
1347	if (!locked)
1348		mutex_exit(&sdp->sd_lock);
1349	VN_RELE(dvp);
1350end:
1351
1352	/*
1353	 * Dispose of the vnode for the snapshot mount point.
1354	 * This is safe to do because once this entry has been removed
1355	 * from the AVL tree, it can't be found again, so cannot become
1356	 * "active".  If we lookup the same name again we will end up
1357	 * creating a new vnode.
1358	 */
1359	iap.a_vp = vp;
1360	return (gfs_vop_inactive(&iap));
1361}
1362
1363static int
1364zfsctl_traverse_begin(vnode_t **vpp, int lktype)
1365{
1366
1367	VN_HOLD(*vpp);
1368	/* Snapshot should be already mounted, but just in case. */
1369	if (vn_mountedvfs(*vpp) == NULL)
1370		return (ENOENT);
1371	return (traverse(vpp, lktype));
1372}
1373
1374static void
1375zfsctl_traverse_end(vnode_t *vp, int err)
1376{
1377
1378	if (err == 0)
1379		vput(vp);
1380	else
1381		VN_RELE(vp);
1382}
1383
1384static int
1385zfsctl_snapshot_getattr(ap)
1386	struct vop_getattr_args /* {
1387		struct vnode *a_vp;
1388		struct vattr *a_vap;
1389		struct ucred *a_cred;
1390	} */ *ap;
1391{
1392	vnode_t *vp = ap->a_vp;
1393	int err;
1394
1395	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
1396	if (err == 0)
1397		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred);
1398	zfsctl_traverse_end(vp, err);
1399	return (err);
1400}
1401
1402static int
1403zfsctl_snapshot_fid(ap)
1404	struct vop_fid_args /* {
1405		struct vnode *a_vp;
1406		struct fid *a_fid;
1407	} */ *ap;
1408{
1409	vnode_t *vp = ap->a_vp;
1410	int err;
1411
1412	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
1413	if (err == 0)
1414		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
1415	zfsctl_traverse_end(vp, err);
1416	return (err);
1417}
1418
1419static int
1420zfsctl_snapshot_lookup(ap)
1421	struct vop_lookup_args /* {
1422		struct vnode *a_dvp;
1423		struct vnode **a_vpp;
1424		struct componentname *a_cnp;
1425	} */ *ap;
1426{
1427	vnode_t *dvp = ap->a_dvp;
1428	vnode_t **vpp = ap->a_vpp;
1429	struct componentname *cnp = ap->a_cnp;
1430	cred_t *cr = ap->a_cnp->cn_cred;
1431	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
1432	int error;
1433
1434	if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' ||
1435	    cnp->cn_nameptr[1] != '.') {
1436		return (ENOENT);
1437	}
1438
1439	ASSERT(dvp->v_type == VDIR);
1440	ASSERT(zfsvfs->z_ctldir != NULL);
1441
1442	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp,
1443	    NULL, 0, NULL, cr, NULL, NULL, NULL);
1444	if (error == 0)
1445		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1446	return (error);
1447}
1448
1449static int
1450zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
1451{
1452	zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data;
1453	vnode_t *dvp, *vp;
1454	zfsctl_snapdir_t *sdp;
1455	zfs_snapentry_t *sep;
1456	int error;
1457
1458	ASSERT(zfsvfs->z_ctldir != NULL);
1459	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1460	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
1461	if (error != 0)
1462		return (error);
1463	sdp = dvp->v_data;
1464
1465	mutex_enter(&sdp->sd_lock);
1466	sep = avl_first(&sdp->sd_snaps);
1467	while (sep != NULL) {
1468		vp = sep->se_root;
1469		if (vp == ap->a_vp)
1470			break;
1471		sep = AVL_NEXT(&sdp->sd_snaps, sep);
1472	}
1473	if (sep == NULL) {
1474		mutex_exit(&sdp->sd_lock);
1475		error = ENOENT;
1476	} else {
1477		size_t len;
1478
1479		len = strlen(sep->se_name);
1480		*ap->a_buflen -= len;
1481		bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len);
1482		mutex_exit(&sdp->sd_lock);
1483		vhold(dvp);
1484		*ap->a_vpp = dvp;
1485	}
1486	VN_RELE(dvp);
1487
1488	return (error);
1489}
1490
1491/*
1492 * These VP's should never see the light of day.  They should always
1493 * be covered.
1494 */
1495static struct vop_vector zfsctl_ops_snapshot = {
1496	.vop_default =	&default_vnodeops,
1497	.vop_inactive =	zfsctl_snapshot_inactive,
1498	.vop_lookup =	zfsctl_snapshot_lookup,
1499	.vop_reclaim =	zfsctl_common_reclaim,
1500	.vop_getattr =	zfsctl_snapshot_getattr,
1501	.vop_fid =	zfsctl_snapshot_fid,
1502	.vop_vptocnp =	zfsctl_snapshot_vptocnp,
1503};
1504
1505int
1506zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
1507{
1508	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1509	vnode_t *dvp, *vp;
1510	zfsctl_snapdir_t *sdp;
1511	zfsctl_node_t *zcp;
1512	zfs_snapentry_t *sep;
1513	int error;
1514
1515	ASSERT(zfsvfs->z_ctldir != NULL);
1516	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1517	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
1518	if (error != 0)
1519		return (error);
1520	sdp = dvp->v_data;
1521
1522	mutex_enter(&sdp->sd_lock);
1523	sep = avl_first(&sdp->sd_snaps);
1524	while (sep != NULL) {
1525		vp = sep->se_root;
1526		zcp = vp->v_data;
1527		if (zcp->zc_id == objsetid)
1528			break;
1529
1530		sep = AVL_NEXT(&sdp->sd_snaps, sep);
1531	}
1532
1533	if (sep != NULL) {
1534		VN_HOLD(vp);
1535		/*
1536		 * Return the mounted root rather than the covered mount point.
1537		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
1538		 * and returns the ZFS vnode mounted on top of the GFS node.
1539		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
1540		 */
1541		error = traverse(&vp, LK_SHARED | LK_RETRY);
1542		if (error == 0) {
1543			if (vp == sep->se_root)
1544				error = EINVAL;
1545			else
1546				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
1547		}
1548		mutex_exit(&sdp->sd_lock);
1549		if (error == 0)
1550			VN_URELE(vp);
1551		else
1552			VN_RELE(vp);
1553	} else {
1554		error = EINVAL;
1555		mutex_exit(&sdp->sd_lock);
1556	}
1557
1558	VN_RELE(dvp);
1559
1560	return (error);
1561}
1562
1563/*
1564 * Unmount any snapshots for the given filesystem.  This is called from
1565 * zfs_umount() - if we have a ctldir, then go through and unmount all the
1566 * snapshots.
1567 */
1568int
1569zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
1570{
1571	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1572	vnode_t *dvp;
1573	zfsctl_snapdir_t *sdp;
1574	zfs_snapentry_t *sep, *next;
1575	int error;
1576
1577	ASSERT(zfsvfs->z_ctldir != NULL);
1578	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1579	    NULL, 0, NULL, cr, NULL, NULL, NULL);
1580	if (error != 0)
1581		return (error);
1582	sdp = dvp->v_data;
1583
1584	mutex_enter(&sdp->sd_lock);
1585
1586	sep = avl_first(&sdp->sd_snaps);
1587	while (sep != NULL) {
1588		next = AVL_NEXT(&sdp->sd_snaps, sep);
1589
1590		/*
1591		 * If this snapshot is not mounted, then it must
1592		 * have just been unmounted by somebody else, and
1593		 * will be cleaned up by zfsctl_snapdir_inactive().
1594		 */
1595		if (vn_ismntpt(sep->se_root)) {
1596			error = zfsctl_unmount_snap(sep, fflags, cr);
1597			if (error) {
1598				avl_index_t where;
1599
1600				/*
1601				 * Before reinserting snapshot to the tree,
1602				 * check if it was actually removed. For example
1603				 * when snapshot mount point is busy, we will
1604				 * have an error here, but there will be no need
1605				 * to reinsert snapshot.
1606				 */
1607				if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
1608					avl_insert(&sdp->sd_snaps, sep, where);
1609				break;
1610			}
1611		}
1612		sep = next;
1613	}
1614
1615	mutex_exit(&sdp->sd_lock);
1616	VN_RELE(dvp);
1617
1618	return (error);
1619}
1620