1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/time.h>
29#include <sys/systm.h>
30#include <sys/sysmacros.h>
31#include <sys/resource.h>
32#include <sys/vfs.h>
33#include <sys/vnode.h>
34#include <sys/file.h>
35#include <sys/kmem.h>
36#include <sys/uio.h>
37#include <sys/cmn_err.h>
38#include <sys/errno.h>
39#include <sys/stat.h>
40#include <sys/unistd.h>
41#include <sys/sunddi.h>
42#include <sys/random.h>
43#include <sys/policy.h>
44#ifdef __FreeBSD__
45#include <sys/kcondvar.h>
46#include <sys/callb.h>
47#include <sys/smp.h>
48#endif
49#include <sys/zfs_dir.h>
50#include <sys/zfs_acl.h>
51#include <sys/fs/zfs.h>
52#include <sys/zap.h>
53#include <sys/dmu.h>
54#include <sys/atomic.h>
55#include <sys/zfs_ctldir.h>
56#include <sys/zfs_fuid.h>
57#include <sys/sa.h>
58#include <sys/zfs_sa.h>
59#include <sys/dnlc.h>
60#include <sys/extdirent.h>
61
62/*
63 * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
64 * of names after deciding which is the appropriate lookup interface.
65 */
66static int
67zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
68    boolean_t exact, uint64_t *zoid)
69{
70	int error;
71
72	if (zfsvfs->z_norm) {
73		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
74
75		/*
76		 * In the non-mixed case we only expect there would ever
77		 * be one match, but we need to use the normalizing lookup.
78		 */
79		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
80		    zoid, mt, NULL, 0, NULL);
81	} else {
82		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
83	}
84	*zoid = ZFS_DIRENT_OBJ(*zoid);
85
86	return (error);
87}
88
89/*
90 * Look up a directory entry under a locked vnode.
91 * dvp being locked gives us a guarantee that there are no concurrent
92 * modification of the directory and, thus, if a node can be found in
93 * the directory, then it must not be unlinked.
94 *
95 * Input arguments:
96 *	dzp	- znode for directory
97 *	name	- name of entry to lock
98 *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
99 *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
100 *		  ZXATTR: we want dzp's xattr directory
101 *
102 * Output arguments:
103 *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
104 *
105 * Return value: 0 on success or errno on failure.
106 *
107 * NOTE: Always checks for, and rejects, '.' and '..'.
108 */
109int
110zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
111{
112	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
113	boolean_t	exact;
114	uint64_t	zoid;
115	vnode_t		*vp = NULL;
116	int		error = 0;
117
118	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
119
120	*zpp = NULL;
121
122	/*
123	 * Verify that we are not trying to lock '.', '..', or '.zfs'
124	 */
125	if (name[0] == '.' &&
126	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
127	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
128		return (SET_ERROR(EEXIST));
129
130	/*
131	 * Case sensitivity and normalization preferences are set when
132	 * the file system is created.  These are stored in the
133	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
134	 * affect how we perform zap lookups.
135	 *
136	 * Decide if exact matches should be requested when performing
137	 * a zap lookup on file systems supporting case-insensitive
138	 * access.
139	 *
140	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
141	 * because in that case MT_EXACT and MT_FIRST should produce exactly
142	 * the same result.
143	 */
144	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
145
146	if (dzp->z_unlinked && !(flag & ZXATTR))
147		return (ENOENT);
148	if (flag & ZXATTR) {
149		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
150		    sizeof (zoid));
151		if (error == 0)
152			error = (zoid == 0 ? ENOENT : 0);
153	} else {
154		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
155	}
156	if (error) {
157		if (error != ENOENT || (flag & ZEXISTS)) {
158			return (error);
159		}
160	} else {
161		if (flag & ZNEW) {
162			return (SET_ERROR(EEXIST));
163		}
164		error = zfs_zget(zfsvfs, zoid, zpp);
165		if (error)
166			return (error);
167		ASSERT(!(*zpp)->z_unlinked);
168	}
169
170	return (0);
171}
172
173static int
174zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
175{
176	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
177	znode_t *zp;
178	uint64_t parent;
179	int error;
180
181	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
182	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
183
184	if (dzp->z_unlinked)
185		return (ENOENT);
186
187	if ((error = sa_lookup(dzp->z_sa_hdl,
188	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
189		return (error);
190
191	error = zfs_zget(zfsvfs, parent, &zp);
192	if (error == 0)
193		*zpp = zp;
194	return (error);
195}
196
197int
198zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
199{
200	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
201	znode_t *zp;
202	int error = 0;
203
204	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
205	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
206
207	if (dzp->z_unlinked)
208		return (SET_ERROR(ENOENT));
209
210	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
211		*zpp = dzp;
212	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
213		error = zfs_dd_lookup(dzp, zpp);
214	} else {
215		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
216		if (error == 0) {
217			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
218			*zpp = zp;
219		}
220	}
221	return (error);
222}
223
224/*
225 * unlinked Set (formerly known as the "delete queue") Error Handling
226 *
227 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
228 * don't specify the name of the entry that we will be manipulating.  We
229 * also fib and say that we won't be adding any new entries to the
230 * unlinked set, even though we might (this is to lower the minimum file
231 * size that can be deleted in a full filesystem).  So on the small
232 * chance that the nlink list is using a fat zap (ie. has more than
233 * 2000 entries), we *may* not pre-read a block that's needed.
234 * Therefore it is remotely possible for some of the assertions
235 * regarding the unlinked set below to fail due to i/o error.  On a
236 * nondebug system, this will result in the space being leaked.
237 */
238void
239zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
240{
241	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
242
243	ASSERT(zp->z_unlinked);
244	ASSERT(zp->z_links == 0);
245
246	VERIFY3U(0, ==,
247	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
248}
249
250/*
251 * Clean up any znodes that had no links when we either crashed or
252 * (force) umounted the file system.
253 */
254void
255zfs_unlinked_drain(zfsvfs_t *zfsvfs)
256{
257	zap_cursor_t	zc;
258	zap_attribute_t zap;
259	dmu_object_info_t doi;
260	znode_t		*zp;
261	int		error;
262
263	/*
264	 * Interate over the contents of the unlinked set.
265	 */
266	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
267	    zap_cursor_retrieve(&zc, &zap) == 0;
268	    zap_cursor_advance(&zc)) {
269
270		/*
271		 * See what kind of object we have in list
272		 */
273
274		error = dmu_object_info(zfsvfs->z_os,
275		    zap.za_first_integer, &doi);
276		if (error != 0)
277			continue;
278
279		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
280		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
281		/*
282		 * We need to re-mark these list entries for deletion,
283		 * so we pull them back into core and set zp->z_unlinked.
284		 */
285		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
286
287		/*
288		 * We may pick up znodes that are already marked for deletion.
289		 * This could happen during the purge of an extended attribute
290		 * directory.  All we need to do is skip over them, since they
291		 * are already in the system marked z_unlinked.
292		 */
293		if (error != 0)
294			continue;
295
296		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
297		zp->z_unlinked = B_TRUE;
298		vput(ZTOV(zp));
299	}
300	zap_cursor_fini(&zc);
301}
302
303/*
304 * Delete the entire contents of a directory.  Return a count
305 * of the number of entries that could not be deleted. If we encounter
306 * an error, return a count of at least one so that the directory stays
307 * in the unlinked set.
308 *
309 * NOTE: this function assumes that the directory is inactive,
310 *	so there is no need to lock its entries before deletion.
311 *	Also, it assumes the directory contents is *only* regular
312 *	files.
313 */
314static int
315zfs_purgedir(znode_t *dzp)
316{
317	zap_cursor_t	zc;
318	zap_attribute_t	zap;
319	znode_t		*xzp;
320	dmu_tx_t	*tx;
321	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
322	int skipped = 0;
323	int error;
324
325	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
326	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
327	    zap_cursor_advance(&zc)) {
328		error = zfs_zget(zfsvfs,
329		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
330		if (error) {
331			skipped += 1;
332			continue;
333		}
334
335		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
336		ASSERT((ZTOV(xzp)->v_type == VREG) ||
337		    (ZTOV(xzp)->v_type == VLNK));
338
339		tx = dmu_tx_create(zfsvfs->z_os);
340		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
341		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
342		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
343		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
344		/* Is this really needed ? */
345		zfs_sa_upgrade_txholds(tx, xzp);
346		dmu_tx_mark_netfree(tx);
347		error = dmu_tx_assign(tx, TXG_WAIT);
348		if (error) {
349			dmu_tx_abort(tx);
350			vput(ZTOV(xzp));
351			skipped += 1;
352			continue;
353		}
354
355		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
356		if (error)
357			skipped += 1;
358		dmu_tx_commit(tx);
359
360		vput(ZTOV(xzp));
361	}
362	zap_cursor_fini(&zc);
363	if (error != ENOENT)
364		skipped += 1;
365	return (skipped);
366}
367
368void
369zfs_rmnode(znode_t *zp)
370{
371	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
372	objset_t	*os = zfsvfs->z_os;
373	znode_t		*xzp = NULL;
374	dmu_tx_t	*tx;
375	uint64_t	acl_obj;
376	uint64_t	xattr_obj;
377	int		error;
378
379	ASSERT(zp->z_links == 0);
380#ifndef __NetBSD__
381	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
382#endif
383
384	/*
385	 * If this is an attribute directory, purge its contents.
386	 */
387	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
388	    (zp->z_pflags & ZFS_XATTR)) {
389		if (zfs_purgedir(zp) != 0) {
390			/*
391			 * Not enough space to delete some xattrs.
392			 * Leave it in the unlinked set.
393			 */
394			zfs_znode_dmu_fini(zp);
395			zfs_znode_free(zp);
396			return;
397		}
398	} else {
399		/*
400		 * Free up all the data in the file.  We don't do this for
401		 * XATTR directories because we need truncate and remove to be
402		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
403		 * we crash here we'll end up with an inconsistent truncated
404		 * zap object in the delete queue.  Note a truncated file is
405		 * harmless since it only contains user data.
406		 */
407		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
408		if (error) {
409			/*
410			 * Not enough space.  Leave the file in the unlinked
411			 * set.
412			 */
413			zfs_znode_dmu_fini(zp);
414			zfs_znode_free(zp);
415			return;
416		}
417	}
418
419	/*
420	 * If the file has extended attributes, we're going to unlink
421	 * the xattr dir.
422	 */
423	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
424	    &xattr_obj, sizeof (xattr_obj));
425	if (error == 0 && xattr_obj) {
426		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
427		ASSERT3S(error, ==, 0);
428		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
429	}
430
431	acl_obj = zfs_external_acl(zp);
432
433	/*
434	 * Set up the final transaction.
435	 */
436	tx = dmu_tx_create(os);
437	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
438	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
439	if (xzp) {
440		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
441		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
442	}
443	if (acl_obj)
444		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
445
446	zfs_sa_upgrade_txholds(tx, zp);
447	error = dmu_tx_assign(tx, TXG_WAIT);
448	if (error) {
449		/*
450		 * Not enough space to delete the file.  Leave it in the
451		 * unlinked set, leaking it until the fs is remounted (at
452		 * which point we'll call zfs_unlinked_drain() to process it).
453		 */
454		dmu_tx_abort(tx);
455		zfs_znode_dmu_fini(zp);
456		zfs_znode_free(zp);
457		goto out;
458	}
459
460	if (xzp) {
461		ASSERT(error == 0);
462		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
463		xzp->z_links = 0;	/* no more links to it */
464		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
465		    &xzp->z_links, sizeof (xzp->z_links), tx));
466		zfs_unlinked_add(xzp, tx);
467	}
468
469	/* Remove this znode from the unlinked set */
470	VERIFY3U(0, ==,
471	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
472
473	zfs_znode_delete(zp, tx);
474
475	dmu_tx_commit(tx);
476out:
477	if (xzp)
478		vput(ZTOV(xzp));
479}
480
481static uint64_t
482zfs_dirent(znode_t *zp, uint64_t mode)
483{
484	uint64_t de = zp->z_id;
485
486	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
487		de |= IFTODT(mode) << 60;
488	return (de);
489}
490
491/*
492 * Link zp into dzp.  Can only fail if zp has been unlinked.
493 */
494int
495zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
496    int flag)
497{
498	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
499	vnode_t *vp = ZTOV(zp);
500	uint64_t value;
501	int zp_is_dir = (vp->v_type == VDIR);
502	sa_bulk_attr_t bulk[5];
503	uint64_t mtime[2], ctime[2];
504	int count = 0;
505	int error;
506
507	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
508	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
509#if 0
510	if (zp_is_dir) {
511		error = 0;
512		if (dzp->z_links >= LINK_MAX)
513			error = SET_ERROR(EMLINK);
514		return (error);
515	}
516#endif
517	if (!(flag & ZRENAMING)) {
518		if (zp->z_unlinked) {	/* no new links to unlinked zp */
519			ASSERT(!(flag & (ZNEW | ZEXISTS)));
520			return (SET_ERROR(ENOENT));
521		}
522#if 0
523		if (zp->z_links >= LINK_MAX) {
524			return (SET_ERROR(EMLINK));
525		}
526#endif
527		zp->z_links++;
528		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
529		    &zp->z_links, sizeof (zp->z_links));
530
531	} else {
532		ASSERT(zp->z_unlinked == 0);
533	}
534	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
535	    &dzp->z_id, sizeof (dzp->z_id));
536	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
537	    &zp->z_pflags, sizeof (zp->z_pflags));
538
539	if (!(flag & ZNEW)) {
540		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
541		    ctime, sizeof (ctime));
542		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
543		    ctime, B_TRUE);
544	}
545	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
546	ASSERT0(error);
547
548	dzp->z_size++;
549	dzp->z_links += zp_is_dir;
550	count = 0;
551	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
552	    &dzp->z_size, sizeof (dzp->z_size));
553	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
554	    &dzp->z_links, sizeof (dzp->z_links));
555	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
556	    mtime, sizeof (mtime));
557	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
558	    ctime, sizeof (ctime));
559	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
560	    &dzp->z_pflags, sizeof (dzp->z_pflags));
561	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
562	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
563	ASSERT0(error);
564
565	value = zfs_dirent(zp, zp->z_mode);
566	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
567	    8, 1, &value, tx);
568	VERIFY0(error);
569
570	return (0);
571}
572
573static int
574zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
575    int flag)
576{
577	int error;
578
579	if (zp->z_zfsvfs->z_norm) {
580		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
581			error = zap_remove_norm(zp->z_zfsvfs->z_os,
582			    dzp->z_id, name, MT_EXACT, tx);
583		else
584			error = zap_remove_norm(zp->z_zfsvfs->z_os,
585			    dzp->z_id, name, MT_FIRST, tx);
586	} else {
587		error = zap_remove(zp->z_zfsvfs->z_os,
588		    dzp->z_id, name, tx);
589	}
590
591	return (error);
592}
593
594/*
595 * Unlink zp from dzp, and mark zp for deletion if this was the last link.
596 * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
597 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
598 * If it's non-NULL, we use it to indicate whether the znode needs deletion,
599 * and it's the caller's job to do it.
600 */
601int
602zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
603    int flag, boolean_t *unlinkedp)
604{
605	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
606	vnode_t *vp = ZTOV(zp);
607	int zp_is_dir = (vp->v_type == VDIR);
608	boolean_t unlinked = B_FALSE;
609	sa_bulk_attr_t bulk[5];
610	uint64_t mtime[2], ctime[2];
611	int count = 0;
612	int error;
613
614	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
615	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
616
617	if (!(flag & ZRENAMING)) {
618
619		if (zp_is_dir && !zfs_dirempty(zp)) {
620#ifdef illumos
621			return (SET_ERROR(EEXIST));
622#else
623			return (SET_ERROR(ENOTEMPTY));
624#endif
625		}
626
627		/*
628		 * If we get here, we are going to try to remove the object.
629		 * First try removing the name from the directory; if that
630		 * fails, return the error.
631		 */
632		error = zfs_dropname(dzp, name, zp, tx, flag);
633		if (error != 0) {
634			return (error);
635		}
636
637		if (zp->z_links <= zp_is_dir) {
638			zfs_panic_recover("zfs: link count on vnode %p is %u, "
639			    "should be at least %u", zp->z_vnode,
640			    (int)zp->z_links,
641			    zp_is_dir + 1);
642			zp->z_links = zp_is_dir + 1;
643		}
644		if (--zp->z_links == zp_is_dir) {
645			zp->z_unlinked = B_TRUE;
646			zp->z_links = 0;
647			unlinked = B_TRUE;
648		} else {
649			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
650			    NULL, &ctime, sizeof (ctime));
651			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
652			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
653			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
654			    B_TRUE);
655		}
656		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
657		    NULL, &zp->z_links, sizeof (zp->z_links));
658		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
659		count = 0;
660		ASSERT0(error);
661	} else {
662		ASSERT(zp->z_unlinked == 0);
663		error = zfs_dropname(dzp, name, zp, tx, flag);
664		if (error != 0)
665			return (error);
666	}
667
668	dzp->z_size--;		/* one dirent removed */
669	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
670	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
671	    NULL, &dzp->z_links, sizeof (dzp->z_links));
672	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
673	    NULL, &dzp->z_size, sizeof (dzp->z_size));
674	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
675	    NULL, ctime, sizeof (ctime));
676	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
677	    NULL, mtime, sizeof (mtime));
678	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
679	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
680	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
681	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
682	ASSERT0(error);
683
684	if (unlinkedp != NULL)
685		*unlinkedp = unlinked;
686	else if (unlinked)
687		zfs_unlinked_add(zp, tx);
688
689	return (0);
690}
691
692/*
693 * Indicate whether the directory is empty.
694 */
695boolean_t
696zfs_dirempty(znode_t *dzp)
697{
698	return (dzp->z_size == 2);
699}
700
701int
702zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
703{
704	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
705	znode_t *xzp;
706	dmu_tx_t *tx;
707	int error;
708	zfs_acl_ids_t acl_ids;
709	boolean_t fuid_dirtied;
710	uint64_t parent;
711
712	*xvpp = NULL;
713
714	/*
715	 * In FreeBSD, access checking for creating an EA is being done
716	 * in zfs_setextattr(),
717	 */
718#ifndef __FreeBSD_kernel__
719	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
720		return (error);
721#endif
722
723	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
724	    &acl_ids)) != 0)
725		return (error);
726	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
727		zfs_acl_ids_free(&acl_ids);
728		return (SET_ERROR(EDQUOT));
729	}
730
731	getnewvnode_reserve(1);
732
733	tx = dmu_tx_create(zfsvfs->z_os);
734	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
735	    ZFS_SA_BASE_ATTR_SIZE);
736	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
737	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
738	fuid_dirtied = zfsvfs->z_fuid_dirty;
739	if (fuid_dirtied)
740		zfs_fuid_txhold(zfsvfs, tx);
741	error = dmu_tx_assign(tx, TXG_WAIT);
742	if (error) {
743		zfs_acl_ids_free(&acl_ids);
744		dmu_tx_abort(tx);
745		return (error);
746	}
747	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
748
749	if (fuid_dirtied)
750		zfs_fuid_sync(zfsvfs, tx);
751
752#ifdef DEBUG
753	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
754	    &parent, sizeof (parent));
755	ASSERT(error == 0 && parent == zp->z_id);
756#endif
757
758	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
759	    sizeof (xzp->z_id), tx));
760
761	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
762	    xzp, "", NULL, acl_ids.z_fuidp, vap);
763
764	zfs_acl_ids_free(&acl_ids);
765	dmu_tx_commit(tx);
766
767	getnewvnode_drop_reserve();
768
769	*xvpp = ZTOV(xzp);
770
771	return (0);
772}
773
774/*
775 * Return a znode for the extended attribute directory for zp.
776 * ** If the directory does not already exist, it is created **
777 *
778 *	IN:	zp	- znode to obtain attribute directory from
779 *		cr	- credentials of caller
780 *		flags	- flags from the VOP_LOOKUP call
781 *
782 *	OUT:	xzpp	- pointer to extended attribute znode
783 *
784 *	RETURN:	0 on success
785 *		error number on failure
786 */
787int
788zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
789{
790	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
791	znode_t		*xzp;
792	vattr_t		va;
793	int		error;
794top:
795	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
796	if (error)
797		return (error);
798
799	if (xzp != NULL) {
800		*xvpp = ZTOV(xzp);
801		return (0);
802	}
803
804
805	if (!(flags & CREATE_XATTR_DIR)) {
806#ifdef illumos
807		return (SET_ERROR(ENOENT));
808#else
809		return (SET_ERROR(ENOATTR));
810#endif
811	}
812
813	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
814		return (SET_ERROR(EROFS));
815	}
816
817	/*
818	 * The ability to 'create' files in an attribute
819	 * directory comes from the write_xattr permission on the base file.
820	 *
821	 * The ability to 'search' an attribute directory requires
822	 * read_xattr permission on the base file.
823	 *
824	 * Once in a directory the ability to read/write attributes
825	 * is controlled by the permissions on the attribute file.
826	 */
827	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
828	va.va_type = VDIR;
829	va.va_mode = S_IFDIR | S_ISVTX | 0777;
830	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
831
832	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
833
834	if (error == ERESTART) {
835		/* NB: we already did dmu_tx_wait() if necessary */
836		goto top;
837	}
838	if (error == 0)
839		VOP_UNLOCK(*xvpp, 0);
840
841	return (error);
842}
843
844/*
845 * Decide whether it is okay to remove within a sticky directory.
846 *
847 * In sticky directories, write access is not sufficient;
848 * you can remove entries from a directory only if:
849 *
850 *	you own the directory,
851 *	you own the entry,
852 *	the entry is a plain file and you have write access,
853 *	or you are privileged (checked in secpolicy...).
854 *
855 * The function returns 0 if remove access is granted.
856 */
857int
858zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
859{
860	uid_t  		uid;
861	uid_t		downer;
862	uid_t		fowner;
863	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
864
865	if (zdp->z_zfsvfs->z_replay)
866		return (0);
867
868	if ((zdp->z_mode & S_ISVTX) == 0)
869		return (0);
870
871	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
872	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
873
874	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
875	    (ZTOV(zp)->v_type == VREG &&
876	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
877		return (0);
878	else
879		return (secpolicy_vnode_remove(ZTOV(zp), cr));
880}
881