zfs_log.c revision 6514:852c82a1989c
1132718Skan/*
2132718Skan * CDDL HEADER START
3132718Skan *
4132718Skan * The contents of this file are subject to the terms of the
5132718Skan * Common Development and Distribution License (the "License").
6132718Skan * You may not use this file except in compliance with the License.
7132718Skan *
8132718Skan * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9132718Skan * or http://www.opensolaris.org/os/licensing.
10132718Skan * See the License for the specific language governing permissions
11132718Skan * and limitations under the License.
12132718Skan *
13132718Skan * When distributing Covered Code, include this CDDL HEADER in each
14132718Skan * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15132718Skan * If applicable, add the following below this CDDL HEADER, with the
16132718Skan * fields enclosed by brackets "[]" replaced with your own identifying
17132718Skan * information: Portions Copyright [yyyy] [name of copyright owner]
18132718Skan *
19132718Skan * CDDL HEADER END
20132718Skan */
21132718Skan/*
22132718Skan * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23132718Skan * Use is subject to license terms.
24132718Skan */
25132718Skan
26132718Skan#pragma ident	"%Z%%M%	%I%	%E% SMI"
27132718Skan
28132718Skan#include <sys/types.h>
29132718Skan#include <sys/param.h>
30132718Skan#include <sys/systm.h>
31132718Skan#include <sys/sysmacros.h>
32132718Skan#include <sys/cmn_err.h>
33132718Skan#include <sys/kmem.h>
34132718Skan#include <sys/thread.h>
35132718Skan#include <sys/file.h>
36132718Skan#include <sys/vfs.h>
37132718Skan#include <sys/zfs_znode.h>
38132718Skan#include <sys/zfs_dir.h>
39132718Skan#include <sys/zil.h>
40132718Skan#include <sys/zil_impl.h>
41132718Skan#include <sys/byteorder.h>
42132718Skan#include <sys/policy.h>
43132718Skan#include <sys/stat.h>
44132718Skan#include <sys/mode.h>
45132718Skan#include <sys/acl.h>
46132718Skan#include <sys/dmu.h>
47132718Skan#include <sys/spa.h>
48132718Skan#include <sys/zfs_fuid.h>
49132718Skan#include <sys/ddi.h>
50132718Skan
51132718Skan/*
52132718Skan * All the functions in this file are used to construct the log entries
53132718Skan * to record transactions. They allocate * an intent log transaction
54132718Skan * structure (itx_t) and save within it all the information necessary to
55132718Skan * possibly replay the transaction. The itx is then assigned a sequence
56132718Skan * number and inserted in the in-memory list anchored in the zilog.
57132718Skan */
58132718Skan
59132718Skanint
60132718Skanzfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
61132718Skan{
62132718Skan	int isxvattr = (vap->va_mask & AT_XVATTR);
63132718Skan	switch (type) {
64132718Skan	case Z_FILE:
65132718Skan		if (vsecp == NULL && !isxvattr)
66132718Skan			return (TX_CREATE);
67132718Skan		if (vsecp && isxvattr)
68132718Skan			return (TX_CREATE_ACL_ATTR);
69132718Skan		if (vsecp)
70132718Skan			return (TX_CREATE_ACL);
71132718Skan		else
72132718Skan			return (TX_CREATE_ATTR);
73132718Skan		/*NOTREACHED*/
74132718Skan	case Z_DIR:
75132718Skan		if (vsecp == NULL && !isxvattr)
76132718Skan			return (TX_MKDIR);
77132718Skan		if (vsecp && isxvattr)
78132718Skan			return (TX_MKDIR_ACL_ATTR);
79132718Skan		if (vsecp)
80132718Skan			return (TX_MKDIR_ACL);
81132718Skan		else
82132718Skan			return (TX_MKDIR_ATTR);
83132718Skan	case Z_XATTRDIR:
84132718Skan		return (TX_MKXATTR);
85132718Skan	}
86132718Skan	ASSERT(0);
87132718Skan	return (TX_MAX_TYPE);
88132718Skan}
89132718Skan
90132718Skan/*
91132718Skan * build up the log data necessary for logging xvattr_t
92132718Skan * First lr_attr_t is initialized.  following the lr_attr_t
93132718Skan * is the mapsize and attribute bitmap copied from the xvattr_t.
94132718Skan * Following the bitmap and bitmapsize two 64 bit words are reserved
95132718Skan * for the create time which may be set.  Following the create time
96132718Skan * records a single 64 bit integer which has the bits to set on
97132718Skan * replay for the xvattr.
98132718Skan */
99132718Skanstatic void
100132718Skanzfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
101132718Skan{
102132718Skan	uint32_t	*bitmap;
103132718Skan	uint64_t	*attrs;
104132718Skan	uint64_t	*crtime;
105132718Skan	xoptattr_t	*xoap;
106132718Skan	void		*scanstamp;
107132718Skan	int		i;
108132718Skan
109132718Skan	xoap = xva_getxoptattr(xvap);
110132718Skan	ASSERT(xoap);
111132718Skan
112132718Skan	lrattr->lr_attr_masksize = xvap->xva_mapsize;
113132718Skan	bitmap = &lrattr->lr_attr_bitmap;
114132718Skan	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
115132718Skan		*bitmap = xvap->xva_reqattrmap[i];
116132718Skan	}
117132718Skan
118132718Skan	/* Now pack the attributes up in a single uint64_t */
119132718Skan	attrs = (uint64_t *)bitmap;
120132718Skan	crtime = attrs + 1;
121132718Skan	scanstamp = (caddr_t)(crtime + 2);
122132718Skan	*attrs = 0;
123132718Skan	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
124132718Skan		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
125132718Skan		    XAT0_READONLY;
126132718Skan	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
127132718Skan		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
128132718Skan		    XAT0_HIDDEN;
129132718Skan	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
130132718Skan		*attrs |= (xoap->xoa_system == 0) ? 0 :
131132718Skan		    XAT0_SYSTEM;
132132718Skan	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
133132718Skan		*attrs |= (xoap->xoa_archive == 0) ? 0 :
134132718Skan		    XAT0_ARCHIVE;
135132718Skan	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
136132718Skan		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
137132718Skan		    XAT0_IMMUTABLE;
138132718Skan	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
139132718Skan		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
140132718Skan		    XAT0_NOUNLINK;
141132718Skan	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
142132718Skan		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
143132718Skan		    XAT0_APPENDONLY;
144132718Skan	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
145132718Skan		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
146132718Skan		    XAT0_APPENDONLY;
147132718Skan	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
148132718Skan		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
149132718Skan		    XAT0_NODUMP;
150132718Skan	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
151132718Skan		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
152132718Skan		    XAT0_AV_QUARANTINED;
153132718Skan	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
154132718Skan		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
155132718Skan		    XAT0_AV_MODIFIED;
156132718Skan	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
157132718Skan		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
158132718Skan	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
159132718Skan		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
160132718Skan}
161132718Skan
162132718Skanstatic void *
163132718Skanzfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
164132718Skan{
165132718Skan	zfs_fuid_t *zfuid;
166132718Skan	uint64_t *fuidloc = start;
167132718Skan
168132718Skan	/* First copy in the ACE FUIDs */
169132718Skan	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
170132718Skan	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
171132718Skan		*fuidloc++ = zfuid->z_logfuid;
172132718Skan	}
173132718Skan	return (fuidloc);
174132718Skan}
175132718Skan
176132718Skan
177132718Skanstatic void *
178132718Skanzfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
179132718Skan{
180132718Skan	zfs_fuid_domain_t *zdomain;
181132718Skan
182132718Skan	/* now copy in the domain info, if any */
183132718Skan	if (fuidp->z_domain_str_sz != 0) {
184132718Skan		for (zdomain = list_head(&fuidp->z_domains); zdomain;
185132718Skan		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
186132718Skan			bcopy((void *)zdomain->z_domain, start,
187132718Skan			    strlen(zdomain->z_domain) + 1);
188132718Skan			start = (caddr_t)start +
189132718Skan			    strlen(zdomain->z_domain) + 1;
190132718Skan		}
191132718Skan	}
192132718Skan	return (start);
193132718Skan}
194132718Skan
195132718Skan/*
196132718Skan * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
197132718Skan * TX_MKDIR_ATTR and TX_MKXATTR
198132718Skan * transactions.
199132718Skan *
200132718Skan * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
201132718Skan * domain information appended prior to the name.  In this case the
202132718Skan * uid/gid in the log record will be a log centric FUID.
203132718Skan *
204132718Skan * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
205132718Skan * may contain attributes, ACL and optional fuid information.
206132718Skan *
207132718Skan * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
208132718Skan * and ACL and normal users/groups in the ACEs.
209132718Skan *
210132718Skan * There may be an optional xvattr attribute information similar
211132718Skan * to zfs_log_setattr.
212132718Skan *
213132718Skan * Also, after the file name "domain" strings may be appended.
214132718Skan */
215132718Skanvoid
216132718Skanzfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
217132718Skan    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
218132718Skan    zfs_fuid_info_t *fuidp, vattr_t *vap)
219132718Skan{
220132718Skan	itx_t *itx;
221132718Skan	uint64_t seq;
222132718Skan	lr_create_t *lr;
223132718Skan	lr_acl_create_t *lracl;
224132718Skan	size_t aclsize;
225132718Skan	size_t xvatsize = 0;
226132718Skan	size_t txsize;
227132718Skan	xvattr_t *xvap = (xvattr_t *)vap;
228132718Skan	void *end;
229132718Skan	size_t lrsize;
230132718Skan	size_t namesize = strlen(name) + 1;
231132718Skan	size_t fuidsz = 0;
232132718Skan
233132718Skan	if (zilog == NULL)
234132718Skan		return;
235132718Skan
236132718Skan	/*
237132718Skan	 * If we have FUIDs present then add in space for
238132718Skan	 * domains and ACE fuid's if any.
239132718Skan	 */
240132718Skan	if (fuidp) {
241132718Skan		fuidsz += fuidp->z_domain_str_sz;
242132718Skan		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
243132718Skan	}
244132718Skan
245132718Skan	if (vap->va_mask & AT_XVATTR)
246132718Skan		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
247132718Skan
248132718Skan	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
249132718Skan	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
250132718Skan	    (int)txtype == TX_MKXATTR) {
251132718Skan		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
252132718Skan		lrsize = sizeof (*lr);
253132718Skan	} else {
254132718Skan		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
255132718Skan		txsize =
256132718Skan		    sizeof (lr_acl_create_t) + namesize + fuidsz +
257132718Skan		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
258132718Skan		lrsize = sizeof (lr_acl_create_t);
259132718Skan	}
260132718Skan
261132718Skan	itx = zil_itx_create(txtype, txsize);
262132718Skan
263132718Skan	lr = (lr_create_t *)&itx->itx_lr;
264132718Skan	lr->lr_doid = dzp->z_id;
265132718Skan	lr->lr_foid = zp->z_id;
266132718Skan	lr->lr_mode = zp->z_phys->zp_mode;
267132718Skan	if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
268132718Skan		lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
269132718Skan	} else {
270132718Skan		lr->lr_uid = fuidp->z_fuid_owner;
271132718Skan	}
272132718Skan	if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
273132718Skan		lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
274132718Skan	} else {
275132718Skan		lr->lr_gid = fuidp->z_fuid_group;
276132718Skan	}
277132718Skan	lr->lr_gen = zp->z_phys->zp_gen;
278132718Skan	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
279132718Skan	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
280132718Skan	lr->lr_rdev = zp->z_phys->zp_rdev;
281132718Skan
282132718Skan	/*
283132718Skan	 * Fill in xvattr info if any
284132718Skan	 */
285132718Skan	if (vap->va_mask & AT_XVATTR) {
286132718Skan		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
287132718Skan		end = (caddr_t)lr + lrsize + xvatsize;
288132718Skan	} else {
289132718Skan		end = (caddr_t)lr + lrsize;
290132718Skan	}
291132718Skan
292132718Skan	/* Now fill in any ACL info */
293132718Skan
294132718Skan	if (vsecp) {
295132718Skan		lracl = (lr_acl_create_t *)&itx->itx_lr;
296132718Skan		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
297132718Skan		lracl->lr_acl_bytes = aclsize;
298132718Skan		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
299132718Skan		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
300132718Skan		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
301132718Skan			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
302132718Skan		else
303132718Skan			lracl->lr_acl_flags = 0;
304132718Skan
305132718Skan		bcopy(vsecp->vsa_aclentp, end, aclsize);
306132718Skan		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
307132718Skan	}
308132718Skan
309132718Skan	/* drop in FUID info */
310132718Skan	if (fuidp) {
311132718Skan		end = zfs_log_fuid_ids(fuidp, end);
312132718Skan		end = zfs_log_fuid_domains(fuidp, end);
313132718Skan	}
314132718Skan	/*
315132718Skan	 * Now place file name in log record
316132718Skan	 */
317132718Skan	bcopy(name, end, namesize);
318132718Skan
319132718Skan	seq = zil_itx_assign(zilog, itx, tx);
320132718Skan	dzp->z_last_itx = seq;
321132718Skan	zp->z_last_itx = seq;
322132718Skan}
323132718Skan
324132718Skan/*
325132718Skan * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
326132718Skan */
327132718Skanvoid
328132718Skanzfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
329132718Skan	znode_t *dzp, char *name)
330132718Skan{
331132718Skan	itx_t *itx;
332132718Skan	uint64_t seq;
333132718Skan	lr_remove_t *lr;
334132718Skan	size_t namesize = strlen(name) + 1;
335132718Skan
336132718Skan	if (zilog == NULL)
337132718Skan		return;
338132718Skan
339132718Skan	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
340132718Skan	lr = (lr_remove_t *)&itx->itx_lr;
341132718Skan	lr->lr_doid = dzp->z_id;
342132718Skan	bcopy(name, (char *)(lr + 1), namesize);
343132718Skan
344132718Skan	seq = zil_itx_assign(zilog, itx, tx);
345132718Skan	dzp->z_last_itx = seq;
346132718Skan}
347132718Skan
348132718Skan/*
349132718Skan * zfs_log_link() handles TX_LINK transactions.
350132718Skan */
351132718Skanvoid
352132718Skanzfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
353132718Skan	znode_t *dzp, znode_t *zp, char *name)
354132718Skan{
355132718Skan	itx_t *itx;
356132718Skan	uint64_t seq;
357132718Skan	lr_link_t *lr;
358132718Skan	size_t namesize = strlen(name) + 1;
359132718Skan
360132718Skan	if (zilog == NULL)
361132718Skan		return;
362132718Skan
363132718Skan	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
364132718Skan	lr = (lr_link_t *)&itx->itx_lr;
365132718Skan	lr->lr_doid = dzp->z_id;
366132718Skan	lr->lr_link_obj = zp->z_id;
367132718Skan	bcopy(name, (char *)(lr + 1), namesize);
368132718Skan
369132718Skan	seq = zil_itx_assign(zilog, itx, tx);
370132718Skan	dzp->z_last_itx = seq;
371132718Skan	zp->z_last_itx = seq;
372132718Skan}
373132718Skan
374132718Skan/*
375132718Skan * zfs_log_symlink() handles TX_SYMLINK transactions.
376132718Skan */
377132718Skanvoid
378132718Skanzfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
379132718Skan    znode_t *dzp, znode_t *zp, char *name, char *link)
380132718Skan{
381132718Skan	itx_t *itx;
382132718Skan	uint64_t seq;
383132718Skan	lr_create_t *lr;
384132718Skan	size_t namesize = strlen(name) + 1;
385132718Skan	size_t linksize = strlen(link) + 1;
386132718Skan
387132718Skan	if (zilog == NULL)
388132718Skan		return;
389132718Skan
390132718Skan	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
391132718Skan	lr = (lr_create_t *)&itx->itx_lr;
392132718Skan	lr->lr_doid = dzp->z_id;
393132718Skan	lr->lr_foid = zp->z_id;
394132718Skan	lr->lr_mode = zp->z_phys->zp_mode;
395132718Skan	lr->lr_uid = zp->z_phys->zp_uid;
396132718Skan	lr->lr_gid = zp->z_phys->zp_gid;
397132718Skan	lr->lr_gen = zp->z_phys->zp_gen;
398132718Skan	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
399132718Skan	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
400132718Skan	bcopy(name, (char *)(lr + 1), namesize);
401132718Skan	bcopy(link, (char *)(lr + 1) + namesize, linksize);
402132718Skan
403132718Skan	seq = zil_itx_assign(zilog, itx, tx);
404132718Skan	dzp->z_last_itx = seq;
405132718Skan	zp->z_last_itx = seq;
406132718Skan}
407132718Skan
408132718Skan/*
409132718Skan * zfs_log_rename() handles TX_RENAME transactions.
410132718Skan */
411132718Skanvoid
412132718Skanzfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
413132718Skan	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
414132718Skan{
415132718Skan	itx_t *itx;
416132718Skan	uint64_t seq;
417132718Skan	lr_rename_t *lr;
418132718Skan	size_t snamesize = strlen(sname) + 1;
419132718Skan	size_t dnamesize = strlen(dname) + 1;
420132718Skan
421132718Skan	if (zilog == NULL)
422132718Skan		return;
423132718Skan
424132718Skan	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
425132718Skan	lr = (lr_rename_t *)&itx->itx_lr;
426132718Skan	lr->lr_sdoid = sdzp->z_id;
427132718Skan	lr->lr_tdoid = tdzp->z_id;
428132718Skan	bcopy(sname, (char *)(lr + 1), snamesize);
429132718Skan	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
430132718Skan
431132718Skan	seq = zil_itx_assign(zilog, itx, tx);
432132718Skan	sdzp->z_last_itx = seq;
433132718Skan	tdzp->z_last_itx = seq;
434132718Skan	szp->z_last_itx = seq;
435132718Skan}
436132718Skan
437132718Skan/*
438132718Skan * zfs_log_write() handles TX_WRITE transactions.
439132718Skan */
440132718Skanssize_t zfs_immediate_write_sz = 32768;
441132718Skan
442132718Skan#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
443132718Skan    sizeof (lr_write_t))
444132718Skan
445132718Skanvoid
446132718Skanzfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
447132718Skan	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
448132718Skan{
449132718Skan	itx_wr_state_t write_state;
450132718Skan	boolean_t slogging;
451132718Skan	uintptr_t fsync_cnt;
452132718Skan
453132718Skan	if (zilog == NULL || zp->z_unlinked)
454132718Skan		return;
455132718Skan
456132718Skan	/*
457132718Skan	 * Writes are handled in three different ways:
458132718Skan	 *
459132718Skan	 * WR_INDIRECT:
460132718Skan	 *    If the write is greater than zfs_immediate_write_sz and there are
461132718Skan	 *    no separate logs in this pool then later *if* we need to log the
462132718Skan	 *    write then dmu_sync() is used to immediately write the block and
463132718Skan	 *    its block pointer is put in the log record.
464132718Skan	 * WR_COPIED:
465132718Skan	 *    If we know we'll immediately be committing the
466132718Skan	 *    transaction (FSYNC or FDSYNC), the we allocate a larger
467132718Skan	 *    log record here for the data and copy the data in.
468132718Skan	 * WR_NEED_COPY:
469132718Skan	 *    Otherwise we don't allocate a buffer, and *if* we need to
470132718Skan	 *    flush the write later then a buffer is allocated and
471132718Skan	 *    we retrieve the data using the dmu.
472132718Skan	 */
473132718Skan	slogging = spa_has_slogs(zilog->zl_spa);
474132718Skan	if (resid > zfs_immediate_write_sz && !slogging)
475132718Skan		write_state = WR_INDIRECT;
476132718Skan	else if (ioflag & (FSYNC | FDSYNC))
477132718Skan		write_state = WR_COPIED;
478132718Skan	else
479132718Skan		write_state = WR_NEED_COPY;
480132718Skan
481132718Skan	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
482132718Skan		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
483132718Skan	}
484132718Skan
485132718Skan	while (resid) {
486132718Skan		itx_t *itx;
487132718Skan		lr_write_t *lr;
488132718Skan		ssize_t len;
489132718Skan
490132718Skan		/*
491132718Skan		 * If there are slogs and the write would overflow the largest
492132718Skan		 * block, then because we don't want to use the main pool
493132718Skan		 * to dmu_sync, we have to split the write.
494132718Skan		 */
495132718Skan		if (slogging && resid > ZIL_MAX_LOG_DATA)
496132718Skan			len = SPA_MAXBLOCKSIZE >> 1;
497132718Skan		else
498132718Skan			len = resid;
499132718Skan
500132718Skan		itx = zil_itx_create(txtype, sizeof (*lr) +
501132718Skan		    (write_state == WR_COPIED ? len : 0));
502132718Skan		lr = (lr_write_t *)&itx->itx_lr;
503132718Skan		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
504132718Skan		    zp->z_id, off, len, lr + 1) != 0) {
505132718Skan			kmem_free(itx, offsetof(itx_t, itx_lr) +
506132718Skan			    itx->itx_lr.lrc_reclen);
507132718Skan			itx = zil_itx_create(txtype, sizeof (*lr));
508132718Skan			lr = (lr_write_t *)&itx->itx_lr;
509132718Skan			write_state = WR_NEED_COPY;
510132718Skan		}
511132718Skan
512132718Skan		itx->itx_wr_state = write_state;
513132718Skan		if (write_state == WR_NEED_COPY)
514132718Skan			itx->itx_sod += len;
515132718Skan		lr->lr_foid = zp->z_id;
516132718Skan		lr->lr_offset = off;
517132718Skan		lr->lr_length = len;
518132718Skan		lr->lr_blkoff = 0;
519132718Skan		BP_ZERO(&lr->lr_blkptr);
520132718Skan
521132718Skan		itx->itx_private = zp->z_zfsvfs;
522132718Skan
523132718Skan		if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
524132718Skan		    (ioflag & (FSYNC | FDSYNC)))
525132718Skan			itx->itx_sync = B_TRUE;
526132718Skan		else
527132718Skan			itx->itx_sync = B_FALSE;
528132718Skan
529132718Skan		zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
530132718Skan
531132718Skan		off += len;
532132718Skan		resid -= len;
533132718Skan	}
534132718Skan}
535132718Skan
536132718Skan/*
537132718Skan * zfs_log_truncate() handles TX_TRUNCATE transactions.
538132718Skan */
539132718Skanvoid
540132718Skanzfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
541132718Skan	znode_t *zp, uint64_t off, uint64_t len)
542132718Skan{
543132718Skan	itx_t *itx;
544132718Skan	uint64_t seq;
545132718Skan	lr_truncate_t *lr;
546132718Skan
547132718Skan	if (zilog == NULL || zp->z_unlinked)
548132718Skan		return;
549132718Skan
550132718Skan	itx = zil_itx_create(txtype, sizeof (*lr));
551132718Skan	lr = (lr_truncate_t *)&itx->itx_lr;
552132718Skan	lr->lr_foid = zp->z_id;
553132718Skan	lr->lr_offset = off;
554132718Skan	lr->lr_length = len;
555132718Skan
556132718Skan	itx->itx_sync = (zp->z_sync_cnt != 0);
557132718Skan	seq = zil_itx_assign(zilog, itx, tx);
558132718Skan	zp->z_last_itx = seq;
559132718Skan}
560132718Skan
561132718Skan/*
562132718Skan * zfs_log_setattr() handles TX_SETATTR transactions.
563132718Skan */
564132718Skanvoid
565132718Skanzfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
566132718Skan	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
567132718Skan{
568132718Skan	itx_t		*itx;
569132718Skan	uint64_t	seq;
570132718Skan	lr_setattr_t	*lr;
571132718Skan	xvattr_t	*xvap = (xvattr_t *)vap;
572132718Skan	size_t		recsize = sizeof (lr_setattr_t);
573132718Skan	void		*start;
574132718Skan
575132718Skan
576132718Skan	if (zilog == NULL || zp->z_unlinked)
577132718Skan		return;
578132718Skan
579132718Skan	/*
580132718Skan	 * If XVATTR set, then log record size needs to allow
581132718Skan	 * for lr_attr_t + xvattr mask, mapsize and create time
582132718Skan	 * plus actual attribute values
583132718Skan	 */
584132718Skan	if (vap->va_mask & AT_XVATTR)
585132718Skan		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
586132718Skan
587132718Skan	if (fuidp)
588132718Skan		recsize += fuidp->z_domain_str_sz;
589132718Skan
590132718Skan	itx = zil_itx_create(txtype, recsize);
591132718Skan	lr = (lr_setattr_t *)&itx->itx_lr;
592132718Skan	lr->lr_foid = zp->z_id;
593132718Skan	lr->lr_mask = (uint64_t)mask_applied;
594132718Skan	lr->lr_mode = (uint64_t)vap->va_mode;
595132718Skan	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
596132718Skan		lr->lr_uid = fuidp->z_fuid_owner;
597132718Skan	else
598132718Skan		lr->lr_uid = (uint64_t)vap->va_uid;
599132718Skan
600132718Skan	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
601132718Skan		lr->lr_gid = fuidp->z_fuid_group;
602132718Skan	else
603132718Skan		lr->lr_gid = (uint64_t)vap->va_gid;
604132718Skan
605132718Skan	lr->lr_size = (uint64_t)vap->va_size;
606132718Skan	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
607132718Skan	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
608132718Skan	start = (lr_setattr_t *)(lr + 1);
609132718Skan	if (vap->va_mask & AT_XVATTR) {
610132718Skan		zfs_log_xvattr((lr_attr_t *)start, xvap);
611132718Skan		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
612132718Skan	}
613132718Skan
614132718Skan	/*
615132718Skan	 * Now stick on domain information if any on end
616132718Skan	 */
617132718Skan
618132718Skan	if (fuidp)
619132718Skan		(void) zfs_log_fuid_domains(fuidp, start);
620132718Skan
621132718Skan	itx->itx_sync = (zp->z_sync_cnt != 0);
622132718Skan	seq = zil_itx_assign(zilog, itx, tx);
623132718Skan	zp->z_last_itx = seq;
624132718Skan}
625132718Skan
626132718Skan/*
627132718Skan * zfs_log_acl() handles TX_ACL transactions.
628132718Skan */
629132718Skanvoid
630132718Skanzfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
631132718Skan    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
632132718Skan{
633132718Skan	itx_t *itx;
634132718Skan	uint64_t seq;
635132718Skan	lr_acl_v0_t *lrv0;
636132718Skan	lr_acl_t *lr;
637132718Skan	int txtype;
638132718Skan	int lrsize;
639132718Skan	size_t txsize;
640132718Skan	size_t aclbytes = vsecp->vsa_aclentsz;
641132718Skan
642132718Skan	if (zilog == NULL || zp->z_unlinked)
643132718Skan		return;
644132718Skan
645132718Skan	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
646132718Skan	    TX_ACL_V0 : TX_ACL;
647132718Skan
648132718Skan	if (txtype == TX_ACL)
649132718Skan		lrsize = sizeof (*lr);
650132718Skan	else
651132718Skan		lrsize = sizeof (*lrv0);
652132718Skan
653132718Skan	txsize = lrsize +
654132718Skan	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
655132718Skan	    (fuidp ? fuidp->z_domain_str_sz : 0) +
656132718Skan	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
657132718Skan
658132718Skan	itx = zil_itx_create(txtype, txsize);
659132718Skan
660132718Skan	lr = (lr_acl_t *)&itx->itx_lr;
661132718Skan	lr->lr_foid = zp->z_id;
662132718Skan	if (txtype == TX_ACL) {
663132718Skan		lr->lr_acl_bytes = aclbytes;
664132718Skan		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
665132718Skan		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
666132718Skan		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
667132718Skan			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
668132718Skan		else
669132718Skan			lr->lr_acl_flags = 0;
670132718Skan	}
671132718Skan	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
672132718Skan
673132718Skan	if (txtype == TX_ACL_V0) {
674132718Skan		lrv0 = (lr_acl_v0_t *)lr;
675132718Skan		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
676132718Skan	} else {
677132718Skan		void *start = (ace_t *)(lr + 1);
678132718Skan
679132718Skan		bcopy(vsecp->vsa_aclentp, start, aclbytes);
680132718Skan
681132718Skan		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
682132718Skan
683132718Skan		if (fuidp) {
684132718Skan			start = zfs_log_fuid_ids(fuidp, start);
685132718Skan			(void) zfs_log_fuid_domains(fuidp, start);
686132718Skan		}
687132718Skan	}
688132718Skan
689132718Skan	itx->itx_sync = (zp->z_sync_cnt != 0);
690132718Skan	seq = zil_itx_assign(zilog, itx, tx);
691132718Skan	zp->z_last_itx = seq;
692132718Skan}
693132718Skan