zfs_replay.c revision 209962
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysmacros.h>
32#include <sys/cmn_err.h>
33#include <sys/kmem.h>
34#include <sys/file.h>
35#include <sys/fcntl.h>
36#include <sys/vfs.h>
37#include <sys/fs/zfs.h>
38#include <sys/zfs_znode.h>
39#include <sys/zfs_dir.h>
40#include <sys/zfs_acl.h>
41#include <sys/zfs_fuid.h>
42#include <sys/spa.h>
43#include <sys/zil.h>
44#include <sys/byteorder.h>
45#include <sys/stat.h>
46#include <sys/acl.h>
47#include <sys/atomic.h>
48#include <sys/cred.h>
49#include <sys/namei.h>
50
51/*
52 * Functions to replay ZFS intent log (ZIL) records
53 * The functions are called through a function vector (zfs_replay_vector)
54 * which is indexed by the transaction type.
55 */
56
57static void
58zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
59	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
60{
61	VATTR_NULL(vap);
62	vap->va_mask = (uint_t)mask;
63	if (mask & AT_TYPE)
64		vap->va_type = IFTOVT(mode);
65	if (mask & AT_MODE)
66		vap->va_mode = mode & MODEMASK;
67	if (mask & AT_UID)
68		vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
69	if (mask & AT_GID)
70		vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
71	vap->va_rdev = zfs_cmpldev(rdev);
72	vap->va_nodeid = nodeid;
73}
74
75/* ARGSUSED */
76static int
77zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
78{
79	return (ENOTSUP);
80}
81
82static void
83zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
84{
85	xoptattr_t *xoap = NULL;
86	uint64_t *attrs;
87	uint64_t *crtime;
88	uint32_t *bitmap;
89	void *scanstamp;
90	int i;
91
92	xvap->xva_vattr.va_mask |= AT_XVATTR;
93	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
94		xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
95		return;
96	}
97
98	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
99
100	bitmap = &lrattr->lr_attr_bitmap;
101	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
102		xvap->xva_reqattrmap[i] = *bitmap;
103
104	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
105	crtime = attrs + 1;
106	scanstamp = (caddr_t)(crtime + 2);
107
108	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
109		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
110	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
111		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
112	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
113		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
114	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
115		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
116	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
117		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
118	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
119		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
120	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
121		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
122	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
123		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
124	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
125		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
126	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
127		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
128	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
129		xoap->xoa_av_quarantined =
130		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
131	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
132		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
133	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
134		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
135}
136
137static int
138zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
139{
140	uint64_t uid_idx;
141	uint64_t gid_idx;
142	int domcnt = 0;
143
144	uid_idx = FUID_INDEX(uid);
145	gid_idx = FUID_INDEX(gid);
146	if (uid_idx)
147		domcnt++;
148	if (gid_idx > 0 && gid_idx != uid_idx)
149		domcnt++;
150
151	return (domcnt);
152}
153
154static void *
155zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
156    int domcnt)
157{
158	int i;
159
160	for (i = 0; i != domcnt; i++) {
161		fuid_infop->z_domain_table[i] = start;
162		start = (caddr_t)start + strlen(start) + 1;
163	}
164
165	return (start);
166}
167
168/*
169 * Set the uid/gid in the fuid_info structure.
170 */
171static void
172zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
173{
174	/*
175	 * If owner or group are log specific FUIDs then slurp up
176	 * domain information and build zfs_fuid_info_t
177	 */
178	if (IS_EPHEMERAL(uid))
179		fuid_infop->z_fuid_owner = uid;
180
181	if (IS_EPHEMERAL(gid))
182		fuid_infop->z_fuid_group = gid;
183}
184
185/*
186 * Load fuid domains into fuid_info_t
187 */
188static zfs_fuid_info_t *
189zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
190{
191	int domcnt;
192
193	zfs_fuid_info_t *fuid_infop;
194
195	fuid_infop = zfs_fuid_info_alloc();
196
197	domcnt = zfs_replay_domain_cnt(uid, gid);
198
199	if (domcnt == 0)
200		return (fuid_infop);
201
202	fuid_infop->z_domain_table =
203	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
204
205	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
206
207	fuid_infop->z_domain_cnt = domcnt;
208	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
209	return (fuid_infop);
210}
211
212/*
213 * load zfs_fuid_t's and fuid_domains into fuid_info_t
214 */
215static zfs_fuid_info_t *
216zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
217    uint64_t gid)
218{
219	uint64_t *log_fuid = (uint64_t *)start;
220	zfs_fuid_info_t *fuid_infop;
221	int i;
222
223	fuid_infop = zfs_fuid_info_alloc();
224	fuid_infop->z_domain_cnt = domcnt;
225
226	fuid_infop->z_domain_table =
227	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
228
229	for (i = 0; i != idcnt; i++) {
230		zfs_fuid_t *zfuid;
231
232		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
233		zfuid->z_logfuid = *log_fuid;
234		zfuid->z_id = -1;
235		zfuid->z_domidx = 0;
236		list_insert_tail(&fuid_infop->z_fuids, zfuid);
237		log_fuid++;
238	}
239
240	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
241
242	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
243	return (fuid_infop);
244}
245
246static void
247zfs_replay_swap_attrs(lr_attr_t *lrattr)
248{
249	/* swap the lr_attr structure */
250	byteswap_uint32_array(lrattr, sizeof (*lrattr));
251	/* swap the bitmap */
252	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
253	    sizeof (uint32_t));
254	/* swap the attributes, create time + 64 bit word for attributes */
255	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
256	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
257}
258
259/*
260 * Replay file create with optional ACL, xvattr information as well
261 * as option FUID information.
262 */
263static int
264zfs_replay_create_acl(zfsvfs_t *zfsvfs,
265    lr_acl_create_t *lracl, boolean_t byteswap)
266{
267	char *name = NULL;		/* location determined later */
268	lr_create_t *lr = (lr_create_t *)lracl;
269	znode_t *dzp;
270	vnode_t *vp = NULL;
271	xvattr_t xva;
272	int vflg = 0;
273	vsecattr_t vsec = { 0 };
274	lr_attr_t *lrattr;
275	void *aclstart;
276	void *fuidstart;
277	size_t xvatlen = 0;
278	uint64_t txtype;
279	int error;
280
281	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
282	if (byteswap) {
283		byteswap_uint64_array(lracl, sizeof (*lracl));
284		if (txtype == TX_CREATE_ACL_ATTR ||
285		    txtype == TX_MKDIR_ACL_ATTR) {
286			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
287			zfs_replay_swap_attrs(lrattr);
288			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
289		}
290
291		aclstart = (caddr_t)(lracl + 1) + xvatlen;
292		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
293		/* swap fuids */
294		if (lracl->lr_fuidcnt) {
295			byteswap_uint64_array((caddr_t)aclstart +
296			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
297			    lracl->lr_fuidcnt * sizeof (uint64_t));
298		}
299	}
300
301	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
302		return (error);
303
304	xva_init(&xva);
305	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
306	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
307
308	/*
309	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
310	 * eventually end up in zfs_mknode(), which assigns the object's
311	 * creation time and generation number.  The generic VOP_CREATE()
312	 * doesn't have either concept, so we smuggle the values inside
313	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
314	 */
315	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
316	xva.xva_vattr.va_nblocks = lr->lr_gen;
317
318	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
319	if (error != ENOENT)
320		goto bail;
321
322	if (lr->lr_common.lrc_txtype & TX_CI)
323		vflg |= FIGNORECASE;
324	switch (txtype) {
325	case TX_CREATE_ACL:
326		aclstart = (caddr_t)(lracl + 1);
327		fuidstart = (caddr_t)aclstart +
328		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
329		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
330		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
331		    lr->lr_uid, lr->lr_gid);
332		/*FALLTHROUGH*/
333	case TX_CREATE_ACL_ATTR:
334		if (name == NULL) {
335			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
336			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
337			xva.xva_vattr.va_mask |= AT_XVATTR;
338			zfs_replay_xvattr(lrattr, &xva);
339		}
340		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
341		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
342		vsec.vsa_aclcnt = lracl->lr_aclcnt;
343		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
344		vsec.vsa_aclflags = lracl->lr_acl_flags;
345		if (zfsvfs->z_fuid_replay == NULL) {
346			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
347			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
348			zfsvfs->z_fuid_replay =
349			    zfs_replay_fuids(fuidstart,
350			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
351			    lr->lr_uid, lr->lr_gid);
352		}
353
354#ifdef TODO
355		error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
356		    0, 0, &vp, kcred, vflg, NULL, &vsec);
357#else
358		panic("%s:%u: unsupported condition", __func__, __LINE__);
359#endif
360		break;
361	case TX_MKDIR_ACL:
362		aclstart = (caddr_t)(lracl + 1);
363		fuidstart = (caddr_t)aclstart +
364		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
365		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
366		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
367		    lr->lr_uid, lr->lr_gid);
368		/*FALLTHROUGH*/
369	case TX_MKDIR_ACL_ATTR:
370		if (name == NULL) {
371			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
372			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
373			zfs_replay_xvattr(lrattr, &xva);
374		}
375		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
376		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
377		vsec.vsa_aclcnt = lracl->lr_aclcnt;
378		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
379		vsec.vsa_aclflags = lracl->lr_acl_flags;
380		if (zfsvfs->z_fuid_replay == NULL) {
381			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
382			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
383			zfsvfs->z_fuid_replay =
384			    zfs_replay_fuids(fuidstart,
385			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
386			    lr->lr_uid, lr->lr_gid);
387		}
388#ifdef TODO
389		error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
390		    &vp, kcred, NULL, vflg, &vsec);
391#else
392		panic("%s:%u: unsupported condition", __func__, __LINE__);
393#endif
394		break;
395	default:
396		error = ENOTSUP;
397	}
398
399bail:
400	if (error == 0 && vp != NULL)
401		VN_RELE(vp);
402
403	VN_RELE(ZTOV(dzp));
404
405	if (zfsvfs->z_fuid_replay)
406		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
407	zfsvfs->z_fuid_replay = NULL;
408
409	return (error);
410}
411
412static int
413zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
414{
415	char *name = NULL;		/* location determined later */
416	char *link;			/* symlink content follows name */
417	znode_t *dzp;
418	vnode_t *vp = NULL;
419	xvattr_t xva;
420	int vflg = 0;
421	size_t lrsize = sizeof (lr_create_t);
422	lr_attr_t *lrattr;
423	void *start;
424	size_t xvatlen;
425	uint64_t txtype;
426	struct componentname cn;
427	int error;
428
429	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
430	if (byteswap) {
431		byteswap_uint64_array(lr, sizeof (*lr));
432		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
433			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
434	}
435
436
437	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
438		return (error);
439
440	xva_init(&xva);
441	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
442	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
443
444	/*
445	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
446	 * eventually end up in zfs_mknode(), which assigns the object's
447	 * creation time and generation number.  The generic VOP_CREATE()
448	 * doesn't have either concept, so we smuggle the values inside
449	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
450	 */
451	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
452	xva.xva_vattr.va_nblocks = lr->lr_gen;
453
454	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
455	if (error != ENOENT)
456		goto out;
457
458	if (lr->lr_common.lrc_txtype & TX_CI)
459		vflg |= FIGNORECASE;
460
461	/*
462	 * Symlinks don't have fuid info, and CIFS never creates
463	 * symlinks.
464	 *
465	 * The _ATTR versions will grab the fuid info in their subcases.
466	 */
467	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
468	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
469	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
470		start = (lr + 1);
471		zfsvfs->z_fuid_replay =
472		    zfs_replay_fuid_domain(start, &start,
473		    lr->lr_uid, lr->lr_gid);
474	}
475
476	cn.cn_cred = kcred;
477	cn.cn_thread = curthread;
478	cn.cn_flags = SAVENAME;
479
480	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
481	switch (txtype) {
482	case TX_CREATE_ATTR:
483		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
484		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
485		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
486		start = (caddr_t)(lr + 1) + xvatlen;
487		zfsvfs->z_fuid_replay =
488		    zfs_replay_fuid_domain(start, &start,
489		    lr->lr_uid, lr->lr_gid);
490		name = (char *)start;
491
492		/*FALLTHROUGH*/
493	case TX_CREATE:
494		if (name == NULL)
495			name = (char *)start;
496
497		cn.cn_nameptr = name;
498		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
499		break;
500	case TX_MKDIR_ATTR:
501		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
502		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
503		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
504		start = (caddr_t)(lr + 1) + xvatlen;
505		zfsvfs->z_fuid_replay =
506		    zfs_replay_fuid_domain(start, &start,
507		    lr->lr_uid, lr->lr_gid);
508		name = (char *)start;
509
510		/*FALLTHROUGH*/
511	case TX_MKDIR:
512		if (name == NULL)
513			name = (char *)(lr + 1);
514
515		cn.cn_nameptr = name;
516		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
517		break;
518	case TX_MKXATTR:
519		name = (char *)(lr + 1);
520		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
521		break;
522	case TX_SYMLINK:
523		name = (char *)(lr + 1);
524		link = name + strlen(name) + 1;
525		cn.cn_nameptr = name;
526		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/);
527		break;
528	default:
529		error = ENOTSUP;
530	}
531	VOP_UNLOCK(ZTOV(dzp), 0);
532
533out:
534	if (error == 0 && vp != NULL) {
535		VOP_UNLOCK(vp, 0);
536		VN_RELE(vp);
537	}
538
539	VN_RELE(ZTOV(dzp));
540
541	if (zfsvfs->z_fuid_replay)
542		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
543	zfsvfs->z_fuid_replay = NULL;
544	return (error);
545}
546
547static int
548zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
549{
550	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
551	znode_t *dzp;
552	struct componentname cn;
553	vnode_t *vp;
554	int error;
555	int vflg = 0;
556
557	if (byteswap)
558		byteswap_uint64_array(lr, sizeof (*lr));
559
560	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
561		return (error);
562
563	if (lr->lr_common.lrc_txtype & TX_CI)
564		vflg |= FIGNORECASE;
565	cn.cn_nameptr = name;
566	cn.cn_namelen = strlen(name);
567	cn.cn_nameiop = DELETE;
568	cn.cn_flags = ISLASTCN | SAVENAME;
569	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
570	cn.cn_cred = kcred;
571	cn.cn_thread = curthread;
572	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
573	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
574	if (error != 0) {
575		VOP_UNLOCK(ZTOV(dzp), 0);
576		goto fail;
577	}
578
579	switch ((int)lr->lr_common.lrc_txtype) {
580	case TX_REMOVE:
581		error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/);
582		break;
583	case TX_RMDIR:
584		error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/);
585		break;
586	default:
587		error = ENOTSUP;
588	}
589	vput(vp);
590	VOP_UNLOCK(ZTOV(dzp), 0);
591fail:
592	VN_RELE(ZTOV(dzp));
593
594	return (error);
595}
596
597static int
598zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
599{
600	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
601	znode_t *dzp, *zp;
602	struct componentname cn;
603	int error;
604	int vflg = 0;
605
606	if (byteswap)
607		byteswap_uint64_array(lr, sizeof (*lr));
608
609	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
610		return (error);
611
612	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
613		VN_RELE(ZTOV(dzp));
614		return (error);
615	}
616
617	if (lr->lr_common.lrc_txtype & TX_CI)
618		vflg |= FIGNORECASE;
619	cn.cn_nameptr = name;
620	cn.cn_cred = kcred;
621	cn.cn_thread = curthread;
622	cn.cn_flags = SAVENAME;
623
624	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
625	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
626	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/);
627	VOP_UNLOCK(ZTOV(zp), 0);
628	VOP_UNLOCK(ZTOV(dzp), 0);
629
630	VN_RELE(ZTOV(zp));
631	VN_RELE(ZTOV(dzp));
632
633	return (error);
634}
635
636static int
637zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
638{
639	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
640	char *tname = sname + strlen(sname) + 1;
641	znode_t *sdzp, *tdzp;
642	struct componentname scn, tcn;
643	vnode_t *svp, *tvp;
644	kthread_t *td = curthread;
645	int error;
646	int vflg = 0;
647
648	if (byteswap)
649		byteswap_uint64_array(lr, sizeof (*lr));
650
651	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
652		return (error);
653
654	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
655		VN_RELE(ZTOV(sdzp));
656		return (error);
657	}
658
659	if (lr->lr_common.lrc_txtype & TX_CI)
660		vflg |= FIGNORECASE;
661	svp = tvp = NULL;
662
663	scn.cn_nameptr = sname;
664	scn.cn_namelen = strlen(sname);
665	scn.cn_nameiop = DELETE;
666	scn.cn_flags = ISLASTCN | SAVENAME;
667	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
668	scn.cn_cred = kcred;
669	scn.cn_thread = td;
670	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
671	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
672	VOP_UNLOCK(ZTOV(sdzp), 0);
673	if (error != 0)
674		goto fail;
675	VOP_UNLOCK(svp, 0);
676
677	tcn.cn_nameptr = tname;
678	tcn.cn_namelen = strlen(tname);
679	tcn.cn_nameiop = RENAME;
680	tcn.cn_flags = ISLASTCN | SAVENAME;
681	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
682	tcn.cn_cred = kcred;
683	tcn.cn_thread = td;
684	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
685	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
686	if (error == EJUSTRETURN)
687		tvp = NULL;
688	else if (error != 0) {
689		VOP_UNLOCK(ZTOV(tdzp), 0);
690		goto fail;
691	}
692
693	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/);
694	return (error);
695fail:
696	if (svp != NULL)
697		vrele(svp);
698	if (tvp != NULL)
699		vrele(tvp);
700	VN_RELE(ZTOV(tdzp));
701	VN_RELE(ZTOV(sdzp));
702
703	return (error);
704}
705
706static int
707zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
708{
709	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
710	znode_t	*zp;
711	int error;
712	ssize_t resid;
713	uint64_t orig_eof, eod;
714
715	if (byteswap)
716		byteswap_uint64_array(lr, sizeof (*lr));
717
718	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
719		/*
720		 * As we can log writes out of order, it's possible the
721		 * file has been removed. In this case just drop the write
722		 * and return success.
723		 */
724		if (error == ENOENT)
725			error = 0;
726		return (error);
727	}
728	orig_eof = zp->z_phys->zp_size;
729	eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
730
731	/* If it's a dmu_sync() block get the data and write the whole block */
732	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
733		zil_get_replay_data(zfsvfs->z_log, lr);
734
735	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
736	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
737
738	/*
739	 * This may be a write from a dmu_sync() for a whole block,
740	 * and may extend beyond the current end of the file.
741	 * We can't just replay what was written for this TX_WRITE as
742	 * a future TX_WRITE2 may extend the eof and the data for that
743	 * write needs to be there. So we write the whole block and
744	 * reduce the eof.
745	 */
746	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
747		zp->z_phys->zp_size = eod;
748
749	VN_RELE(ZTOV(zp));
750
751	return (error);
752}
753
754/*
755 * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
756 * meaning the pool block is already being synced. So now that we always write
757 * out full blocks, all we have to do is expand the eof if
758 * the file is grown.
759 */
760static int
761zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
762{
763	znode_t	*zp;
764	int error;
765	uint64_t end;
766
767	if (byteswap)
768		byteswap_uint64_array(lr, sizeof (*lr));
769
770	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
771		/*
772		 * As we can log writes out of order, it's possible the
773		 * file has been removed. In this case just drop the write
774		 * and return success.
775		 */
776		if (error == ENOENT)
777			error = 0;
778		return (error);
779	}
780
781	end = lr->lr_offset + lr->lr_length;
782	if (end > zp->z_phys->zp_size) {
783		ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
784		zp->z_phys->zp_size = end;
785	}
786
787	VN_RELE(ZTOV(zp));
788
789	return (error);
790}
791
792static int
793zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
794{
795
796	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
797	return (EOPNOTSUPP);
798}
799
800static int
801zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
802{
803	znode_t *zp;
804	xvattr_t xva;
805	vattr_t *vap = &xva.xva_vattr;
806	vnode_t *vp;
807	int error;
808	void *start;
809
810	xva_init(&xva);
811	if (byteswap) {
812		byteswap_uint64_array(lr, sizeof (*lr));
813
814		if ((lr->lr_mask & AT_XVATTR) &&
815		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
816			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
817	}
818
819	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
820		/*
821		 * As we can log setattrs out of order, it's possible the
822		 * file has been removed. In this case just drop the setattr
823		 * and return success.
824		 */
825		if (error == ENOENT)
826			error = 0;
827		return (error);
828	}
829
830	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
831	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
832
833	vap->va_size = lr->lr_size;
834	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
835	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
836
837	/*
838	 * Fill in xvattr_t portions if necessary.
839	 */
840
841	start = (lr_setattr_t *)(lr + 1);
842	if (vap->va_mask & AT_XVATTR) {
843		zfs_replay_xvattr((lr_attr_t *)start, &xva);
844		start = (caddr_t)start +
845		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
846	} else
847		xva.xva_vattr.va_mask &= ~AT_XVATTR;
848
849	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
850	    lr->lr_uid, lr->lr_gid);
851
852	vp = ZTOV(zp);
853	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
854	error = VOP_SETATTR(vp, vap, kcred);
855	VOP_UNLOCK(vp, 0);
856
857	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
858	zfsvfs->z_fuid_replay = NULL;
859	VN_RELE(vp);
860
861	return (error);
862}
863
864static int
865zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
866{
867	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
868	vsecattr_t vsa;
869	znode_t *zp;
870	int error;
871
872	if (byteswap) {
873		byteswap_uint64_array(lr, sizeof (*lr));
874		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
875	}
876
877	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
878		/*
879		 * As we can log acls out of order, it's possible the
880		 * file has been removed. In this case just drop the acl
881		 * and return success.
882		 */
883		if (error == ENOENT)
884			error = 0;
885		return (error);
886	}
887
888	bzero(&vsa, sizeof (vsa));
889	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
890	vsa.vsa_aclcnt = lr->lr_aclcnt;
891	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
892	vsa.vsa_aclflags = 0;
893	vsa.vsa_aclentp = ace;
894
895#ifdef TODO
896	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
897#else
898	panic("%s:%u: unsupported condition", __func__, __LINE__);
899#endif
900
901	VN_RELE(ZTOV(zp));
902
903	return (error);
904}
905
906/*
907 * Replaying ACLs is complicated by FUID support.
908 * The log record may contain some optional data
909 * to be used for replaying FUID's.  These pieces
910 * are the actual FUIDs that were created initially.
911 * The FUID table index may no longer be valid and
912 * during zfs_create() a new index may be assigned.
913 * Because of this the log will contain the original
914 * doman+rid in order to create a new FUID.
915 *
916 * The individual ACEs may contain an ephemeral uid/gid which is no
917 * longer valid and will need to be replaced with an actual FUID.
918 *
919 */
920static int
921zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
922{
923	ace_t *ace = (ace_t *)(lr + 1);
924	vsecattr_t vsa;
925	znode_t *zp;
926	int error;
927
928	if (byteswap) {
929		byteswap_uint64_array(lr, sizeof (*lr));
930		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
931		if (lr->lr_fuidcnt) {
932			byteswap_uint64_array((caddr_t)ace +
933			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
934			    lr->lr_fuidcnt * sizeof (uint64_t));
935		}
936	}
937
938	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
939		/*
940		 * As we can log acls out of order, it's possible the
941		 * file has been removed. In this case just drop the acl
942		 * and return success.
943		 */
944		if (error == ENOENT)
945			error = 0;
946		return (error);
947	}
948
949#ifdef TODO
950	bzero(&vsa, sizeof (vsa));
951	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
952	vsa.vsa_aclcnt = lr->lr_aclcnt;
953	vsa.vsa_aclentp = ace;
954	vsa.vsa_aclentsz = lr->lr_acl_bytes;
955	vsa.vsa_aclflags = lr->lr_acl_flags;
956
957	if (lr->lr_fuidcnt) {
958		void *fuidstart = (caddr_t)ace +
959		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
960
961		zfsvfs->z_fuid_replay =
962		    zfs_replay_fuids(fuidstart, &fuidstart,
963		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
964	}
965
966	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
967
968	if (zfsvfs->z_fuid_replay)
969		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
970#else
971	error = EOPNOTSUPP;
972#endif
973
974	zfsvfs->z_fuid_replay = NULL;
975	VN_RELE(ZTOV(zp));
976
977	return (error);
978}
979
980/*
981 * Callback vectors for replaying records
982 */
983zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
984	zfs_replay_error,	/* 0 no such transaction type */
985	zfs_replay_create,	/* TX_CREATE */
986	zfs_replay_create,	/* TX_MKDIR */
987	zfs_replay_create,	/* TX_MKXATTR */
988	zfs_replay_create,	/* TX_SYMLINK */
989	zfs_replay_remove,	/* TX_REMOVE */
990	zfs_replay_remove,	/* TX_RMDIR */
991	zfs_replay_link,	/* TX_LINK */
992	zfs_replay_rename,	/* TX_RENAME */
993	zfs_replay_write,	/* TX_WRITE */
994	zfs_replay_truncate,	/* TX_TRUNCATE */
995	zfs_replay_setattr,	/* TX_SETATTR */
996	zfs_replay_acl_v0,	/* TX_ACL_V0 */
997	zfs_replay_acl,		/* TX_ACL */
998	zfs_replay_create_acl,	/* TX_CREATE_ACL */
999	zfs_replay_create,	/* TX_CREATE_ATTR */
1000	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
1001	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
1002	zfs_replay_create,	/* TX_MKDIR_ATTR */
1003	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
1004	zfs_replay_write2,	/* TX_WRITE2 */
1005};
1006