zfs_replay.c revision 169884
1132718Skan/*
2132718Skan * CDDL HEADER START
3132718Skan *
4132718Skan * The contents of this file are subject to the terms of the
5132718Skan * Common Development and Distribution License (the "License").
6132718Skan * You may not use this file except in compliance with the License.
7132718Skan *
8132718Skan * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9132718Skan * or http://www.opensolaris.org/os/licensing.
10132718Skan * See the License for the specific language governing permissions
11132718Skan * and limitations under the License.
12132718Skan *
13132718Skan * When distributing Covered Code, include this CDDL HEADER in each
14132718Skan * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15132718Skan * If applicable, add the following below this CDDL HEADER, with the
16132718Skan * fields enclosed by brackets "[]" replaced with your own identifying
17132718Skan * information: Portions Copyright [yyyy] [name of copyright owner]
18132718Skan *
19132718Skan * CDDL HEADER END
20132718Skan */
21132718Skan/*
22132718Skan * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23132718Skan * Use is subject to license terms.
24132718Skan */
25132718Skan
26132718Skan#pragma ident	"%Z%%M%	%I%	%E% SMI"
27132718Skan
28132718Skan#include <sys/types.h>
29132718Skan#include <sys/param.h>
30132718Skan#include <sys/systm.h>
31132718Skan#include <sys/sysmacros.h>
32132718Skan#include <sys/cmn_err.h>
33132718Skan#include <sys/kmem.h>
34132718Skan#include <sys/file.h>
35132718Skan#include <sys/fcntl.h>
36132718Skan#include <sys/vfs.h>
37132718Skan#include <sys/fs/zfs.h>
38132718Skan#include <sys/zfs_znode.h>
39132718Skan#include <sys/zfs_dir.h>
40132718Skan#include <sys/zfs_acl.h>
41132718Skan#include <sys/spa.h>
42132718Skan#include <sys/zil.h>
43132718Skan#include <sys/byteorder.h>
44132718Skan#include <sys/stat.h>
45132718Skan#include <sys/acl.h>
46132718Skan#include <sys/atomic.h>
47132718Skan#include <sys/cred.h>
48132718Skan#include <sys/namei.h>
49132718Skan
50132718Skan/*
51132718Skan * Functions to replay ZFS intent log (ZIL) records
52132718Skan * The functions are called through a function vector (zfs_replay_vector)
53132718Skan * which is indexed by the transaction type.
54132718Skan */
55132718Skan
56132718Skanstatic void
57132718Skanzfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
58132718Skan	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
59132718Skan{
60132718Skan	VATTR_NULL(vap);
61132718Skan	vap->va_mask = (uint_t)mask;
62132718Skan	vap->va_type = IFTOVT(mode);
63132718Skan	vap->va_mode = mode & MODEMASK;
64132718Skan	vap->va_uid = (uid_t)uid;
65132718Skan	vap->va_gid = (gid_t)gid;
66132718Skan	vap->va_rdev = zfs_cmpldev(rdev);
67132718Skan	vap->va_nodeid = nodeid;
68132718Skan}
69132718Skan
70132718Skan/* ARGSUSED */
71132718Skanstatic int
72132718Skanzfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
73132718Skan{
74132718Skan	return (ENOTSUP);
75132718Skan}
76132718Skan
77132718Skanstatic int
78132718Skanzfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
79132718Skan{
80132718Skan	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
81132718Skan	char *link;			/* symlink content follows name */
82132718Skan	znode_t *dzp;
83132718Skan	vnode_t *vp = NULL;
84132718Skan	vattr_t va;
85132718Skan	struct componentname cn;
86132718Skan	int error;
87132718Skan
88132718Skan	if (byteswap)
89132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
90132718Skan
91132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
92132718Skan		return (error);
93132718Skan
94132718Skan	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
95132718Skan	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
96132718Skan
97132718Skan	/*
98132718Skan	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
99132718Skan	 * eventually end up in zfs_mknode(), which assigns the object's
100132718Skan	 * creation time and generation number.  The generic VOP_CREATE()
101132718Skan	 * doesn't have either concept, so we smuggle the values inside
102132718Skan	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
103132718Skan	 */
104132718Skan	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
105132718Skan	va.va_nblocks = lr->lr_gen;
106132718Skan
107132718Skan	cn.cn_nameptr = name;
108132718Skan	cn.cn_cred = kcred;
109132718Skan	cn.cn_thread = curthread;
110132718Skan	cn.cn_flags = SAVENAME;
111132718Skan
112132718Skan	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
113132718Skan	switch ((int)lr->lr_common.lrc_txtype) {
114132718Skan	case TX_CREATE:
115132718Skan		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
116132718Skan		break;
117132718Skan	case TX_MKDIR:
118132718Skan		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
119132718Skan		break;
120132718Skan	case TX_MKXATTR:
121132718Skan		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
122132718Skan		break;
123132718Skan	case TX_SYMLINK:
124132718Skan		link = name + strlen(name) + 1;
125132718Skan		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
126132718Skan		break;
127132718Skan	default:
128132718Skan		error = ENOTSUP;
129132718Skan	}
130132718Skan	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
131132718Skan
132132718Skan	if (error == 0 && vp != NULL) {
133132718Skan		VOP_UNLOCK(vp, 0, curthread);
134132718Skan		VN_RELE(vp);
135132718Skan	}
136132718Skan
137132718Skan	VN_RELE(ZTOV(dzp));
138132718Skan
139132718Skan	return (error);
140132718Skan}
141132718Skan
142132718Skanstatic int
143132718Skanzfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
144132718Skan{
145132718Skan	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
146132718Skan	znode_t *dzp;
147132718Skan	struct componentname cn;
148132718Skan	vnode_t *vp;
149132718Skan	int error;
150132718Skan
151132718Skan	if (byteswap)
152132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
153132718Skan
154132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
155132718Skan		return (error);
156132718Skan
157132718Skan	cn.cn_nameptr = name;
158132718Skan	cn.cn_namelen = strlen(name);
159132718Skan	cn.cn_nameiop = DELETE;
160132718Skan	cn.cn_flags = ISLASTCN | SAVENAME;
161132718Skan	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
162132718Skan	cn.cn_cred = kcred;
163132718Skan	cn.cn_thread = curthread;
164132718Skan	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
165132718Skan	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
166132718Skan	if (error != 0) {
167132718Skan		VOP_UNLOCK(ZTOV(dzp), 0, curthread);
168132718Skan		goto fail;
169132718Skan	}
170132718Skan
171132718Skan	switch ((int)lr->lr_common.lrc_txtype) {
172132718Skan	case TX_REMOVE:
173132718Skan		error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
174132718Skan		break;
175132718Skan	case TX_RMDIR:
176132718Skan		error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
177132718Skan		break;
178132718Skan	default:
179132718Skan		error = ENOTSUP;
180132718Skan	}
181132718Skan	vput(vp);
182132718Skan	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
183132718Skanfail:
184132718Skan	VN_RELE(ZTOV(dzp));
185132718Skan
186132718Skan	return (error);
187132718Skan}
188132718Skan
189132718Skanstatic int
190132718Skanzfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
191132718Skan{
192132718Skan	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
193132718Skan	znode_t *dzp, *zp;
194132718Skan	struct componentname cn;
195132718Skan	int error;
196132718Skan
197132718Skan	if (byteswap)
198132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
199132718Skan
200132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
201132718Skan		return (error);
202132718Skan
203132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
204132718Skan		VN_RELE(ZTOV(dzp));
205132718Skan		return (error);
206132718Skan	}
207132718Skan
208132718Skan	cn.cn_nameptr = name;
209132718Skan	cn.cn_cred = kcred;
210132718Skan	cn.cn_thread = curthread;
211132718Skan	cn.cn_flags = SAVENAME;
212132718Skan
213132718Skan	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
214132718Skan	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread);
215132718Skan	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
216132718Skan	VOP_UNLOCK(ZTOV(zp), 0, curthread);
217132718Skan	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
218132718Skan
219132718Skan	VN_RELE(ZTOV(zp));
220132718Skan	VN_RELE(ZTOV(dzp));
221132718Skan
222132718Skan	return (error);
223132718Skan}
224132718Skan
225132718Skanstatic int
226132718Skanzfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
227132718Skan{
228132718Skan	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
229132718Skan	char *tname = sname + strlen(sname) + 1;
230132718Skan	znode_t *sdzp, *tdzp;
231132718Skan	struct componentname scn, tcn;
232132718Skan	vnode_t *svp, *tvp;
233132718Skan	kthread_t *td = curthread;
234132718Skan	int error;
235132718Skan
236132718Skan	if (byteswap)
237132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
238132718Skan
239132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
240132718Skan		return (error);
241132718Skan
242132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
243132718Skan		VN_RELE(ZTOV(sdzp));
244132718Skan		return (error);
245132718Skan	}
246132718Skan
247132718Skan	svp = tvp = NULL;
248132718Skan
249132718Skan	scn.cn_nameptr = sname;
250132718Skan	scn.cn_namelen = strlen(sname);
251132718Skan	scn.cn_nameiop = DELETE;
252132718Skan	scn.cn_flags = ISLASTCN | SAVENAME;
253132718Skan	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
254132718Skan	scn.cn_cred = kcred;
255132718Skan	scn.cn_thread = td;
256132718Skan	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td);
257132718Skan	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
258132718Skan	VOP_UNLOCK(ZTOV(sdzp), 0, td);
259132718Skan	if (error != 0)
260132718Skan		goto fail;
261132718Skan	VOP_UNLOCK(svp, 0, td);
262132718Skan
263132718Skan	tcn.cn_nameptr = tname;
264132718Skan	tcn.cn_namelen = strlen(tname);
265132718Skan	tcn.cn_nameiop = RENAME;
266132718Skan	tcn.cn_flags = ISLASTCN | SAVENAME;
267132718Skan	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
268132718Skan	tcn.cn_cred = kcred;
269132718Skan	tcn.cn_thread = td;
270132718Skan	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td);
271132718Skan	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
272132718Skan	if (error == EJUSTRETURN)
273132718Skan		tvp = NULL;
274132718Skan	else if (error != 0) {
275132718Skan		VOP_UNLOCK(ZTOV(tdzp), 0, td);
276132718Skan		goto fail;
277132718Skan	}
278132718Skan
279132718Skan	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
280132718Skan	return (error);
281132718Skanfail:
282132718Skan	if (svp != NULL)
283132718Skan		vrele(svp);
284132718Skan	if (tvp != NULL)
285132718Skan		vrele(tvp);
286132718Skan	VN_RELE(ZTOV(tdzp));
287132718Skan	VN_RELE(ZTOV(sdzp));
288132718Skan
289132718Skan	return (error);
290132718Skan}
291132718Skan
292132718Skanstatic int
293132718Skanzfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
294132718Skan{
295132718Skan	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
296132718Skan	znode_t	*zp;
297132718Skan	int error;
298132718Skan	ssize_t resid;
299132718Skan
300132718Skan	if (byteswap)
301132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
302132718Skan
303132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
304132718Skan		/*
305132718Skan		 * As we can log writes out of order, it's possible the
306132718Skan		 * file has been removed. In this case just drop the write
307132718Skan		 * and return success.
308132718Skan		 */
309132718Skan		if (error == ENOENT)
310132718Skan			error = 0;
311132718Skan		return (error);
312132718Skan	}
313132718Skan
314132718Skan	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
315132718Skan	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
316132718Skan
317132718Skan	VN_RELE(ZTOV(zp));
318132718Skan
319132718Skan	return (error);
320132718Skan}
321132718Skan
322132718Skanstatic int
323132718Skanzfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
324132718Skan{
325132718Skan
326132718Skan	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
327132718Skan	return (EOPNOTSUPP);
328132718Skan}
329132718Skan
330132718Skanstatic int
331132718Skanzfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
332132718Skan{
333132718Skan	znode_t *zp;
334132718Skan	vattr_t va;
335132718Skan	vnode_t *vp;
336132718Skan	int error;
337132718Skan
338132718Skan	if (byteswap)
339132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
340132718Skan
341132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
342132718Skan		/*
343132718Skan		 * As we can log setattrs out of order, it's possible the
344132718Skan		 * file has been removed. In this case just drop the setattr
345132718Skan		 * and return success.
346132718Skan		 */
347132718Skan		if (error == ENOENT)
348132718Skan			error = 0;
349132718Skan		return (error);
350132718Skan	}
351132718Skan
352132718Skan	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
353132718Skan	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
354132718Skan
355132718Skan	va.va_size = lr->lr_size;
356132718Skan	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
357132718Skan	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
358132718Skan
359132718Skan	vp = ZTOV(zp);
360132718Skan	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
361132718Skan	error = VOP_SETATTR(vp, &va, kcred, curthread);
362132718Skan	VOP_UNLOCK(vp, 0, curthread);
363132718Skan	VN_RELE(vp);
364132718Skan
365132718Skan	return (error);
366132718Skan}
367132718Skan
368132718Skanstatic int
369132718Skanzfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
370132718Skan{
371132718Skan	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
372132718Skan#ifdef TODO
373132718Skan	vsecattr_t vsa;
374132718Skan#endif
375132718Skan	znode_t *zp;
376132718Skan	int error;
377132718Skan
378132718Skan	if (byteswap) {
379132718Skan		byteswap_uint64_array(lr, sizeof (*lr));
380132718Skan		zfs_ace_byteswap(ace, lr->lr_aclcnt);
381132718Skan	}
382132718Skan
383132718Skan	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
384132718Skan		/*
385132718Skan		 * As we can log acls out of order, it's possible the
386132718Skan		 * file has been removed. In this case just drop the acl
387132718Skan		 * and return success.
388132718Skan		 */
389132718Skan		if (error == ENOENT)
390132718Skan			error = 0;
391132718Skan		return (error);
392132718Skan	}
393132718Skan
394132718Skan#ifdef TODO
395132718Skan	bzero(&vsa, sizeof (vsa));
396132718Skan	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
397132718Skan	vsa.vsa_aclcnt = lr->lr_aclcnt;
398132718Skan	vsa.vsa_aclentp = ace;
399132718Skan
400132718Skan	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
401132718Skan#else
402132718Skan	error = EOPNOTSUPP;
403132718Skan#endif
404132718Skan
405132718Skan	VN_RELE(ZTOV(zp));
406132718Skan
407132718Skan	return (error);
408132718Skan}
409132718Skan
410132718Skan/*
411132718Skan * Callback vectors for replaying records
412132718Skan */
413132718Skanzil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
414132718Skan	zfs_replay_error,	/* 0 no such transaction type */
415132718Skan	zfs_replay_create,	/* TX_CREATE */
416132718Skan	zfs_replay_create,	/* TX_MKDIR */
417132718Skan	zfs_replay_create,	/* TX_MKXATTR */
418132718Skan	zfs_replay_create,	/* TX_SYMLINK */
419132718Skan	zfs_replay_remove,	/* TX_REMOVE */
420132718Skan	zfs_replay_remove,	/* TX_RMDIR */
421132718Skan	zfs_replay_link,	/* TX_LINK */
422132718Skan	zfs_replay_rename,	/* TX_RENAME */
423132718Skan	zfs_replay_write,	/* TX_WRITE */
424132718Skan	zfs_replay_truncate,	/* TX_TRUNCATE */
425132718Skan	zfs_replay_setattr,	/* TX_SETATTR */
426132718Skan	zfs_replay_acl,		/* TX_ACL */
427132718Skan};
428132718Skan