1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23268123Sdelphij * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26168404Spjd/*
27168404Spjd * Routines to manage the on-disk persistent error log.
28168404Spjd *
29168404Spjd * Each pool stores a log of all logical data errors seen during normal
30168404Spjd * operation.  This is actually the union of two distinct logs: the last log,
31168404Spjd * and the current log.  All errors seen are logged to the current log.  When a
32168404Spjd * scrub completes, the current log becomes the last log, the last log is thrown
33168404Spjd * out, and the current log is reinitialized.  This way, if an error is somehow
34168404Spjd * corrected, a new scrub will show that that it no longer exists, and will be
35168404Spjd * deleted from the log when the scrub completes.
36168404Spjd *
37168404Spjd * The log is stored using a ZAP object whose key is a string form of the
38268123Sdelphij * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
39168404Spjd * optional 'objset:object' human-readable string describing the data.  When an
40168404Spjd * error is first logged, this string will be empty, indicating that no name is
41168404Spjd * known.  This prevents us from having to issue a potentially large amount of
42168404Spjd * I/O to discover the object name during an error path.  Instead, we do the
43168404Spjd * calculation when the data is requested, storing the result so future queries
44168404Spjd * will be faster.
45168404Spjd *
46168404Spjd * This log is then shipped into an nvlist where the key is the dataset name and
47168404Spjd * the value is the object name.  Userland is then responsible for uniquifying
48168404Spjd * this list and displaying it to the user.
49168404Spjd */
50168404Spjd
51168404Spjd#include <sys/dmu_tx.h>
52168404Spjd#include <sys/spa.h>
53168404Spjd#include <sys/spa_impl.h>
54168404Spjd#include <sys/zap.h>
55168404Spjd#include <sys/zio.h>
56168404Spjd
57168404Spjd
58168404Spjd/*
59168404Spjd * Convert a bookmark to a string.
60168404Spjd */
61168404Spjdstatic void
62268123Sdelphijbookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
63168404Spjd{
64168404Spjd	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
65168404Spjd	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
66168404Spjd	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
67168404Spjd}
68168404Spjd
69168404Spjd/*
70168404Spjd * Convert a string to a bookmark
71168404Spjd */
72168404Spjd#ifdef _KERNEL
73168404Spjdstatic void
74268123Sdelphijname_to_bookmark(char *buf, zbookmark_phys_t *zb)
75168404Spjd{
76321578Smav	zb->zb_objset = zfs_strtonum(buf, &buf);
77168404Spjd	ASSERT(*buf == ':');
78321578Smav	zb->zb_object = zfs_strtonum(buf + 1, &buf);
79168404Spjd	ASSERT(*buf == ':');
80321578Smav	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
81168404Spjd	ASSERT(*buf == ':');
82321578Smav	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
83168404Spjd	ASSERT(*buf == '\0');
84168404Spjd}
85168404Spjd#endif
86168404Spjd
87168404Spjd/*
88168404Spjd * Log an uncorrectable error to the persistent error log.  We add it to the
89168404Spjd * spa's list of pending errors.  The changes are actually synced out to disk
90168404Spjd * during spa_errlog_sync().
91168404Spjd */
92168404Spjdvoid
93168404Spjdspa_log_error(spa_t *spa, zio_t *zio)
94168404Spjd{
95268123Sdelphij	zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
96168404Spjd	spa_error_entry_t search;
97168404Spjd	spa_error_entry_t *new;
98168404Spjd	avl_tree_t *tree;
99168404Spjd	avl_index_t where;
100168404Spjd
101168404Spjd	/*
102168404Spjd	 * If we are trying to import a pool, ignore any errors, as we won't be
103168404Spjd	 * writing to the pool any time soon.
104168404Spjd	 */
105219089Spjd	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
106168404Spjd		return;
107168404Spjd
108168404Spjd	mutex_enter(&spa->spa_errlist_lock);
109168404Spjd
110168404Spjd	/*
111168404Spjd	 * If we have had a request to rotate the log, log it to the next list
112168404Spjd	 * instead of the current one.
113168404Spjd	 */
114168404Spjd	if (spa->spa_scrub_active || spa->spa_scrub_finished)
115168404Spjd		tree = &spa->spa_errlist_scrub;
116168404Spjd	else
117168404Spjd		tree = &spa->spa_errlist_last;
118168404Spjd
119168404Spjd	search.se_bookmark = *zb;
120168404Spjd	if (avl_find(tree, &search, &where) != NULL) {
121168404Spjd		mutex_exit(&spa->spa_errlist_lock);
122168404Spjd		return;
123168404Spjd	}
124168404Spjd
125168404Spjd	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
126168404Spjd	new->se_bookmark = *zb;
127168404Spjd	avl_insert(tree, new, where);
128168404Spjd
129168404Spjd	mutex_exit(&spa->spa_errlist_lock);
130168404Spjd}
131168404Spjd
132168404Spjd/*
133168404Spjd * Return the number of errors currently in the error log.  This is actually the
134168404Spjd * sum of both the last log and the current log, since we don't know the union
135168404Spjd * of these logs until we reach userland.
136168404Spjd */
137168404Spjduint64_t
138168404Spjdspa_get_errlog_size(spa_t *spa)
139168404Spjd{
140168404Spjd	uint64_t total = 0, count;
141168404Spjd
142168404Spjd	mutex_enter(&spa->spa_errlog_lock);
143168404Spjd	if (spa->spa_errlog_scrub != 0 &&
144168404Spjd	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
145168404Spjd	    &count) == 0)
146168404Spjd		total += count;
147168404Spjd
148168404Spjd	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
149168404Spjd	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
150168404Spjd	    &count) == 0)
151168404Spjd		total += count;
152168404Spjd	mutex_exit(&spa->spa_errlog_lock);
153168404Spjd
154168404Spjd	mutex_enter(&spa->spa_errlist_lock);
155168404Spjd	total += avl_numnodes(&spa->spa_errlist_last);
156168404Spjd	total += avl_numnodes(&spa->spa_errlist_scrub);
157168404Spjd	mutex_exit(&spa->spa_errlist_lock);
158168404Spjd
159168404Spjd	return (total);
160168404Spjd}
161168404Spjd
162168404Spjd#ifdef _KERNEL
163168404Spjdstatic int
164168404Spjdprocess_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
165168404Spjd{
166168404Spjd	zap_cursor_t zc;
167168404Spjd	zap_attribute_t za;
168268123Sdelphij	zbookmark_phys_t zb;
169168404Spjd
170168404Spjd	if (obj == 0)
171168404Spjd		return (0);
172168404Spjd
173168404Spjd	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
174168404Spjd	    zap_cursor_retrieve(&zc, &za) == 0;
175168404Spjd	    zap_cursor_advance(&zc)) {
176168404Spjd
177168404Spjd		if (*count == 0) {
178168404Spjd			zap_cursor_fini(&zc);
179249195Smm			return (SET_ERROR(ENOMEM));
180168404Spjd		}
181168404Spjd
182168404Spjd		name_to_bookmark(za.za_name, &zb);
183168404Spjd
184168404Spjd		if (copyout(&zb, (char *)addr +
185268123Sdelphij		    (*count - 1) * sizeof (zbookmark_phys_t),
186268123Sdelphij		    sizeof (zbookmark_phys_t)) != 0) {
187251632Sdelphij			zap_cursor_fini(&zc);
188249195Smm			return (SET_ERROR(EFAULT));
189251632Sdelphij		}
190168404Spjd
191168404Spjd		*count -= 1;
192168404Spjd	}
193168404Spjd
194168404Spjd	zap_cursor_fini(&zc);
195168404Spjd
196168404Spjd	return (0);
197168404Spjd}
198168404Spjd
199168404Spjdstatic int
200168404Spjdprocess_error_list(avl_tree_t *list, void *addr, size_t *count)
201168404Spjd{
202168404Spjd	spa_error_entry_t *se;
203168404Spjd
204168404Spjd	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
205168404Spjd
206168404Spjd		if (*count == 0)
207249195Smm			return (SET_ERROR(ENOMEM));
208168404Spjd
209168404Spjd		if (copyout(&se->se_bookmark, (char *)addr +
210268123Sdelphij		    (*count - 1) * sizeof (zbookmark_phys_t),
211268123Sdelphij		    sizeof (zbookmark_phys_t)) != 0)
212249195Smm			return (SET_ERROR(EFAULT));
213168404Spjd
214168404Spjd		*count -= 1;
215168404Spjd	}
216168404Spjd
217168404Spjd	return (0);
218168404Spjd}
219168404Spjd#endif
220168404Spjd
221168404Spjd/*
222168404Spjd * Copy all known errors to userland as an array of bookmarks.  This is
223168404Spjd * actually a union of the on-disk last log and current log, as well as any
224168404Spjd * pending error requests.
225168404Spjd *
226168404Spjd * Because the act of reading the on-disk log could cause errors to be
227168404Spjd * generated, we have two separate locks: one for the error log and one for the
228168404Spjd * in-core error lists.  We only need the error list lock to log and error, so
229168404Spjd * we grab the error log lock while we read the on-disk logs, and only pick up
230168404Spjd * the error list lock when we are finished.
231168404Spjd */
232168404Spjdint
233168404Spjdspa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
234168404Spjd{
235168404Spjd	int ret = 0;
236168404Spjd
237168404Spjd#ifdef _KERNEL
238168404Spjd	mutex_enter(&spa->spa_errlog_lock);
239168404Spjd
240168404Spjd	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
241168404Spjd
242168404Spjd	if (!ret && !spa->spa_scrub_finished)
243168404Spjd		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
244168404Spjd		    count);
245168404Spjd
246168404Spjd	mutex_enter(&spa->spa_errlist_lock);
247168404Spjd	if (!ret)
248168404Spjd		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
249168404Spjd		    count);
250168404Spjd	if (!ret)
251168404Spjd		ret = process_error_list(&spa->spa_errlist_last, uaddr,
252168404Spjd		    count);
253168404Spjd	mutex_exit(&spa->spa_errlist_lock);
254168404Spjd
255168404Spjd	mutex_exit(&spa->spa_errlog_lock);
256168404Spjd#endif
257168404Spjd
258168404Spjd	return (ret);
259168404Spjd}
260168404Spjd
261168404Spjd/*
262168404Spjd * Called when a scrub completes.  This simply set a bit which tells which AVL
263168404Spjd * tree to add new errors.  spa_errlog_sync() is responsible for actually
264168404Spjd * syncing the changes to the underlying objects.
265168404Spjd */
266168404Spjdvoid
267168404Spjdspa_errlog_rotate(spa_t *spa)
268168404Spjd{
269168404Spjd	mutex_enter(&spa->spa_errlist_lock);
270168404Spjd	spa->spa_scrub_finished = B_TRUE;
271168404Spjd	mutex_exit(&spa->spa_errlist_lock);
272168404Spjd}
273168404Spjd
274168404Spjd/*
275168404Spjd * Discard any pending errors from the spa_t.  Called when unloading a faulted
276168404Spjd * pool, as the errors encountered during the open cannot be synced to disk.
277168404Spjd */
278168404Spjdvoid
279168404Spjdspa_errlog_drain(spa_t *spa)
280168404Spjd{
281168404Spjd	spa_error_entry_t *se;
282168404Spjd	void *cookie;
283168404Spjd
284168404Spjd	mutex_enter(&spa->spa_errlist_lock);
285168404Spjd
286168404Spjd	cookie = NULL;
287168404Spjd	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
288168404Spjd	    &cookie)) != NULL)
289168404Spjd		kmem_free(se, sizeof (spa_error_entry_t));
290168404Spjd	cookie = NULL;
291168404Spjd	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
292168404Spjd	    &cookie)) != NULL)
293168404Spjd		kmem_free(se, sizeof (spa_error_entry_t));
294168404Spjd
295168404Spjd	mutex_exit(&spa->spa_errlist_lock);
296168404Spjd}
297168404Spjd
298168404Spjd/*
299168404Spjd * Process a list of errors into the current on-disk log.
300168404Spjd */
301168404Spjdstatic void
302168404Spjdsync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
303168404Spjd{
304168404Spjd	spa_error_entry_t *se;
305168404Spjd	char buf[64];
306168404Spjd	void *cookie;
307168404Spjd
308168404Spjd	if (avl_numnodes(t) != 0) {
309168404Spjd		/* create log if necessary */
310168404Spjd		if (*obj == 0)
311168404Spjd			*obj = zap_create(spa->spa_meta_objset,
312168404Spjd			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
313168404Spjd			    0, tx);
314168404Spjd
315168404Spjd		/* add errors to the current log */
316168404Spjd		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
317168404Spjd			char *name = se->se_name ? se->se_name : "";
318168404Spjd
319168404Spjd			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
320168404Spjd
321168404Spjd			(void) zap_update(spa->spa_meta_objset,
322168404Spjd			    *obj, buf, 1, strlen(name) + 1, name, tx);
323168404Spjd		}
324168404Spjd
325168404Spjd		/* purge the error list */
326168404Spjd		cookie = NULL;
327168404Spjd		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
328168404Spjd			kmem_free(se, sizeof (spa_error_entry_t));
329168404Spjd	}
330168404Spjd}
331168404Spjd
332168404Spjd/*
333168404Spjd * Sync the error log out to disk.  This is a little tricky because the act of
334168404Spjd * writing the error log requires the spa_errlist_lock.  So, we need to lock the
335168404Spjd * error lists, take a copy of the lists, and then reinitialize them.  Then, we
336168404Spjd * drop the error list lock and take the error log lock, at which point we
337168404Spjd * do the errlog processing.  Then, if we encounter an I/O error during this
338168404Spjd * process, we can successfully add the error to the list.  Note that this will
339168404Spjd * result in the perpetual recycling of errors, but it is an unlikely situation
340168404Spjd * and not a performance critical operation.
341168404Spjd */
342168404Spjdvoid
343168404Spjdspa_errlog_sync(spa_t *spa, uint64_t txg)
344168404Spjd{
345168404Spjd	dmu_tx_t *tx;
346168404Spjd	avl_tree_t scrub, last;
347168404Spjd	int scrub_finished;
348168404Spjd
349168404Spjd	mutex_enter(&spa->spa_errlist_lock);
350168404Spjd
351168404Spjd	/*
352168404Spjd	 * Bail out early under normal circumstances.
353168404Spjd	 */
354168404Spjd	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
355168404Spjd	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
356168404Spjd	    !spa->spa_scrub_finished) {
357168404Spjd		mutex_exit(&spa->spa_errlist_lock);
358168404Spjd		return;
359168404Spjd	}
360168404Spjd
361168404Spjd	spa_get_errlists(spa, &last, &scrub);
362168404Spjd	scrub_finished = spa->spa_scrub_finished;
363168404Spjd	spa->spa_scrub_finished = B_FALSE;
364168404Spjd
365168404Spjd	mutex_exit(&spa->spa_errlist_lock);
366168404Spjd	mutex_enter(&spa->spa_errlog_lock);
367168404Spjd
368168404Spjd	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
369168404Spjd
370168404Spjd	/*
371168404Spjd	 * Sync out the current list of errors.
372168404Spjd	 */
373168404Spjd	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
374168404Spjd
375168404Spjd	/*
376168404Spjd	 * Rotate the log if necessary.
377168404Spjd	 */
378168404Spjd	if (scrub_finished) {
379168404Spjd		if (spa->spa_errlog_last != 0)
380168404Spjd			VERIFY(dmu_object_free(spa->spa_meta_objset,
381168404Spjd			    spa->spa_errlog_last, tx) == 0);
382168404Spjd		spa->spa_errlog_last = spa->spa_errlog_scrub;
383168404Spjd		spa->spa_errlog_scrub = 0;
384168404Spjd
385168404Spjd		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
386168404Spjd	}
387168404Spjd
388168404Spjd	/*
389168404Spjd	 * Sync out any pending scrub errors.
390168404Spjd	 */
391168404Spjd	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
392168404Spjd
393168404Spjd	/*
394168404Spjd	 * Update the MOS to reflect the new values.
395168404Spjd	 */
396168404Spjd	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
397168404Spjd	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
398168404Spjd	    &spa->spa_errlog_last, tx);
399168404Spjd	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
400168404Spjd	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
401168404Spjd	    &spa->spa_errlog_scrub, tx);
402168404Spjd
403168404Spjd	dmu_tx_commit(tx);
404168404Spjd
405168404Spjd	mutex_exit(&spa->spa_errlog_lock);
406168404Spjd}
407