1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22209962Smm * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23168404Spjd * Use is subject to license terms.
24168404Spjd */
25168404Spjd
26230514Smm/*
27230514Smm * Copyright (c) 2012 by Delphix. All rights reserved.
28230514Smm */
29230514Smm
30168404Spjd#include <sys/spa.h>
31168404Spjd#include <sys/spa_impl.h>
32168404Spjd#include <sys/vdev.h>
33168404Spjd#include <sys/vdev_impl.h>
34168404Spjd#include <sys/zio.h>
35219089Spjd#include <sys/zio_checksum.h>
36168404Spjd
37168404Spjd#include <sys/fm/fs/zfs.h>
38168404Spjd#include <sys/fm/protocol.h>
39168404Spjd#include <sys/fm/util.h>
40219089Spjd#include <sys/sysevent.h>
41168404Spjd
42168404Spjd/*
43168404Spjd * This general routine is responsible for generating all the different ZFS
44168404Spjd * ereports.  The payload is dependent on the class, and which arguments are
45168404Spjd * supplied to the function:
46168404Spjd *
47168404Spjd * 	EREPORT			POOL	VDEV	IO
48168404Spjd * 	block			X	X	X
49168404Spjd * 	data			X		X
50168404Spjd * 	device			X	X
51168404Spjd * 	pool			X
52168404Spjd *
53168404Spjd * If we are in a loading state, all errors are chained together by the same
54185029Spjd * SPA-wide ENA (Error Numeric Association).
55168404Spjd *
56168404Spjd * For isolated I/O requests, we get the ENA from the zio_t. The propagation
57168404Spjd * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
58168404Spjd * to chain together all ereports associated with a logical piece of data.  For
59168404Spjd * read I/Os, there  are basically three 'types' of I/O, which form a roughly
60168404Spjd * layered diagram:
61168404Spjd *
62168404Spjd *      +---------------+
63168404Spjd * 	| Aggregate I/O |	No associated logical data or device
64168404Spjd * 	+---------------+
65168404Spjd *              |
66168404Spjd *              V
67168404Spjd * 	+---------------+	Reads associated with a piece of logical data.
68168404Spjd * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
69168404Spjd * 	+---------------+       mirrors, gang blocks, retries, etc.
70168404Spjd *              |
71168404Spjd *              V
72168404Spjd * 	+---------------+	Reads associated with a particular device, but
73168404Spjd * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
74168404Spjd * 	+---------------+	and I/O aggregation.
75168404Spjd *
76168404Spjd * Note that 'physical I/O' here is not the same terminology as used in the rest
77168404Spjd * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
78168404Spjd * blockpointer.  But I/O with no associated block pointer can still be related
79168404Spjd * to a logical piece of data (i.e. RAID-Z requests).
80168404Spjd *
81168404Spjd * Purely physical I/O always have unique ENAs.  They are not related to a
82168404Spjd * particular piece of logical data, and therefore cannot be chained together.
83168404Spjd * We still generate an ereport, but the DE doesn't correlate it with any
84168404Spjd * logical piece of data.  When such an I/O fails, the delegated I/O requests
85168404Spjd * will issue a retry, which will trigger the 'real' ereport with the correct
86168404Spjd * ENA.
87168404Spjd *
88168404Spjd * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
89168404Spjd * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
90168404Spjd * then inherit this pointer, so that when it is first set subsequent failures
91185029Spjd * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
92185029Spjd * this pointer is set to NULL, and no ereport will be generated (since it
93185029Spjd * doesn't actually correspond to any particular device or piece of data,
94185029Spjd * and the caller will always retry without caching or queueing anyway).
95219089Spjd *
96219089Spjd * For checksum errors, we want to include more information about the actual
97219089Spjd * error which occurs.  Accordingly, we build an ereport when the error is
98219089Spjd * noticed, but instead of sending it in immediately, we hang it off of the
99219089Spjd * io_cksum_report field of the logical IO.  When the logical IO completes
100219089Spjd * (successfully or not), zfs_ereport_finish_checksum() is called with the
101219089Spjd * good and bad versions of the buffer (if available), and we annotate the
102219089Spjd * ereport with information about the differences.
103168404Spjd */
104219089Spjd#ifdef _KERNEL
105219089Spjdstatic void
106219089Spjdzfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
107219089Spjd    const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
108168404Spjd    uint64_t stateoroffset, uint64_t size)
109168404Spjd{
110219089Spjd	nvlist_t *ereport, *detector;
111168404Spjd
112219089Spjd	uint64_t ena;
113219089Spjd	char class[64];
114219089Spjd
115168404Spjd	/*
116219089Spjd	 * If we are doing a spa_tryimport() or in recovery mode,
117219089Spjd	 * ignore errors.
118168404Spjd	 */
119219089Spjd	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
120219089Spjd	    spa_load_state(spa) == SPA_LOAD_RECOVER)
121168404Spjd		return;
122168404Spjd
123168404Spjd	/*
124168404Spjd	 * If we are in the middle of opening a pool, and the previous attempt
125168404Spjd	 * failed, don't bother logging any new ereports - we're just going to
126168404Spjd	 * get the same diagnosis anyway.
127168404Spjd	 */
128219089Spjd	if (spa_load_state(spa) != SPA_LOAD_NONE &&
129168404Spjd	    spa->spa_last_open_failed)
130168404Spjd		return;
131168404Spjd
132185029Spjd	if (zio != NULL) {
133185029Spjd		/*
134185029Spjd		 * If this is not a read or write zio, ignore the error.  This
135185029Spjd		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
136185029Spjd		 */
137185029Spjd		if (zio->io_type != ZIO_TYPE_READ &&
138185029Spjd		    zio->io_type != ZIO_TYPE_WRITE)
139185029Spjd			return;
140168404Spjd
141185029Spjd		/*
142185029Spjd		 * Ignore any errors from speculative I/Os, as failure is an
143185029Spjd		 * expected result.
144185029Spjd		 */
145185029Spjd		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
146185029Spjd			return;
147168404Spjd
148213198Smm		/*
149213198Smm		 * If this I/O is not a retry I/O, don't post an ereport.
150213198Smm		 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
151213198Smm		 * I/Os.
152213198Smm		 */
153213198Smm		if (zio->io_error == EIO &&
154213198Smm		    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
155213198Smm			return;
156213198Smm
157209962Smm		if (vd != NULL) {
158209962Smm			/*
159209962Smm			 * If the vdev has already been marked as failing due
160209962Smm			 * to a failed probe, then ignore any subsequent I/O
161209962Smm			 * errors, as the DE will automatically fault the vdev
162209962Smm			 * on the first such failure.  This also catches cases
163209962Smm			 * where vdev_remove_wanted is set and the device has
164209962Smm			 * not yet been asynchronously placed into the REMOVED
165209962Smm			 * state.
166209962Smm			 */
167219089Spjd			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
168209962Smm				return;
169209962Smm
170209962Smm			/*
171209962Smm			 * Ignore checksum errors for reads from DTL regions of
172209962Smm			 * leaf vdevs.
173209962Smm			 */
174209962Smm			if (zio->io_type == ZIO_TYPE_READ &&
175209962Smm			    zio->io_error == ECKSUM &&
176209962Smm			    vd->vdev_ops->vdev_op_leaf &&
177209962Smm			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
178209962Smm				return;
179209962Smm		}
180185029Spjd	}
181168404Spjd
182219089Spjd	/*
183219089Spjd	 * For probe failure, we want to avoid posting ereports if we've
184219089Spjd	 * already removed the device in the meantime.
185219089Spjd	 */
186219089Spjd	if (vd != NULL &&
187219089Spjd	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
188219089Spjd	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
189219089Spjd		return;
190168404Spjd
191219089Spjd	if ((ereport = fm_nvlist_create(NULL)) == NULL)
192219089Spjd		return;
193219089Spjd
194219089Spjd	if ((detector = fm_nvlist_create(NULL)) == NULL) {
195219089Spjd		fm_nvlist_destroy(ereport, FM_NVA_FREE);
196219089Spjd		return;
197219089Spjd	}
198219089Spjd
199168404Spjd	/*
200168404Spjd	 * Serialize ereport generation
201168404Spjd	 */
202168404Spjd	mutex_enter(&spa->spa_errlist_lock);
203168404Spjd
204168404Spjd	/*
205168404Spjd	 * Determine the ENA to use for this event.  If we are in a loading
206168404Spjd	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
207168404Spjd	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
208168404Spjd	 */
209219089Spjd	if (spa_load_state(spa) != SPA_LOAD_NONE) {
210168404Spjd		if (spa->spa_ena == 0)
211168404Spjd			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
212168404Spjd		ena = spa->spa_ena;
213168404Spjd	} else if (zio != NULL && zio->io_logical != NULL) {
214168404Spjd		if (zio->io_logical->io_ena == 0)
215168404Spjd			zio->io_logical->io_ena =
216168404Spjd			    fm_ena_generate(0, FM_ENA_FMT1);
217168404Spjd		ena = zio->io_logical->io_ena;
218168404Spjd	} else {
219168404Spjd		ena = fm_ena_generate(0, FM_ENA_FMT1);
220168404Spjd	}
221168404Spjd
222168404Spjd	/*
223168404Spjd	 * Construct the full class, detector, and other standard FMA fields.
224168404Spjd	 */
225219089Spjd	(void) snprintf(class, sizeof (class), "%s.%s",
226219089Spjd	    ZFS_ERROR_CLASS, subclass);
227168404Spjd
228219089Spjd	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
229219089Spjd	    vd != NULL ? vd->vdev_guid : 0);
230168404Spjd
231219089Spjd	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
232219089Spjd
233168404Spjd	/*
234168404Spjd	 * Construct the per-ereport payload, depending on which parameters are
235168404Spjd	 * passed in.
236168404Spjd	 */
237168404Spjd
238168404Spjd	/*
239168404Spjd	 * Generic payload members common to all ereports.
240168404Spjd	 */
241219089Spjd	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
242219089Spjd	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
243219089Spjd	    DATA_TYPE_UINT64, spa_guid(spa),
244219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
245219089Spjd	    spa_load_state(spa), NULL);
246168404Spjd
247185029Spjd	if (spa != NULL) {
248219089Spjd		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
249219089Spjd		    DATA_TYPE_STRING,
250185029Spjd		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
251185029Spjd		    FM_EREPORT_FAILMODE_WAIT :
252185029Spjd		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
253219089Spjd		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
254219089Spjd		    NULL);
255185029Spjd	}
256185029Spjd
257168404Spjd	if (vd != NULL) {
258168404Spjd		vdev_t *pvd = vd->vdev_parent;
259168404Spjd
260219089Spjd		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
261219089Spjd		    DATA_TYPE_UINT64, vd->vdev_guid,
262219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
263219089Spjd		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
264209962Smm		if (vd->vdev_path != NULL)
265219089Spjd			fm_payload_set(ereport,
266219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
267219089Spjd			    DATA_TYPE_STRING, vd->vdev_path, NULL);
268209962Smm		if (vd->vdev_devid != NULL)
269219089Spjd			fm_payload_set(ereport,
270219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
271219089Spjd			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
272209962Smm		if (vd->vdev_fru != NULL)
273219089Spjd			fm_payload_set(ereport,
274219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
275219089Spjd			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
276168404Spjd
277168404Spjd		if (pvd != NULL) {
278219089Spjd			fm_payload_set(ereport,
279219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
280219089Spjd			    DATA_TYPE_UINT64, pvd->vdev_guid,
281168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
282219089Spjd			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
283219089Spjd			    NULL);
284168404Spjd			if (pvd->vdev_path)
285219089Spjd				fm_payload_set(ereport,
286168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
287219089Spjd				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
288168404Spjd			if (pvd->vdev_devid)
289219089Spjd				fm_payload_set(ereport,
290168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
291219089Spjd				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
292168404Spjd		}
293168404Spjd	}
294168404Spjd
295168404Spjd	if (zio != NULL) {
296168404Spjd		/*
297168404Spjd		 * Payload common to all I/Os.
298168404Spjd		 */
299219089Spjd		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
300219089Spjd		    DATA_TYPE_INT32, zio->io_error, NULL);
301168404Spjd
302168404Spjd		/*
303168404Spjd		 * If the 'size' parameter is non-zero, it indicates this is a
304168404Spjd		 * RAID-Z or other I/O where the physical offset and length are
305168404Spjd		 * provided for us, instead of within the zio_t.
306168404Spjd		 */
307168404Spjd		if (vd != NULL) {
308219089Spjd			if (size)
309219089Spjd				fm_payload_set(ereport,
310168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
311219089Spjd				    DATA_TYPE_UINT64, stateoroffset,
312219089Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
313219089Spjd				    DATA_TYPE_UINT64, size, NULL);
314219089Spjd			else
315219089Spjd				fm_payload_set(ereport,
316168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
317219089Spjd				    DATA_TYPE_UINT64, zio->io_offset,
318168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
319219089Spjd				    DATA_TYPE_UINT64, zio->io_size, NULL);
320168404Spjd		}
321168404Spjd
322168404Spjd		/*
323168404Spjd		 * Payload for I/Os with corresponding logical information.
324168404Spjd		 */
325219089Spjd		if (zio->io_logical != NULL)
326219089Spjd			fm_payload_set(ereport,
327219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
328219089Spjd			    DATA_TYPE_UINT64,
329219089Spjd			    zio->io_logical->io_bookmark.zb_objset,
330168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
331219089Spjd			    DATA_TYPE_UINT64,
332219089Spjd			    zio->io_logical->io_bookmark.zb_object,
333168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
334219089Spjd			    DATA_TYPE_INT64,
335219089Spjd			    zio->io_logical->io_bookmark.zb_level,
336168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
337219089Spjd			    DATA_TYPE_UINT64,
338219089Spjd			    zio->io_logical->io_bookmark.zb_blkid, NULL);
339168404Spjd	} else if (vd != NULL) {
340168404Spjd		/*
341168404Spjd		 * If we have a vdev but no zio, this is a device fault, and the
342168404Spjd		 * 'stateoroffset' parameter indicates the previous state of the
343168404Spjd		 * vdev.
344168404Spjd		 */
345219089Spjd		fm_payload_set(ereport,
346219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
347219089Spjd		    DATA_TYPE_UINT64, stateoroffset, NULL);
348168404Spjd	}
349219089Spjd
350168404Spjd	mutex_exit(&spa->spa_errlist_lock);
351168404Spjd
352219089Spjd	*ereport_out = ereport;
353219089Spjd	*detector_out = detector;
354219089Spjd}
355219089Spjd
356219089Spjd/* if it's <= 128 bytes, save the corruption directly */
357219089Spjd#define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
358219089Spjd
359219089Spjd#define	MAX_RANGES		16
360219089Spjd
361219089Spjdtypedef struct zfs_ecksum_info {
362219089Spjd	/* histograms of set and cleared bits by bit number in a 64-bit word */
363330588Savg	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
364330588Savg	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
365219089Spjd
366219089Spjd	/* inline arrays of bits set and cleared. */
367219089Spjd	uint64_t zei_bits_set[ZFM_MAX_INLINE];
368219089Spjd	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
369219089Spjd
370219089Spjd	/*
371219089Spjd	 * for each range, the number of bits set and cleared.  The Hamming
372219089Spjd	 * distance between the good and bad buffers is the sum of them all.
373219089Spjd	 */
374219089Spjd	uint32_t zei_range_sets[MAX_RANGES];
375219089Spjd	uint32_t zei_range_clears[MAX_RANGES];
376219089Spjd
377219089Spjd	struct zei_ranges {
378219089Spjd		uint32_t	zr_start;
379219089Spjd		uint32_t	zr_end;
380219089Spjd	} zei_ranges[MAX_RANGES];
381219089Spjd
382219089Spjd	size_t	zei_range_count;
383219089Spjd	uint32_t zei_mingap;
384219089Spjd	uint32_t zei_allowed_mingap;
385219089Spjd
386219089Spjd} zfs_ecksum_info_t;
387219089Spjd
388219089Spjdstatic void
389330588Savgupdate_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
390219089Spjd{
391219089Spjd	size_t i;
392219089Spjd	size_t bits = 0;
393219089Spjd	uint64_t value = BE_64(value_arg);
394219089Spjd
395219089Spjd	/* We store the bits in big-endian (largest-first) order */
396219089Spjd	for (i = 0; i < 64; i++) {
397219089Spjd		if (value & (1ull << i)) {
398219089Spjd			hist[63 - i]++;
399219089Spjd			++bits;
400219089Spjd		}
401219089Spjd	}
402219089Spjd	/* update the count of bits changed */
403219089Spjd	*count += bits;
404219089Spjd}
405219089Spjd
406219089Spjd/*
407219089Spjd * We've now filled up the range array, and need to increase "mingap" and
408219089Spjd * shrink the range list accordingly.  zei_mingap is always the smallest
409219089Spjd * distance between array entries, so we set the new_allowed_gap to be
410219089Spjd * one greater than that.  We then go through the list, joining together
411219089Spjd * any ranges which are closer than the new_allowed_gap.
412219089Spjd *
413219089Spjd * By construction, there will be at least one.  We also update zei_mingap
414219089Spjd * to the new smallest gap, to prepare for our next invocation.
415219089Spjd */
416219089Spjdstatic void
417219089Spjdshrink_ranges(zfs_ecksum_info_t *eip)
418219089Spjd{
419219089Spjd	uint32_t mingap = UINT32_MAX;
420219089Spjd	uint32_t new_allowed_gap = eip->zei_mingap + 1;
421219089Spjd
422219089Spjd	size_t idx, output;
423219089Spjd	size_t max = eip->zei_range_count;
424219089Spjd
425219089Spjd	struct zei_ranges *r = eip->zei_ranges;
426219089Spjd
427219089Spjd	ASSERT3U(eip->zei_range_count, >, 0);
428219089Spjd	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
429219089Spjd
430219089Spjd	output = idx = 0;
431219089Spjd	while (idx < max - 1) {
432219089Spjd		uint32_t start = r[idx].zr_start;
433219089Spjd		uint32_t end = r[idx].zr_end;
434219089Spjd
435219089Spjd		while (idx < max - 1) {
436219089Spjd			idx++;
437219089Spjd
438219089Spjd			uint32_t nstart = r[idx].zr_start;
439219089Spjd			uint32_t nend = r[idx].zr_end;
440219089Spjd
441219089Spjd			uint32_t gap = nstart - end;
442219089Spjd			if (gap < new_allowed_gap) {
443219089Spjd				end = nend;
444219089Spjd				continue;
445219089Spjd			}
446219089Spjd			if (gap < mingap)
447219089Spjd				mingap = gap;
448219089Spjd			break;
449219089Spjd		}
450219089Spjd		r[output].zr_start = start;
451219089Spjd		r[output].zr_end = end;
452219089Spjd		output++;
453219089Spjd	}
454219089Spjd	ASSERT3U(output, <, eip->zei_range_count);
455219089Spjd	eip->zei_range_count = output;
456219089Spjd	eip->zei_mingap = mingap;
457219089Spjd	eip->zei_allowed_mingap = new_allowed_gap;
458219089Spjd}
459219089Spjd
460219089Spjdstatic void
461219089Spjdadd_range(zfs_ecksum_info_t *eip, int start, int end)
462219089Spjd{
463219089Spjd	struct zei_ranges *r = eip->zei_ranges;
464219089Spjd	size_t count = eip->zei_range_count;
465219089Spjd
466219089Spjd	if (count >= MAX_RANGES) {
467219089Spjd		shrink_ranges(eip);
468219089Spjd		count = eip->zei_range_count;
469219089Spjd	}
470219089Spjd	if (count == 0) {
471219089Spjd		eip->zei_mingap = UINT32_MAX;
472219089Spjd		eip->zei_allowed_mingap = 1;
473219089Spjd	} else {
474219089Spjd		int gap = start - r[count - 1].zr_end;
475219089Spjd
476219089Spjd		if (gap < eip->zei_allowed_mingap) {
477219089Spjd			r[count - 1].zr_end = end;
478219089Spjd			return;
479219089Spjd		}
480219089Spjd		if (gap < eip->zei_mingap)
481219089Spjd			eip->zei_mingap = gap;
482219089Spjd	}
483219089Spjd	r[count].zr_start = start;
484219089Spjd	r[count].zr_end = end;
485219089Spjd	eip->zei_range_count++;
486219089Spjd}
487219089Spjd
488219089Spjdstatic size_t
489219089Spjdrange_total_size(zfs_ecksum_info_t *eip)
490219089Spjd{
491219089Spjd	struct zei_ranges *r = eip->zei_ranges;
492219089Spjd	size_t count = eip->zei_range_count;
493219089Spjd	size_t result = 0;
494219089Spjd	size_t idx;
495219089Spjd
496219089Spjd	for (idx = 0; idx < count; idx++)
497219089Spjd		result += (r[idx].zr_end - r[idx].zr_start);
498219089Spjd
499219089Spjd	return (result);
500219089Spjd}
501219089Spjd
502219089Spjdstatic zfs_ecksum_info_t *
503219089Spjdannotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
504219089Spjd    const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
505219089Spjd    boolean_t drop_if_identical)
506219089Spjd{
507219089Spjd	const uint64_t *good = (const uint64_t *)goodbuf;
508219089Spjd	const uint64_t *bad = (const uint64_t *)badbuf;
509219089Spjd
510219089Spjd	uint64_t allset = 0;
511219089Spjd	uint64_t allcleared = 0;
512219089Spjd
513219089Spjd	size_t nui64s = size / sizeof (uint64_t);
514219089Spjd
515219089Spjd	size_t inline_size;
516219089Spjd	int no_inline = 0;
517219089Spjd	size_t idx;
518219089Spjd	size_t range;
519219089Spjd
520219089Spjd	size_t offset = 0;
521219089Spjd	ssize_t start = -1;
522219089Spjd
523219089Spjd	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
524219089Spjd
525219089Spjd	/* don't do any annotation for injected checksum errors */
526219089Spjd	if (info != NULL && info->zbc_injected)
527219089Spjd		return (eip);
528219089Spjd
529219089Spjd	if (info != NULL && info->zbc_has_cksum) {
530219089Spjd		fm_payload_set(ereport,
531219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
532219089Spjd		    DATA_TYPE_UINT64_ARRAY,
533219089Spjd		    sizeof (info->zbc_expected) / sizeof (uint64_t),
534219089Spjd		    (uint64_t *)&info->zbc_expected,
535219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
536219089Spjd		    DATA_TYPE_UINT64_ARRAY,
537219089Spjd		    sizeof (info->zbc_actual) / sizeof (uint64_t),
538219089Spjd		    (uint64_t *)&info->zbc_actual,
539219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
540219089Spjd		    DATA_TYPE_STRING,
541219089Spjd		    info->zbc_checksum_name,
542219089Spjd		    NULL);
543219089Spjd
544219089Spjd		if (info->zbc_byteswapped) {
545219089Spjd			fm_payload_set(ereport,
546219089Spjd			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
547219089Spjd			    DATA_TYPE_BOOLEAN, 1,
548219089Spjd			    NULL);
549219089Spjd		}
550219089Spjd	}
551219089Spjd
552219089Spjd	if (badbuf == NULL || goodbuf == NULL)
553219089Spjd		return (eip);
554219089Spjd
555330588Savg	ASSERT3U(nui64s, <=, UINT32_MAX);
556219089Spjd	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
557219089Spjd	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
558219089Spjd	ASSERT3U(size, <=, UINT32_MAX);
559219089Spjd
560219089Spjd	/* build up the range list by comparing the two buffers. */
561219089Spjd	for (idx = 0; idx < nui64s; idx++) {
562219089Spjd		if (good[idx] == bad[idx]) {
563219089Spjd			if (start == -1)
564219089Spjd				continue;
565219089Spjd
566219089Spjd			add_range(eip, start, idx);
567219089Spjd			start = -1;
568219089Spjd		} else {
569219089Spjd			if (start != -1)
570219089Spjd				continue;
571219089Spjd
572219089Spjd			start = idx;
573219089Spjd		}
574219089Spjd	}
575219089Spjd	if (start != -1)
576219089Spjd		add_range(eip, start, idx);
577219089Spjd
578219089Spjd	/* See if it will fit in our inline buffers */
579219089Spjd	inline_size = range_total_size(eip);
580219089Spjd	if (inline_size > ZFM_MAX_INLINE)
581219089Spjd		no_inline = 1;
582219089Spjd
583219089Spjd	/*
584219089Spjd	 * If there is no change and we want to drop if the buffers are
585219089Spjd	 * identical, do so.
586219089Spjd	 */
587219089Spjd	if (inline_size == 0 && drop_if_identical) {
588219089Spjd		kmem_free(eip, sizeof (*eip));
589219089Spjd		return (NULL);
590219089Spjd	}
591219089Spjd
592219089Spjd	/*
593219089Spjd	 * Now walk through the ranges, filling in the details of the
594219089Spjd	 * differences.  Also convert our uint64_t-array offsets to byte
595219089Spjd	 * offsets.
596219089Spjd	 */
597219089Spjd	for (range = 0; range < eip->zei_range_count; range++) {
598219089Spjd		size_t start = eip->zei_ranges[range].zr_start;
599219089Spjd		size_t end = eip->zei_ranges[range].zr_end;
600219089Spjd
601219089Spjd		for (idx = start; idx < end; idx++) {
602219089Spjd			uint64_t set, cleared;
603219089Spjd
604219089Spjd			// bits set in bad, but not in good
605219089Spjd			set = ((~good[idx]) & bad[idx]);
606219089Spjd			// bits set in good, but not in bad
607219089Spjd			cleared = (good[idx] & (~bad[idx]));
608219089Spjd
609219089Spjd			allset |= set;
610219089Spjd			allcleared |= cleared;
611219089Spjd
612219089Spjd			if (!no_inline) {
613219089Spjd				ASSERT3U(offset, <, inline_size);
614219089Spjd				eip->zei_bits_set[offset] = set;
615219089Spjd				eip->zei_bits_cleared[offset] = cleared;
616219089Spjd				offset++;
617219089Spjd			}
618219089Spjd
619219089Spjd			update_histogram(set, eip->zei_histogram_set,
620219089Spjd			    &eip->zei_range_sets[range]);
621219089Spjd			update_histogram(cleared, eip->zei_histogram_cleared,
622219089Spjd			    &eip->zei_range_clears[range]);
623219089Spjd		}
624219089Spjd
625219089Spjd		/* convert to byte offsets */
626219089Spjd		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
627219089Spjd		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
628219089Spjd	}
629219089Spjd	eip->zei_allowed_mingap	*= sizeof (uint64_t);
630219089Spjd	inline_size		*= sizeof (uint64_t);
631219089Spjd
632219089Spjd	/* fill in ereport */
633219089Spjd	fm_payload_set(ereport,
634219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
635219089Spjd	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
636219089Spjd	    (uint32_t *)eip->zei_ranges,
637219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
638219089Spjd	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
639219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
640219089Spjd	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
641219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
642219089Spjd	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
643219089Spjd	    NULL);
644219089Spjd
645219089Spjd	if (!no_inline) {
646219089Spjd		fm_payload_set(ereport,
647219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
648219089Spjd		    DATA_TYPE_UINT8_ARRAY,
649219089Spjd		    inline_size, (uint8_t *)eip->zei_bits_set,
650219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
651219089Spjd		    DATA_TYPE_UINT8_ARRAY,
652219089Spjd		    inline_size, (uint8_t *)eip->zei_bits_cleared,
653219089Spjd		    NULL);
654219089Spjd	} else {
655219089Spjd		fm_payload_set(ereport,
656219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
657330588Savg		    DATA_TYPE_UINT32_ARRAY,
658219089Spjd		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
659219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
660330588Savg		    DATA_TYPE_UINT32_ARRAY,
661219089Spjd		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
662219089Spjd		    NULL);
663219089Spjd	}
664219089Spjd	return (eip);
665219089Spjd}
666168404Spjd#endif
667219089Spjd
668219089Spjdvoid
669219089Spjdzfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
670219089Spjd    uint64_t stateoroffset, uint64_t size)
671219089Spjd{
672219089Spjd#ifdef _KERNEL
673219089Spjd	nvlist_t *ereport = NULL;
674219089Spjd	nvlist_t *detector = NULL;
675219089Spjd
676219089Spjd	zfs_ereport_start(&ereport, &detector,
677219089Spjd	    subclass, spa, vd, zio, stateoroffset, size);
678219089Spjd
679219089Spjd	if (ereport == NULL)
680219089Spjd		return;
681219089Spjd
682219089Spjd	fm_ereport_post(ereport, EVCH_SLEEP);
683219089Spjd
684219089Spjd	fm_nvlist_destroy(ereport, FM_NVA_FREE);
685219089Spjd	fm_nvlist_destroy(detector, FM_NVA_FREE);
686219089Spjd#endif
687168404Spjd}
688168404Spjd
689219089Spjdvoid
690219089Spjdzfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
691219089Spjd    struct zio *zio, uint64_t offset, uint64_t length, void *arg,
692219089Spjd    zio_bad_cksum_t *info)
693219089Spjd{
694219089Spjd	zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
695219089Spjd
696219089Spjd	if (zio->io_vsd != NULL)
697219089Spjd		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
698219089Spjd	else
699219089Spjd		zio_vsd_default_cksum_report(zio, report, arg);
700219089Spjd
701219089Spjd	/* copy the checksum failure information if it was provided */
702219089Spjd	if (info != NULL) {
703219089Spjd		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
704219089Spjd		bcopy(info, report->zcr_ckinfo, sizeof (*info));
705219089Spjd	}
706219089Spjd
707219089Spjd	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
708219089Spjd	report->zcr_length = length;
709219089Spjd
710219089Spjd#ifdef _KERNEL
711219089Spjd	zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
712219089Spjd	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
713219089Spjd
714219089Spjd	if (report->zcr_ereport == NULL) {
715219089Spjd		report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
716230514Smm		if (report->zcr_ckinfo != NULL) {
717230514Smm			kmem_free(report->zcr_ckinfo,
718230514Smm			    sizeof (*report->zcr_ckinfo));
719230514Smm		}
720219089Spjd		kmem_free(report, sizeof (*report));
721219089Spjd		return;
722219089Spjd	}
723219089Spjd#endif
724219089Spjd
725219089Spjd	mutex_enter(&spa->spa_errlist_lock);
726219089Spjd	report->zcr_next = zio->io_logical->io_cksum_report;
727219089Spjd	zio->io_logical->io_cksum_report = report;
728219089Spjd	mutex_exit(&spa->spa_errlist_lock);
729219089Spjd}
730219089Spjd
731219089Spjdvoid
732219089Spjdzfs_ereport_finish_checksum(zio_cksum_report_t *report,
733219089Spjd    const void *good_data, const void *bad_data, boolean_t drop_if_identical)
734219089Spjd{
735219089Spjd#ifdef _KERNEL
736219089Spjd	zfs_ecksum_info_t *info = NULL;
737219089Spjd	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
738219089Spjd	    good_data, bad_data, report->zcr_length, drop_if_identical);
739219089Spjd
740219089Spjd	if (info != NULL)
741219089Spjd		fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
742219089Spjd
743219089Spjd	fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
744219089Spjd	fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
745219089Spjd	report->zcr_ereport = report->zcr_detector = NULL;
746219089Spjd
747219089Spjd	if (info != NULL)
748219089Spjd		kmem_free(info, sizeof (*info));
749219089Spjd#endif
750219089Spjd}
751219089Spjd
752219089Spjdvoid
753219089Spjdzfs_ereport_free_checksum(zio_cksum_report_t *rpt)
754219089Spjd{
755219089Spjd#ifdef _KERNEL
756219089Spjd	if (rpt->zcr_ereport != NULL) {
757219089Spjd		fm_nvlist_destroy(rpt->zcr_ereport,
758219089Spjd		    FM_NVA_FREE);
759219089Spjd		fm_nvlist_destroy(rpt->zcr_detector,
760219089Spjd		    FM_NVA_FREE);
761219089Spjd	}
762219089Spjd#endif
763219089Spjd	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
764219089Spjd
765219089Spjd	if (rpt->zcr_ckinfo != NULL)
766219089Spjd		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
767219089Spjd
768219089Spjd	kmem_free(rpt, sizeof (*rpt));
769219089Spjd}
770219089Spjd
771219089Spjdvoid
772219089Spjdzfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
773219089Spjd{
774219089Spjd#ifdef _KERNEL
775219089Spjd	fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
776219089Spjd#endif
777219089Spjd}
778219089Spjd
779219089Spjdvoid
780219089Spjdzfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
781219089Spjd    struct zio *zio, uint64_t offset, uint64_t length,
782219089Spjd    const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
783219089Spjd{
784219089Spjd#ifdef _KERNEL
785219089Spjd	nvlist_t *ereport = NULL;
786219089Spjd	nvlist_t *detector = NULL;
787219089Spjd	zfs_ecksum_info_t *info;
788219089Spjd
789219089Spjd	zfs_ereport_start(&ereport, &detector,
790219089Spjd	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
791219089Spjd
792219089Spjd	if (ereport == NULL)
793219089Spjd		return;
794219089Spjd
795219089Spjd	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
796219089Spjd	    B_FALSE);
797219089Spjd
798219089Spjd	if (info != NULL)
799219089Spjd		fm_ereport_post(ereport, EVCH_SLEEP);
800219089Spjd
801219089Spjd	fm_nvlist_destroy(ereport, FM_NVA_FREE);
802219089Spjd	fm_nvlist_destroy(detector, FM_NVA_FREE);
803219089Spjd
804219089Spjd	if (info != NULL)
805219089Spjd		kmem_free(info, sizeof (*info));
806219089Spjd#endif
807219089Spjd}
808219089Spjd
809185029Spjdstatic void
810185029Spjdzfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
811168404Spjd{
812168404Spjd#ifdef _KERNEL
813219089Spjd	nvlist_t *resource;
814168404Spjd	char class[64];
815168404Spjd
816219089Spjd	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
817219089Spjd		return;
818168404Spjd
819219089Spjd	if ((resource = fm_nvlist_create(NULL)) == NULL)
820219089Spjd		return;
821168404Spjd
822219089Spjd	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
823185029Spjd	    ZFS_ERROR_CLASS, name);
824219089Spjd	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
825219089Spjd	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
826219089Spjd	VERIFY(nvlist_add_uint64(resource,
827219089Spjd	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
828168404Spjd	if (vd)
829219089Spjd		VERIFY(nvlist_add_uint64(resource,
830219089Spjd		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
831219089Spjd
832219089Spjd	fm_ereport_post(resource, EVCH_SLEEP);
833219089Spjd
834219089Spjd	fm_nvlist_destroy(resource, FM_NVA_FREE);
835168404Spjd#endif
836168404Spjd}
837185029Spjd
838185029Spjd/*
839185029Spjd * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
840185029Spjd * has been removed from the system.  This will cause the DE to ignore any
841185029Spjd * recent I/O errors, inferring that they are due to the asynchronous device
842185029Spjd * removal.
843185029Spjd */
844185029Spjdvoid
845185029Spjdzfs_post_remove(spa_t *spa, vdev_t *vd)
846185029Spjd{
847185029Spjd	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
848185029Spjd}
849185029Spjd
850185029Spjd/*
851185029Spjd * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
852185029Spjd * has the 'autoreplace' property set, and therefore any broken vdevs will be
853185029Spjd * handled by higher level logic, and no vdev fault should be generated.
854185029Spjd */
855185029Spjdvoid
856185029Spjdzfs_post_autoreplace(spa_t *spa, vdev_t *vd)
857185029Spjd{
858185029Spjd	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
859185029Spjd}
860219089Spjd
861219089Spjd/*
862219089Spjd * The 'resource.fs.zfs.statechange' event is an internal signal that the
863219089Spjd * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
864219089Spjd * cause the retire agent to repair any outstanding fault management cases
865219089Spjd * open because the device was not found (fault.fs.zfs.device).
866219089Spjd */
867219089Spjdvoid
868219089Spjdzfs_post_state_change(spa_t *spa, vdev_t *vd)
869219089Spjd{
870219089Spjd	zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
871219089Spjd}
872