1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2017, Intel Corporation.
25 */
26
27/*
28 * ZFS fault injection
29 *
30 * To handle fault injection, we keep track of a series of zinject_record_t
31 * structures which describe which logical block(s) should be injected with a
32 * fault.  These are kept in a global list.  Each record corresponds to a given
33 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
34 * or exported while the injection record exists.
35 *
36 * Device level injection is done using the 'zi_guid' field.  If this is set, it
37 * means that the error is destined for a particular device, not a piece of
38 * data.
39 *
40 * This is a rather poor data structure and algorithm, but we don't expect more
41 * than a few faults at any one time, so it should be sufficient for our needs.
42 */
43
44#include <sys/arc.h>
45#include <sys/zio.h>
46#include <sys/zfs_ioctl.h>
47#include <sys/vdev_impl.h>
48#include <sys/dmu_objset.h>
49#include <sys/dsl_dataset.h>
50#include <sys/fs/zfs.h>
51
52uint32_t zio_injection_enabled = 0;
53
54/*
55 * Data describing each zinject handler registered on the system, and
56 * contains the list node linking the handler in the global zinject
57 * handler list.
58 */
59typedef struct inject_handler {
60	int			zi_id;
61	spa_t			*zi_spa;
62	zinject_record_t	zi_record;
63	uint64_t		*zi_lanes;
64	int			zi_next_lane;
65	list_node_t		zi_link;
66} inject_handler_t;
67
68/*
69 * List of all zinject handlers registered on the system, protected by
70 * the inject_lock defined below.
71 */
72static list_t inject_handlers;
73
74/*
75 * This protects insertion into, and traversal of, the inject handler
76 * list defined above; as well as the inject_delay_count. Any time a
77 * handler is inserted or removed from the list, this lock should be
78 * taken as a RW_WRITER; and any time traversal is done over the list
79 * (without modification to it) this lock should be taken as a RW_READER.
80 */
81static krwlock_t inject_lock;
82
83/*
84 * This holds the number of zinject delay handlers that have been
85 * registered on the system. It is protected by the inject_lock defined
86 * above. Thus modifications to this count must be a RW_WRITER of the
87 * inject_lock, and reads of this count must be (at least) a RW_READER
88 * of the lock.
89 */
90static int inject_delay_count = 0;
91
92/*
93 * This lock is used only in zio_handle_io_delay(), refer to the comment
94 * in that function for more details.
95 */
96static kmutex_t inject_delay_mtx;
97
98/*
99 * Used to assign unique identifying numbers to each new zinject handler.
100 */
101static int inject_next_id = 1;
102
103/*
104 * Test if the requested frequency was triggered
105 */
106static boolean_t
107freq_triggered(uint32_t frequency)
108{
109	/*
110	 * zero implies always (100%)
111	 */
112	if (frequency == 0)
113		return (B_TRUE);
114
115	/*
116	 * Note: we still handle legacy (unscaled) frequency values
117	 */
118	uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
119
120	return (spa_get_random(maximum) < frequency);
121}
122
123/*
124 * Returns true if the given record matches the I/O in progress.
125 */
126static boolean_t
127zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
128    zinject_record_t *record, int error)
129{
130	/*
131	 * Check for a match against the MOS, which is based on type
132	 */
133	if (zb->zb_objset == DMU_META_OBJSET &&
134	    record->zi_objset == DMU_META_OBJSET &&
135	    record->zi_object == DMU_META_DNODE_OBJECT) {
136		if (record->zi_type == DMU_OT_NONE ||
137		    type == record->zi_type)
138			return (freq_triggered(record->zi_freq));
139		else
140			return (B_FALSE);
141	}
142
143	/*
144	 * Check for an exact match.
145	 */
146	if (zb->zb_objset == record->zi_objset &&
147	    zb->zb_object == record->zi_object &&
148	    zb->zb_level == record->zi_level &&
149	    zb->zb_blkid >= record->zi_start &&
150	    zb->zb_blkid <= record->zi_end &&
151	    (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
152	    error == record->zi_error) {
153		return (freq_triggered(record->zi_freq));
154	}
155
156	return (B_FALSE);
157}
158
159/*
160 * Panic the system when a config change happens in the function
161 * specified by tag.
162 */
163void
164zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
165{
166	inject_handler_t *handler;
167
168	rw_enter(&inject_lock, RW_READER);
169
170	for (handler = list_head(&inject_handlers); handler != NULL;
171	    handler = list_next(&inject_handlers, handler)) {
172
173		if (spa != handler->zi_spa)
174			continue;
175
176		if (handler->zi_record.zi_type == type &&
177		    strcmp(tag, handler->zi_record.zi_func) == 0)
178			panic("Panic requested in function %s\n", tag);
179	}
180
181	rw_exit(&inject_lock);
182}
183
184/*
185 * Inject a decryption failure. Decryption failures can occur in
186 * both the ARC and the ZIO layers.
187 */
188int
189zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
190    uint64_t type, int error)
191{
192	int ret = 0;
193	inject_handler_t *handler;
194
195	rw_enter(&inject_lock, RW_READER);
196
197	for (handler = list_head(&inject_handlers); handler != NULL;
198	    handler = list_next(&inject_handlers, handler)) {
199
200		if (spa != handler->zi_spa ||
201		    handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
202			continue;
203
204		if (zio_match_handler(zb, type, ZI_NO_DVA,
205		    &handler->zi_record, error)) {
206			ret = error;
207			break;
208		}
209	}
210
211	rw_exit(&inject_lock);
212	return (ret);
213}
214
215/*
216 * If this is a physical I/O for a vdev child determine which DVA it is
217 * for. We iterate backwards through the DVAs matching on the offset so
218 * that we end up with ZI_NO_DVA (-1) if we don't find a match.
219 */
220static int
221zio_match_dva(zio_t *zio)
222{
223	int i = ZI_NO_DVA;
224
225	if (zio->io_bp != NULL && zio->io_vd != NULL &&
226	    zio->io_child_type == ZIO_CHILD_VDEV) {
227		for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
228			dva_t *dva = &zio->io_bp->blk_dva[i];
229			uint64_t off = DVA_GET_OFFSET(dva);
230			vdev_t *vd = vdev_lookup_top(zio->io_spa,
231			    DVA_GET_VDEV(dva));
232
233			/* Compensate for vdev label added to leaves */
234			if (zio->io_vd->vdev_ops->vdev_op_leaf)
235				off += VDEV_LABEL_START_SIZE;
236
237			if (zio->io_vd == vd && zio->io_offset == off)
238				break;
239		}
240	}
241
242	return (i);
243}
244
245
246/*
247 * Determine if the I/O in question should return failure.  Returns the errno
248 * to be returned to the caller.
249 */
250int
251zio_handle_fault_injection(zio_t *zio, int error)
252{
253	int ret = 0;
254	inject_handler_t *handler;
255
256	/*
257	 * Ignore I/O not associated with any logical data.
258	 */
259	if (zio->io_logical == NULL)
260		return (0);
261
262	/*
263	 * Currently, we only support fault injection on reads.
264	 */
265	if (zio->io_type != ZIO_TYPE_READ)
266		return (0);
267
268	/*
269	 * A rebuild I/O has no checksum to verify.
270	 */
271	if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
272		return (0);
273
274	rw_enter(&inject_lock, RW_READER);
275
276	for (handler = list_head(&inject_handlers); handler != NULL;
277	    handler = list_next(&inject_handlers, handler)) {
278		if (zio->io_spa != handler->zi_spa ||
279		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
280			continue;
281
282		/* If this handler matches, return the specified error */
283		if (zio_match_handler(&zio->io_logical->io_bookmark,
284		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
285		    zio_match_dva(zio), &handler->zi_record, error)) {
286			ret = error;
287			break;
288		}
289	}
290
291	rw_exit(&inject_lock);
292
293	return (ret);
294}
295
296/*
297 * Determine if the zio is part of a label update and has an injection
298 * handler associated with that portion of the label. Currently, we
299 * allow error injection in either the nvlist or the uberblock region of
300 * of the vdev label.
301 */
302int
303zio_handle_label_injection(zio_t *zio, int error)
304{
305	inject_handler_t *handler;
306	vdev_t *vd = zio->io_vd;
307	uint64_t offset = zio->io_offset;
308	int label;
309	int ret = 0;
310
311	if (offset >= VDEV_LABEL_START_SIZE &&
312	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
313		return (0);
314
315	rw_enter(&inject_lock, RW_READER);
316
317	for (handler = list_head(&inject_handlers); handler != NULL;
318	    handler = list_next(&inject_handlers, handler)) {
319		uint64_t start = handler->zi_record.zi_start;
320		uint64_t end = handler->zi_record.zi_end;
321
322		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
323			continue;
324
325		/*
326		 * The injection region is the relative offsets within a
327		 * vdev label. We must determine the label which is being
328		 * updated and adjust our region accordingly.
329		 */
330		label = vdev_label_number(vd->vdev_psize, offset);
331		start = vdev_label_offset(vd->vdev_psize, label, start);
332		end = vdev_label_offset(vd->vdev_psize, label, end);
333
334		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
335		    (offset >= start && offset <= end)) {
336			ret = error;
337			break;
338		}
339	}
340	rw_exit(&inject_lock);
341	return (ret);
342}
343
344/*ARGSUSED*/
345static int
346zio_inject_bitflip_cb(void *data, size_t len, void *private)
347{
348	zio_t *zio __maybe_unused = private;
349	uint8_t *buffer = data;
350	uint_t byte = spa_get_random(len);
351
352	ASSERT(zio->io_type == ZIO_TYPE_READ);
353
354	/* flip a single random bit in an abd data buffer */
355	buffer[byte] ^= 1 << spa_get_random(8);
356
357	return (1);	/* stop after first flip */
358}
359
360static int
361zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
362{
363	inject_handler_t *handler;
364	int ret = 0;
365
366	/*
367	 * We skip over faults in the labels unless it's during
368	 * device open (i.e. zio == NULL).
369	 */
370	if (zio != NULL) {
371		uint64_t offset = zio->io_offset;
372
373		if (offset < VDEV_LABEL_START_SIZE ||
374		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
375			return (0);
376	}
377
378	rw_enter(&inject_lock, RW_READER);
379
380	for (handler = list_head(&inject_handlers); handler != NULL;
381	    handler = list_next(&inject_handlers, handler)) {
382
383		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
384			continue;
385
386		if (vd->vdev_guid == handler->zi_record.zi_guid) {
387			if (handler->zi_record.zi_failfast &&
388			    (zio == NULL || (zio->io_flags &
389			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
390				continue;
391			}
392
393			/* Handle type specific I/O failures */
394			if (zio != NULL &&
395			    handler->zi_record.zi_iotype != ZIO_TYPES &&
396			    handler->zi_record.zi_iotype != zio->io_type)
397				continue;
398
399			if (handler->zi_record.zi_error == err1 ||
400			    handler->zi_record.zi_error == err2) {
401				/*
402				 * limit error injection if requested
403				 */
404				if (!freq_triggered(handler->zi_record.zi_freq))
405					continue;
406
407				/*
408				 * For a failed open, pretend like the device
409				 * has gone away.
410				 */
411				if (err1 == ENXIO)
412					vd->vdev_stat.vs_aux =
413					    VDEV_AUX_OPEN_FAILED;
414
415				/*
416				 * Treat these errors as if they had been
417				 * retried so that all the appropriate stats
418				 * and FMA events are generated.
419				 */
420				if (!handler->zi_record.zi_failfast &&
421				    zio != NULL)
422					zio->io_flags |= ZIO_FLAG_IO_RETRY;
423
424				/*
425				 * EILSEQ means flip a bit after a read
426				 */
427				if (handler->zi_record.zi_error == EILSEQ) {
428					if (zio == NULL)
429						break;
430
431					/* locate buffer data and flip a bit */
432					(void) abd_iterate_func(zio->io_abd, 0,
433					    zio->io_size, zio_inject_bitflip_cb,
434					    zio);
435					break;
436				}
437
438				ret = handler->zi_record.zi_error;
439				break;
440			}
441			if (handler->zi_record.zi_error == ENXIO) {
442				ret = SET_ERROR(EIO);
443				break;
444			}
445		}
446	}
447
448	rw_exit(&inject_lock);
449
450	return (ret);
451}
452
453int
454zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
455{
456	return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
457}
458
459int
460zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
461{
462	return (zio_handle_device_injection_impl(vd, zio, err1, err2));
463}
464
465/*
466 * Simulate hardware that ignores cache flushes.  For requested number
467 * of seconds nix the actual writing to disk.
468 */
469void
470zio_handle_ignored_writes(zio_t *zio)
471{
472	inject_handler_t *handler;
473
474	rw_enter(&inject_lock, RW_READER);
475
476	for (handler = list_head(&inject_handlers); handler != NULL;
477	    handler = list_next(&inject_handlers, handler)) {
478
479		/* Ignore errors not destined for this pool */
480		if (zio->io_spa != handler->zi_spa ||
481		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
482			continue;
483
484		/*
485		 * Positive duration implies # of seconds, negative
486		 * a number of txgs
487		 */
488		if (handler->zi_record.zi_timer == 0) {
489			if (handler->zi_record.zi_duration > 0)
490				handler->zi_record.zi_timer = ddi_get_lbolt64();
491			else
492				handler->zi_record.zi_timer = zio->io_txg;
493		}
494
495		/* Have a "problem" writing 60% of the time */
496		if (spa_get_random(100) < 60)
497			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
498		break;
499	}
500
501	rw_exit(&inject_lock);
502}
503
504void
505spa_handle_ignored_writes(spa_t *spa)
506{
507	inject_handler_t *handler;
508
509	if (zio_injection_enabled == 0)
510		return;
511
512	rw_enter(&inject_lock, RW_READER);
513
514	for (handler = list_head(&inject_handlers); handler != NULL;
515	    handler = list_next(&inject_handlers, handler)) {
516
517		if (spa != handler->zi_spa ||
518		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
519			continue;
520
521		if (handler->zi_record.zi_duration > 0) {
522			VERIFY(handler->zi_record.zi_timer == 0 ||
523			    ddi_time_after64(
524			    (int64_t)handler->zi_record.zi_timer +
525			    handler->zi_record.zi_duration * hz,
526			    ddi_get_lbolt64()));
527		} else {
528			/* duration is negative so the subtraction here adds */
529			VERIFY(handler->zi_record.zi_timer == 0 ||
530			    handler->zi_record.zi_timer -
531			    handler->zi_record.zi_duration >=
532			    spa_syncing_txg(spa));
533		}
534	}
535
536	rw_exit(&inject_lock);
537}
538
539hrtime_t
540zio_handle_io_delay(zio_t *zio)
541{
542	vdev_t *vd = zio->io_vd;
543	inject_handler_t *min_handler = NULL;
544	hrtime_t min_target = 0;
545
546	rw_enter(&inject_lock, RW_READER);
547
548	/*
549	 * inject_delay_count is a subset of zio_injection_enabled that
550	 * is only incremented for delay handlers. These checks are
551	 * mainly added to remind the reader why we're not explicitly
552	 * checking zio_injection_enabled like the other functions.
553	 */
554	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
555	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
556
557	/*
558	 * If there aren't any inject delay handlers registered, then we
559	 * can short circuit and simply return 0 here. A value of zero
560	 * informs zio_delay_interrupt() that this request should not be
561	 * delayed. This short circuit keeps us from acquiring the
562	 * inject_delay_mutex unnecessarily.
563	 */
564	if (inject_delay_count == 0) {
565		rw_exit(&inject_lock);
566		return (0);
567	}
568
569	/*
570	 * Each inject handler has a number of "lanes" associated with
571	 * it. Each lane is able to handle requests independently of one
572	 * another, and at a latency defined by the inject handler
573	 * record's zi_timer field. Thus if a handler in configured with
574	 * a single lane with a 10ms latency, it will delay requests
575	 * such that only a single request is completed every 10ms. So,
576	 * if more than one request is attempted per each 10ms interval,
577	 * the average latency of the requests will be greater than
578	 * 10ms; but if only a single request is submitted each 10ms
579	 * interval the average latency will be 10ms.
580	 *
581	 * We need to acquire this mutex to prevent multiple concurrent
582	 * threads being assigned to the same lane of a given inject
583	 * handler. The mutex allows us to perform the following two
584	 * operations atomically:
585	 *
586	 *	1. determine the minimum handler and minimum target
587	 *	   value of all the possible handlers
588	 *	2. update that minimum handler's lane array
589	 *
590	 * Without atomicity, two (or more) threads could pick the same
591	 * lane in step (1), and then conflict with each other in step
592	 * (2). This could allow a single lane handler to process
593	 * multiple requests simultaneously, which shouldn't be possible.
594	 */
595	mutex_enter(&inject_delay_mtx);
596
597	for (inject_handler_t *handler = list_head(&inject_handlers);
598	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
599		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
600			continue;
601
602		if (!freq_triggered(handler->zi_record.zi_freq))
603			continue;
604
605		if (vd->vdev_guid != handler->zi_record.zi_guid)
606			continue;
607
608		/*
609		 * Defensive; should never happen as the array allocation
610		 * occurs prior to inserting this handler on the list.
611		 */
612		ASSERT3P(handler->zi_lanes, !=, NULL);
613
614		/*
615		 * This should never happen, the zinject command should
616		 * prevent a user from setting an IO delay with zero lanes.
617		 */
618		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
619
620		ASSERT3U(handler->zi_record.zi_nlanes, >,
621		    handler->zi_next_lane);
622
623		/*
624		 * We want to issue this IO to the lane that will become
625		 * idle the soonest, so we compare the soonest this
626		 * specific handler can complete the IO with all other
627		 * handlers, to find the lowest value of all possible
628		 * lanes. We then use this lane to submit the request.
629		 *
630		 * Since each handler has a constant value for its
631		 * delay, we can just use the "next" lane for that
632		 * handler; as it will always be the lane with the
633		 * lowest value for that particular handler (i.e. the
634		 * lane that will become idle the soonest). This saves a
635		 * scan of each handler's lanes array.
636		 *
637		 * There's two cases to consider when determining when
638		 * this specific IO request should complete. If this
639		 * lane is idle, we want to "submit" the request now so
640		 * it will complete after zi_timer milliseconds. Thus,
641		 * we set the target to now + zi_timer.
642		 *
643		 * If the lane is busy, we want this request to complete
644		 * zi_timer milliseconds after the lane becomes idle.
645		 * Since the 'zi_lanes' array holds the time at which
646		 * each lane will become idle, we use that value to
647		 * determine when this request should complete.
648		 */
649		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
650		hrtime_t busy = handler->zi_record.zi_timer +
651		    handler->zi_lanes[handler->zi_next_lane];
652		hrtime_t target = MAX(idle, busy);
653
654		if (min_handler == NULL) {
655			min_handler = handler;
656			min_target = target;
657			continue;
658		}
659
660		ASSERT3P(min_handler, !=, NULL);
661		ASSERT3U(min_target, !=, 0);
662
663		/*
664		 * We don't yet increment the "next lane" variable since
665		 * we still might find a lower value lane in another
666		 * handler during any remaining iterations. Once we're
667		 * sure we've selected the absolute minimum, we'll claim
668		 * the lane and increment the handler's "next lane"
669		 * field below.
670		 */
671
672		if (target < min_target) {
673			min_handler = handler;
674			min_target = target;
675		}
676	}
677
678	/*
679	 * 'min_handler' will be NULL if no IO delays are registered for
680	 * this vdev, otherwise it will point to the handler containing
681	 * the lane that will become idle the soonest.
682	 */
683	if (min_handler != NULL) {
684		ASSERT3U(min_target, !=, 0);
685		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
686
687		/*
688		 * If we've used all possible lanes for this handler,
689		 * loop back and start using the first lane again;
690		 * otherwise, just increment the lane index.
691		 */
692		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
693		    min_handler->zi_record.zi_nlanes;
694	}
695
696	mutex_exit(&inject_delay_mtx);
697	rw_exit(&inject_lock);
698
699	return (min_target);
700}
701
702static int
703zio_calculate_range(const char *pool, zinject_record_t *record)
704{
705	dsl_pool_t *dp;
706	dsl_dataset_t *ds;
707	objset_t *os = NULL;
708	dnode_t *dn = NULL;
709	int error;
710
711	/*
712	 * Obtain the dnode for object using pool, objset, and object
713	 */
714	error = dsl_pool_hold(pool, FTAG, &dp);
715	if (error)
716		return (error);
717
718	error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
719	dsl_pool_rele(dp, FTAG);
720	if (error)
721		return (error);
722
723	error = dmu_objset_from_ds(ds, &os);
724	dsl_dataset_rele(ds, FTAG);
725	if (error)
726		return (error);
727
728	error = dnode_hold(os, record->zi_object, FTAG, &dn);
729	if (error)
730		return (error);
731
732	/*
733	 * Translate the range into block IDs
734	 */
735	if (record->zi_start != 0 || record->zi_end != -1ULL) {
736		record->zi_start >>= dn->dn_datablkshift;
737		record->zi_end >>= dn->dn_datablkshift;
738	}
739	if (record->zi_level > 0) {
740		if (record->zi_level >= dn->dn_nlevels) {
741			dnode_rele(dn, FTAG);
742			return (SET_ERROR(EDOM));
743		}
744
745		if (record->zi_start != 0 || record->zi_end != 0) {
746			int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
747
748			for (int level = record->zi_level; level > 0; level--) {
749				record->zi_start >>= shift;
750				record->zi_end >>= shift;
751			}
752		}
753	}
754
755	dnode_rele(dn, FTAG);
756	return (0);
757}
758
759/*
760 * Create a new handler for the given record.  We add it to the list, adding
761 * a reference to the spa_t in the process.  We increment zio_injection_enabled,
762 * which is the switch to trigger all fault injection.
763 */
764int
765zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
766{
767	inject_handler_t *handler;
768	int error;
769	spa_t *spa;
770
771	/*
772	 * If this is pool-wide metadata, make sure we unload the corresponding
773	 * spa_t, so that the next attempt to load it will trigger the fault.
774	 * We call spa_reset() to unload the pool appropriately.
775	 */
776	if (flags & ZINJECT_UNLOAD_SPA)
777		if ((error = spa_reset(name)) != 0)
778			return (error);
779
780	if (record->zi_cmd == ZINJECT_DELAY_IO) {
781		/*
782		 * A value of zero for the number of lanes or for the
783		 * delay time doesn't make sense.
784		 */
785		if (record->zi_timer == 0 || record->zi_nlanes == 0)
786			return (SET_ERROR(EINVAL));
787
788		/*
789		 * The number of lanes is directly mapped to the size of
790		 * an array used by the handler. Thus, to ensure the
791		 * user doesn't trigger an allocation that's "too large"
792		 * we cap the number of lanes here.
793		 */
794		if (record->zi_nlanes >= UINT16_MAX)
795			return (SET_ERROR(EINVAL));
796	}
797
798	/*
799	 * If the supplied range was in bytes -- calculate the actual blkid
800	 */
801	if (flags & ZINJECT_CALC_RANGE) {
802		error = zio_calculate_range(name, record);
803		if (error != 0)
804			return (error);
805	}
806
807	if (!(flags & ZINJECT_NULL)) {
808		/*
809		 * spa_inject_ref() will add an injection reference, which will
810		 * prevent the pool from being removed from the namespace while
811		 * still allowing it to be unloaded.
812		 */
813		if ((spa = spa_inject_addref(name)) == NULL)
814			return (SET_ERROR(ENOENT));
815
816		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
817
818		handler->zi_spa = spa;
819		handler->zi_record = *record;
820
821		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
822			handler->zi_lanes = kmem_zalloc(
823			    sizeof (*handler->zi_lanes) *
824			    handler->zi_record.zi_nlanes, KM_SLEEP);
825			handler->zi_next_lane = 0;
826		} else {
827			handler->zi_lanes = NULL;
828			handler->zi_next_lane = 0;
829		}
830
831		rw_enter(&inject_lock, RW_WRITER);
832
833		/*
834		 * We can't move this increment into the conditional
835		 * above because we need to hold the RW_WRITER lock of
836		 * inject_lock, and we don't want to hold that while
837		 * allocating the handler's zi_lanes array.
838		 */
839		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
840			ASSERT3S(inject_delay_count, >=, 0);
841			inject_delay_count++;
842			ASSERT3S(inject_delay_count, >, 0);
843		}
844
845		*id = handler->zi_id = inject_next_id++;
846		list_insert_tail(&inject_handlers, handler);
847		atomic_inc_32(&zio_injection_enabled);
848
849		rw_exit(&inject_lock);
850	}
851
852	/*
853	 * Flush the ARC, so that any attempts to read this data will end up
854	 * going to the ZIO layer.  Note that this is a little overkill, but
855	 * we don't have the necessary ARC interfaces to do anything else, and
856	 * fault injection isn't a performance critical path.
857	 */
858	if (flags & ZINJECT_FLUSH_ARC)
859		/*
860		 * We must use FALSE to ensure arc_flush returns, since
861		 * we're not preventing concurrent ARC insertions.
862		 */
863		arc_flush(NULL, FALSE);
864
865	return (0);
866}
867
868/*
869 * Returns the next record with an ID greater than that supplied to the
870 * function.  Used to iterate over all handlers in the system.
871 */
872int
873zio_inject_list_next(int *id, char *name, size_t buflen,
874    zinject_record_t *record)
875{
876	inject_handler_t *handler;
877	int ret;
878
879	mutex_enter(&spa_namespace_lock);
880	rw_enter(&inject_lock, RW_READER);
881
882	for (handler = list_head(&inject_handlers); handler != NULL;
883	    handler = list_next(&inject_handlers, handler))
884		if (handler->zi_id > *id)
885			break;
886
887	if (handler) {
888		*record = handler->zi_record;
889		*id = handler->zi_id;
890		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
891		ret = 0;
892	} else {
893		ret = SET_ERROR(ENOENT);
894	}
895
896	rw_exit(&inject_lock);
897	mutex_exit(&spa_namespace_lock);
898
899	return (ret);
900}
901
902/*
903 * Clear the fault handler with the given identifier, or return ENOENT if none
904 * exists.
905 */
906int
907zio_clear_fault(int id)
908{
909	inject_handler_t *handler;
910
911	rw_enter(&inject_lock, RW_WRITER);
912
913	for (handler = list_head(&inject_handlers); handler != NULL;
914	    handler = list_next(&inject_handlers, handler))
915		if (handler->zi_id == id)
916			break;
917
918	if (handler == NULL) {
919		rw_exit(&inject_lock);
920		return (SET_ERROR(ENOENT));
921	}
922
923	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
924		ASSERT3S(inject_delay_count, >, 0);
925		inject_delay_count--;
926		ASSERT3S(inject_delay_count, >=, 0);
927	}
928
929	list_remove(&inject_handlers, handler);
930	rw_exit(&inject_lock);
931
932	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
933		ASSERT3P(handler->zi_lanes, !=, NULL);
934		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
935		    handler->zi_record.zi_nlanes);
936	} else {
937		ASSERT3P(handler->zi_lanes, ==, NULL);
938	}
939
940	spa_inject_delref(handler->zi_spa);
941	kmem_free(handler, sizeof (inject_handler_t));
942	atomic_dec_32(&zio_injection_enabled);
943
944	return (0);
945}
946
947void
948zio_inject_init(void)
949{
950	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
951	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
952	list_create(&inject_handlers, sizeof (inject_handler_t),
953	    offsetof(inject_handler_t, zi_link));
954}
955
956void
957zio_inject_fini(void)
958{
959	list_destroy(&inject_handlers);
960	mutex_destroy(&inject_delay_mtx);
961	rw_destroy(&inject_lock);
962}
963
964#if defined(_KERNEL)
965EXPORT_SYMBOL(zio_injection_enabled);
966EXPORT_SYMBOL(zio_inject_fault);
967EXPORT_SYMBOL(zio_inject_list_next);
968EXPORT_SYMBOL(zio_clear_fault);
969EXPORT_SYMBOL(zio_handle_fault_injection);
970EXPORT_SYMBOL(zio_handle_device_injection);
971EXPORT_SYMBOL(zio_handle_label_injection);
972#endif
973