1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23296510Smav * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26168404Spjd/*
27168404Spjd * ZFS fault injection
28168404Spjd *
29168404Spjd * To handle fault injection, we keep track of a series of zinject_record_t
30168404Spjd * structures which describe which logical block(s) should be injected with a
31168404Spjd * fault.  These are kept in a global list.  Each record corresponds to a given
32168404Spjd * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
33168404Spjd * or exported while the injection record exists.
34168404Spjd *
35168404Spjd * Device level injection is done using the 'zi_guid' field.  If this is set, it
36168404Spjd * means that the error is destined for a particular device, not a piece of
37168404Spjd * data.
38168404Spjd *
39168404Spjd * This is a rather poor data structure and algorithm, but we don't expect more
40168404Spjd * than a few faults at any one time, so it should be sufficient for our needs.
41168404Spjd */
42168404Spjd
43168404Spjd#include <sys/arc.h>
44168404Spjd#include <sys/zio_impl.h>
45168404Spjd#include <sys/zfs_ioctl.h>
46168404Spjd#include <sys/vdev_impl.h>
47219089Spjd#include <sys/dmu_objset.h>
48185029Spjd#include <sys/fs/zfs.h>
49168404Spjd
50168404Spjduint32_t zio_injection_enabled;
51168404Spjd
52296510Smav/*
53296510Smav * Data describing each zinject handler registered on the system, and
54296510Smav * contains the list node linking the handler in the global zinject
55296510Smav * handler list.
56296510Smav */
57168404Spjdtypedef struct inject_handler {
58168404Spjd	int			zi_id;
59168404Spjd	spa_t			*zi_spa;
60168404Spjd	zinject_record_t	zi_record;
61296510Smav	uint64_t		*zi_lanes;
62296510Smav	int			zi_next_lane;
63168404Spjd	list_node_t		zi_link;
64168404Spjd} inject_handler_t;
65168404Spjd
66296510Smav/*
67296510Smav * List of all zinject handlers registered on the system, protected by
68296510Smav * the inject_lock defined below.
69296510Smav */
70168404Spjdstatic list_t inject_handlers;
71296510Smav
72296510Smav/*
73296510Smav * This protects insertion into, and traversal of, the inject handler
74296510Smav * list defined above; as well as the inject_delay_count. Any time a
75296510Smav * handler is inserted or removed from the list, this lock should be
76296510Smav * taken as a RW_WRITER; and any time traversal is done over the list
77296510Smav * (without modification to it) this lock should be taken as a RW_READER.
78296510Smav */
79168404Spjdstatic krwlock_t inject_lock;
80296510Smav
81296510Smav/*
82296510Smav * This holds the number of zinject delay handlers that have been
83296510Smav * registered on the system. It is protected by the inject_lock defined
84296510Smav * above. Thus modifications to this count must be a RW_WRITER of the
85296510Smav * inject_lock, and reads of this count must be (at least) a RW_READER
86296510Smav * of the lock.
87296510Smav */
88296510Smavstatic int inject_delay_count = 0;
89296510Smav
90296510Smav/*
91296510Smav * This lock is used only in zio_handle_io_delay(), refer to the comment
92296510Smav * in that function for more details.
93296510Smav */
94296510Smavstatic kmutex_t inject_delay_mtx;
95296510Smav
96296510Smav/*
97296510Smav * Used to assign unique identifying numbers to each new zinject handler.
98296510Smav */
99168404Spjdstatic int inject_next_id = 1;
100168404Spjd
101168404Spjd/*
102168404Spjd * Returns true if the given record matches the I/O in progress.
103168404Spjd */
104168404Spjdstatic boolean_t
105268123Sdelphijzio_match_handler(zbookmark_phys_t *zb, uint64_t type,
106168404Spjd    zinject_record_t *record, int error)
107168404Spjd{
108168404Spjd	/*
109168404Spjd	 * Check for a match against the MOS, which is based on type
110168404Spjd	 */
111219089Spjd	if (zb->zb_objset == DMU_META_OBJSET &&
112219089Spjd	    record->zi_objset == DMU_META_OBJSET &&
113219089Spjd	    record->zi_object == DMU_META_DNODE_OBJECT) {
114168404Spjd		if (record->zi_type == DMU_OT_NONE ||
115168404Spjd		    type == record->zi_type)
116168404Spjd			return (record->zi_freq == 0 ||
117168404Spjd			    spa_get_random(100) < record->zi_freq);
118168404Spjd		else
119168404Spjd			return (B_FALSE);
120168404Spjd	}
121168404Spjd
122168404Spjd	/*
123168404Spjd	 * Check for an exact match.
124168404Spjd	 */
125168404Spjd	if (zb->zb_objset == record->zi_objset &&
126168404Spjd	    zb->zb_object == record->zi_object &&
127168404Spjd	    zb->zb_level == record->zi_level &&
128168404Spjd	    zb->zb_blkid >= record->zi_start &&
129168404Spjd	    zb->zb_blkid <= record->zi_end &&
130168404Spjd	    error == record->zi_error)
131168404Spjd		return (record->zi_freq == 0 ||
132168404Spjd		    spa_get_random(100) < record->zi_freq);
133168404Spjd
134168404Spjd	return (B_FALSE);
135168404Spjd}
136168404Spjd
137168404Spjd/*
138219089Spjd * Panic the system when a config change happens in the function
139219089Spjd * specified by tag.
140219089Spjd */
141219089Spjdvoid
142219089Spjdzio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
143219089Spjd{
144219089Spjd	inject_handler_t *handler;
145219089Spjd
146219089Spjd	rw_enter(&inject_lock, RW_READER);
147219089Spjd
148219089Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
149219089Spjd	    handler = list_next(&inject_handlers, handler)) {
150219089Spjd
151219089Spjd		if (spa != handler->zi_spa)
152219089Spjd			continue;
153219089Spjd
154219089Spjd		if (handler->zi_record.zi_type == type &&
155219089Spjd		    strcmp(tag, handler->zi_record.zi_func) == 0)
156219089Spjd			panic("Panic requested in function %s\n", tag);
157219089Spjd	}
158219089Spjd
159219089Spjd	rw_exit(&inject_lock);
160219089Spjd}
161219089Spjd
162219089Spjd/*
163168404Spjd * Determine if the I/O in question should return failure.  Returns the errno
164168404Spjd * to be returned to the caller.
165168404Spjd */
166168404Spjdint
167168404Spjdzio_handle_fault_injection(zio_t *zio, int error)
168168404Spjd{
169168404Spjd	int ret = 0;
170168404Spjd	inject_handler_t *handler;
171168404Spjd
172168404Spjd	/*
173168404Spjd	 * Ignore I/O not associated with any logical data.
174168404Spjd	 */
175168404Spjd	if (zio->io_logical == NULL)
176168404Spjd		return (0);
177168404Spjd
178168404Spjd	/*
179168404Spjd	 * Currently, we only support fault injection on reads.
180168404Spjd	 */
181168404Spjd	if (zio->io_type != ZIO_TYPE_READ)
182168404Spjd		return (0);
183168404Spjd
184168404Spjd	rw_enter(&inject_lock, RW_READER);
185168404Spjd
186168404Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
187168404Spjd	    handler = list_next(&inject_handlers, handler)) {
188168404Spjd
189247265Smm		if (zio->io_spa != handler->zi_spa ||
190247265Smm		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
191168404Spjd			continue;
192168404Spjd
193168404Spjd		/* If this handler matches, return EIO */
194168404Spjd		if (zio_match_handler(&zio->io_logical->io_bookmark,
195168404Spjd		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
196168404Spjd		    &handler->zi_record, error)) {
197168404Spjd			ret = error;
198168404Spjd			break;
199168404Spjd		}
200168404Spjd	}
201168404Spjd
202168404Spjd	rw_exit(&inject_lock);
203168404Spjd
204168404Spjd	return (ret);
205168404Spjd}
206168404Spjd
207185029Spjd/*
208185029Spjd * Determine if the zio is part of a label update and has an injection
209185029Spjd * handler associated with that portion of the label. Currently, we
210185029Spjd * allow error injection in either the nvlist or the uberblock region of
211185029Spjd * of the vdev label.
212185029Spjd */
213168404Spjdint
214185029Spjdzio_handle_label_injection(zio_t *zio, int error)
215185029Spjd{
216185029Spjd	inject_handler_t *handler;
217185029Spjd	vdev_t *vd = zio->io_vd;
218185029Spjd	uint64_t offset = zio->io_offset;
219185029Spjd	int label;
220185029Spjd	int ret = 0;
221185029Spjd
222219089Spjd	if (offset >= VDEV_LABEL_START_SIZE &&
223185029Spjd	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
224185029Spjd		return (0);
225185029Spjd
226185029Spjd	rw_enter(&inject_lock, RW_READER);
227185029Spjd
228185029Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
229185029Spjd	    handler = list_next(&inject_handlers, handler)) {
230185029Spjd		uint64_t start = handler->zi_record.zi_start;
231185029Spjd		uint64_t end = handler->zi_record.zi_end;
232185029Spjd
233247265Smm		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
234185029Spjd			continue;
235185029Spjd
236185029Spjd		/*
237185029Spjd		 * The injection region is the relative offsets within a
238185029Spjd		 * vdev label. We must determine the label which is being
239185029Spjd		 * updated and adjust our region accordingly.
240185029Spjd		 */
241185029Spjd		label = vdev_label_number(vd->vdev_psize, offset);
242185029Spjd		start = vdev_label_offset(vd->vdev_psize, label, start);
243185029Spjd		end = vdev_label_offset(vd->vdev_psize, label, end);
244185029Spjd
245185029Spjd		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
246185029Spjd		    (offset >= start && offset <= end)) {
247185029Spjd			ret = error;
248185029Spjd			break;
249185029Spjd		}
250185029Spjd	}
251185029Spjd	rw_exit(&inject_lock);
252185029Spjd	return (ret);
253185029Spjd}
254185029Spjd
255185029Spjd
256185029Spjdint
257213198Smmzio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
258168404Spjd{
259168404Spjd	inject_handler_t *handler;
260168404Spjd	int ret = 0;
261168404Spjd
262219089Spjd	/*
263219089Spjd	 * We skip over faults in the labels unless it's during
264219089Spjd	 * device open (i.e. zio == NULL).
265219089Spjd	 */
266219089Spjd	if (zio != NULL) {
267219089Spjd		uint64_t offset = zio->io_offset;
268219089Spjd
269219089Spjd		if (offset < VDEV_LABEL_START_SIZE ||
270219089Spjd		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
271219089Spjd			return (0);
272219089Spjd	}
273219089Spjd
274168404Spjd	rw_enter(&inject_lock, RW_READER);
275168404Spjd
276168404Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
277168404Spjd	    handler = list_next(&inject_handlers, handler)) {
278168404Spjd
279247265Smm		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
280185029Spjd			continue;
281185029Spjd
282168404Spjd		if (vd->vdev_guid == handler->zi_record.zi_guid) {
283213198Smm			if (handler->zi_record.zi_failfast &&
284213198Smm			    (zio == NULL || (zio->io_flags &
285213198Smm			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
286213198Smm				continue;
287213198Smm			}
288213198Smm
289219089Spjd			/* Handle type specific I/O failures */
290219089Spjd			if (zio != NULL &&
291219089Spjd			    handler->zi_record.zi_iotype != ZIO_TYPES &&
292219089Spjd			    handler->zi_record.zi_iotype != zio->io_type)
293219089Spjd				continue;
294219089Spjd
295168404Spjd			if (handler->zi_record.zi_error == error) {
296168404Spjd				/*
297168404Spjd				 * For a failed open, pretend like the device
298168404Spjd				 * has gone away.
299168404Spjd				 */
300168404Spjd				if (error == ENXIO)
301168404Spjd					vd->vdev_stat.vs_aux =
302168404Spjd					    VDEV_AUX_OPEN_FAILED;
303219089Spjd
304219089Spjd				/*
305219089Spjd				 * Treat these errors as if they had been
306219089Spjd				 * retried so that all the appropriate stats
307219089Spjd				 * and FMA events are generated.
308219089Spjd				 */
309219089Spjd				if (!handler->zi_record.zi_failfast &&
310219089Spjd				    zio != NULL)
311219089Spjd					zio->io_flags |= ZIO_FLAG_IO_RETRY;
312219089Spjd
313168404Spjd				ret = error;
314168404Spjd				break;
315168404Spjd			}
316168404Spjd			if (handler->zi_record.zi_error == ENXIO) {
317249195Smm				ret = SET_ERROR(EIO);
318168404Spjd				break;
319168404Spjd			}
320168404Spjd		}
321168404Spjd	}
322168404Spjd
323168404Spjd	rw_exit(&inject_lock);
324168404Spjd
325168404Spjd	return (ret);
326168404Spjd}
327168404Spjd
328168404Spjd/*
329219089Spjd * Simulate hardware that ignores cache flushes.  For requested number
330219089Spjd * of seconds nix the actual writing to disk.
331219089Spjd */
332219089Spjdvoid
333219089Spjdzio_handle_ignored_writes(zio_t *zio)
334219089Spjd{
335219089Spjd	inject_handler_t *handler;
336219089Spjd
337219089Spjd	rw_enter(&inject_lock, RW_READER);
338219089Spjd
339219089Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
340219089Spjd	    handler = list_next(&inject_handlers, handler)) {
341219089Spjd
342219089Spjd		/* Ignore errors not destined for this pool */
343247265Smm		if (zio->io_spa != handler->zi_spa ||
344247265Smm		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
345219089Spjd			continue;
346219089Spjd
347219089Spjd		/*
348219089Spjd		 * Positive duration implies # of seconds, negative
349219089Spjd		 * a number of txgs
350219089Spjd		 */
351219089Spjd		if (handler->zi_record.zi_timer == 0) {
352219089Spjd			if (handler->zi_record.zi_duration > 0)
353219089Spjd				handler->zi_record.zi_timer = ddi_get_lbolt64();
354219089Spjd			else
355219089Spjd				handler->zi_record.zi_timer = zio->io_txg;
356219089Spjd		}
357219089Spjd
358219089Spjd		/* Have a "problem" writing 60% of the time */
359219089Spjd		if (spa_get_random(100) < 60)
360219089Spjd			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
361219089Spjd		break;
362219089Spjd	}
363219089Spjd
364219089Spjd	rw_exit(&inject_lock);
365219089Spjd}
366219089Spjd
367219089Spjdvoid
368219089Spjdspa_handle_ignored_writes(spa_t *spa)
369219089Spjd{
370219089Spjd	inject_handler_t *handler;
371219089Spjd
372219089Spjd	if (zio_injection_enabled == 0)
373219089Spjd		return;
374219089Spjd
375219089Spjd	rw_enter(&inject_lock, RW_READER);
376219089Spjd
377219089Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
378219089Spjd	    handler = list_next(&inject_handlers, handler)) {
379219089Spjd
380247265Smm		if (spa != handler->zi_spa ||
381247265Smm		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
382219089Spjd			continue;
383219089Spjd
384219089Spjd		if (handler->zi_record.zi_duration > 0) {
385219089Spjd			VERIFY(handler->zi_record.zi_timer == 0 ||
386219089Spjd			    handler->zi_record.zi_timer +
387219089Spjd			    handler->zi_record.zi_duration * hz >
388219089Spjd			    ddi_get_lbolt64());
389219089Spjd		} else {
390219089Spjd			/* duration is negative so the subtraction here adds */
391219089Spjd			VERIFY(handler->zi_record.zi_timer == 0 ||
392219089Spjd			    handler->zi_record.zi_timer -
393219089Spjd			    handler->zi_record.zi_duration >=
394219089Spjd			    spa_syncing_txg(spa));
395219089Spjd		}
396219089Spjd	}
397219089Spjd
398219089Spjd	rw_exit(&inject_lock);
399219089Spjd}
400219089Spjd
401296510Smavhrtime_t
402247265Smmzio_handle_io_delay(zio_t *zio)
403247265Smm{
404247265Smm	vdev_t *vd = zio->io_vd;
405296510Smav	inject_handler_t *min_handler = NULL;
406296510Smav	hrtime_t min_target = 0;
407247265Smm
408296510Smav	rw_enter(&inject_lock, RW_READER);
409296510Smav
410296510Smav	/*
411296510Smav	 * inject_delay_count is a subset of zio_injection_enabled that
412296510Smav	 * is only incremented for delay handlers. These checks are
413296510Smav	 * mainly added to remind the reader why we're not explicitly
414296510Smav	 * checking zio_injection_enabled like the other functions.
415296510Smav	 */
416296510Smav	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
417296510Smav	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
418296510Smav
419296510Smav	/*
420296510Smav	 * If there aren't any inject delay handlers registered, then we
421296510Smav	 * can short circuit and simply return 0 here. A value of zero
422296510Smav	 * informs zio_delay_interrupt() that this request should not be
423296510Smav	 * delayed. This short circuit keeps us from acquiring the
424296510Smav	 * inject_delay_mutex unnecessarily.
425296510Smav	 */
426296510Smav	if (inject_delay_count == 0) {
427296510Smav		rw_exit(&inject_lock);
428247265Smm		return (0);
429296510Smav	}
430247265Smm
431296510Smav	/*
432296510Smav	 * Each inject handler has a number of "lanes" associated with
433296510Smav	 * it. Each lane is able to handle requests independently of one
434296510Smav	 * another, and at a latency defined by the inject handler
435296510Smav	 * record's zi_timer field. Thus if a handler in configured with
436296510Smav	 * a single lane with a 10ms latency, it will delay requests
437296510Smav	 * such that only a single request is completed every 10ms. So,
438296510Smav	 * if more than one request is attempted per each 10ms interval,
439296510Smav	 * the average latency of the requests will be greater than
440296510Smav	 * 10ms; but if only a single request is submitted each 10ms
441296510Smav	 * interval the average latency will be 10ms.
442296510Smav	 *
443296510Smav	 * We need to acquire this mutex to prevent multiple concurrent
444296510Smav	 * threads being assigned to the same lane of a given inject
445296510Smav	 * handler. The mutex allows us to perform the following two
446296510Smav	 * operations atomically:
447296510Smav	 *
448296510Smav	 *	1. determine the minimum handler and minimum target
449296510Smav	 *	   value of all the possible handlers
450296510Smav	 *	2. update that minimum handler's lane array
451296510Smav	 *
452296510Smav	 * Without atomicity, two (or more) threads could pick the same
453296510Smav	 * lane in step (1), and then conflict with each other in step
454296510Smav	 * (2). This could allow a single lane handler to process
455296510Smav	 * multiple requests simultaneously, which shouldn't be possible.
456296510Smav	 */
457296510Smav	mutex_enter(&inject_delay_mtx);
458247265Smm
459296510Smav	for (inject_handler_t *handler = list_head(&inject_handlers);
460296510Smav	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
461247265Smm		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
462247265Smm			continue;
463247265Smm
464296510Smav		if (vd->vdev_guid != handler->zi_record.zi_guid)
465296510Smav			continue;
466296510Smav
467296510Smav		/*
468296510Smav		 * Defensive; should never happen as the array allocation
469296510Smav		 * occurs prior to inserting this handler on the list.
470296510Smav		 */
471296510Smav		ASSERT3P(handler->zi_lanes, !=, NULL);
472296510Smav
473296510Smav		/*
474296510Smav		 * This should never happen, the zinject command should
475296510Smav		 * prevent a user from setting an IO delay with zero lanes.
476296510Smav		 */
477296510Smav		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
478296510Smav
479296510Smav		ASSERT3U(handler->zi_record.zi_nlanes, >,
480296510Smav		    handler->zi_next_lane);
481296510Smav
482296510Smav		/*
483296510Smav		 * We want to issue this IO to the lane that will become
484296510Smav		 * idle the soonest, so we compare the soonest this
485296510Smav		 * specific handler can complete the IO with all other
486296510Smav		 * handlers, to find the lowest value of all possible
487296510Smav		 * lanes. We then use this lane to submit the request.
488296510Smav		 *
489296510Smav		 * Since each handler has a constant value for its
490296510Smav		 * delay, we can just use the "next" lane for that
491296510Smav		 * handler; as it will always be the lane with the
492296510Smav		 * lowest value for that particular handler (i.e. the
493296510Smav		 * lane that will become idle the soonest). This saves a
494296510Smav		 * scan of each handler's lanes array.
495296510Smav		 *
496296510Smav		 * There's two cases to consider when determining when
497296510Smav		 * this specific IO request should complete. If this
498296510Smav		 * lane is idle, we want to "submit" the request now so
499296510Smav		 * it will complete after zi_timer milliseconds. Thus,
500296510Smav		 * we set the target to now + zi_timer.
501296510Smav		 *
502296510Smav		 * If the lane is busy, we want this request to complete
503296510Smav		 * zi_timer milliseconds after the lane becomes idle.
504296510Smav		 * Since the 'zi_lanes' array holds the time at which
505296510Smav		 * each lane will become idle, we use that value to
506296510Smav		 * determine when this request should complete.
507296510Smav		 */
508296510Smav		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
509296510Smav		hrtime_t busy = handler->zi_record.zi_timer +
510296510Smav		    handler->zi_lanes[handler->zi_next_lane];
511296510Smav		hrtime_t target = MAX(idle, busy);
512296510Smav
513296510Smav		if (min_handler == NULL) {
514296510Smav			min_handler = handler;
515296510Smav			min_target = target;
516296510Smav			continue;
517247265Smm		}
518247265Smm
519296510Smav		ASSERT3P(min_handler, !=, NULL);
520296510Smav		ASSERT3U(min_target, !=, 0);
521296510Smav
522296510Smav		/*
523296510Smav		 * We don't yet increment the "next lane" variable since
524296510Smav		 * we still might find a lower value lane in another
525296510Smav		 * handler during any remaining iterations. Once we're
526296510Smav		 * sure we've selected the absolute minimum, we'll claim
527296510Smav		 * the lane and increment the handler's "next lane"
528296510Smav		 * field below.
529296510Smav		 */
530296510Smav
531296510Smav		if (target < min_target) {
532296510Smav			min_handler = handler;
533296510Smav			min_target = target;
534296510Smav		}
535247265Smm	}
536296510Smav
537296510Smav	/*
538296510Smav	 * 'min_handler' will be NULL if no IO delays are registered for
539296510Smav	 * this vdev, otherwise it will point to the handler containing
540296510Smav	 * the lane that will become idle the soonest.
541296510Smav	 */
542296510Smav	if (min_handler != NULL) {
543296510Smav		ASSERT3U(min_target, !=, 0);
544296510Smav		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
545296510Smav
546296510Smav		/*
547296510Smav		 * If we've used all possible lanes for this handler,
548296510Smav		 * loop back and start using the first lane again;
549296510Smav		 * otherwise, just increment the lane index.
550296510Smav		 */
551296510Smav		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
552296510Smav		    min_handler->zi_record.zi_nlanes;
553296510Smav	}
554296510Smav
555296510Smav	mutex_exit(&inject_delay_mtx);
556247265Smm	rw_exit(&inject_lock);
557296510Smav
558296510Smav	return (min_target);
559247265Smm}
560247265Smm
561219089Spjd/*
562168404Spjd * Create a new handler for the given record.  We add it to the list, adding
563168404Spjd * a reference to the spa_t in the process.  We increment zio_injection_enabled,
564168404Spjd * which is the switch to trigger all fault injection.
565168404Spjd */
566168404Spjdint
567168404Spjdzio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
568168404Spjd{
569168404Spjd	inject_handler_t *handler;
570168404Spjd	int error;
571168404Spjd	spa_t *spa;
572168404Spjd
573168404Spjd	/*
574168404Spjd	 * If this is pool-wide metadata, make sure we unload the corresponding
575168404Spjd	 * spa_t, so that the next attempt to load it will trigger the fault.
576168404Spjd	 * We call spa_reset() to unload the pool appropriately.
577168404Spjd	 */
578168404Spjd	if (flags & ZINJECT_UNLOAD_SPA)
579168404Spjd		if ((error = spa_reset(name)) != 0)
580168404Spjd			return (error);
581168404Spjd
582296510Smav	if (record->zi_cmd == ZINJECT_DELAY_IO) {
583296510Smav		/*
584296510Smav		 * A value of zero for the number of lanes or for the
585296510Smav		 * delay time doesn't make sense.
586296510Smav		 */
587296510Smav		if (record->zi_timer == 0 || record->zi_nlanes == 0)
588296510Smav			return (SET_ERROR(EINVAL));
589296510Smav
590296510Smav		/*
591296510Smav		 * The number of lanes is directly mapped to the size of
592296510Smav		 * an array used by the handler. Thus, to ensure the
593296510Smav		 * user doesn't trigger an allocation that's "too large"
594296510Smav		 * we cap the number of lanes here.
595296510Smav		 */
596296510Smav		if (record->zi_nlanes >= UINT16_MAX)
597296510Smav			return (SET_ERROR(EINVAL));
598296510Smav	}
599296510Smav
600168404Spjd	if (!(flags & ZINJECT_NULL)) {
601168404Spjd		/*
602168404Spjd		 * spa_inject_ref() will add an injection reference, which will
603168404Spjd		 * prevent the pool from being removed from the namespace while
604168404Spjd		 * still allowing it to be unloaded.
605168404Spjd		 */
606168404Spjd		if ((spa = spa_inject_addref(name)) == NULL)
607249195Smm			return (SET_ERROR(ENOENT));
608168404Spjd
609168404Spjd		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
610168404Spjd
611296510Smav		handler->zi_spa = spa;
612296510Smav		handler->zi_record = *record;
613296510Smav
614296510Smav		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
615296510Smav			handler->zi_lanes = kmem_zalloc(
616296510Smav			    sizeof (*handler->zi_lanes) *
617296510Smav			    handler->zi_record.zi_nlanes, KM_SLEEP);
618296510Smav			handler->zi_next_lane = 0;
619296510Smav		} else {
620296510Smav			handler->zi_lanes = NULL;
621296510Smav			handler->zi_next_lane = 0;
622296510Smav		}
623296510Smav
624168404Spjd		rw_enter(&inject_lock, RW_WRITER);
625168404Spjd
626296510Smav		/*
627296510Smav		 * We can't move this increment into the conditional
628296510Smav		 * above because we need to hold the RW_WRITER lock of
629296510Smav		 * inject_lock, and we don't want to hold that while
630296510Smav		 * allocating the handler's zi_lanes array.
631296510Smav		 */
632296510Smav		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
633296510Smav			ASSERT3S(inject_delay_count, >=, 0);
634296510Smav			inject_delay_count++;
635296510Smav			ASSERT3S(inject_delay_count, >, 0);
636296510Smav		}
637296510Smav
638168404Spjd		*id = handler->zi_id = inject_next_id++;
639168404Spjd		list_insert_tail(&inject_handlers, handler);
640270247Sdelphij		atomic_inc_32(&zio_injection_enabled);
641168404Spjd
642168404Spjd		rw_exit(&inject_lock);
643168404Spjd	}
644168404Spjd
645168404Spjd	/*
646168404Spjd	 * Flush the ARC, so that any attempts to read this data will end up
647168404Spjd	 * going to the ZIO layer.  Note that this is a little overkill, but
648168404Spjd	 * we don't have the necessary ARC interfaces to do anything else, and
649168404Spjd	 * fault injection isn't a performance critical path.
650168404Spjd	 */
651168404Spjd	if (flags & ZINJECT_FLUSH_ARC)
652286763Smav		/*
653286763Smav		 * We must use FALSE to ensure arc_flush returns, since
654286763Smav		 * we're not preventing concurrent ARC insertions.
655286763Smav		 */
656286763Smav		arc_flush(NULL, FALSE);
657168404Spjd
658168404Spjd	return (0);
659168404Spjd}
660168404Spjd
661168404Spjd/*
662168404Spjd * Returns the next record with an ID greater than that supplied to the
663168404Spjd * function.  Used to iterate over all handlers in the system.
664168404Spjd */
665168404Spjdint
666168404Spjdzio_inject_list_next(int *id, char *name, size_t buflen,
667168404Spjd    zinject_record_t *record)
668168404Spjd{
669168404Spjd	inject_handler_t *handler;
670168404Spjd	int ret;
671168404Spjd
672168404Spjd	mutex_enter(&spa_namespace_lock);
673168404Spjd	rw_enter(&inject_lock, RW_READER);
674168404Spjd
675168404Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
676168404Spjd	    handler = list_next(&inject_handlers, handler))
677168404Spjd		if (handler->zi_id > *id)
678168404Spjd			break;
679168404Spjd
680168404Spjd	if (handler) {
681168404Spjd		*record = handler->zi_record;
682168404Spjd		*id = handler->zi_id;
683168404Spjd		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
684168404Spjd		ret = 0;
685168404Spjd	} else {
686249195Smm		ret = SET_ERROR(ENOENT);
687168404Spjd	}
688168404Spjd
689168404Spjd	rw_exit(&inject_lock);
690168404Spjd	mutex_exit(&spa_namespace_lock);
691168404Spjd
692168404Spjd	return (ret);
693168404Spjd}
694168404Spjd
695168404Spjd/*
696168404Spjd * Clear the fault handler with the given identifier, or return ENOENT if none
697168404Spjd * exists.
698168404Spjd */
699168404Spjdint
700168404Spjdzio_clear_fault(int id)
701168404Spjd{
702168404Spjd	inject_handler_t *handler;
703168404Spjd
704168404Spjd	rw_enter(&inject_lock, RW_WRITER);
705168404Spjd
706168404Spjd	for (handler = list_head(&inject_handlers); handler != NULL;
707168404Spjd	    handler = list_next(&inject_handlers, handler))
708168404Spjd		if (handler->zi_id == id)
709168404Spjd			break;
710168404Spjd
711168404Spjd	if (handler == NULL) {
712219089Spjd		rw_exit(&inject_lock);
713249195Smm		return (SET_ERROR(ENOENT));
714168404Spjd	}
715168404Spjd
716296510Smav	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
717296510Smav		ASSERT3S(inject_delay_count, >, 0);
718296510Smav		inject_delay_count--;
719296510Smav		ASSERT3S(inject_delay_count, >=, 0);
720296510Smav	}
721296510Smav
722219089Spjd	list_remove(&inject_handlers, handler);
723168404Spjd	rw_exit(&inject_lock);
724168404Spjd
725296510Smav	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
726296510Smav		ASSERT3P(handler->zi_lanes, !=, NULL);
727296510Smav		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
728296510Smav		    handler->zi_record.zi_nlanes);
729296510Smav	} else {
730296510Smav		ASSERT3P(handler->zi_lanes, ==, NULL);
731296510Smav	}
732296510Smav
733219089Spjd	spa_inject_delref(handler->zi_spa);
734219089Spjd	kmem_free(handler, sizeof (inject_handler_t));
735270247Sdelphij	atomic_dec_32(&zio_injection_enabled);
736219089Spjd
737219089Spjd	return (0);
738168404Spjd}
739168404Spjd
740168404Spjdvoid
741168404Spjdzio_inject_init(void)
742168404Spjd{
743185029Spjd	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
744296510Smav	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
745168404Spjd	list_create(&inject_handlers, sizeof (inject_handler_t),
746168404Spjd	    offsetof(inject_handler_t, zi_link));
747168404Spjd}
748168404Spjd
749168404Spjdvoid
750168404Spjdzio_inject_fini(void)
751168404Spjd{
752168404Spjd	list_destroy(&inject_handlers);
753296510Smav	mutex_destroy(&inject_delay_mtx);
754185029Spjd	rw_destroy(&inject_lock);
755168404Spjd}
756