1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23296510Smav * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24168404Spjd */ 25168404Spjd 26168404Spjd/* 27168404Spjd * ZFS fault injection 28168404Spjd * 29168404Spjd * To handle fault injection, we keep track of a series of zinject_record_t 30168404Spjd * structures which describe which logical block(s) should be injected with a 31168404Spjd * fault. These are kept in a global list. Each record corresponds to a given 32168404Spjd * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 33168404Spjd * or exported while the injection record exists. 34168404Spjd * 35168404Spjd * Device level injection is done using the 'zi_guid' field. If this is set, it 36168404Spjd * means that the error is destined for a particular device, not a piece of 37168404Spjd * data. 38168404Spjd * 39168404Spjd * This is a rather poor data structure and algorithm, but we don't expect more 40168404Spjd * than a few faults at any one time, so it should be sufficient for our needs. 41168404Spjd */ 42168404Spjd 43168404Spjd#include <sys/arc.h> 44168404Spjd#include <sys/zio_impl.h> 45168404Spjd#include <sys/zfs_ioctl.h> 46168404Spjd#include <sys/vdev_impl.h> 47219089Spjd#include <sys/dmu_objset.h> 48185029Spjd#include <sys/fs/zfs.h> 49168404Spjd 50168404Spjduint32_t zio_injection_enabled; 51168404Spjd 52296510Smav/* 53296510Smav * Data describing each zinject handler registered on the system, and 54296510Smav * contains the list node linking the handler in the global zinject 55296510Smav * handler list. 56296510Smav */ 57168404Spjdtypedef struct inject_handler { 58168404Spjd int zi_id; 59168404Spjd spa_t *zi_spa; 60168404Spjd zinject_record_t zi_record; 61296510Smav uint64_t *zi_lanes; 62296510Smav int zi_next_lane; 63168404Spjd list_node_t zi_link; 64168404Spjd} inject_handler_t; 65168404Spjd 66296510Smav/* 67296510Smav * List of all zinject handlers registered on the system, protected by 68296510Smav * the inject_lock defined below. 69296510Smav */ 70168404Spjdstatic list_t inject_handlers; 71296510Smav 72296510Smav/* 73296510Smav * This protects insertion into, and traversal of, the inject handler 74296510Smav * list defined above; as well as the inject_delay_count. Any time a 75296510Smav * handler is inserted or removed from the list, this lock should be 76296510Smav * taken as a RW_WRITER; and any time traversal is done over the list 77296510Smav * (without modification to it) this lock should be taken as a RW_READER. 78296510Smav */ 79168404Spjdstatic krwlock_t inject_lock; 80296510Smav 81296510Smav/* 82296510Smav * This holds the number of zinject delay handlers that have been 83296510Smav * registered on the system. It is protected by the inject_lock defined 84296510Smav * above. Thus modifications to this count must be a RW_WRITER of the 85296510Smav * inject_lock, and reads of this count must be (at least) a RW_READER 86296510Smav * of the lock. 87296510Smav */ 88296510Smavstatic int inject_delay_count = 0; 89296510Smav 90296510Smav/* 91296510Smav * This lock is used only in zio_handle_io_delay(), refer to the comment 92296510Smav * in that function for more details. 93296510Smav */ 94296510Smavstatic kmutex_t inject_delay_mtx; 95296510Smav 96296510Smav/* 97296510Smav * Used to assign unique identifying numbers to each new zinject handler. 98296510Smav */ 99168404Spjdstatic int inject_next_id = 1; 100168404Spjd 101168404Spjd/* 102168404Spjd * Returns true if the given record matches the I/O in progress. 103168404Spjd */ 104168404Spjdstatic boolean_t 105268123Sdelphijzio_match_handler(zbookmark_phys_t *zb, uint64_t type, 106168404Spjd zinject_record_t *record, int error) 107168404Spjd{ 108168404Spjd /* 109168404Spjd * Check for a match against the MOS, which is based on type 110168404Spjd */ 111219089Spjd if (zb->zb_objset == DMU_META_OBJSET && 112219089Spjd record->zi_objset == DMU_META_OBJSET && 113219089Spjd record->zi_object == DMU_META_DNODE_OBJECT) { 114168404Spjd if (record->zi_type == DMU_OT_NONE || 115168404Spjd type == record->zi_type) 116168404Spjd return (record->zi_freq == 0 || 117168404Spjd spa_get_random(100) < record->zi_freq); 118168404Spjd else 119168404Spjd return (B_FALSE); 120168404Spjd } 121168404Spjd 122168404Spjd /* 123168404Spjd * Check for an exact match. 124168404Spjd */ 125168404Spjd if (zb->zb_objset == record->zi_objset && 126168404Spjd zb->zb_object == record->zi_object && 127168404Spjd zb->zb_level == record->zi_level && 128168404Spjd zb->zb_blkid >= record->zi_start && 129168404Spjd zb->zb_blkid <= record->zi_end && 130168404Spjd error == record->zi_error) 131168404Spjd return (record->zi_freq == 0 || 132168404Spjd spa_get_random(100) < record->zi_freq); 133168404Spjd 134168404Spjd return (B_FALSE); 135168404Spjd} 136168404Spjd 137168404Spjd/* 138219089Spjd * Panic the system when a config change happens in the function 139219089Spjd * specified by tag. 140219089Spjd */ 141219089Spjdvoid 142219089Spjdzio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) 143219089Spjd{ 144219089Spjd inject_handler_t *handler; 145219089Spjd 146219089Spjd rw_enter(&inject_lock, RW_READER); 147219089Spjd 148219089Spjd for (handler = list_head(&inject_handlers); handler != NULL; 149219089Spjd handler = list_next(&inject_handlers, handler)) { 150219089Spjd 151219089Spjd if (spa != handler->zi_spa) 152219089Spjd continue; 153219089Spjd 154219089Spjd if (handler->zi_record.zi_type == type && 155219089Spjd strcmp(tag, handler->zi_record.zi_func) == 0) 156219089Spjd panic("Panic requested in function %s\n", tag); 157219089Spjd } 158219089Spjd 159219089Spjd rw_exit(&inject_lock); 160219089Spjd} 161219089Spjd 162219089Spjd/* 163168404Spjd * Determine if the I/O in question should return failure. Returns the errno 164168404Spjd * to be returned to the caller. 165168404Spjd */ 166168404Spjdint 167168404Spjdzio_handle_fault_injection(zio_t *zio, int error) 168168404Spjd{ 169168404Spjd int ret = 0; 170168404Spjd inject_handler_t *handler; 171168404Spjd 172168404Spjd /* 173168404Spjd * Ignore I/O not associated with any logical data. 174168404Spjd */ 175168404Spjd if (zio->io_logical == NULL) 176168404Spjd return (0); 177168404Spjd 178168404Spjd /* 179168404Spjd * Currently, we only support fault injection on reads. 180168404Spjd */ 181168404Spjd if (zio->io_type != ZIO_TYPE_READ) 182168404Spjd return (0); 183168404Spjd 184168404Spjd rw_enter(&inject_lock, RW_READER); 185168404Spjd 186168404Spjd for (handler = list_head(&inject_handlers); handler != NULL; 187168404Spjd handler = list_next(&inject_handlers, handler)) { 188168404Spjd 189247265Smm if (zio->io_spa != handler->zi_spa || 190247265Smm handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 191168404Spjd continue; 192168404Spjd 193168404Spjd /* If this handler matches, return EIO */ 194168404Spjd if (zio_match_handler(&zio->io_logical->io_bookmark, 195168404Spjd zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 196168404Spjd &handler->zi_record, error)) { 197168404Spjd ret = error; 198168404Spjd break; 199168404Spjd } 200168404Spjd } 201168404Spjd 202168404Spjd rw_exit(&inject_lock); 203168404Spjd 204168404Spjd return (ret); 205168404Spjd} 206168404Spjd 207185029Spjd/* 208185029Spjd * Determine if the zio is part of a label update and has an injection 209185029Spjd * handler associated with that portion of the label. Currently, we 210185029Spjd * allow error injection in either the nvlist or the uberblock region of 211185029Spjd * of the vdev label. 212185029Spjd */ 213168404Spjdint 214185029Spjdzio_handle_label_injection(zio_t *zio, int error) 215185029Spjd{ 216185029Spjd inject_handler_t *handler; 217185029Spjd vdev_t *vd = zio->io_vd; 218185029Spjd uint64_t offset = zio->io_offset; 219185029Spjd int label; 220185029Spjd int ret = 0; 221185029Spjd 222219089Spjd if (offset >= VDEV_LABEL_START_SIZE && 223185029Spjd offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 224185029Spjd return (0); 225185029Spjd 226185029Spjd rw_enter(&inject_lock, RW_READER); 227185029Spjd 228185029Spjd for (handler = list_head(&inject_handlers); handler != NULL; 229185029Spjd handler = list_next(&inject_handlers, handler)) { 230185029Spjd uint64_t start = handler->zi_record.zi_start; 231185029Spjd uint64_t end = handler->zi_record.zi_end; 232185029Spjd 233247265Smm if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 234185029Spjd continue; 235185029Spjd 236185029Spjd /* 237185029Spjd * The injection region is the relative offsets within a 238185029Spjd * vdev label. We must determine the label which is being 239185029Spjd * updated and adjust our region accordingly. 240185029Spjd */ 241185029Spjd label = vdev_label_number(vd->vdev_psize, offset); 242185029Spjd start = vdev_label_offset(vd->vdev_psize, label, start); 243185029Spjd end = vdev_label_offset(vd->vdev_psize, label, end); 244185029Spjd 245185029Spjd if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 246185029Spjd (offset >= start && offset <= end)) { 247185029Spjd ret = error; 248185029Spjd break; 249185029Spjd } 250185029Spjd } 251185029Spjd rw_exit(&inject_lock); 252185029Spjd return (ret); 253185029Spjd} 254185029Spjd 255185029Spjd 256185029Spjdint 257213198Smmzio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 258168404Spjd{ 259168404Spjd inject_handler_t *handler; 260168404Spjd int ret = 0; 261168404Spjd 262219089Spjd /* 263219089Spjd * We skip over faults in the labels unless it's during 264219089Spjd * device open (i.e. zio == NULL). 265219089Spjd */ 266219089Spjd if (zio != NULL) { 267219089Spjd uint64_t offset = zio->io_offset; 268219089Spjd 269219089Spjd if (offset < VDEV_LABEL_START_SIZE || 270219089Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 271219089Spjd return (0); 272219089Spjd } 273219089Spjd 274168404Spjd rw_enter(&inject_lock, RW_READER); 275168404Spjd 276168404Spjd for (handler = list_head(&inject_handlers); handler != NULL; 277168404Spjd handler = list_next(&inject_handlers, handler)) { 278168404Spjd 279247265Smm if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 280185029Spjd continue; 281185029Spjd 282168404Spjd if (vd->vdev_guid == handler->zi_record.zi_guid) { 283213198Smm if (handler->zi_record.zi_failfast && 284213198Smm (zio == NULL || (zio->io_flags & 285213198Smm (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 286213198Smm continue; 287213198Smm } 288213198Smm 289219089Spjd /* Handle type specific I/O failures */ 290219089Spjd if (zio != NULL && 291219089Spjd handler->zi_record.zi_iotype != ZIO_TYPES && 292219089Spjd handler->zi_record.zi_iotype != zio->io_type) 293219089Spjd continue; 294219089Spjd 295168404Spjd if (handler->zi_record.zi_error == error) { 296168404Spjd /* 297168404Spjd * For a failed open, pretend like the device 298168404Spjd * has gone away. 299168404Spjd */ 300168404Spjd if (error == ENXIO) 301168404Spjd vd->vdev_stat.vs_aux = 302168404Spjd VDEV_AUX_OPEN_FAILED; 303219089Spjd 304219089Spjd /* 305219089Spjd * Treat these errors as if they had been 306219089Spjd * retried so that all the appropriate stats 307219089Spjd * and FMA events are generated. 308219089Spjd */ 309219089Spjd if (!handler->zi_record.zi_failfast && 310219089Spjd zio != NULL) 311219089Spjd zio->io_flags |= ZIO_FLAG_IO_RETRY; 312219089Spjd 313168404Spjd ret = error; 314168404Spjd break; 315168404Spjd } 316168404Spjd if (handler->zi_record.zi_error == ENXIO) { 317249195Smm ret = SET_ERROR(EIO); 318168404Spjd break; 319168404Spjd } 320168404Spjd } 321168404Spjd } 322168404Spjd 323168404Spjd rw_exit(&inject_lock); 324168404Spjd 325168404Spjd return (ret); 326168404Spjd} 327168404Spjd 328168404Spjd/* 329219089Spjd * Simulate hardware that ignores cache flushes. For requested number 330219089Spjd * of seconds nix the actual writing to disk. 331219089Spjd */ 332219089Spjdvoid 333219089Spjdzio_handle_ignored_writes(zio_t *zio) 334219089Spjd{ 335219089Spjd inject_handler_t *handler; 336219089Spjd 337219089Spjd rw_enter(&inject_lock, RW_READER); 338219089Spjd 339219089Spjd for (handler = list_head(&inject_handlers); handler != NULL; 340219089Spjd handler = list_next(&inject_handlers, handler)) { 341219089Spjd 342219089Spjd /* Ignore errors not destined for this pool */ 343247265Smm if (zio->io_spa != handler->zi_spa || 344247265Smm handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 345219089Spjd continue; 346219089Spjd 347219089Spjd /* 348219089Spjd * Positive duration implies # of seconds, negative 349219089Spjd * a number of txgs 350219089Spjd */ 351219089Spjd if (handler->zi_record.zi_timer == 0) { 352219089Spjd if (handler->zi_record.zi_duration > 0) 353219089Spjd handler->zi_record.zi_timer = ddi_get_lbolt64(); 354219089Spjd else 355219089Spjd handler->zi_record.zi_timer = zio->io_txg; 356219089Spjd } 357219089Spjd 358219089Spjd /* Have a "problem" writing 60% of the time */ 359219089Spjd if (spa_get_random(100) < 60) 360219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 361219089Spjd break; 362219089Spjd } 363219089Spjd 364219089Spjd rw_exit(&inject_lock); 365219089Spjd} 366219089Spjd 367219089Spjdvoid 368219089Spjdspa_handle_ignored_writes(spa_t *spa) 369219089Spjd{ 370219089Spjd inject_handler_t *handler; 371219089Spjd 372219089Spjd if (zio_injection_enabled == 0) 373219089Spjd return; 374219089Spjd 375219089Spjd rw_enter(&inject_lock, RW_READER); 376219089Spjd 377219089Spjd for (handler = list_head(&inject_handlers); handler != NULL; 378219089Spjd handler = list_next(&inject_handlers, handler)) { 379219089Spjd 380247265Smm if (spa != handler->zi_spa || 381247265Smm handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 382219089Spjd continue; 383219089Spjd 384219089Spjd if (handler->zi_record.zi_duration > 0) { 385219089Spjd VERIFY(handler->zi_record.zi_timer == 0 || 386219089Spjd handler->zi_record.zi_timer + 387219089Spjd handler->zi_record.zi_duration * hz > 388219089Spjd ddi_get_lbolt64()); 389219089Spjd } else { 390219089Spjd /* duration is negative so the subtraction here adds */ 391219089Spjd VERIFY(handler->zi_record.zi_timer == 0 || 392219089Spjd handler->zi_record.zi_timer - 393219089Spjd handler->zi_record.zi_duration >= 394219089Spjd spa_syncing_txg(spa)); 395219089Spjd } 396219089Spjd } 397219089Spjd 398219089Spjd rw_exit(&inject_lock); 399219089Spjd} 400219089Spjd 401296510Smavhrtime_t 402247265Smmzio_handle_io_delay(zio_t *zio) 403247265Smm{ 404247265Smm vdev_t *vd = zio->io_vd; 405296510Smav inject_handler_t *min_handler = NULL; 406296510Smav hrtime_t min_target = 0; 407247265Smm 408296510Smav rw_enter(&inject_lock, RW_READER); 409296510Smav 410296510Smav /* 411296510Smav * inject_delay_count is a subset of zio_injection_enabled that 412296510Smav * is only incremented for delay handlers. These checks are 413296510Smav * mainly added to remind the reader why we're not explicitly 414296510Smav * checking zio_injection_enabled like the other functions. 415296510Smav */ 416296510Smav IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 417296510Smav IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 418296510Smav 419296510Smav /* 420296510Smav * If there aren't any inject delay handlers registered, then we 421296510Smav * can short circuit and simply return 0 here. A value of zero 422296510Smav * informs zio_delay_interrupt() that this request should not be 423296510Smav * delayed. This short circuit keeps us from acquiring the 424296510Smav * inject_delay_mutex unnecessarily. 425296510Smav */ 426296510Smav if (inject_delay_count == 0) { 427296510Smav rw_exit(&inject_lock); 428247265Smm return (0); 429296510Smav } 430247265Smm 431296510Smav /* 432296510Smav * Each inject handler has a number of "lanes" associated with 433296510Smav * it. Each lane is able to handle requests independently of one 434296510Smav * another, and at a latency defined by the inject handler 435296510Smav * record's zi_timer field. Thus if a handler in configured with 436296510Smav * a single lane with a 10ms latency, it will delay requests 437296510Smav * such that only a single request is completed every 10ms. So, 438296510Smav * if more than one request is attempted per each 10ms interval, 439296510Smav * the average latency of the requests will be greater than 440296510Smav * 10ms; but if only a single request is submitted each 10ms 441296510Smav * interval the average latency will be 10ms. 442296510Smav * 443296510Smav * We need to acquire this mutex to prevent multiple concurrent 444296510Smav * threads being assigned to the same lane of a given inject 445296510Smav * handler. The mutex allows us to perform the following two 446296510Smav * operations atomically: 447296510Smav * 448296510Smav * 1. determine the minimum handler and minimum target 449296510Smav * value of all the possible handlers 450296510Smav * 2. update that minimum handler's lane array 451296510Smav * 452296510Smav * Without atomicity, two (or more) threads could pick the same 453296510Smav * lane in step (1), and then conflict with each other in step 454296510Smav * (2). This could allow a single lane handler to process 455296510Smav * multiple requests simultaneously, which shouldn't be possible. 456296510Smav */ 457296510Smav mutex_enter(&inject_delay_mtx); 458247265Smm 459296510Smav for (inject_handler_t *handler = list_head(&inject_handlers); 460296510Smav handler != NULL; handler = list_next(&inject_handlers, handler)) { 461247265Smm if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 462247265Smm continue; 463247265Smm 464296510Smav if (vd->vdev_guid != handler->zi_record.zi_guid) 465296510Smav continue; 466296510Smav 467296510Smav /* 468296510Smav * Defensive; should never happen as the array allocation 469296510Smav * occurs prior to inserting this handler on the list. 470296510Smav */ 471296510Smav ASSERT3P(handler->zi_lanes, !=, NULL); 472296510Smav 473296510Smav /* 474296510Smav * This should never happen, the zinject command should 475296510Smav * prevent a user from setting an IO delay with zero lanes. 476296510Smav */ 477296510Smav ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 478296510Smav 479296510Smav ASSERT3U(handler->zi_record.zi_nlanes, >, 480296510Smav handler->zi_next_lane); 481296510Smav 482296510Smav /* 483296510Smav * We want to issue this IO to the lane that will become 484296510Smav * idle the soonest, so we compare the soonest this 485296510Smav * specific handler can complete the IO with all other 486296510Smav * handlers, to find the lowest value of all possible 487296510Smav * lanes. We then use this lane to submit the request. 488296510Smav * 489296510Smav * Since each handler has a constant value for its 490296510Smav * delay, we can just use the "next" lane for that 491296510Smav * handler; as it will always be the lane with the 492296510Smav * lowest value for that particular handler (i.e. the 493296510Smav * lane that will become idle the soonest). This saves a 494296510Smav * scan of each handler's lanes array. 495296510Smav * 496296510Smav * There's two cases to consider when determining when 497296510Smav * this specific IO request should complete. If this 498296510Smav * lane is idle, we want to "submit" the request now so 499296510Smav * it will complete after zi_timer milliseconds. Thus, 500296510Smav * we set the target to now + zi_timer. 501296510Smav * 502296510Smav * If the lane is busy, we want this request to complete 503296510Smav * zi_timer milliseconds after the lane becomes idle. 504296510Smav * Since the 'zi_lanes' array holds the time at which 505296510Smav * each lane will become idle, we use that value to 506296510Smav * determine when this request should complete. 507296510Smav */ 508296510Smav hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 509296510Smav hrtime_t busy = handler->zi_record.zi_timer + 510296510Smav handler->zi_lanes[handler->zi_next_lane]; 511296510Smav hrtime_t target = MAX(idle, busy); 512296510Smav 513296510Smav if (min_handler == NULL) { 514296510Smav min_handler = handler; 515296510Smav min_target = target; 516296510Smav continue; 517247265Smm } 518247265Smm 519296510Smav ASSERT3P(min_handler, !=, NULL); 520296510Smav ASSERT3U(min_target, !=, 0); 521296510Smav 522296510Smav /* 523296510Smav * We don't yet increment the "next lane" variable since 524296510Smav * we still might find a lower value lane in another 525296510Smav * handler during any remaining iterations. Once we're 526296510Smav * sure we've selected the absolute minimum, we'll claim 527296510Smav * the lane and increment the handler's "next lane" 528296510Smav * field below. 529296510Smav */ 530296510Smav 531296510Smav if (target < min_target) { 532296510Smav min_handler = handler; 533296510Smav min_target = target; 534296510Smav } 535247265Smm } 536296510Smav 537296510Smav /* 538296510Smav * 'min_handler' will be NULL if no IO delays are registered for 539296510Smav * this vdev, otherwise it will point to the handler containing 540296510Smav * the lane that will become idle the soonest. 541296510Smav */ 542296510Smav if (min_handler != NULL) { 543296510Smav ASSERT3U(min_target, !=, 0); 544296510Smav min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 545296510Smav 546296510Smav /* 547296510Smav * If we've used all possible lanes for this handler, 548296510Smav * loop back and start using the first lane again; 549296510Smav * otherwise, just increment the lane index. 550296510Smav */ 551296510Smav min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 552296510Smav min_handler->zi_record.zi_nlanes; 553296510Smav } 554296510Smav 555296510Smav mutex_exit(&inject_delay_mtx); 556247265Smm rw_exit(&inject_lock); 557296510Smav 558296510Smav return (min_target); 559247265Smm} 560247265Smm 561219089Spjd/* 562168404Spjd * Create a new handler for the given record. We add it to the list, adding 563168404Spjd * a reference to the spa_t in the process. We increment zio_injection_enabled, 564168404Spjd * which is the switch to trigger all fault injection. 565168404Spjd */ 566168404Spjdint 567168404Spjdzio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 568168404Spjd{ 569168404Spjd inject_handler_t *handler; 570168404Spjd int error; 571168404Spjd spa_t *spa; 572168404Spjd 573168404Spjd /* 574168404Spjd * If this is pool-wide metadata, make sure we unload the corresponding 575168404Spjd * spa_t, so that the next attempt to load it will trigger the fault. 576168404Spjd * We call spa_reset() to unload the pool appropriately. 577168404Spjd */ 578168404Spjd if (flags & ZINJECT_UNLOAD_SPA) 579168404Spjd if ((error = spa_reset(name)) != 0) 580168404Spjd return (error); 581168404Spjd 582296510Smav if (record->zi_cmd == ZINJECT_DELAY_IO) { 583296510Smav /* 584296510Smav * A value of zero for the number of lanes or for the 585296510Smav * delay time doesn't make sense. 586296510Smav */ 587296510Smav if (record->zi_timer == 0 || record->zi_nlanes == 0) 588296510Smav return (SET_ERROR(EINVAL)); 589296510Smav 590296510Smav /* 591296510Smav * The number of lanes is directly mapped to the size of 592296510Smav * an array used by the handler. Thus, to ensure the 593296510Smav * user doesn't trigger an allocation that's "too large" 594296510Smav * we cap the number of lanes here. 595296510Smav */ 596296510Smav if (record->zi_nlanes >= UINT16_MAX) 597296510Smav return (SET_ERROR(EINVAL)); 598296510Smav } 599296510Smav 600168404Spjd if (!(flags & ZINJECT_NULL)) { 601168404Spjd /* 602168404Spjd * spa_inject_ref() will add an injection reference, which will 603168404Spjd * prevent the pool from being removed from the namespace while 604168404Spjd * still allowing it to be unloaded. 605168404Spjd */ 606168404Spjd if ((spa = spa_inject_addref(name)) == NULL) 607249195Smm return (SET_ERROR(ENOENT)); 608168404Spjd 609168404Spjd handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 610168404Spjd 611296510Smav handler->zi_spa = spa; 612296510Smav handler->zi_record = *record; 613296510Smav 614296510Smav if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 615296510Smav handler->zi_lanes = kmem_zalloc( 616296510Smav sizeof (*handler->zi_lanes) * 617296510Smav handler->zi_record.zi_nlanes, KM_SLEEP); 618296510Smav handler->zi_next_lane = 0; 619296510Smav } else { 620296510Smav handler->zi_lanes = NULL; 621296510Smav handler->zi_next_lane = 0; 622296510Smav } 623296510Smav 624168404Spjd rw_enter(&inject_lock, RW_WRITER); 625168404Spjd 626296510Smav /* 627296510Smav * We can't move this increment into the conditional 628296510Smav * above because we need to hold the RW_WRITER lock of 629296510Smav * inject_lock, and we don't want to hold that while 630296510Smav * allocating the handler's zi_lanes array. 631296510Smav */ 632296510Smav if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 633296510Smav ASSERT3S(inject_delay_count, >=, 0); 634296510Smav inject_delay_count++; 635296510Smav ASSERT3S(inject_delay_count, >, 0); 636296510Smav } 637296510Smav 638168404Spjd *id = handler->zi_id = inject_next_id++; 639168404Spjd list_insert_tail(&inject_handlers, handler); 640270247Sdelphij atomic_inc_32(&zio_injection_enabled); 641168404Spjd 642168404Spjd rw_exit(&inject_lock); 643168404Spjd } 644168404Spjd 645168404Spjd /* 646168404Spjd * Flush the ARC, so that any attempts to read this data will end up 647168404Spjd * going to the ZIO layer. Note that this is a little overkill, but 648168404Spjd * we don't have the necessary ARC interfaces to do anything else, and 649168404Spjd * fault injection isn't a performance critical path. 650168404Spjd */ 651168404Spjd if (flags & ZINJECT_FLUSH_ARC) 652286763Smav /* 653286763Smav * We must use FALSE to ensure arc_flush returns, since 654286763Smav * we're not preventing concurrent ARC insertions. 655286763Smav */ 656286763Smav arc_flush(NULL, FALSE); 657168404Spjd 658168404Spjd return (0); 659168404Spjd} 660168404Spjd 661168404Spjd/* 662168404Spjd * Returns the next record with an ID greater than that supplied to the 663168404Spjd * function. Used to iterate over all handlers in the system. 664168404Spjd */ 665168404Spjdint 666168404Spjdzio_inject_list_next(int *id, char *name, size_t buflen, 667168404Spjd zinject_record_t *record) 668168404Spjd{ 669168404Spjd inject_handler_t *handler; 670168404Spjd int ret; 671168404Spjd 672168404Spjd mutex_enter(&spa_namespace_lock); 673168404Spjd rw_enter(&inject_lock, RW_READER); 674168404Spjd 675168404Spjd for (handler = list_head(&inject_handlers); handler != NULL; 676168404Spjd handler = list_next(&inject_handlers, handler)) 677168404Spjd if (handler->zi_id > *id) 678168404Spjd break; 679168404Spjd 680168404Spjd if (handler) { 681168404Spjd *record = handler->zi_record; 682168404Spjd *id = handler->zi_id; 683168404Spjd (void) strncpy(name, spa_name(handler->zi_spa), buflen); 684168404Spjd ret = 0; 685168404Spjd } else { 686249195Smm ret = SET_ERROR(ENOENT); 687168404Spjd } 688168404Spjd 689168404Spjd rw_exit(&inject_lock); 690168404Spjd mutex_exit(&spa_namespace_lock); 691168404Spjd 692168404Spjd return (ret); 693168404Spjd} 694168404Spjd 695168404Spjd/* 696168404Spjd * Clear the fault handler with the given identifier, or return ENOENT if none 697168404Spjd * exists. 698168404Spjd */ 699168404Spjdint 700168404Spjdzio_clear_fault(int id) 701168404Spjd{ 702168404Spjd inject_handler_t *handler; 703168404Spjd 704168404Spjd rw_enter(&inject_lock, RW_WRITER); 705168404Spjd 706168404Spjd for (handler = list_head(&inject_handlers); handler != NULL; 707168404Spjd handler = list_next(&inject_handlers, handler)) 708168404Spjd if (handler->zi_id == id) 709168404Spjd break; 710168404Spjd 711168404Spjd if (handler == NULL) { 712219089Spjd rw_exit(&inject_lock); 713249195Smm return (SET_ERROR(ENOENT)); 714168404Spjd } 715168404Spjd 716296510Smav if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 717296510Smav ASSERT3S(inject_delay_count, >, 0); 718296510Smav inject_delay_count--; 719296510Smav ASSERT3S(inject_delay_count, >=, 0); 720296510Smav } 721296510Smav 722219089Spjd list_remove(&inject_handlers, handler); 723168404Spjd rw_exit(&inject_lock); 724168404Spjd 725296510Smav if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 726296510Smav ASSERT3P(handler->zi_lanes, !=, NULL); 727296510Smav kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 728296510Smav handler->zi_record.zi_nlanes); 729296510Smav } else { 730296510Smav ASSERT3P(handler->zi_lanes, ==, NULL); 731296510Smav } 732296510Smav 733219089Spjd spa_inject_delref(handler->zi_spa); 734219089Spjd kmem_free(handler, sizeof (inject_handler_t)); 735270247Sdelphij atomic_dec_32(&zio_injection_enabled); 736219089Spjd 737219089Spjd return (0); 738168404Spjd} 739168404Spjd 740168404Spjdvoid 741168404Spjdzio_inject_init(void) 742168404Spjd{ 743185029Spjd rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 744296510Smav mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 745168404Spjd list_create(&inject_handlers, sizeof (inject_handler_t), 746168404Spjd offsetof(inject_handler_t, zi_link)); 747168404Spjd} 748168404Spjd 749168404Spjdvoid 750168404Spjdzio_inject_fini(void) 751168404Spjd{ 752168404Spjd list_destroy(&inject_handlers); 753296510Smav mutex_destroy(&inject_delay_mtx); 754185029Spjd rw_destroy(&inject_lock); 755168404Spjd} 756