1#!/bin/ksh -p 2 3# 4# CDDL HEADER START 5# 6# This file and its contents are supplied under the terms of the 7# Common Development and Distribution License ("CDDL"), version 1.0. 8# You may only use this file in accordance with the terms of version 9# 1.0 of the CDDL. 10# 11# A full copy of the text of the CDDL should have accompanied this 12# source. A copy of the CDDL is also available via the Internet at 13# http://www.illumos.org/license/CDDL. 14# 15# CDDL HEADER END 16# 17 18# 19# Copyright (c) 2017 by Intel Corporation. All rights reserved. 20# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 21# 22 23. $STF_SUITE/include/libtest.shlib 24. $STF_SUITE/tests/functional/fault/fault.cfg 25 26# 27# DESCRIPTION: 28# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when 29# multiple drives are faulted. 30# 31# STRATEGY: 32# 1. Create a pool with two hot spares 33# 2. Inject IO ERRORS with a zinject error handler on the first device 34# 3. Start a scrub 35# 4. Verify the ZED kicks in a hot spare and expected pool/device status 36# 5. Inject IO ERRORS on a second device 37# 6. Start a scrub 38# 7. Verify the ZED kicks in a second hot spare 39# 8. Clear the fault on both devices 40# 9. Verify the hot spares are available and expected pool/device status 41# 10. Rinse and repeat, this time faulting both devices at the same time 42# 43 44verify_runnable "both" 45 46function cleanup 47{ 48 log_must zinject -c all 49 destroy_pool $TESTPOOL 50 rm -f $DATA_DEVS $SPARE_DEVS 51} 52 53log_assert "ZED should be able to handle multiple faulted devices" 54log_onexit cleanup 55 56# Events not supported on FreeBSD 57if ! is_freebsd; then 58 # Clear events from previous runs 59 zed_events_drain 60fi 61 62FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" 63FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" 64SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" 65SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" 66SAFE_DEV3="$TEST_BASE_DIR/safe-dev3" 67SAFE_DEV4="$TEST_BASE_DIR/safe-dev4" 68DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" 69SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" 70SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" 71SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" 72 73for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do 74 if [ "$type" = "draid2:1s" ]; then 75 # 1. Create a dRAID pool with a distributed and traditional 76 # hot spare to provide test coverage for both configurations. 77 # 78 # Corruption is injected in the third and fourth vdevs 79 # since the dRAID permutation at these offsets maps to 80 # distributed spare space and not data devices. 81 # 82 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 83 log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 84 $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 85 spare $SPARE_DEV1 86 SPARE1=$SPARE_DEV1 87 SPARE2="draid2-0-0" 88 elif [ "$type" = "mirror" ]; then 89 # 1. Create a 3-way mirror pool with two hot spares 90 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 91 log_must zpool create -f $TESTPOOL $type \ 92 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 93 SPARE1=$SPARE_DEV1 94 SPARE2=$SPARE_DEV2 95 else 96 # 1. Create a raidz pool with two hot spares 97 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 98 log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 99 spare $SPARE_DEVS 100 SPARE1=$SPARE_DEV1 101 SPARE2=$SPARE_DEV2 102 fi 103 104 # 2. Inject IO ERRORS with a zinject error handler on the first device 105 log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL 106 107 # 3. Start a scrub 108 log_must zpool scrub $TESTPOOL 109 110 # 4. Verify the ZED kicks in a hot spare and the pool/device status 111 log_note "Wait for ZED to auto-spare" 112 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 113 log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 114 log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 115 log_must check_state $TESTPOOL "" "DEGRADED" 116 117 # 5. Inject IO ERRORS on a second device 118 log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL 119 120 # 6. Start a scrub 121 while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 122 sleep 1 123 done 124 log_must zpool scrub $TESTPOOL 125 126 # 7. Verify the ZED kicks in a second hot spare 127 log_note "Wait for ZED to auto-spare" 128 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 129 log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 130 log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 131 log_must check_state $TESTPOOL "" "DEGRADED" 132 133 while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 134 sleep 1 135 done 136 137 # 8. Clear the fault on both devices 138 log_must zinject -c all 139 log_must zpool clear $TESTPOOL $FAULT_DEV1 140 log_must zpool clear $TESTPOOL $FAULT_DEV2 141 142 # 9. Verify the hot spares are available and expected pool/device status 143 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 144 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 145 log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" 146 log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" 147 log_must check_state $TESTPOOL "" "ONLINE" 148 149 # Cleanup 150 cleanup 151done 152 153# Rinse and repeat, this time faulting both devices at the same time 154# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices 155# NOTE: "mirror" is a 3-way mirror here and should survive this test 156for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do 157 if [ "$type" = "draid2:1s" ]; then 158 # 1. Create a dRAID pool with a distributed and traditional 159 # hot spare to provide test coverage for both configurations. 160 # 161 # Corruption is injected in the third and fourth vdevs 162 # since the dRAID permutation at these offsets maps to 163 # distributed spare space and not data devices. 164 # 165 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 166 log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 167 $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 168 spare $SPARE_DEV1 169 SPARE1=$SPARE_DEV1 170 SPARE2="draid2-0-0" 171 elif [ "$type" = "mirror" ]; then 172 # 1. Create a 3-way mirror pool with two hot spares 173 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 174 log_must zpool create -f $TESTPOOL $type \ 175 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 176 SPARE1=$SPARE_DEV1 177 SPARE2=$SPARE_DEV2 178 else 179 # 1. Create a raidz pool with two hot spares 180 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 181 log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 182 spare $SPARE_DEVS 183 SPARE1=$SPARE_DEV1 184 SPARE2=$SPARE_DEV2 185 fi 186 187 # 2. Inject IO ERRORS with a zinject error handler on two devices 188 log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" 189 log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &" 190 191 # 3. Start a scrub 192 log_must zpool scrub $TESTPOOL 193 194 # 4. Verify the ZED kicks in two hot spares and the pool/device status 195 log_note "Wait for ZED to auto-spare" 196 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 197 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 198 log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 199 log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 200 log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 201 log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 202 log_must check_state $TESTPOOL "" "DEGRADED" 203 204 # 5. Clear the fault on both devices 205 log_must zinject -c all 206 log_must zpool clear $TESTPOOL $FAULT_DEV1 207 log_must zpool clear $TESTPOOL $FAULT_DEV2 208 209 # Cleanup 210 cleanup 211done 212 213log_pass "ZED successfully handles multiple faulted devices" 214