167761Smsmith#!/bin/ksh -p 278915Smsmith 367761Smsmith# 467761Smsmith# CDDL HEADER START 567761Smsmith# 667761Smsmith# This file and its contents are supplied under the terms of the 767761Smsmith# Common Development and Distribution License ("CDDL"), version 1.0. 867761Smsmith# You may only use this file in accordance with the terms of version 967761Smsmith# 1.0 of the CDDL. 1067761Smsmith# 1167761Smsmith# A full copy of the text of the CDDL should have accompanied this 1267761Smsmith# source. A copy of the CDDL is also available via the Internet at 1367761Smsmith# http://www.illumos.org/license/CDDL. 1467761Smsmith# 1567761Smsmith# CDDL HEADER END 1667761Smsmith# 1767761Smsmith 1867761Smsmith# 1967761Smsmith# Copyright (c) 2017 by Intel Corporation. All rights reserved. 2067761Smsmith# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 2167761Smsmith# 2267761Smsmith 2367761Smsmith. $STF_SUITE/include/libtest.shlib 2467761Smsmith. $STF_SUITE/tests/functional/fault/fault.cfg 2567761Smsmith 2667761Smsmith# 2767761Smsmith# DESCRIPTION: 28119418Sobrien# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when 29119418Sobrien# multiple drives are faulted. 30119418Sobrien# 3167761Smsmith# STRATEGY: 3267761Smsmith# 1. Create a pool with two hot spares 3367761Smsmith# 2. Inject IO ERRORS with a zinject error handler on the first device 34133624Snjl# 3. Start a scrub 35148138Sume# 4. Verify the ZED kicks in a hot spare and expected pool/device status 3691126Smsmith# 5. Inject IO ERRORS on a second device 37133624Snjl# 6. Start a scrub 38129879Sphk# 7. Verify the ZED kicks in a second hot spare 3967761Smsmith# 8. Clear the fault on both devices 4091126Smsmith# 9. Verify the hot spares are available and expected pool/device status 41128991Snjl# 10. Rinse and repeat, this time faulting both devices at the same time 4279283Smsmith# 4391126Smsmith 4491640Siwasakiverify_runnable "both" 4567761Smsmith 46148138Sumefunction cleanup 47148138Sume{ 48193530Sjkim log_must zinject -c all 49193530Sjkim destroy_pool $TESTPOOL 50193530Sjkim rm -f $DATA_DEVS $SPARE_DEVS 5167761Smsmith} 5267761Smsmith 53119529Snjllog_assert "ZED should be able to handle multiple faulted devices" 5478999Smsmithlog_onexit cleanup 5591126Smsmith 5669744Smsmith# Events not supported on FreeBSD 5771874Smsmithif ! is_freebsd; then 58160657Snjl # Clear events from previous runs 5967761Smsmith zed_events_drain 60125366Snjlfi 61125366Snjl 62125366SnjlFAULT_DEV1="$TEST_BASE_DIR/fault-dev1" 63125366SnjlFAULT_DEV2="$TEST_BASE_DIR/fault-dev2" 6478915SmsmithSAFE_DEV1="$TEST_BASE_DIR/safe-dev1" 65125335SnjlSAFE_DEV2="$TEST_BASE_DIR/safe-dev2" 66125335SnjlSAFE_DEV3="$TEST_BASE_DIR/safe-dev3" 6778915SmsmithSAFE_DEV4="$TEST_BASE_DIR/safe-dev4" 68125335SnjlDATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" 69125335SnjlSPARE_DEV1="$TEST_BASE_DIR/spare-dev1" 70125335SnjlSPARE_DEV2="$TEST_BASE_DIR/spare-dev2" 71125366SnjlSPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" 72125366Snjl 73125366Snjlfor type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do 74119529Snjl if [ "$type" = "draid2:1s" ]; then 75119529Snjl # 1. Create a dRAID pool with a distributed and traditional 7679375Smsmith # hot spare to provide test coverage for both configurations. 7778915Smsmith # 7878915Smsmith # Corruption is injected in the third and fourth vdevs 7978915Smsmith # since the dRAID permutation at these offsets maps to 8078915Smsmith # distributed spare space and not data devices. 8178915Smsmith # 8278915Smsmith truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 8378915Smsmith log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 8478915Smsmith $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 8578915Smsmith spare $SPARE_DEV1 8678915Smsmith SPARE1=$SPARE_DEV1 8778915Smsmith SPARE2="draid2-0-0" 8878915Smsmith elif [ "$type" = "mirror" ]; then 8967761Smsmith # 1. Create a 3-way mirror pool with two hot spares 90119529Snjl truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 91119529Snjl log_must zpool create -f $TESTPOOL $type \ 92119529Snjl $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 93119529Snjl SPARE1=$SPARE_DEV1 9479375Smsmith SPARE2=$SPARE_DEV2 95178506Srpaulo else 96119529Snjl # 1. Create a raidz pool with two hot spares 97119529Snjl truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 9879375Smsmith log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 9979375Smsmith spare $SPARE_DEVS 10079375Smsmith SPARE1=$SPARE_DEV1 101148138Sume SPARE2=$SPARE_DEV2 10279283Smsmith fi 103119529Snjl 104119529Snjl # 2. Inject IO ERRORS with a zinject error handler on the first device 105133624Snjl log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL 106119529Snjl 107119529Snjl # 3. Start a scrub 10879283Smsmith log_must zpool scrub $TESTPOOL 109119529Snjl 11079283Smsmith # 4. Verify the ZED kicks in a hot spare and the pool/device status 111133624Snjl log_note "Wait for ZED to auto-spare" 112133624Snjl log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 113119529Snjl log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 114125335Snjl log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 115148138Sume log_must check_state $TESTPOOL "" "DEGRADED" 116148138Sume 117148138Sume # 5. Inject IO ERRORS on a second device 118148703Sume log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL 119148138Sume 120148138Sume # 6. Start a scrub 121148138Sume while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 122149201Sume sleep 1 12367761Smsmith done 12467761Smsmith log_must zpool scrub $TESTPOOL 125148138Sume 126148138Sume # 7. Verify the ZED kicks in a second hot spare 12767761Smsmith log_note "Wait for ZED to auto-spare" 12867761Smsmith log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 12978915Smsmith log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 130119529Snjl log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 13178915Smsmith log_must check_state $TESTPOOL "" "DEGRADED" 13278915Smsmith 133119529Snjl while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 134119529Snjl sleep 1 13579283Smsmith done 13679375Smsmith 137148138Sume # 8. Clear the fault on both devices 138160657Snjl log_must zinject -c all 139174889Sume log_must zpool clear $TESTPOOL $FAULT_DEV1 140119529Snjl log_must zpool clear $TESTPOOL $FAULT_DEV2 141119529Snjl 142133624Snjl # 9. Verify the hot spares are available and expected pool/device status 143133624Snjl log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 14491640Siwasaki log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 14591126Smsmith log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" 146148138Sume log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" 147148138Sume log_must check_state $TESTPOOL "" "ONLINE" 14891126Smsmith 14967761Smsmith # Cleanup 15067761Smsmith cleanup 15167761Smsmithdone 15267761Smsmith 15367761Smsmith# Rinse and repeat, this time faulting both devices at the same time 15467761Smsmith# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices 15567761Smsmith# NOTE: "mirror" is a 3-way mirror here and should survive this test 15667761Smsmithfor type in "mirror" "raidz2" "raidz3" "draid2:1s"; do 15767761Smsmith if [ "$type" = "draid2:1s" ]; then 15867761Smsmith # 1. Create a dRAID pool with a distributed and traditional 15967761Smsmith # hot spare to provide test coverage for both configurations. 16067761Smsmith # 16167761Smsmith # Corruption is injected in the third and fourth vdevs 16267761Smsmith # since the dRAID permutation at these offsets maps to 16389054Smsmith # distributed spare space and not data devices. 16467761Smsmith # 165128071Snjl truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 16667761Smsmith log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 16779283Smsmith $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 16879283Smsmith spare $SPARE_DEV1 16979283Smsmith SPARE1=$SPARE_DEV1 170119529Snjl SPARE2="draid2-0-0" 171160657Snjl elif [ "$type" = "mirror" ]; then 17288420Siwasaki # 1. Create a 3-way mirror pool with two hot spares 173160657Snjl truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 17485699Siwasaki log_must zpool create -f $TESTPOOL $type \ 175119529Snjl $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 176119529Snjl SPARE1=$SPARE_DEV1 177133624Snjl SPARE2=$SPARE_DEV2 178119529Snjl else 179176329Sume # 1. Create a raidz pool with two hot spares 180176329Sume truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 18167761Smsmith log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 18267761Smsmith spare $SPARE_DEVS 18367761Smsmith SPARE1=$SPARE_DEV1 18478999Smsmith SPARE2=$SPARE_DEV2 185148138Sume fi 186119529Snjl 187120453Snjl # 2. Inject IO ERRORS with a zinject error handler on two devices 18878999Smsmith log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" 189133624Snjl log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &" 19078999Smsmith 191119529Snjl # 3. Start a scrub 19267761Smsmith log_must zpool scrub $TESTPOOL 19367761Smsmith 19467761Smsmith # 4. Verify the ZED kicks in two hot spares and the pool/device status 19567761Smsmith log_note "Wait for ZED to auto-spare" 19667761Smsmith log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 19767761Smsmith log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 19879283Smsmith log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 19978915Smsmith log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 20079283Smsmith log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 20167761Smsmith log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 20296926Speter log_must check_state $TESTPOOL "" "DEGRADED" 20369744Smsmith 20467761Smsmith # 5. Clear the fault on both devices 20567761Smsmith log_must zinject -c all 20667761Smsmith log_must zpool clear $TESTPOOL $FAULT_DEV1 20779375Smsmith log_must zpool clear $TESTPOOL $FAULT_DEV2 208178506Srpaulo 209135548Snjl # Cleanup 210148138Sume cleanup 211148703Sumedone 212148138Sume 213148138Sumelog_pass "ZED successfully handles multiple faulted devices" 214176329Sume