167761Smsmith#!/bin/ksh -p
278915Smsmith
367761Smsmith#
467761Smsmith# CDDL HEADER START
567761Smsmith#
667761Smsmith# This file and its contents are supplied under the terms of the
767761Smsmith# Common Development and Distribution License ("CDDL"), version 1.0.
867761Smsmith# You may only use this file in accordance with the terms of version
967761Smsmith# 1.0 of the CDDL.
1067761Smsmith#
1167761Smsmith# A full copy of the text of the CDDL should have accompanied this
1267761Smsmith# source.  A copy of the CDDL is also available via the Internet at
1367761Smsmith# http://www.illumos.org/license/CDDL.
1467761Smsmith#
1567761Smsmith# CDDL HEADER END
1667761Smsmith#
1767761Smsmith
1867761Smsmith#
1967761Smsmith# Copyright (c) 2017 by Intel Corporation. All rights reserved.
2067761Smsmith# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
2167761Smsmith#
2267761Smsmith
2367761Smsmith. $STF_SUITE/include/libtest.shlib
2467761Smsmith. $STF_SUITE/tests/functional/fault/fault.cfg
2567761Smsmith
2667761Smsmith#
2767761Smsmith# DESCRIPTION:
28119418Sobrien# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when
29119418Sobrien# multiple drives are faulted.
30119418Sobrien#
3167761Smsmith# STRATEGY:
3267761Smsmith# 1. Create a pool with two hot spares
3367761Smsmith# 2. Inject IO ERRORS with a zinject error handler on the first device
34133624Snjl# 3. Start a scrub
35148138Sume# 4. Verify the ZED kicks in a hot spare and expected pool/device status
3691126Smsmith# 5. Inject IO ERRORS on a second device
37133624Snjl# 6. Start a scrub
38129879Sphk# 7. Verify the ZED kicks in a second hot spare
3967761Smsmith# 8. Clear the fault on both devices
4091126Smsmith# 9. Verify the hot spares are available and expected pool/device status
41128991Snjl# 10. Rinse and repeat, this time faulting both devices at the same time
4279283Smsmith#
4391126Smsmith
4491640Siwasakiverify_runnable "both"
4567761Smsmith
46148138Sumefunction cleanup
47148138Sume{
48193530Sjkim	log_must zinject -c all
49193530Sjkim	destroy_pool $TESTPOOL
50193530Sjkim	rm -f $DATA_DEVS $SPARE_DEVS
5167761Smsmith}
5267761Smsmith
53119529Snjllog_assert "ZED should be able to handle multiple faulted devices"
5478999Smsmithlog_onexit cleanup
5591126Smsmith
5669744Smsmith# Events not supported on FreeBSD
5771874Smsmithif ! is_freebsd; then
58160657Snjl	# Clear events from previous runs
5967761Smsmith	zed_events_drain
60125366Snjlfi
61125366Snjl
62125366SnjlFAULT_DEV1="$TEST_BASE_DIR/fault-dev1"
63125366SnjlFAULT_DEV2="$TEST_BASE_DIR/fault-dev2"
6478915SmsmithSAFE_DEV1="$TEST_BASE_DIR/safe-dev1"
65125335SnjlSAFE_DEV2="$TEST_BASE_DIR/safe-dev2"
66125335SnjlSAFE_DEV3="$TEST_BASE_DIR/safe-dev3"
6778915SmsmithSAFE_DEV4="$TEST_BASE_DIR/safe-dev4"
68125335SnjlDATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4"
69125335SnjlSPARE_DEV1="$TEST_BASE_DIR/spare-dev1"
70125335SnjlSPARE_DEV2="$TEST_BASE_DIR/spare-dev2"
71125366SnjlSPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2"
72125366Snjl
73125366Snjlfor type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do
74119529Snjl	if [ "$type" = "draid2:1s" ]; then
75119529Snjl		# 1. Create a dRAID pool with a distributed and traditional
7679375Smsmith		# hot spare to provide test coverage for both configurations.
7778915Smsmith		#
7878915Smsmith		# Corruption is injected in the third and fourth vdevs
7978915Smsmith		# since the dRAID permutation at these offsets maps to
8078915Smsmith		# distributed spare space and not data devices.
8178915Smsmith		#
8278915Smsmith		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
8378915Smsmith		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
8478915Smsmith		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
8578915Smsmith		    spare $SPARE_DEV1
8678915Smsmith		SPARE1=$SPARE_DEV1
8778915Smsmith		SPARE2="draid2-0-0"
8878915Smsmith	elif [ "$type" = "mirror" ]; then
8967761Smsmith		# 1. Create a 3-way mirror pool with two hot spares
90119529Snjl		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
91119529Snjl		log_must zpool create -f $TESTPOOL $type \
92119529Snjl		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
93119529Snjl		SPARE1=$SPARE_DEV1
9479375Smsmith		SPARE2=$SPARE_DEV2
95178506Srpaulo	else
96119529Snjl		# 1. Create a raidz pool with two hot spares
97119529Snjl		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
9879375Smsmith		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
9979375Smsmith		    spare $SPARE_DEVS
10079375Smsmith		SPARE1=$SPARE_DEV1
101148138Sume		SPARE2=$SPARE_DEV2
10279283Smsmith	fi
103119529Snjl
104119529Snjl	# 2. Inject IO ERRORS with a zinject error handler on the first device
105133624Snjl	log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL
106119529Snjl
107119529Snjl	# 3. Start a scrub
10879283Smsmith	log_must zpool scrub $TESTPOOL
109119529Snjl
11079283Smsmith	# 4. Verify the ZED kicks in a hot spare and the pool/device status
111133624Snjl	log_note "Wait for ZED to auto-spare"
112133624Snjl	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
113119529Snjl	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
114125335Snjl	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
115148138Sume	log_must check_state $TESTPOOL "" "DEGRADED"
116148138Sume
117148138Sume	# 5. Inject IO ERRORS on a second device
118148703Sume	log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL
119148138Sume
120148138Sume	# 6. Start a scrub
121148138Sume	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
122149201Sume		sleep 1
12367761Smsmith	done
12467761Smsmith	log_must zpool scrub $TESTPOOL
125148138Sume
126148138Sume	# 7. Verify the ZED kicks in a second hot spare
12767761Smsmith	log_note "Wait for ZED to auto-spare"
12867761Smsmith	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
12978915Smsmith	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
130119529Snjl	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
13178915Smsmith	log_must check_state $TESTPOOL "" "DEGRADED"
13278915Smsmith
133119529Snjl	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
134119529Snjl		sleep 1
13579283Smsmith	done
13679375Smsmith
137148138Sume	# 8. Clear the fault on both devices
138160657Snjl	log_must zinject -c all
139174889Sume	log_must zpool clear $TESTPOOL $FAULT_DEV1
140119529Snjl	log_must zpool clear $TESTPOOL $FAULT_DEV2
141119529Snjl
142133624Snjl	# 9. Verify the hot spares are available and expected pool/device status
143133624Snjl	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60
14491640Siwasaki	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60
14591126Smsmith	log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL"
146148138Sume	log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL"
147148138Sume	log_must check_state $TESTPOOL "" "ONLINE"
14891126Smsmith
14967761Smsmith	# Cleanup
15067761Smsmith	cleanup
15167761Smsmithdone
15267761Smsmith
15367761Smsmith# Rinse and repeat, this time faulting both devices at the same time
15467761Smsmith# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices
15567761Smsmith# NOTE: "mirror" is a 3-way mirror here and should survive this test
15667761Smsmithfor type in "mirror" "raidz2" "raidz3" "draid2:1s"; do
15767761Smsmith	if [ "$type" = "draid2:1s" ]; then
15867761Smsmith		# 1. Create a dRAID pool with a distributed and traditional
15967761Smsmith		# hot spare to provide test coverage for both configurations.
16067761Smsmith		#
16167761Smsmith		# Corruption is injected in the third and fourth vdevs
16267761Smsmith		# since the dRAID permutation at these offsets maps to
16389054Smsmith		# distributed spare space and not data devices.
16467761Smsmith		#
165128071Snjl		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
16667761Smsmith		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
16779283Smsmith		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
16879283Smsmith		    spare $SPARE_DEV1
16979283Smsmith		SPARE1=$SPARE_DEV1
170119529Snjl		SPARE2="draid2-0-0"
171160657Snjl	elif [ "$type" = "mirror" ]; then
17288420Siwasaki		# 1. Create a 3-way mirror pool with two hot spares
173160657Snjl		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
17485699Siwasaki		log_must zpool create -f $TESTPOOL $type \
175119529Snjl		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
176119529Snjl		SPARE1=$SPARE_DEV1
177133624Snjl		SPARE2=$SPARE_DEV2
178119529Snjl	else
179176329Sume		# 1. Create a raidz pool with two hot spares
180176329Sume		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
18167761Smsmith		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
18267761Smsmith		    spare $SPARE_DEVS
18367761Smsmith		SPARE1=$SPARE_DEV1
18478999Smsmith		SPARE2=$SPARE_DEV2
185148138Sume	fi
186119529Snjl
187120453Snjl	# 2. Inject IO ERRORS with a zinject error handler on two devices
18878999Smsmith	log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &"
189133624Snjl	log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &"
19078999Smsmith
191119529Snjl	# 3. Start a scrub
19267761Smsmith	log_must zpool scrub $TESTPOOL
19367761Smsmith
19467761Smsmith	# 4. Verify the ZED kicks in two hot spares and the pool/device status
19567761Smsmith	log_note "Wait for ZED to auto-spare"
19667761Smsmith	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
19767761Smsmith	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
19879283Smsmith	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
19978915Smsmith	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
20079283Smsmith	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
20167761Smsmith	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
20296926Speter	log_must check_state $TESTPOOL "" "DEGRADED"
20369744Smsmith
20467761Smsmith	# 5. Clear the fault on both devices
20567761Smsmith	log_must zinject -c all
20667761Smsmith	log_must zpool clear $TESTPOOL $FAULT_DEV1
20779375Smsmith	log_must zpool clear $TESTPOOL $FAULT_DEV2
208178506Srpaulo
209135548Snjl	# Cleanup
210148138Sume	cleanup
211148703Sumedone
212148138Sume
213148138Sumelog_pass "ZED successfully handles multiple faulted devices"
214176329Sume