blob: 8a9cf6f5324ec928b803b058962b3a2c5d56a3c5 [file] [log] [blame]
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Intel Corporation. All rights reserved.
# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg
#
# DESCRIPTION:
# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when
# multiple drives are faulted.
#
# STRATEGY:
# 1. Create a pool with two hot spares
# 2. Inject IO ERRORS with a zinject error handler on the first device
# 3. Start a scrub
# 4. Verify the ZED kicks in a hot spare and expected pool/device status
# 5. Inject IO ERRORS on a second device
# 6. Start a scrub
# 7. Verify the ZED kicks in a second hot spare
# 8. Clear the fault on both devices
# 9. Verify the hot spares are available and expected pool/device status
# 10. Rinse and repeat, this time faulting both devices at the same time
#
verify_runnable "both"
function cleanup
{
log_must zinject -c all
destroy_pool $TESTPOOL
rm -f $DATA_DEVS $SPARE_DEVS
}
log_assert "ZED should be able to handle multiple faulted devices"
log_onexit cleanup
# Events not supported on FreeBSD
if ! is_freebsd; then
# Clear events from previous runs
zed_events_drain
fi
FAULT_DEV1="$TEST_BASE_DIR/fault-dev1"
FAULT_DEV2="$TEST_BASE_DIR/fault-dev2"
SAFE_DEV1="$TEST_BASE_DIR/safe-dev1"
SAFE_DEV2="$TEST_BASE_DIR/safe-dev2"
SAFE_DEV3="$TEST_BASE_DIR/safe-dev3"
SAFE_DEV4="$TEST_BASE_DIR/safe-dev4"
DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4"
SPARE_DEV1="$TEST_BASE_DIR/spare-dev1"
SPARE_DEV2="$TEST_BASE_DIR/spare-dev2"
SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2"
for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do
if [ "$type" = "draid2:1s" ]; then
# 1. Create a dRAID pool with a distributed and traditional
# hot spare to provide test coverage for both configurations.
#
# Corruption is injected in the third and fourth vdevs
# since the dRAID permutation at these offsets maps to
# distributed spare space and not data devices.
#
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
$SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
spare $SPARE_DEV1
SPARE1=$SPARE_DEV1
SPARE2="draid2-0-0"
elif [ "$type" = "mirror" ]; then
# 1. Create a 3-way mirror pool with two hot spares
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
log_must zpool create -f $TESTPOOL $type \
$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
SPARE1=$SPARE_DEV1
SPARE2=$SPARE_DEV2
else
# 1. Create a raidz pool with two hot spares
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
spare $SPARE_DEVS
SPARE1=$SPARE_DEV1
SPARE2=$SPARE_DEV2
fi
# 2. Inject IO ERRORS with a zinject error handler on the first device
log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL
# 3. Start a scrub
log_must zpool scrub $TESTPOOL
# 4. Verify the ZED kicks in a hot spare and the pool/device status
log_note "Wait for ZED to auto-spare"
log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
log_must check_state $TESTPOOL "" "DEGRADED"
# 5. Inject IO ERRORS on a second device
log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL
# 6. Start a scrub
while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
sleep 1
done
log_must zpool scrub $TESTPOOL
# 7. Verify the ZED kicks in a second hot spare
log_note "Wait for ZED to auto-spare"
log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
log_must check_state $TESTPOOL "" "DEGRADED"
while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
sleep 1
done
# 8. Clear the fault on both devices
log_must zinject -c all
log_must zpool clear $TESTPOOL $FAULT_DEV1
log_must zpool clear $TESTPOOL $FAULT_DEV2
# 9. Verify the hot spares are available and expected pool/device status
log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60
log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60
log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL"
log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL"
log_must check_state $TESTPOOL "" "ONLINE"
# Cleanup
cleanup
done
# Rinse and repeat, this time faulting both devices at the same time
# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices
# NOTE: "mirror" is a 3-way mirror here and should survive this test
for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do
if [ "$type" = "draid2:1s" ]; then
# 1. Create a dRAID pool with a distributed and traditional
# hot spare to provide test coverage for both configurations.
#
# Corruption is injected in the third and fourth vdevs
# since the dRAID permutation at these offsets maps to
# distributed spare space and not data devices.
#
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
$SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
spare $SPARE_DEV1
SPARE1=$SPARE_DEV1
SPARE2="draid2-0-0"
elif [ "$type" = "mirror" ]; then
# 1. Create a 3-way mirror pool with two hot spares
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
log_must zpool create -f $TESTPOOL $type \
$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
SPARE1=$SPARE_DEV1
SPARE2=$SPARE_DEV2
else
# 1. Create a raidz pool with two hot spares
truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
spare $SPARE_DEVS
SPARE1=$SPARE_DEV1
SPARE2=$SPARE_DEV2
fi
# 2. Inject IO ERRORS with a zinject error handler on two devices
log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &"
log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &"
# 3. Start a scrub
log_must zpool scrub $TESTPOOL
# 4. Verify the ZED kicks in two hot spares and the pool/device status
log_note "Wait for ZED to auto-spare"
log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
log_must check_state $TESTPOOL "" "DEGRADED"
# 5. Clear the fault on both devices
log_must zinject -c all
log_must zpool clear $TESTPOOL $FAULT_DEV1
log_must zpool clear $TESTPOOL $FAULT_DEV2
# Cleanup
cleanup
done
log_pass "ZED successfully handles multiple faulted devices"