| #!/bin/bash |
| # |
| # |
| # HealthSMART OCF RA. Checks the S.M.A.R.T. status of all given |
| # drives and writes the #health-smart status into the CIB |
| # |
| # Copyright (c) 2009 Michael Schwartzkopff, 2010 Matthew Richardson |
| # |
| # All Rights Reserved. |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of version 2 of the GNU General Public License as |
| # published by the Free Software Foundation. |
| # |
| # This program is distributed in the hope that it would be useful, but |
| # WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| # |
| # Further, this software is distributed without any warranty that it is |
| # free of the rightful claim of any third person regarding infringement |
| # or the like. Any license provided herein, whether implied or |
| # otherwise, applies only to this software file. Patent licenses, if |
| # any, provided herein do not apply to combinations of this program with |
| # other software, or any other product whatsoever. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program; if not, write the Free Software Foundation, |
| # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. |
| # |
| ####################################################################### |
| |
| ####################################################################### |
| # Initialization: |
| |
| : ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs} |
| . ${OCF_FUNCTIONS} |
| : ${__OCF_ACTION=$1} |
| # |
| SMARTCTL=/usr/sbin/smartctl |
| ATTRDUP=/usr/sbin/attrd_updater |
| |
| ####################################################################### |
| |
| meta_data() { |
| cat <<END |
| <?xml version="1.0"?> |
| <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> |
| <resource-agent name="HealthSMART" version="0.1"> |
| <version>1.0</version> |
| |
| <longdesc lang="en"> |
| Systhem health agent that checks the S.M.A.R.T. status of the given drives and |
| updates the #health-smart attribute. |
| </longdesc> |
| <shortdesc lang="en">SMART health status</shortdesc> |
| |
| <parameters> |
| <parameter name="state" unique="1"> |
| <longdesc lang="en"> |
| Location to store the resource state in. |
| </longdesc> |
| <shortdesc lang="en">State file</shortdesc> |
| <content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" /> |
| </parameter> |
| |
| <parameter name="drives" unique="0"> |
| <longdesc lang="en"> |
| The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". |
| </longdesc> |
| <shortdesc lang="en">Drives to check</shortdesc> |
| <content type="string" default="/dev/sda" /> |
| </parameter> |
| |
| <parameter name="devices" unique="0"> |
| <longdesc lang="en"> |
| The device type(s) to assume for the drive(s) being tested as a SPACE separated list. |
| </longdesc> |
| <shortdesc lang="en">Device types</shortdesc> |
| <content type="string" /> |
| </parameter> |
| |
| <parameter name="temp_lower_limit" unique="0"> |
| <longdesc lang="en"> |
| Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red. |
| </longdesc> |
| <shortdesc lang="en">Lower limit for the red smart attribute</shortdesc> |
| <content type="string" default="0"/> |
| </parameter> |
| |
| <parameter name="temp_upper_limit" unique="0"> |
| <longdesc lang="en"> |
| Upper limit of the temperature if deg C of the drives(s). If the drive reports |
| a temperature higher than this value the status of #health-smart will be red. |
| </longdesc> |
| <shortdesc lang="en">Upper limit for red smart attribute</shortdesc> |
| <content type="string" default="60"/> |
| </parameter> |
| |
| <parameter name="temp_warning" unique="0"> |
| <longdesc lang="en"> |
| Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow. |
| </longdesc> |
| <shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc> |
| <content type="string" default="5"/> |
| </parameter> |
| |
| </parameters> |
| |
| <actions> |
| <action name="start" timeout="10" /> |
| <action name="stop" timeout="10" /> |
| <action name="monitor" timeout="10" interval="10" start-delay="0" /> |
| <action name="meta-data" timeout="5" /> |
| <action name="validate-all" timeout="10" /> |
| </actions> |
| </resource-agent> |
| END |
| } |
| |
| ####################################################################### |
| |
| check_temperature() { |
| |
| if [ $1 -lt ${lower_red_limit} ] ; then |
| ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C" |
| $ATTRDUP -n "#health-smart" -U "red" -d "5s" |
| return 1 |
| fi |
| |
| if [ $1 -gt ${upper_red_limit} ] ; then |
| ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C" |
| $ATTRDUP -n "#health-smart" -U "red" -d "5s" |
| return 1 |
| fi |
| |
| if [ $1 -lt ${lower_yellow_limit} ] ; then |
| ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C" |
| $ATTRDUP -n "#health-smart" -U "yellow" -d "5s" |
| return 1 |
| fi |
| |
| if [ $1 -gt ${upper_yellow_limit} ] ; then |
| ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C" |
| $ATTRDUP -n "#health-smart" -U "yellow" -d "5s" |
| return 1 |
| fi |
| } |
| |
| |
| init_smart() { |
| #Set temperature defaults |
| if [ -z ${OCF_RESKEY_temp_warning} ]; then |
| yellow_threshold=5 |
| else |
| yellow_threshold=${OCF_RESKEY_temp_warning} |
| fi |
| |
| if [ -z ${OCF_RESKEY_temp_lower_limit} ] ; then |
| lower_red_limit=0 |
| else |
| lower_red_limit=${OCF_RESKEY_temp_lower_limit} |
| fi |
| lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold})) |
| |
| if [ -z ${OCF_RESKEY_temp_upper_limit} ] ; then |
| upper_red_limit=60 |
| else |
| upper_red_limit=${OCF_RESKEY_temp_upper_limit} |
| fi |
| upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold})) |
| |
| #Set disk defaults |
| if [ -z "${OCF_RESKEY_drives}" ] ; then |
| DRIVES="/dev/sda" |
| else |
| DRIVES=${OCF_RESKEY_drives} |
| fi |
| |
| #Test for presence of smartctl |
| if [ ! -x $SMARTCTL ] ; then |
| ocf_log err "${SMARTCTL} not installed." |
| exit $OCF_ERR_INSTALLED |
| fi |
| |
| for DRIVE in $DRIVES; do |
| if [ "${OCF_RESKEY_devices}" ]; then |
| for DEVICE in ${OCF_RESKEY_devices}; do |
| $SMARTCTL -d $DEVICE -i ${DRIVE} | grep -q "SMART support is: Enabled" |
| if [ $? -ne "0" ] ; then |
| ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} |
| exit $OCF_ERR_INSTALLED |
| fi |
| done |
| else |
| $SMARTCTL -i ${DRIVE} | grep -q "SMART support is: Enabled" |
| if [ $? -ne "0" ] ; then |
| ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} |
| exit $OCF_ERR_INSTALLED |
| fi |
| fi |
| done |
| } |
| |
| HealthSMART_usage() { |
| cat <<END |
| usage: $0 {start|stop|monitor|validate-all|meta-data} |
| |
| Expects to have a fully populated OCF RA-compliant environment set. |
| END |
| } |
| |
| HealthSMART_start() { |
| HealthSMART_monitor |
| if [ $? = $OCF_SUCCESS ]; then |
| return $OCF_SUCCESS |
| fi |
| touch ${OCF_RESKEY_state} |
| } |
| |
| HealthSMART_stop() { |
| HealthSMART_monitor |
| if [ $? = $OCF_SUCCESS ]; then |
| rm ${OCF_RESKEY_state} |
| fi |
| return $OCF_SUCCESS |
| } |
| |
| HealthSMART_monitor() { |
| |
| init_smart |
| |
| # Monitor _MUST!_ differentiate correctly between running |
| # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). |
| # That is THREE states, not just yes/no. |
| |
| if [ -f ${OCF_RESKEY_state} ]; then |
| |
| # Check overall S.M.A.R.T. status |
| for DRIVE in $DRIVES; do |
| if [ "${OCF_RESKEY_devices}" ]; then |
| for DEVICE in ${OCF_RESKEY_devices}; do |
| $SMARTCTL -d $DEVICE -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED" |
| if [ $? -ne "0" ]; then |
| $ATTRDUP -n "#health-smart" -U "red" -d "5s" |
| return $OCF_SUCCESS |
| fi |
| done |
| else |
| $SMARTCTL -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED" |
| if [ $? -ne "0" ]; then |
| $ATTRDUP -n "#health-smart" -U "red" -d "5s" |
| return $OCF_SUCCESS |
| fi |
| fi |
| |
| # Check drive temperature(s) |
| if [ "${OCF_RESKEY_devices}" ]; then |
| for DEVICE in ${OCF_RESKEY_devices}; do |
| check_temperature `$SMARTCTL -d $DEVICE -A ${DRIVE} | awk '/^194/ { print $10 }'` |
| if [ $? != 0 ]; then |
| return $OCF_SUCCESS |
| fi |
| done |
| else |
| check_temperature `$SMARTCTL -A ${DRIVE} | awk '/^194/ { print $10 }'` |
| if [ $? != 0 ]; then |
| return $OCF_SUCCESS |
| fi |
| fi |
| done |
| |
| $ATTRDUP -n "#health-smart" -U "green" -d "5s" |
| return $OCF_SUCCESS |
| fi |
| |
| return $OCF_NOT_RUNNING |
| |
| } |
| |
| HealthSMART_validate() { |
| |
| init_smart |
| |
| # Is the state directory writable? |
| state_dir=`dirname "$OCF_RESKEY_state"` |
| touch "$state_dir/$$" |
| if [ $? != 0 ]; then |
| return $OCF_ERR_ARGS |
| fi |
| rm "$state_dir/$$" |
| |
| return $OCF_SUCCESS |
| } |
| |
| : ${OCF_RESKEY_CRM_meta_interval=0} |
| : ${OCF_RESKEY_CRM_meta_globally_unique:="true"} |
| |
| if [ "x$OCF_RESKEY_state" = "x" ]; then |
| if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then |
| state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" |
| |
| # Strip off the trailing clone marker |
| OCF_RESKEY_state=`echo $state | sed s/:[0-9][0-9]*\.state/.state/` |
| else |
| OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" |
| fi |
| fi |
| |
| case $__OCF_ACTION in |
| start) HealthSMART_start;; |
| stop) HealthSMART_stop;; |
| monitor) HealthSMART_monitor;; |
| validate-all) HealthSMART_validate;; |
| meta-data) |
| meta_data |
| exit $OCF_SUCCESS |
| ;; |
| usage|help) |
| HealthSMART_usage |
| exit $OCF_SUCCESS |
| ;; |
| *) HealthSMART_usage |
| exit $OCF_ERR_UNIMPLEMENTED |
| ;; |
| esac |
| rc=$? |
| ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" |
| exit $rc |