| #!/bin/sh |
| |
| # Copyright (C) 2010 Andrew Beekhof <andrew@beekhof.net> |
| # |
| # This program is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # This software is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public |
| # License along with this library; if not, write to the Free Software |
| # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| # |
| |
| |
| TEMP=`getopt \ |
| -o hv?xl:f:t:n:T:L:p:c:dSACHu:D:MVse: \ |
| --long help,corosync,cts:,cts-log:,dest:,heartbeat,node:,nodes:,openais,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ |
| -n 'crm_report' -- "$@"` |
| # The quotes around $TEMP are essential |
| eval set -- "$TEMP" |
| |
| progname=$(basename "$0") |
| rsh="ssh -T" |
| tests="" |
| nodes="" |
| compress=1 |
| cluster="any" |
| ssh_user="root" |
| search_logs=1 |
| sos_mode=0 |
| report_data=`dirname $0` |
| maxdepth=5 |
| |
| extra_logs="" |
| sanitize_patterns="passw.*" |
| log_patterns="CRIT: ERROR:" |
| |
| usage() { |
| cat<<EOF |
| $progname - Create archive of everything needed when reporting cluster problems |
| |
| |
| Usage: $progname [options] [DEST] |
| |
| Required option: |
| -f, --from TIME time prior to problems beginning |
| (as "YYYY-M-D H:M:S" including the quotes) |
| |
| Options: |
| -V increase verbosity (may be specified multiple times) |
| -h, --help display this message |
| -v, --version display software version |
| --features display software features |
| -t, --to TIME time at which all problems were resolved |
| (as "YYYY-M-D H:M:S" including the quotes; default "now") |
| -T, --cts TEST CTS test or set of tests to extract |
| --cts-log CTS master logfile |
| -n, --nodes NODES node names for this cluster (only needed if cluster is |
| not active on this host; accepts -n "a b" or -n a -n b) |
| -M do not search for cluster logs |
| -l, --logfile FILE log file to collect (in addition to detected logs if -M |
| is not specified; may be specified multiple times) |
| -p PATT additional regular expression to match variables to be |
| masked in output (default: "passw.*") |
| -L PATT additional regular expression to match in log files for |
| analysis (default: $log_patterns) |
| -S, --single-node don't attempt to collect data from other nodes |
| -c, --cluster TYPE force the cluster type instead of detecting |
| (one of corosync, cman, openais, heartbeat) |
| -A, --openais force the cluster type to be openais |
| -C, --corosync force the cluster type to be corosync |
| -H, --heartbeat force the cluster type to be heartbeat |
| -u, --user USER username to use when collecting data from other nodes |
| (default root) |
| -D, --max-depth search depth to use when attempting to locate files |
| -e, --rsh command to use to run commands on other nodes |
| (default ssh -T) |
| -d, --as-directory leave result as a directory tree instead of archiving |
| --sos-mode use defaults suitable for being called by sosreport tool |
| (behavior subject to change and not useful to end users) |
| DEST, --dest DEST custom destination directory or file name |
| |
| $progname works best when run from a cluster node on a running cluster, |
| but can be run from a stopped cluster node or a Pacemaker Remote node. |
| |
| If neither --nodes nor --single-node is given, $progname will guess the |
| node list, but may have trouble detecting Pacemaker Remote nodes. |
| Unless --single-node is given, the node names (whether specified by --nodes |
| or detected automatically) must be resolvable and reachable via the command |
| specified by -e/--rsh using the user specified by -u/--user. |
| |
| Examples: |
| $progname -f "2011-12-14 13:05:00" unexplained-apache-failure |
| $progname -f 2011-12-14 -t 2011-12-15 something-that-took-multiple-days |
| $progname -f 13:05:00 -t 13:12:00 brief-outage |
| EOF |
| } |
| |
| case "$1" in |
| -v|--version) echo "$progname @VERSION@-@BUILD_VERSION@"; exit 0;; |
| --features) echo "@VERSION@-@BUILD_VERSION@: @PCMK_FEATURES@"; exit 0;; |
| --|-h|--help) usage; exit 0;; |
| esac |
| |
| # Prefer helpers in the same directory if they exist, to simplify development |
| if [ ! -f $report_data/report.common ]; then |
| report_data=@datadir@/@PACKAGE@ |
| else |
| echo "Using local helpers" |
| fi |
| |
| . $report_data/report.common |
| |
| while true; do |
| case "$1" in |
| -x) set -x; shift;; |
| -V) verbose=`expr $verbose + 1`; shift;; |
| -T|--cts) tests="$tests $2"; shift; shift;; |
| --cts-log) ctslog="$2"; shift; shift;; |
| -f|--from) start_time=`get_time "$2"`; shift; shift;; |
| -t|--to) end_time=`get_time "$2"`; shift; shift;; |
| -n|--node|--nodes) nodes="$nodes $2"; shift; shift;; |
| -S|--single-node) nodes="$host"; shift;; |
| -l|--logfile) extra_logs="$extra_logs $2"; shift; shift;; |
| -p) sanitize_patterns="$sanitize_patterns $2"; shift; shift;; |
| -L) log_patterns="$log_patterns `echo $2 | sed 's/ /\\\W/g'`"; shift; shift;; |
| -d|--as-directory) compress=0; shift;; |
| -A|--openais) cluster="openais"; shift;; |
| -C|--corosync) cluster="corosync"; shift;; |
| -H|--heartbeat) cluster="heartbeat"; shift;; |
| -c|--cluster) cluster="$2"; shift; shift;; |
| -e|--rsh) rsh="$2"; shift; shift;; |
| -u|--user) ssh_user="$2"; shift; shift;; |
| -D|--max-depth) maxdepth="$2"; shift; shift;; |
| -M) search_logs=0; shift;; |
| --sos-mode) sos_mode=1; nodes="$host"; shift;; |
| --dest) DESTDIR=$2; shift; shift;; |
| --) if [ ! -z $2 ]; then DESTDIR=$2; fi; break;; |
| -h|--help) usage; exit 0;; |
| # Options for compatibility with hb_report |
| -s) shift;; |
| |
| *) echo "Unknown argument: $1"; usage; exit 1;; |
| esac |
| done |
| |
| |
| collect_data() { |
| label="$1" |
| start=`expr $2 - 10` |
| end=`expr $3 + 10` |
| masterlog=$4 |
| |
| if [ "x$DESTDIR" != x ]; then |
| echo $DESTDIR | grep -e "^/" -qs |
| if [ $? = 0 ]; then |
| l_base=$DESTDIR |
| else |
| l_base="`pwd`/$DESTDIR" |
| fi |
| debug "Using custom scratch dir: $l_base" |
| r_base=`basename $l_base` |
| else |
| l_base=$HOME/$label |
| r_base=$label |
| fi |
| |
| if [ -e $l_base ]; then |
| fatal "Output directory $l_base already exists, specify an alternate name with --dest" |
| fi |
| mkdir -p $l_base |
| |
| if [ "x$masterlog" != "x" ]; then |
| dumplogset "$masterlog" $start $end > "$l_base/$HALOG_F" |
| fi |
| |
| for node in $nodes; do |
| cat <<EOF >$l_base/.env |
| LABEL="$label" |
| REPORT_HOME="$r_base" |
| REPORT_MASTER="$host" |
| REPORT_TARGET="$node" |
| LOG_START=$start |
| LOG_END=$end |
| REMOVE=1 |
| SANITIZE="$sanitize_patterns" |
| CLUSTER=$cluster |
| LOG_PATTERNS="$log_patterns" |
| EXTRA_LOGS="$extra_logs" |
| SEARCH_LOGS=$search_logs |
| SOS_MODE=$sos_mode |
| verbose=$verbose |
| maxdepth=$maxdepth |
| EOF |
| |
| if [ $host = $node ]; then |
| cat <<EOF >>$l_base/.env |
| REPORT_HOME="$l_base" |
| EOF |
| cat $l_base/.env $report_data/report.common $report_data/report.collector > $l_base/collector |
| bash $l_base/collector |
| else |
| cat $l_base/.env $report_data/report.common $report_data/report.collector \ |
| | $rsh -l $ssh_user $node -- "mkdir -p $r_base; cat > $r_base/collector; bash $r_base/collector" | (cd $l_base && tar mxf -) |
| fi |
| done |
| |
| analyze $l_base > $l_base/$ANALYSIS_F |
| if [ -f $l_base/$HALOG_F ]; then |
| node_events $l_base/$HALOG_F > $l_base/$EVENTS_F |
| fi |
| |
| for node in $nodes; do |
| cat $l_base/$node/$ANALYSIS_F >> $l_base/$ANALYSIS_F |
| if [ -s $l_base/$node/$EVENTS_F ]; then |
| cat $l_base/$node/$EVENTS_F >> $l_base/$EVENTS_F |
| elif [ -s $l_base/$HALOG_F ]; then |
| awk "\$4==\"$nodes\"" $l_base/$EVENTS_F >> $l_base/$n/$EVENTS_F |
| fi |
| done |
| |
| log " " |
| if [ $compress = 1 ]; then |
| fname=`shrink $l_base` |
| rm -rf $l_base |
| log "Collected results are available in $fname" |
| log " " |
| log "Please create a bug entry at" |
| log " @BUG_URL@" |
| log "Include a description of your problem and attach this tarball" |
| log " " |
| log "Thank you for taking time to create this report." |
| else |
| log "Collected results are available in $l_base" |
| fi |
| log " " |
| } |
| |
| # |
| # check if files have same content in the cluster |
| # |
| cibdiff() { |
| d1=$(dirname $1) |
| d2=$(dirname $2) |
| |
| if [ -f "$d1/RUNNING" ] && [ ! -f "$d2/RUNNING" ]; then |
| DIFF_OK=0 |
| elif [ -f "$d1/STOPPED" ] && [ ! -f "$d2/STOPPED" ]; then |
| DIFF_OK=0 |
| else |
| DIFF_OK=1 |
| fi |
| |
| if [ $DIFF_OK -eq 1 ]; then |
| if which crm_diff > /dev/null 2>&1; then |
| crm_diff -c -n $1 -o $2 |
| else |
| info "crm_diff(8) not found, cannot diff CIBs" |
| fi |
| else |
| echo "can't compare cibs from running and stopped systems" |
| fi |
| } |
| |
| diffcheck() { |
| [ -f "$1" ] || { |
| echo "$1 does not exist" |
| return 1 |
| } |
| [ -f "$2" ] || { |
| echo "$2 does not exist" |
| return 1 |
| } |
| case `basename $1` in |
| $CIB_F) cibdiff $1 $2;; |
| $B_CONF) diff -u $1 $2;; # confdiff? |
| *) diff -u $1 $2;; |
| esac |
| } |
| |
| # |
| # remove duplicates if files are same, make links instead |
| # |
| consolidate() { |
| for n in $nodes; do |
| if [ -f $1/$2 ]; then |
| rm $1/$n/$2 |
| else |
| mv $1/$n/$2 $1 |
| fi |
| ln -s ../$2 $1/$n |
| done |
| } |
| |
| analyze_one() { |
| rc=0 |
| node0="" |
| for n in $nodes; do |
| if [ "$node0" ]; then |
| diffcheck $1/$node0/$2 $1/$n/$2 |
| rc=$(($rc+$?)) |
| else |
| node0=$n |
| fi |
| done |
| return $rc |
| } |
| |
| analyze() { |
| flist="$HOSTCACHE $MEMBERSHIP_F $CIB_F $CRM_MON_F $B_CONF logd.cf $SYSINFO_F" |
| for f in $flist; do |
| printf "Diff $f... " |
| ls $1/*/$f >/dev/null 2>&1 || { |
| echo "no $1/*/$f :/" |
| continue |
| } |
| if analyze_one $1 $f; then |
| echo "OK" |
| [ "$f" != $CIB_F ] && consolidate $1 $f |
| else |
| echo "" |
| fi |
| done |
| } |
| |
| do_cts() { |
| test_sets=`echo $tests | tr ',' ' '` |
| for test_set in $test_sets; do |
| |
| start_time=0 |
| start_test=`echo $test_set | tr '-' ' ' | awk '{print $1}'` |
| |
| end_time=0 |
| end_test=`echo $test_set | tr '-' ' ' | awk '{print $2}'` |
| |
| if [ x$end_test = x ]; then |
| msg="Extracting test $start_test" |
| label="CTS-$start_test-`date +"%b-%d-%Y"`" |
| end_test=`expr $start_test + 1` |
| else |
| msg="Extracting tests $start_test to $end_test" |
| label="CTS-$start_test-$end_test-`date +"%b-%d-%Y"`" |
| end_test=`expr $end_test + 1` |
| fi |
| |
| if [ $start_test = 0 ]; then |
| start_pat="BEGINNING [0-9].* TESTS" |
| else |
| start_pat="Running test.*\[ *$start_test\]" |
| fi |
| |
| if [ x$ctslog = x ]; then |
| ctslog=`findmsg 1 "$start_pat"` |
| |
| if [ x$ctslog = x ]; then |
| fatal "No CTS control file detected" |
| else |
| log "Using CTS control file: $ctslog" |
| fi |
| fi |
| |
| line=`grep -n "$start_pat" $ctslog | tail -1 | sed 's/:.*//'` |
| if [ ! -z "$line" ]; then |
| start_time=`linetime $ctslog $line` |
| fi |
| |
| line=`grep -n "Running test.*\[ *$end_test\]" $ctslog | tail -1 | sed 's/:.*//'` |
| if [ ! -z "$line" ]; then |
| end_time=`linetime $ctslog $line` |
| fi |
| |
| if [ -z "$nodes" ]; then |
| nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '` |
| log "Calculated node list: $nodes" |
| fi |
| |
| if [ $end_time -lt $start_time ]; then |
| debug "Test didn't complete, grabbing everything up to now" |
| end_time=`date +%s` |
| fi |
| |
| if [ $start_time != 0 ];then |
| log "$msg (`time2str $start_time` to `time2str $end_time`)" |
| collect_data $label $start_time $end_time $ctslog |
| else |
| fatal "$msg failed: not found" |
| fi |
| done |
| } |
| |
| node_names_from_xml() { |
| awk ' |
| /uname/ { |
| for( i=1; i<=NF; i++ ) |
| if( $i~/^uname=/ ) { |
| sub("uname=.","",$i); |
| sub("\".*","",$i); |
| print $i; |
| next; |
| } |
| } |
| ' | tr '\n' ' ' |
| } |
| |
| getnodes() { |
| cluster="$1" |
| |
| # 1. Live (cluster nodes or Pacemaker Remote nodes) |
| # TODO: This will not detect Pacemaker Remote nodes unless they |
| # have ever had a permanent node attribute set, because it only |
| # searches the nodes section. It should also search the config |
| # for resources that create Pacemaker Remote nodes. |
| cib_nodes=$(cibadmin -Ql -o nodes 2>/dev/null) |
| if [ $? -eq 0 ]; then |
| debug "Querying CIB for nodes" |
| echo "$cib_nodes" | node_names_from_xml |
| return |
| fi |
| |
| # 2. Saved |
| if [ -f "@CRM_CONFIG_DIR@/cib.xml" ]; then |
| debug "Querying on-disk CIB for nodes" |
| grep "node " "@CRM_CONFIG_DIR@/cib.xml" | node_names_from_xml |
| return |
| fi |
| |
| # 3. hostcache |
| if [ -z "$HA_STATE_DIR" ]; then |
| HA_STATE_DIR=/var/lib/heartbeat |
| fi |
| if [ -f "$HA_STATE_DIR/hostcache" ]; then |
| debug "Reading nodes from $HA_STATE_DIR/hostcache" |
| awk '{print $1}' "$HA_STATE_DIR/hostcache" |
| return |
| fi |
| |
| # 4. ha.cf |
| if [ "x$cluster" = "xheartbeat" ]; then |
| cluster_cf=$(find_cluster_cf $cluster) |
| debug "Reading nodes from $cluster_cf" |
| getcfvar $cluster node "$cluster_cf" |
| return |
| fi |
| |
| # 5. logs |
| # TODO: This has multiple issues: |
| # * It looks for messages from crm_update_peer(), which is used only by |
| # heartbeat and legacy plugin clusters; it should work with CMAN and |
| # corosync2 clusters as well. |
| # * It does a findmsg for "crm_update_peer" (which will hit |
| # "crm_update_peer_proc" etc.), but then greps for "crm_update_peer:". |
| # * It always uses grep, even though $logfile might be compressed. |
| # For this reason and efficiency, it would nice if findmsg could |
| # optionally print the matches instead of the file names. |
| # * It would be nice to skip this step for Pacemaker Remote nodes since their |
| # logs will not have node names, but it is nontrivial to know that. |
| # Cluster nodes generally won't get here, but stopped Pacemaker Remote |
| # nodes will. |
| logfile=$(findmsg 1 "crm_update_peer") |
| debug "Looking for nodes in $logfile" |
| if [ ! -z "$logfile" ]; then |
| grep crm_update_peer: "$logfile" \ |
| | sed s/.*crm_update_peer// \ |
| | sed s/://g \ |
| | awk '{print $2}' \ |
| | grep -v "(null)" \ |
| | sort -u \ |
| | tr '\n' ' ' |
| fi |
| } |
| |
| if [ $compress -eq 1 ]; then |
| require_tar |
| fi |
| |
| if [ "x$tests" != "x" ]; then |
| do_cts |
| |
| elif [ "x$start_time" != "x" ]; then |
| masterlog="" |
| |
| if [ -z "$sanitize_patterns" ]; then |
| log "WARNING: The tarball produced by this program may contain" |
| log " sensitive information such as passwords." |
| log "" |
| log "We will attempt to remove such information if you use the" |
| log "-p option. For example: -p \"pass.*\" -p \"user.*\"" |
| log "" |
| log "However, doing this may reduce the ability for the recipients" |
| log "to diagnose issues and generally provide assistance." |
| log "" |
| log "IT IS YOUR RESPONSIBILITY TO PROTECT SENSITIVE DATA FROM EXPOSURE" |
| log "" |
| fi |
| |
| # If user didn't specify a cluster stack, make a best guess if possible. |
| if [ -z "$cluster" ] || [ "$cluster" = "any" ]; then |
| cluster=$(get_cluster_type) |
| fi |
| |
| # If user didn't specify node(s), make a best guess if possible. |
| if [ -z "$nodes" ]; then |
| nodes=`getnodes $cluster` |
| if [ -n "$nodes" ]; then |
| log "Calculated node list: $nodes" |
| else |
| fatal "Cannot determine nodes; specify --nodes or --single-node" |
| fi |
| fi |
| |
| if |
| echo $nodes | grep -qs $host |
| then |
| debug "We are a cluster node" |
| else |
| debug "We are a log master" |
| masterlog=`findmsg 1 "crmd\\|CTS"` |
| fi |
| |
| |
| if [ -z $end_time ]; then |
| end_time=`perl -e 'print time()'` |
| fi |
| label="pcmk-`date +"%a-%d-%b-%Y"`" |
| log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)" |
| collect_data $label $start_time $end_time $masterlog |
| else |
| fatal "Not sure what to do, no tests or time ranges to extract" |
| fi |
| |
| # vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80: |