blob: 47a90676bf74331badbe9f6dbf3918019942c9fd [file] [log] [blame]
import sys, os
from cts.CTSvars import *
patternvariants = {}
class BasePatterns:
def __init__(self, name):
self.name = name
patternvariants[name] = self
self.ignore = [
"avoid confusing Valgrind",
# Logging bug in some versions of libvirtd
r"libvirtd.*: internal error: Failed to parse PCI config address",
# pcs can log this when node is fenced, but fencing is OK in some
# tests (and we will catch it in pacemaker logs when not OK)
r"pcs.daemon:No response from: .* request: get_configs, error:",
]
self.BadNews = []
self.components = {}
self.commands = {
"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
"CibQuery" : "cibadmin -Ql",
"CibAddXml" : "cibadmin --modify -c --xml-text %s",
"CibDelXpath" : "cibadmin --delete --xpath %s",
# 300,000 == 5 minutes
"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/lrmd_test -R -r %s",
"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",
# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
"ReduceCommCmd" : "",
"RestoreCommCmd" : "tc qdisc del dev lo root",
"UUIDQueryCmd" : "crmadmin -N",
"SetCheckInterval" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-recheck-interval-setting\" name=\"cluster-recheck-interval\" value=\"%s\"/></cluster_property_set>'",
"ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"",
"MaintenanceModeOn" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
"StandbyCmd" : "crm_attribute -VQ -U %s -n standby -l forever -v %s 2>/dev/null",
"StandbyQueryCmd" : "crm_attribute -QG -U %s -n standby -l forever -d off 2>/dev/null",
}
self.search = {
"Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE",
# This wont work if we have multiple partitions
"Pat:Local_started" : "%s\W.*The local CRM is operational",
"Pat:Slave_started" : "%s\W.*State transition.*-> S_NOT_DC",
"Pat:Master_started": "%s\W.*State transition.*-> S_IDLE",
"Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete",
"Pat:Logd_stopped" : "%s\W.*logd:.*Exiting write process",
"Pat:They_stopped" : "%s\W.*LOST:.* %s ",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
"Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s",
"Pat:Fencing_ok" : r"stonith.*:\s*Operation .* targeting %s on .* for .*@.*: OK",
"Pat:Fencing_recover" : r"pengine.*: Recover %s",
"Pat:Fencing_active" : r"pengine.*: Resource %s is active on .* nodes",
"Pat:Fencing_probe" : r"crmd.*: Result of probe operation for %s on .*: Error",
"Pat:RscOpOK" : r"crmd.*:\s+Result of %s operation for %s.*: (0 \()?ok",
"Pat:RscRemoteOpOK" : r"crmd.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
"Pat:NodeFenced" : r"crmd.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK",
"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
}
def get_component(self, key):
if key in self.components:
return self.components[key]
print("Unknown component '%s' for %s" % (key, self.name))
return []
def get_patterns(self, key):
if key == "BadNews":
return self.BadNews
elif key == "BadNewsIgnore":
return self.ignore
elif key == "Commands":
return self.commands
elif key == "Search":
return self.search
elif key == "Components":
return self.components
def __getitem__(self, key):
if key == "Name":
return self.name
elif key in self.commands:
return self.commands[key]
elif key in self.search:
return self.search[key]
else:
print("Unknown template '%s' for %s" % (key, self.name))
return None
class crm_lha(BasePatterns):
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"StartCmd" : "service heartbeat start > /dev/null 2>&1",
"StopCmd" : "service heartbeat stop > /dev/null 2>&1",
"EpochCmd" : "crm_node -H -e",
"QuorumCmd" : "crm_node -H -q",
"PartitionCmd" : "crm_node -H -p",
})
self.search.update({
# Patterns to look for in the log files for various occasions...
"Pat:ChildKilled" : "%s\W.*heartbeat.*%s.*killed by signal 9",
"Pat:ChildRespawn" : "%s\W.*heartbeat.*Respawning client.*%s",
"Pat:ChildExit" : "(ERROR|error): Client .* exited with return code",
})
self.BadNews = [
r"error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|lrmd|crmd):.*, exiting",
r"WARN.*Ignoring HA message.*vote.*not in our membership list",
r"pengine.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r"No need to invoke the TE",
r"global_timer_callback:",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
]
self.ignore = self.ignore + [
r"(ERROR|error):.*\s+assert\s+at\s+crm_glib_handler:"
"(ERROR|error): Message hist queue is filling up",
"stonithd.*CRIT: external_hostlist:.*'vmware gethosts' returned an empty hostlist",
"stonithd.*(ERROR|error): Could not list nodes for stonith RA external/vmware.",
"pengine.*Preventing .* from re-starting",
]
class crm_cs_v0(BasePatterns):
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"EpochCmd" : "crm_node -e --openais",
"QuorumCmd" : "crm_node -q --openais",
"PartitionCmd" : "crm_node -p --openais",
"StartCmd" : "service corosync start",
"StopCmd" : "service corosync stop",
})
self.search.update({
# The next pattern is too early
# "Pat:We_stopped" : "%s.*Service engine unloaded: Pacemaker Cluster Manager",
# The next pattern would be preferred, but it doesn't always come out
# "Pat:We_stopped" : "%s.*Corosync Cluster Engine exiting with status",
"Pat:We_stopped" : "%s\W.*Service engine unloaded: corosync cluster quorum service",
"Pat:They_stopped" : "%s\W.*crmd.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "corosync:.*Node %s is now: lost",
"Pat:ChildExit" : "Child process .* exited",
"Pat:ChildKilled" : "%s\W.*corosync.*Child process %s terminated with signal 9",
"Pat:ChildRespawn" : "%s\W.*corosync.*Respawning failed child process: %s",
"Pat:InfraUp" : "%s\W.*corosync.*Initializing transport",
"Pat:PacemakerUp" : "%s\W.*corosync.*CRM: Initialized",
})
self.ignore = self.ignore + [
r"crm_mon:",
r"crmadmin:",
r"update_trace_data",
r"async_notify:.*strange, client not found",
r"Parse error: Ignoring unknown option .*nodename",
r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:",
r"Child process .* terminated with signal 9",
r"getinfo response error: 1$",
"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro failed",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* closed .I/O condition=17",
]
self.BadNews = [
r"error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|lrmd|crmd):.*, exiting",
r"(WARN|warn).*Ignoring HA message.*vote.*not in our membership list",
r"pengine.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)",
r"Child process .* terminated with signal",
r"pengine.*Recover .*\(.* -\> .*\)",
r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
r"Peer is not part of our cluster",
r"We appear to be in an election loop",
r"Unknown node -> we will not deliver message",
r"(Blackbox dump requested|Problem detected)",
r"pacemakerd.*Could not connect to Cluster Configuration Database API",
r"Receiving messages from a node we think is dead",
r"share the same cluster nodeid",
r"share the same name",
#r"crm_ipc_send:.*Request .* failed",
#r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received",
# Not inherently bad, but worth tracking
#r"No need to invoke the TE",
#r"ping.*: DEBUG: Updated connected = 0",
#r"Digest mis-match:",
r"crmd:.*Transition failed: terminated",
r"Local CIB .* differs from .*:",
r"warn.*:\s*Continuing but .* will NOT be used",
r"warn.*:\s*Cluster configuration file .* is corrupt",
#r"Executing .* fencing operation",
#r"fence_pcmk.* Call to fence",
#r"fence_pcmk",
r"cman killed by node",
r"Election storm",
r"stalled the FSA with pending inputs",
]
self.components["common-ignore"] = [
"Pending action:",
"error: crm_log_message_adv:",
r"resource( was|s were) active at shutdown",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"Sending message to CIB service FAILED",
"apply_xml_diff:.*Diff application failed!",
r"crmd.*:\s*Action A_RECOVER .* not supported",
"unconfirmed_actions:.*Waiting on .* unconfirmed actions",
"cib_native_msgready:.*Message pending on command channel",
r"crmd.*:\s*Performing A_EXIT_1 - forcefully exiting the CRMd",
"verify_stopped:.*Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
"error: attrd_connection_destroy:.*Lost connection to attrd",
r".*:\s*Executing .* fencing operation \(.*\) on ",
r".*:\s*Requesting fencing \([^)]+\) of node ",
r"(Blackbox dump requested|Problem detected)",
# "error: native_create_actions: Resource .*stonith::.* is active on 2 nodes attempting recovery",
# "error: process_pe_message: Transition .* ERRORs found during PE processing",
]
self.components["corosync-ignore"] = [
r"error:.*Connection to the CPG API failed: Library error",
r"\[[0-9]+\] exited with status [0-9]+ \(",
r"pacemakerd.*error:.*Child process .* exited",
r"cib.*error:.*Corosync connection lost",
r"stonith-ng.*error:.*Corosync connection terminated",
r"error:.*Child process cib .* exited: Invalid argument",
r"error:.*Child process attrd .* exited: Transport endpoint is not connected",
r"error:.*Child process crmd .* exited: Link has been severed",
r"lrmd.*error:.*Connection to stonith-ng.* (failed|closed)",
r"lrmd.*error:.*LRMD lost STONITH connection",
r"crmd.*State transition .* S_RECOVERY",
r"crmd.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state",
r"crmd.*error:.*Connection to cman failed",
r"crmd.*error:.*Could not recover from internal error",
r"error:.*Connection to cib_(shm|rw).* (failed|closed)",
r"error:.*STONITH connection failed",
r"error: Connection to stonith-ng.* (failed|closed)",
r"crit: Fencing daemon connection failed",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pengine.* Calculated transition .*/pe-error",
]
self.components["corosync"] = [
r"pacemakerd.*error:.*Connection destroyed",
r"attrd.*:\s*(crit|error):.*Lost connection to (Corosync|CIB) service",
r"stonith.*:\s*(Corosync connection terminated|Shutting down)",
r"cib.*:\s*Corosync connection lost!\s+Exiting.",
r"crmd.*:\s*(connection terminated|Disconnected from Corosync)",
r"pengine.*Scheduling Node .* for STONITH",
r"crmd.*:\s*Peer .* was terminated \(.*\) by .* for .*:\s*OK",
]
self.components["cib-ignore"] = [
"lrmd.*Connection to stonith-ng failed",
"lrmd.*Connection to stonith-ng.* closed",
"lrmd.*LRMD lost STONITH connection",
"lrmd.*STONITH connection failed, finalizing .* pending operations",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pengine.* Calculated transition .*/pe-error",
]
self.components["cib"] = [
"State transition .* S_RECOVERY",
"Respawning .* crmd",
"Respawning .* attrd",
"Connection to cib_.* failed",
"Connection to cib_.* closed",
r"crmd.*:.*Connection to the CIB terminated...",
r"attrd.*:.*(Lost connection to CIB service|Connection to the CIB terminated)",
r"crmd\[[0-9]+\] exited with status 2",
r"attrd\[[0-9]+\] exited with status 1",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*I_ERROR.*crmd_cib_connection_destroy",
"crmd.*Could not recover from internal error",
]
self.components["lrmd"] = [
"State transition .* S_RECOVERY",
"LRM Connection failed",
"Respawning .* crmd",
"Connection to lrmd failed",
"Connection to lrmd.* closed",
"crmd.*I_ERROR.*lrm_connection_destroy",
r"crmd\[[0-9]+\] exited with status 2",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*Could not recover from internal error",
]
self.components["lrmd-ignore"] = [
r"attrd.*Connection to lrmd (failed|closed)",
r"(attrd|controld).*Could not execute alert",
]
self.components["crmd"] = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# Only if the node wasn't the DC: "State transition S_IDLE",
"State transition .* -> S_IDLE",
]
self.components["crmd-ignore"] = []
self.components["attrd"] = []
self.components["attrd-ignore"] = []
self.components["pengine"] = [
"State transition .* S_RECOVERY",
"Respawning .* crmd",
r"crmd\[[0-9]+\] exited with status 2",
"Connection to pengine failed",
"Connection to pengine.* closed",
"Connection to the Policy Engine failed",
"crmd.*I_ERROR.*save_cib_contents",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*Could not recover from internal error",
]
self.components["pengine-ignore"] = []
self.components["stonith"] = [
"Connection to stonith-ng failed",
"LRMD lost STONITH connection",
"Connection to stonith-ng.* closed",
"Fencing daemon connection failed",
r"crmd.*: Fencer successfully connected",
]
self.components["stonith-ignore"] = [
r"pengine.*: Recover Fencing",
r"Updating failcount for Fencing",
r"error:.*Connection to stonith-ng failed",
r"error:.*Connection to stonith-ng.*closed \(I/O condition=17\)",
r"crit:.*Fencing daemon connection failed",
r"error:.*Sign-in failed: triggered a retry",
"STONITH connection failed, finalizing .* pending operations.",
r"crmd.*:\s+Result of .* operation for Fencing.*Error",
# This is overbroad, but we don't have a way to say that only
# certain transition errors are acceptable (if the fencer respawns,
# fence devices may appear multiply active). We have to rely on
# other causes of a transition error logging their own error
# message, which is the usual practice.
r"pengine.* Calculated transition .*/pe-error",
]
self.components["stonith-ignore"].extend(self.components["common-ignore"])
class crm_mcp(crm_cs_v0):
'''
The crm version 4 cluster manager class.
It implements the things we need to talk to and manipulate
crm clusters running on top of native corosync (no plugins)
'''
def __init__(self, name):
crm_cs_v0.__init__(self, name)
self.commands.update({
"StartCmd" : "service corosync start && service pacemaker start",
"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker_remoted ] || service pacemaker_remote stop; service corosync stop",
"EpochCmd" : "crm_node -e",
"QuorumCmd" : "crm_node -q",
"PartitionCmd" : "crm_node -p",
})
self.search.update({
# Close enough... "Corosync Cluster Engine exiting normally" isn't printed
# reliably and there's little interest in doing anything about it
"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*crmd.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "crmd.*Node %s(\[|\s).*state is now lost",
"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
# "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
# if self.Env["have_systemd"]:
# self.update({
# # When systemd is in use, we can look for this instead
# "Pat:We_stopped" : "%s.*Stopped Corosync Cluster Engine",
# })
class crm_mcp_docker(crm_mcp):
'''
The crm version 4 cluster manager class.
It implements the things we need to talk to and manipulate
crm clusters running on top of native corosync (no plugins)
'''
def __init__(self, name):
crm_mcp.__init__(self, name)
self.commands.update({
"StartCmd" : "pcmk_start",
"StopCmd" : "pcmk_stop",
})
class crm_cman(crm_cs_v0):
'''
The crm version 3 cluster manager class.
It implements the things we need to talk to and manipulate
crm clusters running on top of openais
'''
def __init__(self, name):
crm_cs_v0.__init__(self, name)
self.commands.update({
"StartCmd" : "service pacemaker start",
"StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker_remoted ] || service pacemaker_remote stop",
"EpochCmd" : "crm_node -e --cman",
"QuorumCmd" : "crm_node -q --cman",
"PartitionCmd" : "crm_node -p --cman",
})
self.search.update({
"Pat:We_stopped" : "%s.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*crmd.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "crmd.*Node %s(\[|\s).*state is now lost",
# "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
class PatternSelector:
def __init__(self, name=None):
self.name = name
self.base = BasePatterns("crm-base")
if not name:
crm_cs_v0("crm-plugin-v0")
crm_cman("crm-cman")
crm_mcp("crm-mcp")
crm_lha("crm-lha")
elif name == "crm-lha":
crm_lha(name)
elif name == "crm-plugin-v0":
crm_cs_v0(name)
elif name == "crm-cman":
crm_cman(name)
elif name == "crm-mcp":
crm_mcp(name)
elif name == "crm-mcp-docker":
crm_mcp_docker(name)
def get_variant(self, variant):
if variant in patternvariants:
return patternvariants[variant]
print("defaulting to crm-base for %s" % variant)
return self.base
def get_patterns(self, variant, kind):
return self.get_variant(variant).get_patterns(kind)
def get_template(self, variant, key):
v = self.get_variant(variant)
return v[key]
def get_component(self, variant, kind):
return self.get_variant(variant).get_component(kind)
def __getitem__(self, key):
return self.get_template(self.name, key)
# python cts/CTSpatt.py -k crm-mcp -t StartCmd
if __name__ == '__main__':
pdir=os.path.dirname(sys.path[0])
sys.path.insert(0, pdir) # So that things work from the source directory
kind=None
template=None
skipthis=None
args=sys.argv[1:]
for i in range(0, len(args)):
if skipthis:
skipthis=None
continue
elif args[i] == "-k" or args[i] == "--kind":
skipthis=1
kind = args[i+1]
elif args[i] == "-t" or args[i] == "--template":
skipthis=1
template = args[i+1]
else:
print("Illegal argument " + args[i])
print(PatternSelector(kind)[template])