From 83b90c02d6655db7706e510a7a2594dca2e75b58 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Thu, 28 Sep 2017 14:08:40 +0100 Subject: [PATCH] cluster-servant: only report online if quorate or never had quorum Currently, the sbd-cluster servant does nothing for a corosync/cman cluster each time notify_timer_cb fires. So this servant can never cause fencing after connecting. Instead, query the QUORUM service and only report 'online' status when quorate or when quorum has never been attained. This allows sbd to be used for fencing upon loss of quorum when pacemaker is not running. --- configure.ac | 1 + src/sbd-cluster.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 46429d1..933d865 100644 --- a/configure.ac +++ b/configure.ac @@ -54,6 +54,7 @@ AC_CHECK_LIB(aio, io_setup, , missing="yes") AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set, , missing="yes") AC_CHECK_LIB(cib, cib_new, , missing="yes") AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") +AC_CHECK_LIB(quorum, quorum_initialize, , missing="yes") AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c index 0ed56e7..11bc33c 100644 --- a/src/sbd-cluster.c +++ b/src/sbd-cluster.c @@ -33,6 +33,10 @@ #include #include +#if SUPPORT_COROSYNC +#include +#endif + #include "sbd.h" //undef SUPPORT_PLUGIN @@ -85,9 +89,18 @@ sbd_cpg_membership_dispatch(cpg_handle_t handle, } #endif +#if SUPPORT_COROSYNC +static quorum_handle_t q_handle; +static uint32_t q_type; +#endif + static gboolean notify_timer_cb(gpointer data) { + int is_quorate; + int err; + static int ever_had_quorum = FALSE; + cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":""); if(remote_node) { @@ -102,7 +115,20 @@ notify_timer_cb(gpointer data) case pcmk_cluster_corosync: case pcmk_cluster_cman: - /* TODO - Make a CPG call and only call notify_parent() when we get a reply */ +#if SUPPORT_COROSYNC + /* Report healthy if we're quorate or we've never seen quorum */ + err = quorum_getquorate(q_handle, &is_quorate); + if (err != CS_OK) { + set_servant_health(pcmk_health_transient, LOG_INFO, "Unable to dispatch quorum status: %d", err); + } else if (is_quorate) { + set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); + ever_had_quorum = TRUE; + } else if (ever_had_quorum) { + set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); + } else { + set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); + } +#endif notify_parent(); break; @@ -117,6 +143,7 @@ static void sbd_membership_connect(void) { bool connected = false; + int err; cl_log(LOG_NOTICE, "Attempting cluster connection"); @@ -128,6 +155,7 @@ sbd_membership_connect(void) #if SUPPORT_COROSYNC cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch; + q_handle = 0; #endif while(connected == false) { @@ -146,6 +174,18 @@ sbd_membership_connect(void) if(crm_cluster_connect(&cluster)) { connected = true; } + +#if SUPPORT_COROSYNC + /* Connect to quorum service so we can use q_handle */ + cl_log(LOG_INFO, "Attempting quorum connection"); + err = quorum_initialize(&q_handle, NULL, &q_type); + if (err != CS_OK) { + cl_log(LOG_ERR, "Cannot initialize QUORUM service: %d\n", err); + q_handle = 0; + crm_cluster_disconnect(&cluster); + connected = false; + } +#endif } if(connected == false) { @@ -163,11 +203,23 @@ sbd_membership_connect(void) static void sbd_membership_destroy(gpointer user_data) { + int err; + cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type())); set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated"); notify_parent(); +#if SUPPORT_COROSYNC + /* Best effort attempt to disconnect from quorum service */ + cl_log(LOG_INFO, "Attempting quorum disconnection"); + err = quorum_finalize(q_handle); + if (err != CS_OK) { + cl_log(LOG_ERR, "Cannot finalize QUORUM service: %d\n", err); + q_handle = 0; + } +#endif + /* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */ sbd_membership_connect(); }