From dec54ea4a1f03e60d4db3017ce482349d22f8553 Mon Sep 17 00:00:00 2001 From: Ben Anson Date: Thu, 17 Jun 2021 14:01:33 +0100 Subject: [PATCH 1/2] CA-354414 perform best effort Pool.eject cleanups This fixes parallel Pool.ejects - if we try to eject host B, but host C is not contactable, we do not want to block the eject. The price we pay for parallel ejects is that we might not clean up host B's certificates on host C. Signed-off-by: Ben Anson --- ocaml/xapi/message_forwarding.ml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index b82b1a745e0..76768a3f694 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -782,12 +782,20 @@ functor do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Pool.eject rpc session_id host ) ; - (* call eject on all other slaves first *) + (* perform cleanup on remaining pool members + * this must be best effort - once an eject has begun we cannot rollback *) other |> List.iter (fun h -> - do_op_on ~local_fn ~__context ~host:h (fun session_id rpc -> - Client.Pool.eject rpc session_id host - ) + try + do_op_on ~local_fn ~__context ~host:h (fun session_id rpc -> + Client.Pool.eject rpc session_id host + ) + with e -> + D.warn + "Pool.eject: while ejecting host=%s, we failed to clean up \ + on host=%s. ignoring error: %s" + (Ref.short_string_of host) (Ref.short_string_of h) + (Printexc.to_string e) ) ; (* finally clean up on master *) do_op_on ~local_fn ~__context ~host:master (fun session_id rpc -> From 88af3b2b031783842503cc79b7d6d363608b9f23 Mon Sep 17 00:00:00 2001 From: Ben Anson Date: Fri, 18 Jun 2021 12:29:35 +0100 Subject: [PATCH 2/2] REQ-403 cert_distrib lock This prevents concurrent cert distribs interfering with each other. Pool.join caveat: during a join, the joiner modifies /etc/stunnel itself. This cannot be locked by the master, so some race conditions still remain. Additionally, the joiner goes offline for a portion of the join, so could lose out on certificate updates during this time, especially if multiple pool joins are happening at the same time. Signed-off-by: Ben Anson --- ocaml/xapi/cert_distrib.ml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ocaml/xapi/cert_distrib.ml b/ocaml/xapi/cert_distrib.ml index 955b2b50d22..697483a01c0 100644 --- a/ocaml/xapi/cert_distrib.ml +++ b/ocaml/xapi/cert_distrib.ml @@ -345,6 +345,25 @@ let collect_pool_certs ~__context ~rpc ~session_id ~map ~from_hosts = map cert ) +let _distrib_m = Mutex.create () + +(* where possible, the master host should control the certificate + * distributions. this allows us to coordinate multiple parties that are + * trying to modify /etc/stunnel at the same time with [lock]! + * + * we apply this lock to all top level distribution calls, with the exception of + * the pool join functions that execute on the joiner. + *) +let lock (f : unit -> 'a) : 'a = + D.debug "cert_distrib.ml: locking..." ; + Mutex.lock _distrib_m ; + Fun.protect + ~finally:(fun () -> + D.debug "cert_distrib.ml: unlocking..." ; + Mutex.unlock _distrib_m + ) + f + let exchange_certificates_among_all_members ~__context = (* here we coordinate the certificate distribution. from a high level we do the following: @@ -363,6 +382,7 @@ let exchange_certificates_among_all_members ~__context = we do not guarantee 'atomicity', so if regenerating the bundle on one host fails, then state across the pool will most likely become inconsistent, and manual intervention may be required *) + lock @@ fun () -> let all_hosts = Xapi_pool_helpers.get_master_slaves_list ~__context in Helpers.call_api_functions ~__context @@ fun rpc session_id -> let certs = @@ -396,6 +416,7 @@ let import_joiner ~__context ~uuid ~certificate ~to_hosts = (* This function is called on the pool that is incorporating a new host *) let exchange_certificates_with_joiner ~__context ~uuid ~certificate = + lock @@ fun () -> let all_hosts = Db.Host.get_all ~__context in import_joiner ~__context ~uuid ~certificate ~to_hosts:all_hosts ; Helpers.call_api_functions ~__context @@ fun rpc session_id -> @@ -414,6 +435,7 @@ let collect_ca_certs ~__context ~names = (* This function is called on the pool that is incorporating a new host *) let exchange_ca_certificates_with_joiner ~__context ~import ~export = + lock @@ fun () -> let all_hosts = Db.Host.get_all ~__context in let appliance_certs = List.map WireProtocol.certificate_file_of_pair import in