diff --git a/ocaml/xapi/cert_distrib.ml b/ocaml/xapi/cert_distrib.ml index 955b2b50d22..697483a01c0 100644 --- a/ocaml/xapi/cert_distrib.ml +++ b/ocaml/xapi/cert_distrib.ml @@ -345,6 +345,25 @@ let collect_pool_certs ~__context ~rpc ~session_id ~map ~from_hosts = map cert ) +let _distrib_m = Mutex.create () + +(* where possible, the master host should control the certificate + * distributions. this allows us to coordinate multiple parties that are + * trying to modify /etc/stunnel at the same time with [lock]! + * + * we apply this lock to all top level distribution calls, with the exception of + * the pool join functions that execute on the joiner. + *) +let lock (f : unit -> 'a) : 'a = + D.debug "cert_distrib.ml: locking..." ; + Mutex.lock _distrib_m ; + Fun.protect + ~finally:(fun () -> + D.debug "cert_distrib.ml: unlocking..." ; + Mutex.unlock _distrib_m + ) + f + let exchange_certificates_among_all_members ~__context = (* here we coordinate the certificate distribution. from a high level we do the following: @@ -363,6 +382,7 @@ let exchange_certificates_among_all_members ~__context = we do not guarantee 'atomicity', so if regenerating the bundle on one host fails, then state across the pool will most likely become inconsistent, and manual intervention may be required *) + lock @@ fun () -> let all_hosts = Xapi_pool_helpers.get_master_slaves_list ~__context in Helpers.call_api_functions ~__context @@ fun rpc session_id -> let certs = @@ -396,6 +416,7 @@ let import_joiner ~__context ~uuid ~certificate ~to_hosts = (* This function is called on the pool that is incorporating a new host *) let exchange_certificates_with_joiner ~__context ~uuid ~certificate = + lock @@ fun () -> let all_hosts = Db.Host.get_all ~__context in import_joiner ~__context ~uuid ~certificate ~to_hosts:all_hosts ; Helpers.call_api_functions ~__context @@ fun rpc session_id -> @@ -414,6 +435,7 @@ let collect_ca_certs ~__context ~names = (* This function is called on the pool that is incorporating a new host *) let exchange_ca_certificates_with_joiner ~__context ~import ~export = + lock @@ fun () -> let all_hosts = Db.Host.get_all ~__context in let appliance_certs = List.map WireProtocol.certificate_file_of_pair import in diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index b82b1a745e0..76768a3f694 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -782,12 +782,20 @@ functor do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Pool.eject rpc session_id host ) ; - (* call eject on all other slaves first *) + (* perform cleanup on remaining pool members + * this must be best effort - once an eject has begun we cannot rollback *) other |> List.iter (fun h -> - do_op_on ~local_fn ~__context ~host:h (fun session_id rpc -> - Client.Pool.eject rpc session_id host - ) + try + do_op_on ~local_fn ~__context ~host:h (fun session_id rpc -> + Client.Pool.eject rpc session_id host + ) + with e -> + D.warn + "Pool.eject: while ejecting host=%s, we failed to clean up \ + on host=%s. ignoring error: %s" + (Ref.short_string_of host) (Ref.short_string_of h) + (Printexc.to_string e) ) ; (* finally clean up on master *) do_op_on ~local_fn ~__context ~host:master (fun session_id rpc ->