Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ocaml/xapi/create_storage.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ open Client
module D=Debug.Debugger(struct let name="xapi" end)
open D

let plug_all_pbds __context rpc session_id =
let plug_all_pbds __context =
(* Explicitly resynchronise local PBD state *)
let my_pbds = Helpers.get_my_pbds __context in
Storage_access.resynchronise_pbds ~__context ~pbds:(List.map fst my_pbds);
Expand All @@ -31,22 +31,22 @@ let plug_all_pbds __context rpc session_id =
try
if pbd_record.API.pBD_currently_attached
then debug "Not replugging PBD %s: already plugged in" (Ref.string_of self)
else Client.PBD.plug ~rpc ~session_id ~self
else Xapi_pbd.plug ~__context ~self
with e ->
result := false;
error "Could not plug in pbd '%s': %s" (Db.PBD.get_uuid ~__context ~self) (Printexc.to_string e))
my_pbds;
!result

let plug_unplugged_pbds __context rpc session_id =
let plug_unplugged_pbds __context =
let my_pbds = Helpers.get_my_pbds __context in
List.iter
(fun (self, pbd_record) ->
try
if pbd_record.API.pBD_currently_attached
then debug "Not replugging PBD %s: already plugged in" (Ref.string_of self)
else Client.PBD.plug ~rpc ~session_id ~self
with e -> debug "Could not plug in pbd '%s': %s" (Client.PBD.get_uuid rpc session_id self) (Printexc.to_string e))
else Xapi_pbd.plug ~__context ~self
with e -> debug "Could not plug in pbd '%s': %s" (Db.PBD.get_uuid ~__context ~self) (Printexc.to_string e))
my_pbds

(* Create a PBD which connects this host to the SR, if one doesn't already exist *)
Expand Down Expand Up @@ -107,7 +107,7 @@ let create_storage (me: API.ref_host) rpc session_id __context : unit =
else
debug "Skipping creation of PBDs for shared SRs";

let all_pbds_ok = plug_all_pbds __context rpc session_id in
let all_pbds_ok = plug_all_pbds __context in
if not(all_pbds_ok) then begin
let obj_uuid = Helpers.get_localhost_uuid () in
Xapi_alert.add ~name:Api_messages.pbd_plug_failed_on_server_start ~priority:1L ~cls:`Host ~obj_uuid ~body:"";
Expand Down
4 changes: 2 additions & 2 deletions ocaml/xapi/xapi.ml
Original file line number Diff line number Diff line change
Expand Up @@ -987,8 +987,8 @@ let server_init() =
debug "Waiting forever for the management interface to gain an IP address";
let ip = wait_for_management_ip_address () in
debug "Management interface got IP address: %s; attempting to re-plug any unplugged PBDs" ip;
Helpers.call_api_functions ~__context (fun rpc session_id ->
Create_storage.plug_unplugged_pbds __context rpc session_id)
Helpers.call_api_functions ~__context (fun rpc session_id ->
Create_storage.plug_unplugged_pbds __context)
)
in

Expand Down
4 changes: 2 additions & 2 deletions ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ module Monitor = struct
let now = Unix.gettimeofday () in
let plan_too_old = now -. !last_plan_time > Xapi_globs.ha_monitor_plan_timer in
if plan_too_old || !plan_out_of_date then begin
let changed = Xapi_ha_vm_failover.update_pool_status ~__context in
let changed = Xapi_ha_vm_failover.update_pool_status ~__context ~live_set:liveset_refs () in

(* Extremely bad: something managed to break our careful plan *)
if changed && not !plan_out_of_date then error "Overcommit protection failed to prevent a change which invalidated our failover plan";
Expand Down Expand Up @@ -1461,7 +1461,7 @@ let enable __context heartbeat_srs configuration =
(* Update the Pool's planning configuration (ha_overcommitted, ha_plan_exists_for) *)
(* Start by assuming there is no ha_plan_for: this can be revised upwards later *)
Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:0L;
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context in
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context () in

let generation = Uuid.string_of_uuid (Uuid.make_uuid ()) in

Expand Down
143 changes: 78 additions & 65 deletions ocaml/xapi/xapi_ha_vm_failover.ml
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,20 @@ let host_of_non_agile_vm ~__context all_hosts_and_snapshots_sorted (vm, snapshot
warn "No host could support protected xHA VM: %s (%s)" (Helpers.short_string_of_ref vm) (snapshot.API.vM_name_label);
[]

let get_live_set ~__context =
let all_hosts = Db.Host.get_all_records ~__context in
let live_hosts = List.filter (fun (rf,r) -> r.API.host_enabled
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false))
all_hosts in
List.map (fun (rf,_) -> rf) live_hosts

(** Given the current number of host failures to consider (only useful for passing to the binpacker to influence its
choice of heuristic), return an instantaneous VM restart plan which includes all protected offline VMs, and a
planning configuration corresponding to the state of the world after the starts are complete, for use in further
planning.
Returns: (VM restart plan, new planning configuration, true if some protected non-agile VMs exist)
*)
let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration_change) num_failures =
let compute_restart_plan ~__context ~all_protected_vms ~live_set ?(change=no_configuration_change) num_failures =
(* This function must be deterministic: for the same set of hosts and set of VMs it must produce the same output.
We rely partially on the binpacker enforcing its own ordering over hosts and vms, so it's not critical for us
to sort the result of Db.*.get_all calls generally. However the handling of non-agile VMs needs special care. *)
Expand Down Expand Up @@ -155,7 +162,8 @@ let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration
true
&& r.API.host_enabled
&& not (List.mem rf change.hosts_to_disable)
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false) in
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false)
&& (List.mem rf live_set) in
let live_hosts_and_snapshots, dead_hosts_and_snapshots = List.partition is_alive all_hosts_and_snapshots in

let live_hosts = List.map fst live_hosts_and_snapshots (* and dead_hosts = List.map fst dead_hosts_and_snapshots *) in
Expand Down Expand Up @@ -273,10 +281,11 @@ type result =
breaking the plan.
*)

let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuration_change) n =
let plan_for_n_failures ~__context ~all_protected_vms ?live_set ?(change = no_configuration_change) n =
let live_set = match live_set with None -> get_live_set ~__context | Some s -> s in
try
(* 'changes' are applied by the compute_restart_plan function *)
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~change n in
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set ~change n in

(* Could some VMs not be started? If so we're overcommitted before we started. *)
if vms_not_restarted <> [] then begin
Expand All @@ -299,26 +308,26 @@ let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuratio
error "Unexpected error in HA VM failover planning function: %s" (ExnHelper.string_of_exn e);
No_plan_exists

let compute_max_host_failures_to_tolerate ~__context ?protected_vms () =
let compute_max_host_failures_to_tolerate ~__context ?live_set ?protected_vms () =
let protected_vms = match protected_vms with
| None -> all_protected_vms ~__context
| Some vms -> vms in
let nhosts = List.length (Db.Host.get_all ~__context) in
(* We assume that if not(plan_exists(n)) then \forall.x>n not(plan_exists(n))
although even if we screw this up it's not a disaster because all we need is a
safe approximation (so ultimately "0" will do but we'd prefer higher) *)
Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)
Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms ?live_set (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)

(* Make sure the pool is marked as overcommitted and the appropriate alert is generated. Return
true if something changed, false otherwise *)
let mark_pool_as_overcommitted ~__context =
let mark_pool_as_overcommitted ~__context ~live_set =
let pool = Helpers.get_pool ~__context in

let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
let planned_for = Db.Pool.get_ha_plan_exists_for ~__context ~self:pool in
let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in

let max_failures = compute_max_host_failures_to_tolerate ~__context () in
let max_failures = compute_max_host_failures_to_tolerate ~__context ~live_set () in
if planned_for <> max_failures then begin
Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:(min to_tolerate max_failures);
if max_failures < planned_for
Expand All @@ -345,7 +354,10 @@ let mark_pool_as_overcommitted ~__context =
planned_for <> max_failures || (not overcommitted)

(* Update the pool's HA fields *)
let update_pool_status ~__context =
let update_pool_status ~__context ?live_set () =
let live_set = match live_set with
| None -> get_live_set ~__context
| Some s -> s in
let pool = Helpers.get_pool ~__context in
let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in
Expand All @@ -357,7 +369,7 @@ let update_pool_status ~__context =
i.e. have we become overcommitted (in the sense of having too few resources to satisfy
demand) or not?
*)
match plan_for_n_failures ~__context ~all_protected_vms (Int64.to_int to_tolerate) with
match plan_for_n_failures ~__context ~all_protected_vms ~live_set (Int64.to_int to_tolerate) with
| Plan_exists_for_all_VMs ->
debug "HA failover plan exists for all protected VMs";
Db.Pool.set_ha_overcommitted ~__context ~self:pool ~value:false;
Expand All @@ -367,29 +379,30 @@ let update_pool_status ~__context =
overcommitted || (planned_for <> to_tolerate)
| Plan_exists_excluding_non_agile_VMs ->
debug "HA failover plan exists for all protected VMs, excluding some non-agile VMs";
mark_pool_as_overcommitted ~__context; (* might define this as false later *)
mark_pool_as_overcommitted ~__context ~live_set; (* might define this as false later *)
| No_plan_exists ->
debug "No HA failover plan exists";
mark_pool_as_overcommitted ~__context
mark_pool_as_overcommitted ~__context ~live_set

let assert_configuration_change_preserves_ha_plan ~__context c =
debug "assert_configuration_change_preserves_ha_plan c = %s" (string_of_configuration_change ~__context c);

(* Only block the operation if a plan exists now but would evaporate with the proposed changes.
This prevents us blocking all operations should be suddenly become overcommitted eg through
multiple host failures *)
let live_set = get_live_set ~__context in
let pool = Helpers.get_pool ~__context in
if Db.Pool.get_ha_enabled ~__context ~self:pool && not(Db.Pool.get_ha_allow_overcommit ~__context ~self:pool) then begin
let to_tolerate = Int64.to_int (Db.Pool.get_ha_plan_exists_for ~__context ~self:pool) in
let all_protected_vms = all_protected_vms ~__context in

match plan_for_n_failures ~__context ~all_protected_vms to_tolerate with
match plan_for_n_failures ~__context ~all_protected_vms ~live_set to_tolerate with
| Plan_exists_excluding_non_agile_VMs
| No_plan_exists ->
debug "assert_configuration_change_preserves_ha_plan: no plan currently exists; cannot get worse"
| Plan_exists_for_all_VMs -> begin
(* Does the plan break? *)
match plan_for_n_failures ~__context ~all_protected_vms ~change:c to_tolerate with
match plan_for_n_failures ~__context ~all_protected_vms ~live_set ~change:c to_tolerate with
| Plan_exists_for_all_VMs ->
debug "assert_configuration_change_preserves_ha_plan: plan exists after change"
| Plan_exists_excluding_non_agile_VMs
Expand Down Expand Up @@ -430,56 +443,56 @@ let restart_auto_run_vms ~__context live_set n =
an accurate way to determine 'failed' VMs but it will suffice for our 'best-effort'
category. *)
let reset_vms = ref [] in
List.iter
(fun h ->
if not (List.mem h live_set) then
begin
let hostname = Db.Host.get_hostname ~__context ~self:h in
debug "Setting host %s to dead" hostname;
(* Sample this before calling any hook scripts *)
let resident_on_vms = Db.Host.get_resident_VMs ~__context ~self:h in
reset_vms := resident_on_vms @ !reset_vms;

(* ensure live=false *)
begin
try
let h_metrics = Db.Host.get_metrics ~__context ~self:h in
let current = Db.Host_metrics.get_live ~__context ~self:h_metrics in
if current then begin
(* Fire off a ha_host_failed message if the host hasn't just shut itself down *)
let shutting_down = Threadext.Mutex.execute Xapi_globs.hosts_which_are_shutting_down_m (fun () -> !Xapi_globs.hosts_which_are_shutting_down) in
if not (List.exists (fun x -> x=h) shutting_down) then begin
let obj_uuid = Db.Host.get_uuid ~__context ~self:h in
let host_name = Db.Host.get_name_label ~__context ~self:h in
Xapi_alert.add ~name:Api_messages.ha_host_failed ~priority:Api_messages.ha_host_failed_priority ~cls:`Host ~obj_uuid
~body:(Printf.sprintf "Server '%s' has failed" host_name);
end;
(* Call external host failed hook (allows a third-party to use power-fencing if desired) *)
Xapi_hooks.host_pre_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
Db.Host_metrics.set_live ~__context ~self:h_metrics ~value:false; (* since slave is fenced, it will not set this to true again itself *)
Xapi_host_helpers.update_allowed_operations ~__context ~self:h;
Xapi_hooks.host_post_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
end
with _ ->
(* if exn assume h_metrics doesn't exist, then "live" is defined to be false implicitly, so do nothing *)
()
end;
debug "Setting all VMs running or paused on %s to Halted" hostname;
(* ensure all vms resident_on this host running or paused have their powerstates reset *)

List.iter
(fun vm ->
let vm_powerstate = Db.VM.get_power_state ~__context ~self:vm in
if (vm_powerstate=`Running || vm_powerstate=`Paused) then
Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted
)
resident_on_vms
end
)
hosts;
let dead_hosts = ref [] in
List.iter (fun h ->
if not (List.mem h live_set) then begin
let hostname = Db.Host.get_hostname ~__context ~self:h in
debug "Setting host %s to dead" hostname;
(* Sample this before calling any hook scripts *)
let resident_on_vms = List.filter
(fun vm -> not (Db.VM.get_is_control_domain ~__context ~self:vm))
(Db.Host.get_resident_VMs ~__context ~self:h) in
reset_vms := resident_on_vms @ !reset_vms;

(* ensure live=false *)
begin
try
let h_metrics = Db.Host.get_metrics ~__context ~self:h in
let current = Db.Host_metrics.get_live ~__context ~self:h_metrics in
if current then begin
(* Fire off a ha_host_failed message if the host hasn't just shut itself down *)
let shutting_down = Threadext.Mutex.execute Xapi_globs.hosts_which_are_shutting_down_m (fun () -> !Xapi_globs.hosts_which_are_shutting_down) in
if not (List.exists (fun x -> x=h) shutting_down) then begin
let obj_uuid = Db.Host.get_uuid ~__context ~self:h in
let host_name = Db.Host.get_name_label ~__context ~self:h in
Xapi_alert.add ~msg:Api_messages.ha_host_failed ~cls:`Host ~obj_uuid
~body:(Printf.sprintf "Server '%s' has failed" host_name);
end;
(* Call external host failed hook (allows a third-party to use power-fencing if desired) *)
Xapi_hooks.host_pre_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
Db.Host_metrics.set_live ~__context ~self:h_metrics ~value:false; (* since slave is fenced, it will not set this to true again itself *)
Xapi_host_helpers.update_allowed_operations ~__context ~self:h;
dead_hosts := h :: !dead_hosts;
end
with _ ->
() (* if exn assume h_metrics doesn't exist, then "live" is defined to be false implicitly, so do nothing *)
end
end) hosts;

debug "Setting all VMs running or paused to Halted";
(* ensure all vms resident_on this host running or paused have their powerstates reset *)
List.iter (fun vm ->
let vm_powerstate = Db.VM.get_power_state ~__context ~self:vm in
if (vm_powerstate=`Running || vm_powerstate=`Paused) then
Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted)
!reset_vms;
(* host_post_declare_dead may take a long time if the SR is locked *)
dead_hosts := List.rev !dead_hosts;
List.iter (fun h -> Xapi_hooks.host_post_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced)
!dead_hosts;

(* If something has changed then we'd better refresh the pool status *)
if !reset_vms <> [] then ignore(update_pool_status ~__context);
if !reset_vms <> [] then ignore(update_pool_status ~__context ~live_set ());

(* At this point failed protected agile VMs are Halted, not resident_on anywhere *)

Expand All @@ -497,7 +510,7 @@ let restart_auto_run_vms ~__context live_set n =
protection.
For the best-effort VMs we call the script
when we have reset some VMs to halted (no guarantee there is enough resource but better safe than sorry) *)
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms n in
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set n in
let plan, config, vms_not_restarted, non_agile_protected_vms_exist =
if true
&& Xapi_hooks.pool_pre_ha_vm_restart_hook_exists ()
Expand All @@ -511,7 +524,7 @@ let restart_auto_run_vms ~__context live_set n =
error "pool-pre-ha-vm-restart-hook failed: %s: continuing anyway" (ExnHelper.string_of_exn e)
end;
debug "Recomputing restart plan to take into account new state of the world after running the script";
compute_restart_plan ~__context ~all_protected_vms n
compute_restart_plan ~__context ~all_protected_vms ~live_set n
end else plan, config, vms_not_restarted, non_agile_protected_vms_exist (* nothing needs recomputing *)
in

Expand Down Expand Up @@ -566,7 +579,7 @@ let restart_auto_run_vms ~__context live_set n =
| Api_errors.Server_error(code, params) when code = Api_errors.ha_operation_would_break_failover_plan ->
(* This should never happen since the planning code would always allow the restart of a protected VM... *)
error "Caught exception HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN: setting pool as overcommitted and retrying";
ignore_bool(mark_pool_as_overcommitted ~__context);
ignore_bool(mark_pool_as_overcommitted ~__context ~live_set);
begin
try
go ();
Expand Down
Loading