Skip to content

Commit 0b59a7f

Browse files
Jerome Malobertisimonjbeaumont
authored andcommitted
CA-111637: Use the HA live_set when computing a restart plan for VMs.
Previously, after a host failure in HA, the function Xapi_ha_vm_failover.compute_restart_plan which choose which hosts should restart the failed VMs would pickup host that are live and enabled. In some cases, a host may be live but not in the HA live_set, for example if it returned in the pool before the HA recovery process finished. This situation is bad since the live host would have new VMs started on it, while later it may be marked as dead, once the HA recovery process finished. This commit add the live_set parameter to compute_restart_plan and all functions that need it by transitivity. Some functions are called during the HA recovery, where the live_set is available, or at startup, in this case the live_set is created from all live and enabled hosts. Signed-off-by: Jerome Maloberti <[email protected]>
1 parent c635440 commit 0b59a7f

File tree

5 files changed

+38
-25
lines changed

5 files changed

+38
-25
lines changed

ocaml/xapi/xapi_ha.ml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ module Monitor = struct
521521
let now = Unix.gettimeofday () in
522522
let plan_too_old = now -. !last_plan_time > Xapi_globs.ha_monitor_plan_timer in
523523
if plan_too_old || !plan_out_of_date then begin
524-
let changed = Xapi_ha_vm_failover.update_pool_status ~__context in
524+
let changed = Xapi_ha_vm_failover.update_pool_status ~__context ~live_set:liveset_refs () in
525525

526526
(* Extremely bad: something managed to break our careful plan *)
527527
if changed && not !plan_out_of_date then error "Overcommit protection failed to prevent a change which invalidated our failover plan";
@@ -1461,7 +1461,7 @@ let enable __context heartbeat_srs configuration =
14611461
(* Update the Pool's planning configuration (ha_overcommitted, ha_plan_exists_for) *)
14621462
(* Start by assuming there is no ha_plan_for: this can be revised upwards later *)
14631463
Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:0L;
1464-
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context in
1464+
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context () in
14651465

14661466
let generation = Uuid.string_of_uuid (Uuid.make_uuid ()) in
14671467

ocaml/xapi/xapi_ha_vm_failover.ml

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,20 @@ let host_of_non_agile_vm ~__context all_hosts_and_snapshots_sorted (vm, snapshot
9393
warn "No host could support protected xHA VM: %s (%s)" (Helpers.short_string_of_ref vm) (snapshot.API.vM_name_label);
9494
[]
9595

96+
let get_live_set ~__context =
97+
let all_hosts = Db.Host.get_all_records ~__context in
98+
let live_hosts = List.filter (fun (rf,r) -> r.API.host_enabled
99+
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false))
100+
all_hosts in
101+
List.map (fun (rf,_) -> rf) live_hosts
102+
96103
(** Given the current number of host failures to consider (only useful for passing to the binpacker to influence its
97104
choice of heuristic), return an instantaneous VM restart plan which includes all protected offline VMs, and a
98105
planning configuration corresponding to the state of the world after the starts are complete, for use in further
99106
planning.
100107
Returns: (VM restart plan, new planning configuration, true if some protected non-agile VMs exist)
101108
*)
102-
let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration_change) num_failures =
109+
let compute_restart_plan ~__context ~all_protected_vms ~live_set ?(change=no_configuration_change) num_failures =
103110
(* This function must be deterministic: for the same set of hosts and set of VMs it must produce the same output.
104111
We rely partially on the binpacker enforcing its own ordering over hosts and vms, so it's not critical for us
105112
to sort the result of Db.*.get_all calls generally. However the handling of non-agile VMs needs special care. *)
@@ -155,7 +162,8 @@ let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration
155162
true
156163
&& r.API.host_enabled
157164
&& not (List.mem rf change.hosts_to_disable)
158-
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false) in
165+
&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false)
166+
&& (List.mem rf live_set) in
159167
let live_hosts_and_snapshots, dead_hosts_and_snapshots = List.partition is_alive all_hosts_and_snapshots in
160168

161169
let live_hosts = List.map fst live_hosts_and_snapshots (* and dead_hosts = List.map fst dead_hosts_and_snapshots *) in
@@ -273,10 +281,11 @@ type result =
273281
breaking the plan.
274282
*)
275283

276-
let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuration_change) n =
284+
let plan_for_n_failures ~__context ~all_protected_vms ?live_set ?(change = no_configuration_change) n =
285+
let live_set = match live_set with None -> get_live_set ~__context | Some s -> s in
277286
try
278287
(* 'changes' are applied by the compute_restart_plan function *)
279-
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~change n in
288+
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set ~change n in
280289

281290
(* Could some VMs not be started? If so we're overcommitted before we started. *)
282291
if vms_not_restarted <> [] then begin
@@ -299,26 +308,26 @@ let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuratio
299308
error "Unexpected error in HA VM failover planning function: %s" (ExnHelper.string_of_exn e);
300309
No_plan_exists
301310

302-
let compute_max_host_failures_to_tolerate ~__context ?protected_vms () =
311+
let compute_max_host_failures_to_tolerate ~__context ?live_set ?protected_vms () =
303312
let protected_vms = match protected_vms with
304313
| None -> all_protected_vms ~__context
305314
| Some vms -> vms in
306315
let nhosts = List.length (Db.Host.get_all ~__context) in
307316
(* We assume that if not(plan_exists(n)) then \forall.x>n not(plan_exists(n))
308317
although even if we screw this up it's not a disaster because all we need is a
309318
safe approximation (so ultimately "0" will do but we'd prefer higher) *)
310-
Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)
319+
Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms ?live_set (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)
311320

312321
(* Make sure the pool is marked as overcommitted and the appropriate alert is generated. Return
313322
true if something changed, false otherwise *)
314-
let mark_pool_as_overcommitted ~__context =
323+
let mark_pool_as_overcommitted ~__context ~live_set =
315324
let pool = Helpers.get_pool ~__context in
316325

317326
let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
318327
let planned_for = Db.Pool.get_ha_plan_exists_for ~__context ~self:pool in
319328
let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in
320329

321-
let max_failures = compute_max_host_failures_to_tolerate ~__context () in
330+
let max_failures = compute_max_host_failures_to_tolerate ~__context ~live_set () in
322331
if planned_for <> max_failures then begin
323332
Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:(min to_tolerate max_failures);
324333
if max_failures < planned_for
@@ -345,7 +354,10 @@ let mark_pool_as_overcommitted ~__context =
345354
planned_for <> max_failures || (not overcommitted)
346355

347356
(* Update the pool's HA fields *)
348-
let update_pool_status ~__context =
357+
let update_pool_status ~__context ?live_set () =
358+
let live_set = match live_set with
359+
| None -> get_live_set ~__context
360+
| Some s -> s in
349361
let pool = Helpers.get_pool ~__context in
350362
let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
351363
let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in
@@ -357,7 +369,7 @@ let update_pool_status ~__context =
357369
i.e. have we become overcommitted (in the sense of having too few resources to satisfy
358370
demand) or not?
359371
*)
360-
match plan_for_n_failures ~__context ~all_protected_vms (Int64.to_int to_tolerate) with
372+
match plan_for_n_failures ~__context ~all_protected_vms ~live_set (Int64.to_int to_tolerate) with
361373
| Plan_exists_for_all_VMs ->
362374
debug "HA failover plan exists for all protected VMs";
363375
Db.Pool.set_ha_overcommitted ~__context ~self:pool ~value:false;
@@ -367,29 +379,30 @@ let update_pool_status ~__context =
367379
overcommitted || (planned_for <> to_tolerate)
368380
| Plan_exists_excluding_non_agile_VMs ->
369381
debug "HA failover plan exists for all protected VMs, excluding some non-agile VMs";
370-
mark_pool_as_overcommitted ~__context; (* might define this as false later *)
382+
mark_pool_as_overcommitted ~__context ~live_set; (* might define this as false later *)
371383
| No_plan_exists ->
372384
debug "No HA failover plan exists";
373-
mark_pool_as_overcommitted ~__context
385+
mark_pool_as_overcommitted ~__context ~live_set
374386

375387
let assert_configuration_change_preserves_ha_plan ~__context c =
376388
debug "assert_configuration_change_preserves_ha_plan c = %s" (string_of_configuration_change ~__context c);
377389

378390
(* Only block the operation if a plan exists now but would evaporate with the proposed changes.
379391
This prevents us blocking all operations should be suddenly become overcommitted eg through
380392
multiple host failures *)
393+
let live_set = get_live_set ~__context in
381394
let pool = Helpers.get_pool ~__context in
382395
if Db.Pool.get_ha_enabled ~__context ~self:pool && not(Db.Pool.get_ha_allow_overcommit ~__context ~self:pool) then begin
383396
let to_tolerate = Int64.to_int (Db.Pool.get_ha_plan_exists_for ~__context ~self:pool) in
384397
let all_protected_vms = all_protected_vms ~__context in
385398

386-
match plan_for_n_failures ~__context ~all_protected_vms to_tolerate with
399+
match plan_for_n_failures ~__context ~all_protected_vms ~live_set to_tolerate with
387400
| Plan_exists_excluding_non_agile_VMs
388401
| No_plan_exists ->
389402
debug "assert_configuration_change_preserves_ha_plan: no plan currently exists; cannot get worse"
390403
| Plan_exists_for_all_VMs -> begin
391404
(* Does the plan break? *)
392-
match plan_for_n_failures ~__context ~all_protected_vms ~change:c to_tolerate with
405+
match plan_for_n_failures ~__context ~all_protected_vms ~live_set ~change:c to_tolerate with
393406
| Plan_exists_for_all_VMs ->
394407
debug "assert_configuration_change_preserves_ha_plan: plan exists after change"
395408
| Plan_exists_excluding_non_agile_VMs
@@ -479,7 +492,7 @@ let restart_auto_run_vms ~__context live_set n =
479492
hosts;
480493

481494
(* If something has changed then we'd better refresh the pool status *)
482-
if !reset_vms <> [] then ignore(update_pool_status ~__context);
495+
if !reset_vms <> [] then ignore(update_pool_status ~__context ~live_set ());
483496

484497
(* At this point failed protected agile VMs are Halted, not resident_on anywhere *)
485498

@@ -497,7 +510,7 @@ let restart_auto_run_vms ~__context live_set n =
497510
protection.
498511
For the best-effort VMs we call the script
499512
when we have reset some VMs to halted (no guarantee there is enough resource but better safe than sorry) *)
500-
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms n in
513+
let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set n in
501514
let plan, config, vms_not_restarted, non_agile_protected_vms_exist =
502515
if true
503516
&& Xapi_hooks.pool_pre_ha_vm_restart_hook_exists ()
@@ -511,7 +524,7 @@ let restart_auto_run_vms ~__context live_set n =
511524
error "pool-pre-ha-vm-restart-hook failed: %s: continuing anyway" (ExnHelper.string_of_exn e)
512525
end;
513526
debug "Recomputing restart plan to take into account new state of the world after running the script";
514-
compute_restart_plan ~__context ~all_protected_vms n
527+
compute_restart_plan ~__context ~all_protected_vms ~live_set n
515528
end else plan, config, vms_not_restarted, non_agile_protected_vms_exist (* nothing needs recomputing *)
516529
in
517530

@@ -566,7 +579,7 @@ let restart_auto_run_vms ~__context live_set n =
566579
| Api_errors.Server_error(code, params) when code = Api_errors.ha_operation_would_break_failover_plan ->
567580
(* This should never happen since the planning code would always allow the restart of a protected VM... *)
568581
error "Caught exception HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN: setting pool as overcommitted and retrying";
569-
ignore_bool(mark_pool_as_overcommitted ~__context);
582+
ignore_bool(mark_pool_as_overcommitted ~__context ~live_set);
570583
begin
571584
try
572585
go ();

ocaml/xapi/xapi_ha_vm_failover.mli

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,13 @@ type configuration_change = {
4242
val no_configuration_change : configuration_change
4343

4444
(** Update the Pool.ha_* fields with the current planning status *)
45-
val update_pool_status : __context:Context.t -> bool
45+
val update_pool_status : __context:Context.t -> ?live_set:API.ref_host list -> unit -> bool
4646

4747
(** Consider all possible failures of 'n' hosts *)
48-
val plan_for_n_failures : __context:Context.t -> all_protected_vms:((API.ref_VM * API.vM_t) list) -> ?change:configuration_change -> int -> result
48+
val plan_for_n_failures : __context:Context.t -> all_protected_vms:((API.ref_VM * API.vM_t) list) -> ?live_set:API.ref_host list -> ?change:configuration_change -> int -> result
4949

5050
(** Compute the maximum plan size we can currently find *)
51-
val compute_max_host_failures_to_tolerate : __context:Context.t -> ?protected_vms:((API.ref_VM * API.vM_t) list) -> unit -> int64
51+
val compute_max_host_failures_to_tolerate : __context:Context.t -> ?live_set:API.ref_host list -> ?protected_vms:((API.ref_VM * API.vM_t) list) -> unit -> int64
5252

5353
(** HA admission control functions: aim is to block operations which would make us become overcommitted: *)
5454
val assert_vm_placement_preserves_ha_plan : __context:Context.t -> ?leaving:(API.ref_host * (API.ref_VM * API.vM_t)) list -> ?arriving:(API.ref_host * (API.ref_VM * API.vM_t)) list -> unit -> unit

ocaml/xapi/xapi_pool.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ let set_ha_host_failures_to_tolerate ~__context ~self ~value =
11681168
if Db.Pool.get_ha_plan_exists_for ~__context ~self:pool > 0L
11691169
then Xapi_ha_vm_failover.assert_nfailures_change_preserves_ha_plan ~__context (Int64.to_int value);
11701170
Db.Pool.set_ha_host_failures_to_tolerate ~__context ~self ~value;
1171-
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context in ()
1171+
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context () in ()
11721172

11731173
let ha_schedule_plan_recomputation ~__context =
11741174
Xapi_ha.Monitor.plan_out_of_date := true

ocaml/xapi/xapi_vm.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ let set_ha_restart_priority ~__context ~self ~value =
9090
Xapi_ha_vm_failover.assert_new_vm_preserves_ha_plan ~__context self;
9191
let pool = Helpers.get_pool ~__context in
9292
if Db.Pool.get_ha_enabled ~__context ~self:pool then
93-
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context in ()
93+
let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context () in ()
9494
end;
9595

9696
if current <> value then begin

0 commit comments

Comments
 (0)