@@ -93,13 +93,20 @@ let host_of_non_agile_vm ~__context all_hosts_and_snapshots_sorted (vm, snapshot
9393 warn " No host could support protected xHA VM: %s (%s)" (Helpers. short_string_of_ref vm) (snapshot.API. vM_name_label);
9494 []
9595
96+ let get_live_set ~__context =
97+ let all_hosts = Db.Host. get_all_records ~__context in
98+ let live_hosts = List. filter (fun (rf ,r ) -> r.API. host_enabled
99+ && (try Db.Host_metrics. get_live ~__context ~self: r.API. host_metrics with _ -> false ))
100+ all_hosts in
101+ List. map (fun (rf ,_ ) -> rf) live_hosts
102+
96103(* * Given the current number of host failures to consider (only useful for passing to the binpacker to influence its
97104 choice of heuristic), return an instantaneous VM restart plan which includes all protected offline VMs, and a
98105 planning configuration corresponding to the state of the world after the starts are complete, for use in further
99106 planning.
100107 Returns: (VM restart plan, new planning configuration, true if some protected non-agile VMs exist)
101108*)
102- let compute_restart_plan ~__context ~all_protected_vms ?(change =no_configuration_change) num_failures =
109+ let compute_restart_plan ~__context ~all_protected_vms ~ live_set ?(change =no_configuration_change) num_failures =
103110 (* This function must be deterministic: for the same set of hosts and set of VMs it must produce the same output.
104111 We rely partially on the binpacker enforcing its own ordering over hosts and vms, so it's not critical for us
105112 to sort the result of Db.*.get_all calls generally. However the handling of non-agile VMs needs special care. *)
@@ -155,7 +162,8 @@ let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration
155162 true
156163 && r.API. host_enabled
157164 && not (List. mem rf change.hosts_to_disable)
158- && (try Db.Host_metrics. get_live ~__context ~self: r.API. host_metrics with _ -> false ) in
165+ && (try Db.Host_metrics. get_live ~__context ~self: r.API. host_metrics with _ -> false )
166+ && (List. mem rf live_set) in
159167 let live_hosts_and_snapshots, dead_hosts_and_snapshots = List. partition is_alive all_hosts_and_snapshots in
160168
161169 let live_hosts = List. map fst live_hosts_and_snapshots (* and dead_hosts = List.map fst dead_hosts_and_snapshots *) in
@@ -273,10 +281,11 @@ type result =
273281 breaking the plan.
274282*)
275283
276- let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuration_change) n =
284+ let plan_for_n_failures ~__context ~all_protected_vms ?live_set ?(change = no_configuration_change) n =
285+ let live_set = match live_set with None -> get_live_set ~__context | Some s -> s in
277286 try
278287 (* 'changes' are applied by the compute_restart_plan function *)
279- let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~change n in
288+ let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set ~ change n in
280289
281290 (* Could some VMs not be started? If so we're overcommitted before we started. *)
282291 if vms_not_restarted <> [] then begin
@@ -299,26 +308,26 @@ let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuratio
299308 error " Unexpected error in HA VM failover planning function: %s" (ExnHelper. string_of_exn e);
300309 No_plan_exists
301310
302- let compute_max_host_failures_to_tolerate ~__context ?protected_vms () =
311+ let compute_max_host_failures_to_tolerate ~__context ?live_set ? protected_vms () =
303312 let protected_vms = match protected_vms with
304313 | None -> all_protected_vms ~__context
305314 | Some vms -> vms in
306315 let nhosts = List. length (Db.Host. get_all ~__context) in
307316 (* We assume that if not(plan_exists(n)) then \forall.x>n not(plan_exists(n))
308317 although even if we screw this up it's not a disaster because all we need is a
309318 safe approximation (so ultimately "0" will do but we'd prefer higher) *)
310- Helpers. bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms: protected_vms (Int64. to_int n) = Plan_exists_for_all_VMs ) 0L (Int64. of_int nhosts)
319+ Helpers. bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms: protected_vms ?live_set (Int64. to_int n) = Plan_exists_for_all_VMs ) 0L (Int64. of_int nhosts)
311320
312321(* Make sure the pool is marked as overcommitted and the appropriate alert is generated. Return
313322 true if something changed, false otherwise *)
314- let mark_pool_as_overcommitted ~__context =
323+ let mark_pool_as_overcommitted ~__context ~ live_set =
315324 let pool = Helpers. get_pool ~__context in
316325
317326 let overcommitted = Db.Pool. get_ha_overcommitted ~__context ~self: pool in
318327 let planned_for = Db.Pool. get_ha_plan_exists_for ~__context ~self: pool in
319328 let to_tolerate = Db.Pool. get_ha_host_failures_to_tolerate ~__context ~self: pool in
320329
321- let max_failures = compute_max_host_failures_to_tolerate ~__context () in
330+ let max_failures = compute_max_host_failures_to_tolerate ~__context ~live_set () in
322331 if planned_for <> max_failures then begin
323332 Db.Pool. set_ha_plan_exists_for ~__context ~self: pool ~value: (min to_tolerate max_failures);
324333 if max_failures < planned_for
@@ -345,7 +354,10 @@ let mark_pool_as_overcommitted ~__context =
345354 planned_for <> max_failures || (not overcommitted)
346355
347356(* Update the pool's HA fields *)
348- let update_pool_status ~__context =
357+ let update_pool_status ~__context ?live_set () =
358+ let live_set = match live_set with
359+ | None -> get_live_set ~__context
360+ | Some s -> s in
349361 let pool = Helpers. get_pool ~__context in
350362 let overcommitted = Db.Pool. get_ha_overcommitted ~__context ~self: pool in
351363 let to_tolerate = Db.Pool. get_ha_host_failures_to_tolerate ~__context ~self: pool in
@@ -357,7 +369,7 @@ let update_pool_status ~__context =
357369 i.e. have we become overcommitted (in the sense of having too few resources to satisfy
358370 demand) or not?
359371 *)
360- match plan_for_n_failures ~__context ~all_protected_vms (Int64. to_int to_tolerate) with
372+ match plan_for_n_failures ~__context ~all_protected_vms ~live_set (Int64. to_int to_tolerate) with
361373 | Plan_exists_for_all_VMs ->
362374 debug " HA failover plan exists for all protected VMs" ;
363375 Db.Pool. set_ha_overcommitted ~__context ~self: pool ~value: false ;
@@ -367,29 +379,30 @@ let update_pool_status ~__context =
367379 overcommitted || (planned_for <> to_tolerate)
368380 | Plan_exists_excluding_non_agile_VMs ->
369381 debug " HA failover plan exists for all protected VMs, excluding some non-agile VMs" ;
370- mark_pool_as_overcommitted ~__context; (* might define this as false later *)
382+ mark_pool_as_overcommitted ~__context ~live_set ; (* might define this as false later *)
371383 | No_plan_exists ->
372384 debug " No HA failover plan exists" ;
373- mark_pool_as_overcommitted ~__context
385+ mark_pool_as_overcommitted ~__context ~live_set
374386
375387let assert_configuration_change_preserves_ha_plan ~__context c =
376388 debug " assert_configuration_change_preserves_ha_plan c = %s" (string_of_configuration_change ~__context c);
377389
378390 (* Only block the operation if a plan exists now but would evaporate with the proposed changes.
379391 This prevents us blocking all operations should be suddenly become overcommitted eg through
380392 multiple host failures *)
393+ let live_set = get_live_set ~__context in
381394 let pool = Helpers. get_pool ~__context in
382395 if Db.Pool. get_ha_enabled ~__context ~self: pool && not (Db.Pool. get_ha_allow_overcommit ~__context ~self: pool) then begin
383396 let to_tolerate = Int64. to_int (Db.Pool. get_ha_plan_exists_for ~__context ~self: pool) in
384397 let all_protected_vms = all_protected_vms ~__context in
385398
386- match plan_for_n_failures ~__context ~all_protected_vms to_tolerate with
399+ match plan_for_n_failures ~__context ~all_protected_vms ~live_set to_tolerate with
387400 | Plan_exists_excluding_non_agile_VMs
388401 | No_plan_exists ->
389402 debug " assert_configuration_change_preserves_ha_plan: no plan currently exists; cannot get worse"
390403 | Plan_exists_for_all_VMs -> begin
391404 (* Does the plan break? *)
392- match plan_for_n_failures ~__context ~all_protected_vms ~change: c to_tolerate with
405+ match plan_for_n_failures ~__context ~all_protected_vms ~live_set ~ change: c to_tolerate with
393406 | Plan_exists_for_all_VMs ->
394407 debug " assert_configuration_change_preserves_ha_plan: plan exists after change"
395408 | Plan_exists_excluding_non_agile_VMs
@@ -479,7 +492,7 @@ let restart_auto_run_vms ~__context live_set n =
479492 hosts;
480493
481494 (* If something has changed then we'd better refresh the pool status *)
482- if ! reset_vms <> [] then ignore(update_pool_status ~__context);
495+ if ! reset_vms <> [] then ignore(update_pool_status ~__context ~live_set () );
483496
484497 (* At this point failed protected agile VMs are Halted, not resident_on anywhere *)
485498
@@ -497,7 +510,7 @@ let restart_auto_run_vms ~__context live_set n =
497510 protection.
498511 For the best-effort VMs we call the script
499512 when we have reset some VMs to halted (no guarantee there is enough resource but better safe than sorry) *)
500- let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms n in
513+ let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set n in
501514 let plan, config, vms_not_restarted, non_agile_protected_vms_exist =
502515 if true
503516 && Xapi_hooks. pool_pre_ha_vm_restart_hook_exists ()
@@ -511,7 +524,7 @@ let restart_auto_run_vms ~__context live_set n =
511524 error " pool-pre-ha-vm-restart-hook failed: %s: continuing anyway" (ExnHelper. string_of_exn e)
512525 end ;
513526 debug " Recomputing restart plan to take into account new state of the world after running the script" ;
514- compute_restart_plan ~__context ~all_protected_vms n
527+ compute_restart_plan ~__context ~all_protected_vms ~live_set n
515528 end else plan, config, vms_not_restarted, non_agile_protected_vms_exist (* nothing needs recomputing *)
516529 in
517530
@@ -566,7 +579,7 @@ let restart_auto_run_vms ~__context live_set n =
566579 | Api_errors. Server_error (code , params ) when code = Api_errors. ha_operation_would_break_failover_plan ->
567580 (* This should never happen since the planning code would always allow the restart of a protected VM... *)
568581 error " Caught exception HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN: setting pool as overcommitted and retrying" ;
569- ignore_bool(mark_pool_as_overcommitted ~__context);
582+ ignore_bool(mark_pool_as_overcommitted ~__context ~live_set );
570583 begin
571584 try
572585 go () ;
0 commit comments