xapi-project · simonjbeaumont · Oct 22, 2013 · Jul 25, 2013 · Jul 26, 2013 · Jul 19, 2013
diff --git a/ocaml/xapi/create_storage.ml b/ocaml/xapi/create_storage.ml
@@ -19,7 +19,7 @@ open Client
 module D=Debug.Debugger(struct let name="xapi" end)
 open D
 
-let plug_all_pbds __context rpc session_id =
+let plug_all_pbds __context =
   (* Explicitly resynchronise local PBD state *)
   let my_pbds = Helpers.get_my_pbds __context in
   Storage_access.resynchronise_pbds ~__context ~pbds:(List.map fst my_pbds);
@@ -31,22 +31,22 @@ let plug_all_pbds __context rpc session_id =
       try 
 	if pbd_record.API.pBD_currently_attached
 	then debug "Not replugging PBD %s: already plugged in" (Ref.string_of self)
-	else Client.PBD.plug ~rpc ~session_id ~self
+	else Xapi_pbd.plug ~__context ~self
       with e -> 
 	result := false;
 	error "Could not plug in pbd '%s': %s" (Db.PBD.get_uuid ~__context ~self) (Printexc.to_string e))
     my_pbds;
   !result 
 
-let plug_unplugged_pbds __context rpc session_id =
+let plug_unplugged_pbds __context =
   let my_pbds = Helpers.get_my_pbds __context in
   List.iter
     (fun (self, pbd_record) ->
       try 
 	if pbd_record.API.pBD_currently_attached 
 	then debug "Not replugging PBD %s: already plugged in" (Ref.string_of self)
-	else Client.PBD.plug ~rpc ~session_id ~self
-      with e -> debug "Could not plug in pbd '%s': %s" (Client.PBD.get_uuid rpc session_id self) (Printexc.to_string e))
+	else Xapi_pbd.plug ~__context ~self
+      with e -> debug "Could not plug in pbd '%s': %s" (Db.PBD.get_uuid ~__context ~self) (Printexc.to_string e))
     my_pbds
 
 (* Create a PBD which connects this host to the SR, if one doesn't already exist *)
@@ -107,7 +107,7 @@ let create_storage (me: API.ref_host) rpc session_id __context : unit =
   else
     debug "Skipping creation of PBDs for shared SRs";
 
-  let all_pbds_ok = plug_all_pbds __context rpc session_id in
+  let all_pbds_ok = plug_all_pbds __context in
   if not(all_pbds_ok) then begin
     let obj_uuid = Helpers.get_localhost_uuid () in
     Xapi_alert.add ~name:Api_messages.pbd_plug_failed_on_server_start ~priority:1L ~cls:`Host ~obj_uuid ~body:"";

diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml
@@ -987,8 +987,8 @@ let server_init() =
 	debug "Waiting forever for the management interface to gain an IP address";
 	let ip = wait_for_management_ip_address () in
 	debug "Management interface got IP address: %s; attempting to re-plug any unplugged PBDs" ip;
-	Helpers.call_api_functions ~__context (fun rpc session_id -> 
-	       Create_storage.plug_unplugged_pbds __context rpc session_id)
+	Helpers.call_api_functions ~__context (fun rpc session_id ->
+		Create_storage.plug_unplugged_pbds __context)
       )
       in
 

diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml
@@ -521,7 +521,7 @@ module Monitor = struct
 				let now = Unix.gettimeofday () in
 				let plan_too_old = now -. !last_plan_time > Xapi_globs.ha_monitor_plan_timer in
 				if plan_too_old || !plan_out_of_date then begin
-					let changed = Xapi_ha_vm_failover.update_pool_status ~__context in
+					let changed = Xapi_ha_vm_failover.update_pool_status ~__context ~live_set:liveset_refs () in
 
 					(* Extremely bad: something managed to break our careful plan *)
 					if changed && not !plan_out_of_date then error "Overcommit protection failed to prevent a change which invalidated our failover plan";
@@ -1461,7 +1461,7 @@ let enable __context heartbeat_srs configuration =
 		(* Update the Pool's planning configuration (ha_overcommitted, ha_plan_exists_for) *)
 		(* Start by assuming there is no ha_plan_for: this can be revised upwards later *)
 		Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:0L;
-		let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context in
+		let (_: bool) = Xapi_ha_vm_failover.update_pool_status ~__context () in
 
 		let generation = Uuid.string_of_uuid (Uuid.make_uuid ()) in
 

diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -93,13 +93,20 @@ let host_of_non_agile_vm ~__context all_hosts_and_snapshots_sorted (vm, snapshot
       warn "No host could support protected xHA VM: %s (%s)" (Helpers.short_string_of_ref vm) (snapshot.API.vM_name_label);
       []
 
+let get_live_set ~__context =
+	let all_hosts = Db.Host.get_all_records ~__context in
+	let live_hosts = List.filter (fun (rf,r) -> r.API.host_enabled
+		&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false))
+		all_hosts in
+	List.map (fun (rf,_) -> rf) live_hosts
+
 (** Given the current number of host failures to consider (only useful for passing to the binpacker to influence its
     choice of heuristic), return an instantaneous VM restart plan which includes all protected offline VMs, and a
     planning configuration corresponding to the state of the world after the starts are complete, for use in further
     planning. 
     Returns: (VM restart plan, new planning configuration, true if some protected non-agile VMs exist)
 *)
-let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration_change) num_failures = 
+let compute_restart_plan ~__context ~all_protected_vms ~live_set ?(change=no_configuration_change) num_failures =
 	(* This function must be deterministic: for the same set of hosts and set of VMs it must produce the same output.
 	   We rely partially on the binpacker enforcing its own ordering over hosts and vms, so it's not critical for us
 	   to sort the result of Db.*.get_all calls generally. However the handling of non-agile VMs needs special care. *)
@@ -155,7 +162,8 @@ let compute_restart_plan ~__context ~all_protected_vms ?(change=no_configuration
 		true
 		&& r.API.host_enabled
 		&& not (List.mem rf change.hosts_to_disable)
-		&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false) in
+		&& (try Db.Host_metrics.get_live ~__context ~self:r.API.host_metrics with _ -> false)
+		&& (List.mem rf live_set) in
 	let live_hosts_and_snapshots, dead_hosts_and_snapshots = List.partition is_alive all_hosts_and_snapshots in
 
 	let live_hosts = List.map fst live_hosts_and_snapshots (* and dead_hosts = List.map fst dead_hosts_and_snapshots *) in
@@ -273,10 +281,11 @@ type result =
        breaking the plan.
 *)
 
-let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuration_change) n =
+let plan_for_n_failures ~__context ~all_protected_vms ?live_set ?(change = no_configuration_change) n =
+	let live_set = match live_set with None -> get_live_set ~__context | Some s -> s in
   try
     (* 'changes' are applied by the compute_restart_plan function *)
-    let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~change n in
+    let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set ~change n in
 
     (* Could some VMs not be started? If so we're overcommitted before we started. *)
     if vms_not_restarted <> [] then begin
@@ -299,26 +308,26 @@ let plan_for_n_failures ~__context ~all_protected_vms ?(change = no_configuratio
       error "Unexpected error in HA VM failover planning function: %s" (ExnHelper.string_of_exn e);
       No_plan_exists
 
-let compute_max_host_failures_to_tolerate ~__context ?protected_vms () = 
+let compute_max_host_failures_to_tolerate ~__context ?live_set ?protected_vms () =
   let protected_vms = match protected_vms with
     | None -> all_protected_vms ~__context 
     | Some vms -> vms in
   let nhosts = List.length (Db.Host.get_all ~__context) in
   (* We assume that if not(plan_exists(n)) then \forall.x>n not(plan_exists(n))
      although even if we screw this up it's not a disaster because all we need is a 
      safe approximation (so ultimately "0" will do but we'd prefer higher) *)
-  Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)
+  Helpers.bisect (fun n -> plan_for_n_failures ~__context ~all_protected_vms:protected_vms ?live_set (Int64.to_int n) = Plan_exists_for_all_VMs) 0L (Int64.of_int nhosts)
 
 (* Make sure the pool is marked as overcommitted and the appropriate alert is generated. Return 
    true if something changed, false otherwise *)
-let mark_pool_as_overcommitted ~__context = 
+let mark_pool_as_overcommitted ~__context ~live_set =
   let pool = Helpers.get_pool ~__context in
 
   let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
   let planned_for = Db.Pool.get_ha_plan_exists_for ~__context ~self:pool in
   let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in
 
-  let max_failures = compute_max_host_failures_to_tolerate ~__context () in
+  let max_failures = compute_max_host_failures_to_tolerate ~__context ~live_set () in
   if planned_for <> max_failures then begin
     Db.Pool.set_ha_plan_exists_for ~__context ~self:pool ~value:(min to_tolerate max_failures);
     if max_failures < planned_for
@@ -345,7 +354,10 @@ let mark_pool_as_overcommitted ~__context =
   planned_for <> max_failures || (not overcommitted)
 
 (* Update the pool's HA fields *)
-let update_pool_status ~__context = 
+let update_pool_status ~__context ?live_set () =
+	let live_set = match live_set with
+		| None -> get_live_set ~__context
+		| Some s -> s in
   let pool = Helpers.get_pool ~__context in
   let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in
   let to_tolerate = Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool in
@@ -357,7 +369,7 @@ let update_pool_status ~__context =
      i.e. have we become overcommitted (in the sense of having too few resources to satisfy
      demand) or not?
   *)
-  match plan_for_n_failures ~__context ~all_protected_vms (Int64.to_int to_tolerate) with
+  match plan_for_n_failures ~__context ~all_protected_vms ~live_set (Int64.to_int to_tolerate) with
   | Plan_exists_for_all_VMs ->
       debug "HA failover plan exists for all protected VMs";
       Db.Pool.set_ha_overcommitted ~__context ~self:pool ~value:false;
@@ -367,29 +379,30 @@ let update_pool_status ~__context =
       overcommitted || (planned_for <> to_tolerate)
   | Plan_exists_excluding_non_agile_VMs ->
       debug "HA failover plan exists for all protected VMs, excluding some non-agile VMs";
-      mark_pool_as_overcommitted ~__context; (* might define this as false later *)
+      mark_pool_as_overcommitted ~__context ~live_set; (* might define this as false later *)
   | No_plan_exists ->
       debug "No HA failover plan exists";
-      mark_pool_as_overcommitted ~__context
+      mark_pool_as_overcommitted ~__context ~live_set
 
 let assert_configuration_change_preserves_ha_plan ~__context c = 
   debug "assert_configuration_change_preserves_ha_plan c = %s" (string_of_configuration_change ~__context c);
 
   (* Only block the operation if a plan exists now but would evaporate with the proposed changes.
      This prevents us blocking all operations should be suddenly become overcommitted eg through
      multiple host failures *)
+	let live_set = get_live_set ~__context in
   let pool = Helpers.get_pool ~__context in
   if Db.Pool.get_ha_enabled ~__context ~self:pool && not(Db.Pool.get_ha_allow_overcommit ~__context ~self:pool) then begin
     let to_tolerate = Int64.to_int (Db.Pool.get_ha_plan_exists_for ~__context ~self:pool) in
     let all_protected_vms = all_protected_vms ~__context in
 
-    match plan_for_n_failures ~__context ~all_protected_vms to_tolerate with
+    match plan_for_n_failures ~__context ~all_protected_vms ~live_set to_tolerate with
     | Plan_exists_excluding_non_agile_VMs 
     | No_plan_exists -> 
 	debug "assert_configuration_change_preserves_ha_plan: no plan currently exists; cannot get worse"
     | Plan_exists_for_all_VMs -> begin
       (* Does the plan break? *)
-      match plan_for_n_failures ~__context ~all_protected_vms ~change:c to_tolerate with
+      match plan_for_n_failures ~__context ~all_protected_vms ~live_set ~change:c to_tolerate with
       | Plan_exists_for_all_VMs -> 
 	  debug "assert_configuration_change_preserves_ha_plan: plan exists after change"
       | Plan_exists_excluding_non_agile_VMs 
@@ -430,56 +443,56 @@ let restart_auto_run_vms ~__context live_set n =
 	   an accurate way to determine 'failed' VMs but it will suffice for our 'best-effort' 
 	   category. *)
 	let reset_vms = ref [] in
-	List.iter
-		(fun h ->
-			if not (List.mem h live_set) then
-				begin
-					let hostname = Db.Host.get_hostname ~__context ~self:h in
-					debug "Setting host %s to dead" hostname;
-					(* Sample this before calling any hook scripts *)
-					let resident_on_vms = Db.Host.get_resident_VMs ~__context ~self:h in
-					reset_vms := resident_on_vms @ !reset_vms;
-
-					(* ensure live=false *)
-					begin
-						try
-							let h_metrics = Db.Host.get_metrics ~__context ~self:h in
-							let current = Db.Host_metrics.get_live ~__context ~self:h_metrics in
-							if current then begin
-								(* Fire off a ha_host_failed message if the host hasn't just shut itself down *)
-								let shutting_down = Threadext.Mutex.execute Xapi_globs.hosts_which_are_shutting_down_m (fun () -> !Xapi_globs.hosts_which_are_shutting_down) in
-								if not (List.exists (fun x -> x=h) shutting_down) then begin
-									let obj_uuid = Db.Host.get_uuid ~__context ~self:h in
-									let host_name = Db.Host.get_name_label ~__context ~self:h in
-									Xapi_alert.add ~name:Api_messages.ha_host_failed ~priority:Api_messages.ha_host_failed_priority ~cls:`Host ~obj_uuid
-										~body:(Printf.sprintf "Server '%s' has failed" host_name);
-								end;
-								(* Call external host failed hook (allows a third-party to use power-fencing if desired) *)
-								Xapi_hooks.host_pre_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
-								Db.Host_metrics.set_live ~__context ~self:h_metrics ~value:false; (* since slave is fenced, it will not set this to true again itself *)
-								Xapi_host_helpers.update_allowed_operations ~__context ~self:h;
-								Xapi_hooks.host_post_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
-							end
-						with _ -> 
-							(* if exn assume h_metrics doesn't exist, then "live" is defined to be false implicitly, so do nothing *)
-							()
-					end;
-					debug "Setting all VMs running or paused on %s to Halted" hostname;
-					(* ensure all vms resident_on this host running or paused have their powerstates reset *)
-
-					List.iter
-						(fun vm ->
-							let vm_powerstate = Db.VM.get_power_state ~__context ~self:vm in
-							if (vm_powerstate=`Running || vm_powerstate=`Paused) then
-								Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted
-						)
-						resident_on_vms
-				end
-		)
-		hosts;
+	let dead_hosts = ref [] in
+	List.iter (fun h ->
+		if not (List.mem h live_set) then begin
+			let hostname = Db.Host.get_hostname ~__context ~self:h in
+			debug "Setting host %s to dead" hostname;
+			(* Sample this before calling any hook scripts *)
+			let resident_on_vms = List.filter
+				(fun vm -> not (Db.VM.get_is_control_domain ~__context ~self:vm))
+				(Db.Host.get_resident_VMs ~__context ~self:h) in
+			reset_vms := resident_on_vms @ !reset_vms;
+
+			(* ensure live=false *)
+			begin
+				try
+					let h_metrics = Db.Host.get_metrics ~__context ~self:h in
+					let current = Db.Host_metrics.get_live ~__context ~self:h_metrics in
+					if current then begin
+						(* Fire off a ha_host_failed message if the host hasn't just shut itself down *)
+						let shutting_down = Threadext.Mutex.execute Xapi_globs.hosts_which_are_shutting_down_m (fun () -> !Xapi_globs.hosts_which_are_shutting_down) in
+						if not (List.exists (fun x -> x=h) shutting_down) then begin
+							let obj_uuid = Db.Host.get_uuid ~__context ~self:h in
+							let host_name = Db.Host.get_name_label ~__context ~self:h in
+							Xapi_alert.add ~msg:Api_messages.ha_host_failed ~cls:`Host ~obj_uuid
+								~body:(Printf.sprintf "Server '%s' has failed" host_name);
+						end;
+						(* Call external host failed hook (allows a third-party to use power-fencing if desired) *)
+						Xapi_hooks.host_pre_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced;
+						Db.Host_metrics.set_live ~__context ~self:h_metrics ~value:false; (* since slave is fenced, it will not set this to true again itself *)
+						Xapi_host_helpers.update_allowed_operations ~__context ~self:h;
+						dead_hosts := h :: !dead_hosts;
+					end
+				with _ ->
+					() (* if exn assume h_metrics doesn't exist, then "live" is defined to be false implicitly, so do nothing *)
+			end
+		end) hosts;
+
+	debug "Setting all VMs running or paused to Halted";
+	(* ensure all vms resident_on this host running or paused have their powerstates reset *)
+	List.iter (fun vm ->
+		let vm_powerstate = Db.VM.get_power_state ~__context ~self:vm in
+		if (vm_powerstate=`Running || vm_powerstate=`Paused) then
+			Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted)
+	  !reset_vms;
+	(* host_post_declare_dead may take a long time if the SR is locked *)
+	dead_hosts := List.rev !dead_hosts;
+	List.iter (fun h -> Xapi_hooks.host_post_declare_dead ~__context ~host:h ~reason:Xapi_hooks.reason__fenced)
+	  !dead_hosts;
 
 	(* If something has changed then we'd better refresh the pool status *)
-	if !reset_vms <> [] then ignore(update_pool_status ~__context);
+	if !reset_vms <> [] then ignore(update_pool_status ~__context ~live_set ());
 
 	(* At this point failed protected agile VMs are Halted, not resident_on anywhere *)
 
@@ -497,7 +510,7 @@ let restart_auto_run_vms ~__context live_set n =
 			   protection.
 			   For the best-effort VMs we call the script
 			   when we have reset some VMs to halted (no guarantee there is enough resource but better safe than sorry) *)
-			let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms n in
+			let plan, config, vms_not_restarted, non_agile_protected_vms_exist = compute_restart_plan ~__context ~all_protected_vms ~live_set n in
 			let plan, config, vms_not_restarted, non_agile_protected_vms_exist = 
 				if true
 					&& Xapi_hooks.pool_pre_ha_vm_restart_hook_exists ()
@@ -511,7 +524,7 @@ let restart_auto_run_vms ~__context live_set n =
 								error "pool-pre-ha-vm-restart-hook failed: %s: continuing anyway" (ExnHelper.string_of_exn e)
 						end;
 						debug "Recomputing restart plan to take into account new state of the world after running the script";
-						compute_restart_plan ~__context ~all_protected_vms n
+						compute_restart_plan ~__context ~all_protected_vms ~live_set n
 					end else plan, config, vms_not_restarted, non_agile_protected_vms_exist (* nothing needs recomputing *)
 			in
 
@@ -566,7 +579,7 @@ let restart_auto_run_vms ~__context live_set n =
 					| Api_errors.Server_error(code, params) when code = Api_errors.ha_operation_would_break_failover_plan ->
 						(* This should never happen since the planning code would always allow the restart of a protected VM... *)
 						error "Caught exception HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN: setting pool as overcommitted and retrying";
-						ignore_bool(mark_pool_as_overcommitted ~__context);
+						ignore_bool(mark_pool_as_overcommitted ~__context ~live_set);
 						begin
 							try
 								go ();