@@ -225,69 +225,16 @@ let update_vms ~xal ~__context =
225225 warn " Caught error resynchronising PCIs: %s" (ExnHelper. string_of_exn e);
226226 ) () in
227227
228- (* We call a domain "managed" if we have some kind of vm record for
229- it [albeit an inconsistent one]; we call a domain "unmanaged" if
230- we have no record of it at all *)
231-
232228 (* Deal with a VM whose resident-on fields indicates it should be running here, but no domain exists here... *)
233229 let vm_in_db_for_me_but_no_domain_on_me vm =
234230 debug " domain marked as running on me in db, but no active domain: %s" (uuid_from_vmref vm);
235231 Db.VM. set_resident_on ~__context ~self: vm ~value: Ref. null;
236232 Db.VM. set_scheduled_to_be_resident_on ~__context ~self: vm ~value: Ref. null;
237233 set_db_shutdown vm in
238-
239- (* Process a "managed domain" that's active here, syncing devices and registering monitoring events *)
240- let managed_domain_running dinfo =
241- let vmref,vmrec = vmrefrec_of_dinfo dinfo in
242- (* If this domain isn't marked as running on my in the database then make it so... *)
243- if not (List. mem vmref my_running_vm_refs_according_to_db) then
244- begin
245- debug " domain running on me, but corresponding db record doesn't have resident_on=me && powerstate=running: %s" (uuid_from_vmref vmref);
246- Db.VM. set_resident_on ~__context ~self: vmref ~value: this_host;
247- end ;
248- (* CA-13878: if we've restarted xapi in the middle of starting or rebooting a VM, restart
249- the VM again under the assumption that the devices haven't been attached or the memory
250- image is not built.
251- We detect the starting/rebooting VM by the fact that it is paused and has used no CPU time
252- and the power-state is not marked as Paused (this distinguishes between a VM which
253- has been started paused and left alone for a long time and a VM which is being started
254- or rebooted, which would always have the power state to Halted or Running)
255- We start it again by setting the domain's state to shutdown with reason reboot (the event
256- thread will do the hard work for us). *)
257- if dinfo.Xc. paused && not (dinfo.Xc. shutdown) && dinfo.Xc. cpu_time = 0L &&
258- (vmrec.API. vM_power_state <> `Paused ) then begin
259- warn " domain id %d uuid %s is paused but not in the database as paused; assuming it's broken; rebooting"
260- dinfo.Xc. domid (uuid_from_vmref vmref);
261- (* Mark the domain as shutdown(reboot), the power state as running and inject
262- a fake event into the event system. This should provoke the event thread into
263- restarting the VM *)
264- Xc. domain_shutdown xc dinfo.Xc. domid Xc. Reboot ;
265- set_db_state_and_domid vmref `Running dinfo.Xc. domid;
266- Events. callback_release xal dinfo.Xc. domid (Uuid. string_of_uuid (Uuid. uuid_of_int_array dinfo.Xc. handle))
267- end else begin
268- let domain_is_shutdown =
269- try
270- let dinfo'= Xc. domain_getinfo xc dinfo.Xc. domid in dinfo'.Xc. shutdown
271- with _ -> true in
272- if not domain_is_shutdown then
273- (* Reset the power state, this also clears VBD operations etc *)
274- let state = if dinfo.Xc. paused then `Paused else `Running in
275- set_db_state_and_domid vmref state dinfo.Xc. domid;
276- end ;
277- (* Now sync devices *)
278- debug " syncing devices and registering vm for monitoring: %s" (uuid_from_dinfo dinfo);
279- let uuid = Uuid. uuid_of_int_array dinfo.Xc. handle in
280- sync_devices dinfo;
281- (* Update the VM's guest metrics since: (i) while we were offline we may
282- have missed an update; and (ii) if the tools .iso has been updated then
283- we wish to re-evaluate whether we believe the VMs have up-to-date
284- tools *)
285-
286- Events. guest_agent_update xal dinfo.Xc. domid (uuid_from_dinfo dinfo);
287- (* Now register with monitoring thread *)
288-
289- Monitor_rrds. load_rrd ~__context (Uuid. to_string uuid) false
290- in
234+
235+ (* We call a domain "managed" if we have some kind of vm record for
236+ it [albeit an inconsistent one]; we call a domain "unmanaged" if
237+ we have no record of it at all *)
291238
292239 (* Process a managed domain that exists here, but is in the shutdown state *)
293240 let managed_domain_shutdown dinfo =
@@ -303,7 +250,64 @@ let update_vms ~xal ~__context =
303250 with e ->
304251 warn " Ignoring exception during domain.destroy: %s" (Printexc. to_string e)
305252 end in
306-
253+
254+ (* Process a "managed domain" that's active here, syncing devices and registering monitoring events *)
255+ let managed_domain_running dinfo =
256+ let vmref,vmrec = vmrefrec_of_dinfo dinfo in
257+ let db_resident_on_me = List. mem vmref my_running_vm_refs_according_to_db in
258+ (* CA-13878: if we've restarted xapi in the middle of starting or rebooting a VM, restart
259+ the VM again under the assumption that the devices haven't been attached or the memory
260+ image is not built.
261+ We detect the starting/rebooting VM by the fact that it is paused and has used no CPU time
262+ and the power-state is not marked as Paused (this distinguishes between a VM which
263+ has been started paused and left alone for a long time and a VM which is being started
264+ or rebooted, which would always have the power state to Halted or Running)
265+ We start it again by setting the domain's state to shutdown with reason reboot (the event
266+ thread will do the hard work for us). *)
267+ if dinfo.Xc. paused && not (dinfo.Xc. shutdown) && dinfo.Xc. cpu_time = 0L &&
268+ (vmrec.API. vM_power_state <> `Paused ) then begin
269+ (* If this domain isn't marked as running on my in the database then make it so... *)
270+ if not db_resident_on_me then
271+ begin
272+ debug " domain %s running on me, but corresponding db record doesn't have resident_on=me" (uuid_from_vmref vmref);
273+ debug " Domain is paused, not shutdown and has 0 cpu time. Assuming this was a failed migration" ;
274+ managed_domain_shutdown dinfo;
275+ end else begin
276+ warn " domain id %d uuid %s is paused but not in the database as paused; assuming it's broken; rebooting" dinfo.Xc. domid (uuid_from_vmref vmref);
277+ (* Mark the domain as shutdown(reboot), the power state as running and inject
278+ a fake event into the event system. This should provoke the event thread into
279+ restarting the VM *)
280+ Xc. domain_shutdown xc dinfo.Xc. domid Xc. Reboot ;
281+ set_db_state_and_domid vmref `Running dinfo.Xc. domid;
282+ Events. callback_release xal dinfo.Xc. domid (Uuid. string_of_uuid (Uuid. uuid_of_int_array dinfo.Xc. handle))
283+ end
284+ end else begin
285+ if not db_resident_on_me then begin
286+ debug " domain %s running on me, but corresponding db record doesn't have resident_on=me" (uuid_from_vmref vmref);
287+ Db.VM. set_resident_on ~__context ~self: vmref ~value: this_host;
288+ end ;
289+ let domain_is_shutdown =
290+ try
291+ let dinfo'= Xc. domain_getinfo xc dinfo.Xc. domid in dinfo'.Xc. shutdown
292+ with _ -> true in
293+ if not domain_is_shutdown then
294+ (* Reset the power state, this also clears VBD operations etc *)
295+ let state = if dinfo.Xc. paused then `Paused else `Running in
296+ set_db_state_and_domid vmref state dinfo.Xc. domid;
297+ (* Now sync devices *)
298+ debug " syncing devices and registering vm for monitoring: %s" (uuid_from_dinfo dinfo);
299+ let uuid = Uuid. uuid_of_int_array dinfo.Xc. handle in
300+ sync_devices dinfo;
301+ (* Update the VM's guest metrics since: (i) while we were offline we may
302+ have missed an update; and (ii) if the tools .iso has been updated then
303+ we wish to re-evaluate whether we believe the VMs have up-to-date
304+ tools *)
305+ Events. guest_agent_update xal dinfo.Xc. domid (uuid_from_dinfo dinfo);
306+ (* Now register with monitoring thread *)
307+ Monitor_rrds. load_rrd ~__context (Uuid. to_string uuid) false
308+ end
309+ in
310+
307311 (* Process an "unmanaged domain" that's running here *)
308312 let unmanaged_domain_running dinfo =
309313 debug " killing umanaged domain: %s" (uuid_from_dinfo dinfo);
0 commit comments