Skip to content

Commit 498e2b9

Browse files
Jon Ludlamzli
authored andcommitted
CA-97364: dbsync destory migrated remnants
Signed-off-by: Jon Ludlam <[email protected]>
1 parent dfd6011 commit 498e2b9

File tree

1 file changed

+62
-58
lines changed

1 file changed

+62
-58
lines changed

ocaml/xapi/dbsync_slave.ml

Lines changed: 62 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -225,69 +225,16 @@ let update_vms ~xal ~__context =
225225
warn "Caught error resynchronising PCIs: %s" (ExnHelper.string_of_exn e);
226226
) () in
227227

228-
(* We call a domain "managed" if we have some kind of vm record for
229-
it [albeit an inconsistent one]; we call a domain "unmanaged" if
230-
we have no record of it at all *)
231-
232228
(* Deal with a VM whose resident-on fields indicates it should be running here, but no domain exists here... *)
233229
let vm_in_db_for_me_but_no_domain_on_me vm =
234230
debug "domain marked as running on me in db, but no active domain: %s" (uuid_from_vmref vm);
235231
Db.VM.set_resident_on ~__context ~self:vm ~value:Ref.null;
236232
Db.VM.set_scheduled_to_be_resident_on ~__context ~self:vm ~value:Ref.null;
237233
set_db_shutdown vm in
238-
239-
(* Process a "managed domain" that's active here, syncing devices and registering monitoring events *)
240-
let managed_domain_running dinfo =
241-
let vmref,vmrec = vmrefrec_of_dinfo dinfo in
242-
(* If this domain isn't marked as running on my in the database then make it so... *)
243-
if not (List.mem vmref my_running_vm_refs_according_to_db) then
244-
begin
245-
debug "domain running on me, but corresponding db record doesn't have resident_on=me && powerstate=running: %s" (uuid_from_vmref vmref);
246-
Db.VM.set_resident_on ~__context ~self:vmref ~value:this_host;
247-
end;
248-
(* CA-13878: if we've restarted xapi in the middle of starting or rebooting a VM, restart
249-
the VM again under the assumption that the devices haven't been attached or the memory
250-
image is not built.
251-
We detect the starting/rebooting VM by the fact that it is paused and has used no CPU time
252-
and the power-state is not marked as Paused (this distinguishes between a VM which
253-
has been started paused and left alone for a long time and a VM which is being started
254-
or rebooted, which would always have the power state to Halted or Running)
255-
We start it again by setting the domain's state to shutdown with reason reboot (the event
256-
thread will do the hard work for us). *)
257-
if dinfo.Xc.paused && not(dinfo.Xc.shutdown) && dinfo.Xc.cpu_time = 0L &&
258-
(vmrec.API.vM_power_state <> `Paused) then begin
259-
warn "domain id %d uuid %s is paused but not in the database as paused; assuming it's broken; rebooting"
260-
dinfo.Xc.domid (uuid_from_vmref vmref);
261-
(* Mark the domain as shutdown(reboot), the power state as running and inject
262-
a fake event into the event system. This should provoke the event thread into
263-
restarting the VM *)
264-
Xc.domain_shutdown xc dinfo.Xc.domid Xc.Reboot;
265-
set_db_state_and_domid vmref `Running dinfo.Xc.domid;
266-
Events.callback_release xal dinfo.Xc.domid (Uuid.string_of_uuid (Uuid.uuid_of_int_array dinfo.Xc.handle))
267-
end else begin
268-
let domain_is_shutdown =
269-
try
270-
let dinfo'= Xc.domain_getinfo xc dinfo.Xc.domid in dinfo'.Xc.shutdown
271-
with _ -> true in
272-
if not domain_is_shutdown then
273-
(* Reset the power state, this also clears VBD operations etc *)
274-
let state = if dinfo.Xc.paused then `Paused else `Running in
275-
set_db_state_and_domid vmref state dinfo.Xc.domid;
276-
end;
277-
(* Now sync devices *)
278-
debug "syncing devices and registering vm for monitoring: %s" (uuid_from_dinfo dinfo);
279-
let uuid = Uuid.uuid_of_int_array dinfo.Xc.handle in
280-
sync_devices dinfo;
281-
(* Update the VM's guest metrics since: (i) while we were offline we may
282-
have missed an update; and (ii) if the tools .iso has been updated then
283-
we wish to re-evaluate whether we believe the VMs have up-to-date
284-
tools *)
285-
286-
Events.guest_agent_update xal dinfo.Xc.domid (uuid_from_dinfo dinfo);
287-
(* Now register with monitoring thread *)
288-
289-
Monitor_rrds.load_rrd ~__context (Uuid.to_string uuid) false
290-
in
234+
235+
(* We call a domain "managed" if we have some kind of vm record for
236+
it [albeit an inconsistent one]; we call a domain "unmanaged" if
237+
we have no record of it at all *)
291238

292239
(* Process a managed domain that exists here, but is in the shutdown state *)
293240
let managed_domain_shutdown dinfo =
@@ -303,7 +250,64 @@ let update_vms ~xal ~__context =
303250
with e ->
304251
warn "Ignoring exception during domain.destroy: %s" (Printexc.to_string e)
305252
end in
306-
253+
254+
(* Process a "managed domain" that's active here, syncing devices and registering monitoring events *)
255+
let managed_domain_running dinfo =
256+
let vmref,vmrec = vmrefrec_of_dinfo dinfo in
257+
let db_resident_on_me = List.mem vmref my_running_vm_refs_according_to_db in
258+
(* CA-13878: if we've restarted xapi in the middle of starting or rebooting a VM, restart
259+
the VM again under the assumption that the devices haven't been attached or the memory
260+
image is not built.
261+
We detect the starting/rebooting VM by the fact that it is paused and has used no CPU time
262+
and the power-state is not marked as Paused (this distinguishes between a VM which
263+
has been started paused and left alone for a long time and a VM which is being started
264+
or rebooted, which would always have the power state to Halted or Running)
265+
We start it again by setting the domain's state to shutdown with reason reboot (the event
266+
thread will do the hard work for us). *)
267+
if dinfo.Xc.paused && not(dinfo.Xc.shutdown) && dinfo.Xc.cpu_time = 0L &&
268+
(vmrec.API.vM_power_state <> `Paused) then begin
269+
(* If this domain isn't marked as running on my in the database then make it so... *)
270+
if not db_resident_on_me then
271+
begin
272+
debug "domain %s running on me, but corresponding db record doesn't have resident_on=me" (uuid_from_vmref vmref);
273+
debug "Domain is paused, not shutdown and has 0 cpu time. Assuming this was a failed migration";
274+
managed_domain_shutdown dinfo;
275+
end else begin
276+
warn "domain id %d uuid %s is paused but not in the database as paused; assuming it's broken; rebooting" dinfo.Xc.domid (uuid_from_vmref vmref);
277+
(* Mark the domain as shutdown(reboot), the power state as running and inject
278+
a fake event into the event system. This should provoke the event thread into
279+
restarting the VM *)
280+
Xc.domain_shutdown xc dinfo.Xc.domid Xc.Reboot;
281+
set_db_state_and_domid vmref `Running dinfo.Xc.domid;
282+
Events.callback_release xal dinfo.Xc.domid (Uuid.string_of_uuid (Uuid.uuid_of_int_array dinfo.Xc.handle))
283+
end
284+
end else begin
285+
if not db_resident_on_me then begin
286+
debug "domain %s running on me, but corresponding db record doesn't have resident_on=me" (uuid_from_vmref vmref);
287+
Db.VM.set_resident_on ~__context ~self:vmref ~value:this_host;
288+
end;
289+
let domain_is_shutdown =
290+
try
291+
let dinfo'= Xc.domain_getinfo xc dinfo.Xc.domid in dinfo'.Xc.shutdown
292+
with _ -> true in
293+
if not domain_is_shutdown then
294+
(* Reset the power state, this also clears VBD operations etc *)
295+
let state = if dinfo.Xc.paused then `Paused else `Running in
296+
set_db_state_and_domid vmref state dinfo.Xc.domid;
297+
(* Now sync devices *)
298+
debug "syncing devices and registering vm for monitoring: %s" (uuid_from_dinfo dinfo);
299+
let uuid = Uuid.uuid_of_int_array dinfo.Xc.handle in
300+
sync_devices dinfo;
301+
(* Update the VM's guest metrics since: (i) while we were offline we may
302+
have missed an update; and (ii) if the tools .iso has been updated then
303+
we wish to re-evaluate whether we believe the VMs have up-to-date
304+
tools *)
305+
Events.guest_agent_update xal dinfo.Xc.domid (uuid_from_dinfo dinfo);
306+
(* Now register with monitoring thread *)
307+
Monitor_rrds.load_rrd ~__context (Uuid.to_string uuid) false
308+
end
309+
in
310+
307311
(* Process an "unmanaged domain" that's running here *)
308312
let unmanaged_domain_running dinfo =
309313
debug "killing umanaged domain: %s" (uuid_from_dinfo dinfo);

0 commit comments

Comments
 (0)