Skip to content

Commit be796c8

Browse files
author
David Scott
committed
[block]: remove the old blktap1 disk pausing infrastructure
When the upgrade to blktap2 happened, we never had a chance to remove the unused code. This change: * simplifies the udev block handling, removing dependencies on blktap2.py * simplifies the migration code, which nolonger needs to worry about blktap1 backends being paused * removes complicated synchronisation logic needed by the old pause/unpause API * removes a very slow test from quicktest (the VBD pause/unpause one) * unifies PV and HVM codepaths in checkpoint, for increased reliability (HVM guests and PV guests both use the slow resume) Note in the blktap2 universe, the SM backend operations (eg vdi_snapshot) manipulate the tapdisks directly (via tap-ctl invocations). This complexity was only needed because blktap1 lacked such a direct interface. Signed-off-by: David Scott <[email protected]>
1 parent 0bb53c6 commit be796c8

File tree

12 files changed

+76
-469
lines changed

12 files changed

+76
-469
lines changed

ocaml/xapi/quicktest.ml

Lines changed: 0 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -265,75 +265,6 @@ let compare_snapshots session_id test one two =
265265
compare_vms session_id test x y in
266266
List.iter2 compare_all one_s two_s
267267

268-
(* CA-24232 serialising VBD.pause VBD.unpause *)
269-
let vbd_pause_unpause_test session_id vm =
270-
let vm = Client.VM.clone !rpc session_id vm "vbd-pause-unpause-test" in
271-
let test = make_test "VBD.pause and VBD.unpause" 1 in
272-
start test;
273-
finally
274-
(fun () ->
275-
let vbds = Client.VM.get_VBDs !rpc session_id vm in
276-
(* CA-24275 *)
277-
debug test "VBD.pause should fail for offline VMs";
278-
let vbd = List.hd vbds in
279-
begin
280-
try
281-
ignore(Client.VBD.pause !rpc session_id vbd);
282-
failed test "VBD.pause should not have succeeded";
283-
with
284-
| (Api_errors.Server_error(code, params)) when code = Api_errors.vm_bad_power_state -> ()
285-
| e ->
286-
failed test (Printf.sprintf "Unexpected exception from VBD.pause: %s" (Printexc.to_string e))
287-
end;
288-
debug test "VBD.unpause should fail for offline VMs";
289-
begin
290-
try
291-
ignore(Client.VBD.unpause !rpc session_id vbd "");
292-
failed test "VBD.unpause should not have succeeded";
293-
with
294-
| (Api_errors.Server_error(code, params)) when code = Api_errors.vm_bad_power_state -> ()
295-
| e ->
296-
failed test (Printf.sprintf "Unexpected exception from VBD.pause: %s" (Printexc.to_string e))
297-
end;
298-
debug test "Starting VM";
299-
Client.VM.start !rpc session_id vm false false;
300-
debug test "A solitary unpause should succeed";
301-
Client.VBD.unpause !rpc session_id vbd "";
302-
debug test "100 pause/unpause cycles should succeed";
303-
for i = 0 to 100 do
304-
let token = Client.VBD.pause !rpc session_id vbd in
305-
Client.VBD.unpause !rpc session_id vbd token
306-
done;
307-
debug test "force-shutdown should still work even if a device is paused";
308-
debug test "pausing device";
309-
let token = Client.VBD.pause !rpc session_id vbd in
310-
debug test "performing hard shutdown";
311-
let (_: unit) = Client.VM.hard_shutdown !rpc session_id vm in
312-
begin
313-
try
314-
ignore(Client.VBD.unpause !rpc session_id vbd "");
315-
failed test "VBD.unpause should not have succeeded";
316-
with
317-
| (Api_errors.Server_error(code, params)) when code = Api_errors.vm_bad_power_state -> ()
318-
| e ->
319-
failed test (Printf.sprintf "Unexpected exception from VBD.pause: %s" (Printexc.to_string e))
320-
end;
321-
debug test "starting VM again";
322-
try
323-
Client.VM.start !rpc session_id vm false false;
324-
Client.VBD.unpause !rpc session_id vbd token;
325-
with
326-
| Api_errors.Server_error(code, params) as e ->
327-
debug test (Printf.sprintf "Api_error %s [ %s ]" code (String.concat "; " params));
328-
raise e
329-
| e ->
330-
debug test (Printf.sprintf "Exception: %s" (Printexc.to_string e));
331-
raise e
332-
) (fun () ->
333-
Client.VM.hard_shutdown !rpc session_id vm;
334-
vm_uninstall test session_id vm);
335-
success test
336-
337268
let read_sys path = Stringext.String.strip Stringext.String.isspace (Unixext.string_of_file path)
338269

339270
let verify_network_connectivity session_id test vm =
@@ -629,7 +560,6 @@ let vm_powercycle_test s vm =
629560
()
630561
| _ -> ()
631562
end;
632-
vbd_pause_unpause_test s vm;
633563
powercycle_test s vm;
634564
success test
635565

ocaml/xapi/sm.ml

Lines changed: 0 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -255,82 +255,3 @@ let sr_content_type ~__context ~sr =
255255
Hashtbl.replace sr_content_type_cache sr ty;
256256
ty)
257257

258-
(*****************************************************************************)
259-
(* Storage-related utility functions *)
260-
261-
open Client
262-
263-
(** Call a function 'f' after pausing all VBDs which are currently_attached to a particular VDI.
264-
Take care to unpause everything on the way out of the function.
265-
Notes on how the locking is supposed to work:
266-
1. The whole disk should be locked against concurrent hotplug attempts
267-
2. A disk might be spontaneously unplugged between the point where we example 'currently_attached'
268-
and the point where we call VBD.pause -- it is safe to skip over these.
269-
3. In Orlando concurrent VM.snapshot calls are allowed (as part of the snapshot-with-quiesce work)
270-
we must pause VBDs in order of device number to avoid a deadlock where 'VM.get_VBDs' returns the
271-
devices in a different order in a parallel call
272-
*)
273-
let with_all_vbds_paused ~__context ~vdis f =
274-
(* We need to keep track of the VBDs we've paused so we can go back and unpause them *)
275-
let paused_so_far = ref [] in
276-
let vbds = List.concat (List.map (fun vdi -> Db.VDI.get_VBDs ~__context ~self:vdi) vdis) in
277-
let vbds = List.filter
278-
(fun self -> Db.VBD.get_currently_attached ~__context ~self
279-
&& Db.VM.get_power_state ~__context ~self:(Db.VBD.get_VM ~__context ~self) <> `Suspended)
280-
vbds in
281-
(* CA-24232: prevent concurrent snapshots of the same VM leading to deadlock since the database
282-
doesn't guarantee to return records in the same order. *)
283-
let vbds = List.sort (fun a b ->
284-
let a_device = Db.VBD.get_device ~__context ~self:a
285-
and b_device = Db.VBD.get_device ~__context ~self:b in
286-
compare a_device b_device) vbds in
287-
Helpers.call_api_functions ~__context
288-
(fun rpc session_id ->
289-
finally
290-
(fun () ->
291-
(* Attempt to pause all the VBDs *)
292-
List.iter
293-
(fun vbd ->
294-
(* NB it is possible for a disk to spontaneously unplug itself:
295-
we are safe to skip these VBDs. *)
296-
(* NB it is possible for a VM to suddenly migrate; if so we retry *)
297-
let finished = ref false in
298-
while not !finished do
299-
try
300-
let token = Client.VBD.pause rpc session_id vbd in
301-
paused_so_far := (vbd, token) :: !paused_so_far;
302-
finished := true
303-
with
304-
| Api_errors.Server_error(code, _) when code = Api_errors.device_not_attached ->
305-
warn "Not pausing VBD %s because it appears to have spontaneously unplugged" (Ref.string_of vbd);
306-
finished := true
307-
| Api_errors.Server_error(code, _) when code = Api_errors.vm_bad_power_state ->
308-
warn "Not pausing VBD %s because the VM has shutdown" (Ref.string_of vbd);
309-
finished := true
310-
| Api_errors.Server_error(code, [ cls; reference ]) when code = Api_errors.handle_invalid && reference = Ref.string_of vbd ->
311-
warn "Not pausing VBD %s because it has been deleted" reference;
312-
finished := true
313-
| Api_errors.Server_error(code, _) when code = Api_errors.vm_not_resident_here ->
314-
warn "Pausing VBD %s temporarily failed because VM has just migrated; retrying" (Ref.string_of vbd);
315-
(* !finished = false *)
316-
| e ->
317-
error "Error pausing VBD %s: %s" (Ref.string_of vbd) (ExnHelper.string_of_exn e);
318-
raise e (* let this propagate *)
319-
done
320-
) vbds;
321-
(* Now all the VBDs are paused we can call the main function *)
322-
f ()
323-
)
324-
(fun () ->
325-
(* I would have used Helpers.log_exn_continue but I wanted to log the errors as warnings
326-
and not as only debug messages *)
327-
List.iter
328-
(fun (vbd, token) ->
329-
try Client.VBD.unpause rpc session_id vbd token
330-
with e ->
331-
warn "Failed to unpause VBD %s: %s" (Ref.string_of vbd) (ExnHelper.string_of_exn e))
332-
!paused_so_far
333-
)
334-
)
335-
336-

ocaml/xapi/vmops.ml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -471,15 +471,6 @@ let destroy_domain ?(preserve_xs_vm=false) ?(unplug_frontends=true) ?(clear_curr
471471
(fun () ->
472472
Db.VBD.set_currently_attached ~__context ~self:vbd ~value:false
473473
) ()
474-
) all_vbds;
475-
476-
(* We unpause every VBD which allows any pending VBD.pause thread to unblock, acquire the VM lock in turn and check the state *)
477-
List.iter
478-
(fun vbd ->
479-
Helpers.log_exn_continue (Printf.sprintf "Vmops.destroy_domain: pre-emptively unpausing VBD: %s" (Ref.string_of vbd))
480-
(fun () ->
481-
Xapi_vbd.clean_up_on_domain_destroy vbd (* effect is to unblock threads, not actually unpause *)
482-
) ();
483474
) all_vbds
484475

485476
(* Destroy a VM's domain and all runtime state (domid etc).

ocaml/xapi/xapi_vbd.ml

Lines changed: 7 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -293,147 +293,11 @@ let refresh ~__context ~vbd ~vdi =
293293

294294
open Threadext
295295
open Pervasiveext
296+
open Fun
296297

297-
(* VBD.pause and VBD.unpause are used for two purposes:
298-
1. to stop IO and give blkback/blktap a kick during disk online resize/ disk snapshot
299-
2. to stop IO while the background coalescing daemon moves metadata around
300-
As explained in CA-24232, there is an issue where both purposes of call can happen simulataneously
301-
and so we need to perform some serialisation otherwise the first 'unpause' will prematurely reactivate
302-
the device.
303-
304-
We assume that all client threads issue 'pause' then 'unpause' calls in sequence, never in parallel.
305-
We assume that clients *rarely* fail between 'pause' and 'unpause' but in this extreme case, the
306-
VM can always be force-shutdown (ie the VM is 'unlocked' while a disk is paused)
307-
308-
We let one pause request through at a time and set vbd_pause_state.in_progress to true. Other
309-
pause threads wait until in_progress is reset to false by device_is_unpaused.
310-
We count the number of threads running VBD.pause in order to remove the vbd_pause_state from
311-
the hashtable when the unpause happens.
312-
*)
313-
314-
(* One of these is stored per actively-paused VBD *)
315-
type vbd_pause_state = {
316-
m: Mutex.t;
317-
c: Condition.t;
318-
mutable in_progress: bool; (* true if a VBD.pause thread is performing the operation *) (* protected by 'm' *)
319-
320-
mutable blocked_threads: int; (* counts the number of threads blocked in pause *) (* protected by 'paused_vbds_m' *)
321-
}
322-
323-
(* Table of vbd_pause_state records *)
324-
let paused_vbds = Hashtbl.create 10
325-
let paused_vbds_m = Mutex.create ()
326-
327-
(* Either the pause just failed or an unpause has succeeded: wake up any blocked threads or cleanup if none are waiting *)
328-
let device_is_unpaused self state =
329-
(* If no threads are waiting in the queue to perform fresh pauses then we can delete the vbd_pause_state record.
330-
If some thread(s) are waiting then we signal them. *)
331-
Mutex.execute paused_vbds_m
332-
(fun () ->
333-
debug "device_is_unpaused blocked_threads=%d" state.blocked_threads;
334-
if state.blocked_threads = 0
335-
then (Hashtbl.remove paused_vbds self; debug "Removed hashtbl entry for %s" (Ref.string_of self))
336-
else (Mutex.execute state.m (fun () -> state.in_progress <- false); Condition.signal state.c))
337-
338-
let pause_common ~__context ~vm ~self =
339-
340-
(* XXX: these checks may be redundant since the message forwarding layer checks this stuff *)
341-
let localhost = Helpers.get_localhost ~__context in
342-
343-
(* Since we blocked not holding the per-VM mutex, we need to double-check that the VM is still present *)
344-
if not(Helpers.is_running ~__context ~self:vm)
345-
then raise (Api_errors.Server_error(Api_errors.vm_bad_power_state,
346-
[ Ref.string_of vm;
347-
Record_util.power_to_string `Running;
348-
Record_util.power_to_string (Db.VM.get_power_state ~__context ~self:vm) ]));
349-
350-
if not(Db.VBD.get_currently_attached ~__context ~self)
351-
then raise (Api_errors.Server_error(Api_errors.device_not_attached, [ Ref.string_of self ]));
352-
353-
(* Make sure VM has not migrated *)
354-
if Db.VM.get_resident_on ~__context ~self:vm <> localhost
355-
then raise (Api_errors.Server_error(Api_errors.vm_not_resident_here, [ Ref.string_of vm; Ref.string_of localhost ]));
356-
357-
let device = Xen_helpers.device_of_vbd ~__context ~self in
358-
try
359-
with_xs (fun xs -> Device.Vbd.pause ~xs device)
360-
with Device.Device_shutdown | Device.Device_not_found ->
361-
raise (Api_errors.Server_error(Api_errors.device_not_attached, [ Ref.string_of self ]))
362-
363-
let pause ~__context ~self : string =
364-
let vm = Db.VBD.get_VM ~__context ~self in
365-
366-
(* 1. Find the current vbd pause state record or make a new one *)
367-
let state = Mutex.execute paused_vbds_m
368-
(fun () ->
369-
let state =
370-
if Hashtbl.mem paused_vbds self
371-
then Hashtbl.find paused_vbds self
372-
else
373-
let state = { m = Mutex.create (); c = Condition.create (); in_progress = false; blocked_threads = 0 } in
374-
Hashtbl.replace paused_vbds self state;
375-
state in
376-
state.blocked_threads <- state.blocked_threads + 1;
377-
state
378-
) in
379-
(* Multiple VBD.pause threads may end up here. Note the vbd_pause_state record will linger until
380-
the final unpause *)
381-
try
382-
finally
383-
(fun () ->
384-
(* Only one VBD.pause thread is allowed to acquire the 'in_progress' flag/lock *)
385-
Mutex.execute state.m
386-
(fun () ->
387-
while state.in_progress do Condition.wait state.c state.m done;
388-
state.in_progress <- true);
389-
(* One lucky VBD.pause thread will get here at a time *)
390-
pause_common ~__context ~vm ~self
391-
)
392-
(* Decrement the 'blocked_threads' refcount *)
393-
(fun () -> Mutex.execute paused_vbds_m (fun () -> state.blocked_threads <- state.blocked_threads - 1));
394-
(* This thread returns leaving the 'in_progress' flag true -- this is the lock. *)
395-
396-
with e ->
397-
(* Something bad happened when we were trying to pause so unwind and cleanup *)
398-
error "Unexpected error during VBD.pause: %s" (ExnHelper.string_of_exn e);
399-
device_is_unpaused self state;
400-
raise e
401-
402-
let state_of_vbd self =
403-
Mutex.execute paused_vbds_m
404-
(fun () ->
405-
if Hashtbl.mem paused_vbds self
406-
then Some (Hashtbl.find paused_vbds self)
407-
else None)
408-
409-
let unpause ~__context ~self ~token =
410-
(* Find the vbd_pause_state record which must exist if the client has called VBD.pause beforehand
411-
UNLESS someone restarted xapi *)
412-
let state = state_of_vbd self in
413-
414-
(* Note we don't check that the client has waited for the VBD.pause to return: we trust the client not
415-
to be stupid anyway by the nature of this API *)
416-
let device = Xen_helpers.device_of_vbd ~__context ~self in
417-
try
418-
with_xs (fun xs -> Device.Vbd.unpause ~xs device token);
419-
Opt.iter (device_is_unpaused self) state
420-
with
421-
| Device.Device_not_paused ->
422-
debug "Ignoring Device_not_paused exception";
423-
Opt.iter (device_is_unpaused self) state
424-
| Device.Device_not_found ->
425-
debug "Ignoring Device_not_found exception";
426-
Opt.iter (device_is_unpaused self) state
427-
| Device.Pause_token_mismatch ->
428-
warn "Unpause left device paused because supplied token did not match"
429-
430-
let flush ~__context self =
431-
debug "Flushing vbd %s" (Ref.string_of self);
432-
let token = pause ~__context ~self in
433-
unpause ~__context ~self ~token
434-
435-
(** Called on domain destroy to signal any blocked pause threads to re-evaluate the state of the world
436-
and give up *)
437-
let clean_up_on_domain_destroy self =
438-
let state = state_of_vbd self in
439-
Opt.iter (device_is_unpaused self) state
298+
let pause ~__context ~self =
299+
let vdi = Db.VBD.get_VDI ~__context ~self in
300+
let sr = Db.VDI.get_SR ~__context ~self:vdi |> Ref.string_of in
301+
raise (Api_errors.Server_error(Api_errors.sr_operation_not_supported, [ sr ]))
302+
303+
let unpause = pause

ocaml/xapi/xapi_vdi.ml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -418,12 +418,9 @@ let resize_online ~__context ~vdi ~size =
418418
Sm.assert_pbd_is_plugged ~__context ~sr:(Db.VDI.get_SR ~__context ~self:vdi);
419419
Xapi_vdi_helpers.assert_managed ~__context ~vdi;
420420

421-
(* Need to carefully pause and unpause all active VBDs *)
422-
let vdi_info = Sm.with_all_vbds_paused ~__context ~vdis:[vdi]
423-
(fun () ->
424-
Sm.call_sm_vdi_functions ~__context ~vdi
421+
let vdi_info = Sm.call_sm_vdi_functions ~__context ~vdi
425422
(fun srconf srtype sr ->
426-
Sm.vdi_resize_online srconf srtype sr vdi size)
423+
Sm.vdi_resize_online srconf srtype sr vdi size
427424
) in
428425
after_resize ~__context ~vdi ~size vdi_info
429426

0 commit comments

Comments
 (0)