Skip to content

Commit a619065

Browse files
mg12ctxlindig
authored andcommitted
CA-273775: improve error handling in VM_receive_memory
Originally, once vm-migrate reached synchronisation point 1, there was only one thread to kill in the receiving end in case VM_create or VM_restore failed (the main memory-receiving thread in created in receive_memory). After the vgpu-migration support, once vm-migrate reaches synchronisation point 1, two threads will have been created in the receiving host: 1) receive_memory: the original one receiving the main memory 2) receive_vgpu: a new one receiving the vgpu memory, created when the sending host accessed the url /migrate-vgpu/ on the receiving host. In the receive_vgpu thread in the receiving host, if some exception occurs while executing VM_create or VM_restore, only the main memory thread will be killed automatically, and the receive_vgpu thread will leak (will never end). This patch kills the receive_vgpu thread if an exception in the receive_memory thread occurs while the receive_vgpu thread is active, preventing this leak. It uses the `finally` clause to execute VM_create and VM_restore, and then instruct the receive_vgpu thread to disappear regardless if VM_create/restore succeeded or raised an exception. Signed-off-by: Marcus Granado <[email protected]>
1 parent b4d86f2 commit a619065

File tree

1 file changed

+27
-22
lines changed

1 file changed

+27
-22
lines changed

lib/xenops_server.ml

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,29 +1680,34 @@ and perform_exn ?subtask ?result (op: operation) (t: Xenops_task.task_handle) :
16801680

16811681
(* set up the destination domain *)
16821682
debug "VM.receive_memory creating domain and restoring VIFs";
1683-
(try
1684-
perform_atomics (
1685-
simplify [VM_create (id, Some memory_limit);] @
1686-
(* Perform as many operations as possible on the destination domain before pausing the original domain *)
1687-
(atomics_of_operation (VM_restore_vifs id))
1688-
) t;
1689-
Handshake.send s Handshake.Success
1690-
with e ->
1691-
Handshake.send s (Handshake.Error (Printexc.to_string e));
1692-
raise e
1683+
1684+
finally (fun ()->
1685+
(try
1686+
perform_atomics (
1687+
simplify [VM_create (id, Some memory_limit);] @
1688+
(* Perform as many operations as possible on the destination domain before pausing the original domain *)
1689+
(atomics_of_operation (VM_restore_vifs id))
1690+
) t;
1691+
Handshake.send s Handshake.Success
1692+
with e ->
1693+
Handshake.send s (Handshake.Error (Printexc.to_string e));
1694+
raise e
1695+
);
1696+
debug "VM.receive_memory: Synchronisation point 1";
1697+
1698+
debug "VM.receive_memory restoring VM";
1699+
(* Check if there is a separate vGPU data channel *)
1700+
let vgpu_info = Stdext.Opt.of_exception (fun () -> Hashtbl.find vgpu_receiver_sync id) in
1701+
perform_atomics (
1702+
List.map (fun vgpu_id -> VGPU_start (vgpu_id, true)) (VGPU_DB.ids id) @ [
1703+
VM_restore (id, FD s, Opt.map (fun x -> FD x.vgpu_fd) vgpu_info);
1704+
]) t;
1705+
debug "VM.receive_memory restore complete";
1706+
) (fun ()->
1707+
(* Tell the vGPU receive thread that we're done, so that it can clean up vgpu_receiver_sync id and terminate *)
1708+
let vgpu_info = Stdext.Opt.of_exception (fun () -> Hashtbl.find vgpu_receiver_sync id) in
1709+
Opt.iter (fun x -> Event.send x.vgpu_channel () |> Event.sync) vgpu_info;
16931710
);
1694-
debug "VM.receive_memory: Synchronisation point 1";
1695-
1696-
debug "VM.receive_memory restoring VM";
1697-
(* Check if there is a separate vGPU data channel *)
1698-
let vgpu_info = Stdext.Opt.of_exception (fun () -> Hashtbl.find vgpu_receiver_sync id) in
1699-
perform_atomics (
1700-
List.map (fun vgpu_id -> VGPU_start (vgpu_id, true)) (VGPU_DB.ids id) @ [
1701-
VM_restore (id, FD s, Opt.map (fun x -> FD x.vgpu_fd) vgpu_info);
1702-
]) t;
1703-
debug "VM.receive_memory restore complete";
1704-
(* Tell the vGPU receive thread that we're done *)
1705-
Opt.iter (fun x -> Event.send x.vgpu_channel () |> Event.sync) vgpu_info;
17061711
debug "VM.receive_memory: Synchronisation point 2";
17071712

17081713
begin try

0 commit comments

Comments
 (0)