Skip to content

Commit bb9cd58

Browse files
mg12ctxlindig
authored andcommitted
CP-26145: fail vgpu-migration from pre-Jura to Jura and later hosts
The vgpu-migration protocol had to be changed to prevent a race condition, causing vgpu-migration from pre-Jura to Jura hosts to block forever. The only way to recover is to restart xenopsd. This patch interrupts the vgpu-migration if it detects the vgpu-migration operation between these invalid hosts, preventing the block from occurring. The vgpu-migration is interrupted before the VM state is saved -- therefore, the VM in the sending host continues to run unaffected. It works for vgpu-migration during both RPU and cross-pool migration. The interruption occurs by xenopsd raising an Internal_error exception at the receive_vgpu thread and sending a Handshake.error back to the main migration thread of the sending host, which will clean up the main migration threads in both the sender and receiver hosts. The distinction between pre-Jura and Jura/later hosts is done by sending a new cookie 'vgpu_migration' between the sender and receiver. This patch sends this cookie with an empty value, but in the future this value could contain either a version number of features that the receiver could parse to verify if it agrees to receive the migrating VM. Signed-off-by: Marcus Granado <[email protected]>
1 parent 1849b89 commit bb9cd58

File tree

1 file changed

+14
-1
lines changed

1 file changed

+14
-1
lines changed

lib/xenops_server.ml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ let query _ _ _ = {
4040
features = [];
4141
instance_id = instance_id;
4242
}
43+
let cookie_vgpu_migration = "vgpu_migration"
4344

4445
let backend = ref None
4546
let get_backend () = match !backend with
@@ -1657,7 +1658,7 @@ and perform_exn ?subtask ?result (op: operation) (t: Xenops_task.task_handle) :
16571658
| [vgpu_id] ->
16581659
let vgpu_url = make_url "/migrate-vgpu/" (VGPU_DB.string_of_id vgpu_id) in
16591660
Open_uri.with_open_uri vgpu_url (fun vgpu_fd ->
1660-
do_request vgpu_fd [] vgpu_url;
1661+
do_request vgpu_fd [cookie_vgpu_migration, ""] vgpu_url;
16611662
Handshake.recv_success vgpu_fd;
16621663
debug "VM.migrate: Synchronisation point 1-vgpu";
16631664
Handshake.send ~verbose:true mem_fd Handshake.Success;
@@ -2363,6 +2364,18 @@ module VM = struct
23632364
let vm_id = VGPU_DB.vm_of vgpu_id in
23642365
match context.transferred_fd with
23652366
| Some transferred_fd ->
2367+
2368+
(* prevent vgpu-migration from pre-Jura to Jura and later *)
2369+
if not (List.mem_assoc cookie_vgpu_migration cookies) then
2370+
begin
2371+
(* only Jura and later hosts send this cookie; fail the migration from pre-Jura hosts *)
2372+
let msg = Printf.sprintf "VM.migrate: version of sending host incompatible with receiving host: no cookie %s" cookie_vgpu_migration in
2373+
Xenops_migrate.(Handshake.send ~verbose:true transferred_fd (Handshake.Error msg));
2374+
debug "VM.receive_vgpu: Synchronisation point 1-vgpu ERR %s" msg;
2375+
raise (Internal_error msg)
2376+
end
2377+
;
2378+
23662379
debug "VM.receive_vgpu: passed fd %d" (Obj.magic transferred_fd);
23672380
(* Store away the fd for VM_receive_memory/restore to use *)
23682381
let info = {

0 commit comments

Comments
 (0)