Skip to content

Commit 2172ed0

Browse files
committed
Merge vgpu-migration into master
2 parents 9f0f994 + 626f8e9 commit 2172ed0

36 files changed

+1069
-442
lines changed

ocaml/database/db_names.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ let is_a_snapshot = "is_a_snapshot"
3333
let is_control_domain = "is_control_domain"
3434
let platform = "platform"
3535
let other_config = "other_config"
36+
let metrics = "metrics"
3637
let guest_metrics = "guest_metrics"
3738
let parent = "parent"
3839
let snapshot_of = "snapshot_of"

ocaml/idl/datamodel.ml

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,10 @@ let _ =
676676
~doc:"VGPU type is not one of the PGPU's supported types." ();
677677
error Api_errors.vgpu_type_not_compatible_with_running_type ["pgpu"; "type"; "running_type"]
678678
~doc:"VGPU type is not compatible with one or more of the VGPU types currently running on this PGPU" ();
679+
error Api_errors.vgpu_destination_incompatible ["reason"; "vgpu"; "host"]
680+
~doc:"The VGPU is not compatible with any PGPU in the destination." ();
681+
error Api_errors.nvidia_tools_error ["host"]
682+
~doc:"Nvidia tools error. Please ensure that the latest Nvidia tools are installed" ();
679683

680684
error Api_errors.openvswitch_not_active []
681685
~doc:"This operation needs the OpenVSwitch networking backend to be enabled on all hosts in the pool." ();
@@ -1683,7 +1687,7 @@ let rrd_cf_type = Enum ("rrd_cf_type",
16831687
let vm_get_boot_record = call
16841688
~name:"get_boot_record"
16851689
~in_oss_since:None
1686-
~in_product_since:rel_rio
1690+
~lifecycle:[Published, rel_rio, ""; Deprecated, rel_inverness, "Use the current VM record/fields instead"]
16871691
~doc:"Returns a record describing the VM's dynamic state, initialised when the VM boots and updated to reflect runtime configuration changes e.g. CPU hotplug"
16881692
~result:(Record _vm, "A record describing the VM")
16891693
~params:[Ref _vm, "self", "The VM whose boot-time state to return"]
@@ -2509,12 +2513,15 @@ let vm_migrate_send = call
25092513
~name: "migrate_send"
25102514
~in_product_since:rel_tampa
25112515
~doc: "Migrate the VM to another host. This can only be called when the specified VM is in the Running state."
2512-
~params:[Ref _vm, "vm", "The VM";
2513-
Map(String,String), "dest", "The result of a Host.migrate_receive call.";
2514-
Bool, "live", "Live migration";
2515-
Map (Ref _vdi, Ref _sr), "vdi_map", "Map of source VDI to destination SR";
2516-
Map (Ref _vif, Ref _network), "vif_map", "Map of source VIF to destination network";
2517-
Map (String, String), "options", "Other parameters"]
2516+
~versioned_params:
2517+
[{param_type=Ref _vm; param_name="vm"; param_doc="The VM"; param_release=tampa_release; param_default=None};
2518+
{param_type=Map(String,String); param_name="dest"; param_doc="The result of a Host.migrate_receive call."; param_release=tampa_release; param_default=None};
2519+
{param_type=Bool; param_name="live"; param_doc="Live migration"; param_release=tampa_release; param_default=None};
2520+
{param_type=Map (Ref _vdi, Ref _sr); param_name="vdi_map"; param_doc="Map of source VDI to destination SR"; param_release=tampa_release; param_default=None};
2521+
{param_type=Map (Ref _vif, Ref _network); param_name="vif_map"; param_doc="Map of source VIF to destination network"; param_release=tampa_release; param_default=None};
2522+
{param_type=Map (String, String); param_name="options"; param_doc="Other parameters"; param_release=tampa_release; param_default=None};
2523+
{param_type=Map (Ref _vgpu, Ref _gpu_group); param_name="vgpu_map"; param_doc="Map of source vGPU to destination GPU group"; param_release=inverness_release; param_default=Some (VMap [])}
2524+
]
25182525
~result:(Ref _vm, "The reference of the newly created VM in the destination pool")
25192526
~errs:[Api_errors.vm_bad_power_state; Api_errors.license_restriction]
25202527
~allowed_roles:_R_VM_POWER_ADMIN
@@ -2524,15 +2531,33 @@ let vm_assert_can_migrate = call
25242531
~name:"assert_can_migrate"
25252532
~in_product_since:rel_tampa
25262533
~doc:"Assert whether a VM can be migrated to the specified destination."
2534+
~versioned_params:
2535+
[{param_type=Ref _vm; param_name="vm"; param_doc="The VM"; param_release=tampa_release; param_default=None};
2536+
{param_type=Map(String,String); param_name="dest"; param_doc="The result of a VM.migrate_receive call."; param_release=tampa_release; param_default=None};
2537+
{param_type=Bool; param_name="live"; param_doc="Live migration"; param_release=tampa_release; param_default=None};
2538+
{param_type=Map (Ref _vdi, Ref _sr); param_name="vdi_map"; param_doc="Map of source VDI to destination SR"; param_release=tampa_release; param_default=None};
2539+
{param_type=Map (Ref _vif, Ref _network); param_name="vif_map"; param_doc="Map of source VIF to destination network"; param_release=tampa_release; param_default=None};
2540+
{param_type=Map (String, String); param_name="options"; param_doc="Other parameters"; param_release=tampa_release; param_default=None};
2541+
{param_type=Map (Ref _vgpu, Ref _gpu_group); param_name="vgpu_map"; param_doc="Map of source vGPU to destination GPU group"; param_release=inverness_release; param_default=Some (VMap [])}
2542+
]
2543+
~allowed_roles:_R_VM_POWER_ADMIN
2544+
~errs:[Api_errors.license_restriction]
2545+
()
2546+
2547+
let vm_assert_can_migrate_sender = call
2548+
~name:"assert_can_migrate_sender"
2549+
~lifecycle:[]
2550+
~doc:"Assertions for VM.assert_can_migrate that must be done on the sending host."
25272551
~params:[
25282552
Ref _vm, "vm", "The VM";
25292553
Map(String,String), "dest", "The result of a VM.migrate_receive call.";
25302554
Bool, "live", "Live migration";
25312555
Map (Ref _vdi, Ref _sr), "vdi_map", "Map of source VDI to destination SR";
25322556
Map (Ref _vif, Ref _network), "vif_map", "Map of source VIF to destination network";
2557+
Map (Ref _vgpu, Ref _gpu_group), "vgpu_map", "Map of source vGPU to destination GPU group";
25332558
Map (String, String), "options", "Other parameters" ]
25342559
~allowed_roles:_R_VM_POWER_ADMIN
2535-
~errs:[Api_errors.license_restriction]
2560+
~hide_from_docs:true
25362561
()
25372562

25382563
let vm_s3_suspend = call
@@ -4999,6 +5024,19 @@ let host_mxgpu_vf_setup = call
49995024
~allowed_roles:_R_VM_OP
50005025
()
50015026

5027+
let host_allocate_resources_for_vm = call
5028+
~name:"allocate_resources_for_vm"
5029+
~lifecycle:[Published, rel_inverness, ""]
5030+
~doc:"Reserves the resources for a VM by setting the 'scheduled_to_be_resident_on' fields"
5031+
~params:[
5032+
Ref _host, "self", "The host";
5033+
Ref _vm, "vm", "The VM";
5034+
Bool, "live", "Is this part of a live migration?"
5035+
]
5036+
~hide_from_docs:true
5037+
~allowed_roles:_R_VM_OP
5038+
()
5039+
50025040
(** Hosts *)
50035041
let host =
50045042
create_obj ~in_db:true ~in_product_since:rel_rio ~in_oss_since:oss_since_303 ~internal_deprecated_since:None ~persist:PersistEverything ~gen_constructor_destructor:false ~name:_host ~descr:"A physical host" ~gen_events:true
@@ -5090,6 +5128,7 @@ let host =
50905128
host_set_ssl_legacy;
50915129
host_apply_guest_agent_config;
50925130
host_mxgpu_vf_setup;
5131+
host_allocate_resources_for_vm;
50935132
]
50945133
~contents:
50955134
([ uid _host;
@@ -8023,6 +8062,7 @@ let vm =
80238062
vm_maximise_memory;
80248063
vm_migrate_send;
80258064
vm_assert_can_migrate;
8065+
vm_assert_can_migrate_sender;
80268066
vm_get_boot_record;
80278067
vm_get_data_sources; vm_record_data_source; vm_query_data_source; vm_forget_data_source_archives;
80288068
assert_operation_valid vm_operations _vm _self;
@@ -9375,6 +9415,7 @@ let pgpu =
93759415
field ~qualifier:DynamicRO ~ty:(Map (Ref _vgpu_type, Int)) ~lifecycle:[Published, rel_vgpu_productisation, ""] ~default_value:(Some (VMap [])) "supported_VGPU_max_capacities" "A map relating each VGPU type supported on this GPU to the maximum number of VGPUs of that type which can run simultaneously on this GPU";
93769416
field ~qualifier:DynamicRO ~ty:(pgpu_dom0_access) ~lifecycle:[Published, rel_cream, ""] ~default_value:(Some (VEnum "enabled")) "dom0_access" "The accessibility of this device from dom0";
93779417
field ~qualifier:DynamicRO ~ty:Bool ~lifecycle:[Published, rel_cream, ""] ~default_value:(Some (VBool false)) "is_system_display_device" "Is this device the system display device";
9418+
field ~qualifier:DynamicRO ~ty:(Map (String,String)) ~lifecycle:[Published, rel_inverness, ""] ~default_value:(Some (VMap [])) "compatibility_metadata" "PGPU metadata to determine whether a VGPU can migrate between two PGPUs";
93789419
]
93799420
()
93809421

@@ -9529,7 +9570,7 @@ let vgpu =
95299570
field ~qualifier:RW ~ty:(Map (String,String)) ~lifecycle:[Published, rel_boston, ""] "other_config" "Additional configuration" ~default_value:(Some (VMap []));
95309571
field ~qualifier:DynamicRO ~ty:(Ref _vgpu_type) ~lifecycle:[Published, rel_vgpu_tech_preview, ""] "type" "Preset type for this VGPU" ~default_value:(Some (VRef null_ref));
95319572
field ~qualifier:DynamicRO ~ty:(Ref _pgpu) ~lifecycle:[Published, rel_vgpu_tech_preview, ""] "resident_on" "The PGPU on which this VGPU is running" ~default_value:(Some (VRef null_ref));
9532-
field ~qualifier:DynamicRO ~ty:(Ref _pgpu) ~lifecycle:[Published, rel_dundee, ""] ~internal_only:true "scheduled_to_be_resident_on" "The PGPU on which this VGPU is scheduled to run" ~default_value:(Some (VRef null_ref));
9573+
field ~qualifier:DynamicRO ~ty:(Ref _pgpu) ~lifecycle:[Published, rel_dundee, ""] "scheduled_to_be_resident_on" "The PGPU on which this VGPU is scheduled to run" ~default_value:(Some (VRef null_ref));
95339574
]
95349575
()
95359576

ocaml/xapi-consts/api_errors.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ let pgpu_insufficient_capacity_for_vgpu = "PGPU_INSUFFICIENT_CAPACITY_FOR_VGPU"
288288
let vgpu_type_not_enabled = "VGPU_TYPE_NOT_ENABLED"
289289
let vgpu_type_not_supported = "VGPU_TYPE_NOT_SUPPORTED"
290290
let vgpu_type_not_compatible_with_running_type = "VGPU_TYPE_NOT_COMPATIBLE_WITH_RUNNING_TYPE"
291+
let vgpu_destination_incompatible = "VGPU_DESTINATION_INCOMPATIBLE"
292+
let nvidia_tools_error = "NVIDIA_TOOLS_ERROR"
291293

292294
let import_error_generic = "IMPORT_ERROR"
293295
let import_error_premature_eof = "IMPORT_ERROR_PREMATURE_EOF"

ocaml/xapi-consts/constants.ml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,6 @@ let storage_migrate_vif_map_key = "maps_to"
130130

131131
(* Abstract size value for tracking PGPU utilisation. *)
132132
let pgpu_default_size = Int64.mul 1024L 1024L
133+
134+
(* Used to specify mapping of vGPUs to pGPU groups on the remote machine. Stored in VGPU.other_config *)
135+
let storage_migrate_vgpu_map_key = "maps_to"

ocaml/xapi/cli_operations.ml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2755,7 +2755,7 @@ let vm_retrieve_wlb_recommendations printer rpc session_id params =
27552755
failwith ("Parameter '"^name^"' is not a field of the VM class. Failed to select VM for operation.")
27562756

27572757
let vm_migrate_sxm_params = ["remote-master"; "remote-username"; "vif"; "remote-password";
2758-
"remote-network"; "vdi"]
2758+
"remote-network"; "vdi"; "vgpu"]
27592759

27602760
let vm_migrate printer rpc session_id params =
27612761
(* Hack to match host-uuid and host-name for backwards compatibility *)
@@ -2825,6 +2825,11 @@ let vm_migrate printer rpc session_id params =
28252825
let vdi = Client.VDI.get_by_uuid rpc session_id vdi_uuid in
28262826
let sr = Client.SR.get_by_uuid remote_rpc remote_session sr_uuid in
28272827
vdi,sr) (read_map_params "vdi" params) in
2828+
2829+
let vgpu_map = List.map (fun (vgpu_uuid,gpu_group_uuid) ->
2830+
let vgpu = Client.VGPU.get_by_uuid rpc session_id vgpu_uuid in
2831+
let gpu_group = Client.GPU_group.get_by_uuid remote_rpc remote_session gpu_group_uuid in
2832+
vgpu,gpu_group) (read_map_params "vgpu" params) in
28282833

28292834
let default_sr =
28302835
try let pools = Client.Pool.get_all remote_rpc remote_session in
@@ -2864,7 +2869,7 @@ let vm_migrate printer rpc session_id params =
28642869
vdi_map ;
28652870
let token = Client.Host.migrate_receive remote_rpc remote_session host network options in
28662871
let new_vm =
2867-
do_vm_op ~include_control_vms:false ~include_template_vms:true printer rpc session_id (fun vm -> Client.VM.migrate_send rpc session_id (vm.getref ()) token true vdi_map vif_map options)
2872+
do_vm_op ~include_control_vms:false ~include_template_vms:true printer rpc session_id (fun vm -> Client.VM.migrate_send rpc session_id (vm.getref ()) token true vdi_map vif_map options vgpu_map)
28682873
params (["host"; "host-uuid"; "host-name"; "live"; "force"; "copy"] @ vm_migrate_sxm_params) |> List.hd in
28692874
if get_bool_param params "copy" then
28702875
printer (Cli_printer.PList [Client.VM.get_uuid remote_rpc remote_session new_vm])

ocaml/xapi/create_misc.ml

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,14 @@ let (+++) = Int64.add
173173
(** 1. The domain zero record. *)
174174
(** 2. The domain zero console record. *)
175175
(** 3. The domain zero guest metrics record. *)
176-
(** 4. The domain zero shadow record. *)
177176
(** This function makes sure there is exactly one record of each type. *)
178177
(** It updates existing records if they are found, or else creates new *)
179178
(** records for any records that are missing. *)
180179
let rec ensure_domain_zero_records ~__context ~host (host_info: host_info) : unit =
181180
maybe_upgrade_domain_zero_record ~__context ~host host_info;
182181
let domain_zero_ref = ensure_domain_zero_record ~__context host_info in
183182
ensure_domain_zero_console_record ~__context ~domain_zero_ref;
184-
ensure_domain_zero_guest_metrics_record ~__context ~domain_zero_ref host_info;
185-
ensure_domain_zero_shadow_record ~__context ~domain_zero_ref
183+
ensure_domain_zero_guest_metrics_record ~__context ~domain_zero_ref host_info
186184

187185
and maybe_upgrade_domain_zero_record ~__context ~host (host_info: host_info) =
188186
try
@@ -229,11 +227,6 @@ and ensure_domain_zero_guest_metrics_record ~__context ~domain_zero_ref (host_in
229227
Db.VM.set_metrics ~__context ~self:domain_zero_ref ~value:metrics_ref
230228
end
231229

232-
and ensure_domain_zero_shadow_record ~__context ~domain_zero_ref : unit =
233-
(* Always create a new shadow record. *)
234-
let domain_zero_record = Db.VM.get_record ~__context ~self:domain_zero_ref in
235-
Helpers.set_boot_record ~__context ~self:domain_zero_ref domain_zero_record
236-
237230
and create_domain_zero_record ~__context ~domain_zero_ref (host_info: host_info) : unit =
238231
(* Determine domain 0 memory constraints. *)
239232
let memory = create_domain_zero_memory_constraints host_info in
@@ -246,8 +239,24 @@ and create_domain_zero_record ~__context ~domain_zero_ref (host_info: host_info)
246239
let uuid = host_info.dom0_uuid in
247240
(* FIXME: Assume dom0 has 1 vCPU per Host_cpu for now *)
248241
let vcpus = calculate_domain_zero_vcpu_count ~__context in
249-
let metrics = Ref.make () in
250-
(* Now create the database record. *)
242+
let metrics = Ref.make () and metrics_uuid = Uuid.to_string (Uuid.make_uuid ()) in
243+
let vCPUs_utilisation = [(0L, 0.)] in
244+
(* Now create the database records. *)
245+
Db.VM_metrics.create ~__context ~ref:metrics ~uuid:metrics_uuid
246+
~memory_actual:0L ~vCPUs_number:0L
247+
~vCPUs_utilisation
248+
~vCPUs_CPU:[]
249+
~vCPUs_params:[]
250+
~vCPUs_flags:[]
251+
~state:[]
252+
~start_time:Date.never
253+
~install_time:Date.never
254+
~last_updated:Date.never
255+
~other_config:[]
256+
~hvm:false
257+
~nested_virt:false
258+
~nomigrate:false
259+
;
251260
Db.VM.create ~__context ~ref:domain_zero_ref
252261
~name_label:("Control domain on host: " ^ host_info.hostname) ~uuid
253262
~name_description:"The domain which manages physical devices and manages other domains"

ocaml/xapi/helpers.ml

Lines changed: 8 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -458,54 +458,6 @@ let rolling_upgrade_in_progress ~__context =
458458
with _ ->
459459
false
460460

461-
let parse_boot_record ~string:lbr =
462-
match Xmlrpc_sexpr.sexpr_str_to_xmlrpc lbr with
463-
| None -> API.Legacy.From.vM_t "ret_val" (Xml.parse_string lbr)
464-
| Some xml -> API.Legacy.From.vM_t "ret_val" xml
465-
466-
(** Fetch the configuration the VM was booted with *)
467-
let get_boot_record_of_record ~__context ~string:lbr ~uuid:current_vm_uuid =
468-
try
469-
parse_boot_record lbr
470-
with e ->
471-
(* warn "Warning: exception '%s' parsing last booted record (%s) - returning current record instead" lbr (ExnHelper.string_of_exn e); *)
472-
Db.VM.get_record ~__context ~self:(Db.VM.get_by_uuid ~__context ~uuid:current_vm_uuid)
473-
474-
let get_boot_record ~__context ~self =
475-
let r = Db.VM.get_record_internal ~__context ~self in
476-
let lbr = get_boot_record_of_record ~__context ~string:r.Db_actions.vM_last_booted_record ~uuid:r.Db_actions.vM_uuid in
477-
(* CA-31903: we now use an unhealthy mix of fields from the boot_records and the live VM.
478-
In particular the VM is currently using dynamic_min and max from the live VM -- not the boot-time settings. *)
479-
{ lbr with
480-
API.vM_memory_target = 0L;
481-
API.vM_memory_dynamic_min = r.Db_actions.vM_memory_dynamic_min;
482-
API.vM_memory_dynamic_max = r.Db_actions.vM_memory_dynamic_max;
483-
}
484-
485-
486-
let set_boot_record ~__context ~self newbootrec =
487-
(* blank last_booted_record field in newbootrec, so we don't just keep encapsulating
488-
old last_boot_records in new snapshots! *)
489-
let newbootrec = {newbootrec with API.vM_last_booted_record=""; API.vM_bios_strings=[]} in
490-
if rolling_upgrade_in_progress ~__context then
491-
begin
492-
(* during a rolling upgrade, there might be slaves in the pool
493-
who have not yet been upgraded to understand sexprs, so
494-
let's still talk using the legacy xmlrpc format.
495-
*)
496-
let xml = Xml.to_string (API.Legacy.To.vM_t newbootrec) in
497-
Db.VM.set_last_booted_record ~__context ~self ~value:xml
498-
end
499-
else
500-
begin
501-
(* if it's not a rolling upgrade, then we know everyone
502-
else in the pool will understand s-expressions.
503-
*)
504-
let sexpr = Xmlrpc_sexpr.xmlrpc_to_sexpr_str (API.Legacy.To.vM_t newbootrec) in
505-
Db.VM.set_last_booted_record ~__context ~self ~value:sexpr
506-
end;
507-
()
508-
509461
(** Inspect the current configuration of a VM and return a boot_method type *)
510462
let boot_method_of_vm ~__context ~vm =
511463
if vm.API.vM_HVM_boot_policy <> "" then begin
@@ -545,20 +497,18 @@ let boot_method_of_vm ~__context ~vm =
545497

546498
(** Returns true if the supplied VM configuration is HVM.
547499
NB that just because a VM's current configuration looks like HVM doesn't imply it
548-
actually booted that way; you must check the boot_record to be sure *)
549-
let is_hvm (x: API.vM_t) = x.API.vM_HVM_boot_policy <> ""
500+
actually booted that way; you must check the VM_metrics to be sure *)
501+
let will_boot_hvm_from_record (x: API.vM_t) = x.API.vM_HVM_boot_policy <> ""
550502

551503
let will_boot_hvm ~__context ~self = Db.VM.get_HVM_boot_policy ~__context ~self <> ""
552504

553505
let has_booted_hvm ~__context ~self =
554-
let boot_record = get_boot_record ~__context ~self in
555-
boot_record.API.vM_HVM_boot_policy <> ""
556-
557-
let has_booted_hvm_of_record ~__context r =
558-
let boot_record =
559-
get_boot_record_of_record ~__context
560-
~string:r.Db_actions.vM_last_booted_record ~uuid:r.Db_actions.vM_uuid in
561-
boot_record.API.vM_HVM_boot_policy <> ""
506+
Db.VM_metrics.get_hvm ~__context ~self:(Db.VM.get_metrics ~__context ~self)
507+
508+
let is_hvm ~__context ~self =
509+
match Db.VM.get_power_state ~__context ~self with
510+
| `Paused | `Running | `Suspended -> has_booted_hvm ~__context ~self
511+
| `Halted | _ -> will_boot_hvm ~__context ~self
562512

563513
let is_running ~__context ~self = Db.VM.get_domid ~__context ~self <> -1L
564514

0 commit comments

Comments
 (0)