|
12 | 12 | * GNU Lesser General Public License for more details. |
13 | 13 | *) |
14 | 14 |
|
| 15 | +open Rpc |
| 16 | +open Idl |
| 17 | + |
15 | 18 | let service_name = "gpumon" |
16 | 19 | let queue_name = Xcp_service.common_prefix ^ service_name |
17 | 20 | let xml_path = "/var/xapi/" ^ service_name |
18 | 21 |
|
| 22 | +(** Uninterpreted string associated with the operation *) |
19 | 23 | type debug_info = string |
| 24 | +[@@deriving rpcty] |
| 25 | + |
| 26 | +(* Domain ID of VM *) |
20 | 27 | type domid = int |
| 28 | +[@@deriving rpcty] |
| 29 | + |
| 30 | +(** Reason for incompatibility *) |
| 31 | +type incompatibility_reason = |
| 32 | + | Host_driver |
| 33 | + | Guest_driver |
| 34 | + | GPU |
| 35 | + | Other |
| 36 | +[@@deriving rpcty] |
21 | 37 |
|
22 | | -type incompatibility_reason = Host_driver | Guest_driver | GPU | Other |
23 | | -type compatibility = Compatible | Incompatible of incompatibility_reason list |
| 38 | +(** Compatibility between virtual and physical GPU *) |
| 39 | +type compatibility = |
| 40 | + | Compatible |
| 41 | + | Incompatible of incompatibility_reason list |
| 42 | +[@@deriving rpcty] |
24 | 43 |
|
| 44 | +(** PCI identifier of physical GPU *) |
25 | 45 | type pgpu_address = string |
| 46 | +[@@deriving rpcty] |
| 47 | + |
| 48 | +(** Metadata of Nvidia physical GPU *) |
26 | 49 | type nvidia_pgpu_metadata = string |
| 50 | +[@@deriving rpcty] |
| 51 | + |
| 52 | +(** Metadata of Nvidia virtual GPU *) |
27 | 53 | type nvidia_vgpu_metadata = string |
| 54 | +[@@deriving rpcty] |
| 55 | + |
| 56 | +(** List of Nvidia virtual GPU metadata records *) |
| 57 | +type nvidia_vgpu_metadata_list = nvidia_vgpu_metadata list |
| 58 | +[@@deriving rpcty] |
| 59 | + |
| 60 | + |
| 61 | +(** Error wrapper *) |
| 62 | +type gpu_errors = |
| 63 | + | NvmlInterfaceNotAvailable |
| 64 | + (** Exception raised when gpumon is unable to load the nvml nvidia library *) |
| 65 | + | NvmlFailure of string |
| 66 | + (** Exception raised by the c bindings to the nvml nvidia library*) |
| 67 | + | Gpumon_failure |
| 68 | + (** Default exception raised upon daemon failure *) |
| 69 | +[@@default Gpumon_failure] |
| 70 | +[@@deriving rpcty] |
| 71 | + |
| 72 | +exception Gpumon_error of gpu_errors |
| 73 | + |
| 74 | +(** Error handler *) |
| 75 | +module GpuErrors = Error.Make(struct |
| 76 | + type t = gpu_errors |
| 77 | + let t = gpu_errors |
| 78 | + end) |
| 79 | +let gpu_err = GpuErrors.error |
| 80 | + |
| 81 | +(** Functor to autogenerate API calls *) |
| 82 | +module RPC_API(R : RPC) = struct |
| 83 | + open R |
| 84 | + |
| 85 | + let param = Param.mk |
28 | 86 |
|
29 | | -(** Exception raised when gpumon is unable to load the nvml nvidia library *) |
30 | | -exception NvmlInterfaceNotAvailable |
31 | | -(** Exception raised by the c bindings to the nvml nvidia library*) |
32 | | -exception NvmlFailure of string |
| 87 | + let description = |
| 88 | + Interface.{ name = "Gpumon" |
| 89 | + ; namespace = None |
| 90 | + ; description = |
| 91 | + [ "This interface is used by Xapi and Gpumon to monitor " |
| 92 | + ; "physical and virtual GPUs."] |
| 93 | + ; version=(1,0,0) |
| 94 | + } |
33 | 95 |
|
| 96 | + let implementation = implement description |
34 | 97 |
|
35 | | -module Nvidia = struct |
36 | 98 | (** Compatibility checking interface for Nvidia vGPUs *) |
| 99 | + module Nvidia = struct |
| 100 | + |
| 101 | + (** common API call parameters *) |
| 102 | + |
| 103 | + let debug_info_p = param ~description: |
| 104 | + ["Uninterpreted string used for debugging."] |
| 105 | + debug_info |
| 106 | + |
| 107 | + let domid_p = param ~description: |
| 108 | + ["Domain ID of the VM in which the vGPU(s) is running."] |
| 109 | + domid |
| 110 | + |
| 111 | + let pgpu_address_p = param ~description: |
| 112 | + ["PCI bus ID of the pGPU in which the VM is currently running" |
| 113 | + ;"in the form `domain:bus:device.function` PCI identifier."] |
| 114 | + pgpu_address |
| 115 | + |
| 116 | + let nvidia_pgpu_metadata_p = param ~description: |
| 117 | + ["Metadata of Nvidia physical GPU."] |
| 118 | + nvidia_pgpu_metadata |
| 119 | + |
| 120 | + let nvidia_vgpu_metadata_p = param ~description: |
| 121 | + ["Metadata of Nvidia virtual GPU."] |
| 122 | + nvidia_vgpu_metadata |
| 123 | + |
| 124 | + let nvidia_vgpu_metadata_list_p = param ~description: |
| 125 | + ["Metadata list of Nvidia virtual GPU."] |
| 126 | + nvidia_vgpu_metadata_list |
| 127 | + |
| 128 | + let compatibility_p = param ~description: |
| 129 | + [ "Value indicating whether two or more GPUs are compatible with each other." ] |
| 130 | + compatibility |
| 131 | + |
| 132 | + let get_pgpu_metadata = |
| 133 | + declare "get_pgpu_metadata" |
| 134 | + [ "Gets the metadata for a pGPU, given its address (PCI bus ID)." ] |
| 135 | + (debug_info_p |
| 136 | + @-> pgpu_address_p |
| 137 | + @-> returning nvidia_pgpu_metadata_p gpu_err |
| 138 | + ) |
| 139 | + |
| 140 | + let get_pgpu_vm_compatibility = |
| 141 | + declare "get_pgpu_vm_compatibility" |
| 142 | + [ "Checks compatibility between a VM's vGPU(s) and another pGPU." ] |
| 143 | + (debug_info_p |
| 144 | + @-> pgpu_address_p |
| 145 | + @-> domid_p |
| 146 | + @-> nvidia_pgpu_metadata_p |
| 147 | + @-> returning compatibility_p gpu_err |
| 148 | + ) |
| 149 | + |
| 150 | + let get_vgpu_metadata = |
| 151 | + declare "get_vgpu_metadata" |
| 152 | + [ "Obtains metadata for all vGPUs running in a domain." ] |
| 153 | + ( debug_info_p |
| 154 | + @-> domid_p |
| 155 | + @-> pgpu_address_p |
| 156 | + @-> returning nvidia_vgpu_metadata_list_p gpu_err |
| 157 | + ) |
37 | 158 |
|
38 | | - (** Get the metadata for a pGPU, given its address (PCI bus ID). *) |
39 | | - external get_pgpu_metadata: debug_info -> pgpu_address -> nvidia_pgpu_metadata = "" |
40 | | - |
41 | | - (** Check compatibility between a VM's vGPU(s) and another pGPU. |
42 | | - * pgpu_address = PCI bus ID of the pGPU in which the VM is currently running |
43 | | - * in the form `domain:bus:device.function` PCI identifier. |
44 | | - * domid = domain ID of the VM in which the vGPU(s) is running. |
45 | | - * pgpu_metadata = metadata of the pGPU to check compatibility for. *) |
46 | | - external get_pgpu_vm_compatibility: debug_info -> pgpu_address -> domid -> nvidia_pgpu_metadata -> compatibility = "" |
47 | | - |
48 | | - (** Obtain meta data for all vGPUs running in a domain. The |
49 | | - * [pgpu_address] is a PCI identifier of the form |
50 | | - * domain:bus:device.function |
51 | | - *) |
52 | | - external get_vgpu_metadata |
53 | | - : debug_info |
54 | | - -> domid |
55 | | - -> pgpu_address |
56 | | - -> nvidia_vgpu_metadata list |
57 | | - = "" |
58 | | - |
59 | | - (** Check compatibility between a pGPU (on a host) and a list of vGPUs |
60 | | - * (assigned to a VM). The use case is VM.suspend/VM.resume: before |
61 | | - * VM.resume [nvidia_vgpu_metadata] of the suspended VM is checked |
62 | | - * against the [nvidia_pgpu_metadata] on the host where the VM is |
63 | | - * resumed. A VM may use several vGPUs. |
64 | | - *) |
65 | | - external get_pgpu_vgpu_compatibility |
66 | | - : debug_info |
67 | | - -> nvidia_pgpu_metadata |
68 | | - -> nvidia_vgpu_metadata list |
69 | | - -> compatibility |
70 | | - = "" |
| 159 | + let get_pgpu_vgpu_compatibility = |
| 160 | + declare "get_pgpu_vgpu_compatibility" |
| 161 | + [ "Checks compatibility between a pGPU (on a host) and a list of vGPUs " |
| 162 | + ; "(assigned to a VM). Note: A VM may use several vGPUs." |
| 163 | + ; "The use case is VM.suspend/VM.resume:" |
| 164 | + ; "before VM.resume [nvidia_vgpu_metadata] of the suspended VM is " |
| 165 | + ; "checked against the [nvidia_pgpu_metadata] on the host where the VM " |
| 166 | + ; "is resumed." ] |
| 167 | + ( debug_info_p |
| 168 | + @-> nvidia_pgpu_metadata_p |
| 169 | + @-> nvidia_vgpu_metadata_list_p |
| 170 | + @-> returning compatibility_p gpu_err) |
| 171 | + end |
71 | 172 | end |
0 commit comments