Skip to content

Commit bda1536

Browse files
authored
Merge pull request #196 from minishrink/gpu_ppx
CP-26717: Port Gpumon to PPX-based RPCs
2 parents 4e173bf + 8a007fa commit bda1536

File tree

4 files changed

+191
-73
lines changed

4 files changed

+191
-73
lines changed

gpumon/gpumon_cli.ml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
(* Gpumon CLI *)
3+
4+
module Cmds = Gpumon_interface.RPC_API(Cmdlinergen.Gen ())
5+
6+
let version_str description =
7+
let maj,min,mic = description.Idl.Interface.version in
8+
Printf.sprintf "%d.%d.%d" maj min mic
9+
10+
let default_cmd =
11+
let doc = String.concat "" [
12+
"A CLI for the GPU monitoring API. This allows scripting of the gpumon daemon ";
13+
"for testing and debugging. This tool is not intended to be used as an ";
14+
"end user tool"] in
15+
Cmdliner.Term.(ret (const (fun _ -> `Help (`Pager, None)) $ const ())),
16+
Cmdliner.Term.info "gpumon_cli" ~version:(version_str Cmds.description) ~doc
17+
18+
let cli () =
19+
let rpc = Gpumon_client.rpc in
20+
Cmdliner.Term.eval_choice default_cmd (List.map (fun t -> t rpc) (Cmds.implementation ()))
21+
22+
let _ = cli ()

gpumon/gpumon_client.ml

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,14 @@
1212
* GNU Lesser General Public License for more details.
1313
*)
1414

15-
open Gpumon_interface
16-
open Xcp_client
17-
18-
let xml_url () = "file:" ^ xml_path
19-
20-
module Client = Gpumon_interface.Client(struct
21-
let rpc call =
22-
if !use_switch
23-
then json_switch_rpc queue_name call
24-
else xml_http_rpc
25-
~srcstr:(get_user_agent ())
26-
~dststr:"gpumon"
27-
xml_url
28-
call
29-
end)
15+
let xml_url () = "file:" ^ Gpumon_interface.xml_path
3016

17+
let rpc call =
18+
if !Xcp_client.use_switch
19+
then Xcp_client.json_switch_rpc Gpumon_interface.queue_name call
20+
else Xcp_client.xml_http_rpc
21+
~srcstr:(Xcp_client.get_user_agent ())
22+
~dststr:"gpumon"
23+
xml_url
24+
call
25+
module Client = Gpumon_interface.RPC_API(Idl.GenClientExnRpc(struct let rpc=rpc end))

gpumon/gpumon_interface.ml

Lines changed: 141 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,60 +12,161 @@
1212
* GNU Lesser General Public License for more details.
1313
*)
1414

15+
open Rpc
16+
open Idl
17+
1518
let service_name = "gpumon"
1619
let queue_name = Xcp_service.common_prefix ^ service_name
1720
let xml_path = "/var/xapi/" ^ service_name
1821

22+
(** Uninterpreted string associated with the operation *)
1923
type debug_info = string
24+
[@@deriving rpcty]
25+
26+
(* Domain ID of VM *)
2027
type domid = int
28+
[@@deriving rpcty]
29+
30+
(** Reason for incompatibility *)
31+
type incompatibility_reason =
32+
| Host_driver
33+
| Guest_driver
34+
| GPU
35+
| Other
36+
[@@deriving rpcty]
2137

22-
type incompatibility_reason = Host_driver | Guest_driver | GPU | Other
23-
type compatibility = Compatible | Incompatible of incompatibility_reason list
38+
(** Compatibility between virtual and physical GPU *)
39+
type compatibility =
40+
| Compatible
41+
| Incompatible of incompatibility_reason list
42+
[@@deriving rpcty]
2443

44+
(** PCI identifier of physical GPU *)
2545
type pgpu_address = string
46+
[@@deriving rpcty]
47+
48+
(** Metadata of Nvidia physical GPU *)
2649
type nvidia_pgpu_metadata = string
50+
[@@deriving rpcty]
51+
52+
(** Metadata of Nvidia virtual GPU *)
2753
type nvidia_vgpu_metadata = string
54+
[@@deriving rpcty]
55+
56+
(** List of Nvidia virtual GPU metadata records *)
57+
type nvidia_vgpu_metadata_list = nvidia_vgpu_metadata list
58+
[@@deriving rpcty]
59+
60+
61+
(** Error wrapper *)
62+
type gpu_errors =
63+
| NvmlInterfaceNotAvailable
64+
(** Exception raised when gpumon is unable to load the nvml nvidia library *)
65+
| NvmlFailure of string
66+
(** Exception raised by the c bindings to the nvml nvidia library*)
67+
| Gpumon_failure
68+
(** Default exception raised upon daemon failure *)
69+
[@@default Gpumon_failure]
70+
[@@deriving rpcty]
71+
72+
exception Gpumon_error of gpu_errors
73+
74+
(** Error handler *)
75+
module GpuErrors = Error.Make(struct
76+
type t = gpu_errors
77+
let t = gpu_errors
78+
end)
79+
let gpu_err = GpuErrors.error
80+
81+
(** Functor to autogenerate API calls *)
82+
module RPC_API(R : RPC) = struct
83+
open R
84+
85+
let param = Param.mk
2886

29-
(** Exception raised when gpumon is unable to load the nvml nvidia library *)
30-
exception NvmlInterfaceNotAvailable
31-
(** Exception raised by the c bindings to the nvml nvidia library*)
32-
exception NvmlFailure of string
87+
let description =
88+
Interface.{ name = "Gpumon"
89+
; namespace = None
90+
; description =
91+
[ "This interface is used by Xapi and Gpumon to monitor "
92+
; "physical and virtual GPUs."]
93+
; version=(1,0,0)
94+
}
3395

96+
let implementation = implement description
3497

35-
module Nvidia = struct
3698
(** Compatibility checking interface for Nvidia vGPUs *)
99+
module Nvidia = struct
100+
101+
(** common API call parameters *)
102+
103+
let debug_info_p = param ~description:
104+
["Uninterpreted string used for debugging."]
105+
debug_info
106+
107+
let domid_p = param ~description:
108+
["Domain ID of the VM in which the vGPU(s) is running."]
109+
domid
110+
111+
let pgpu_address_p = param ~description:
112+
["PCI bus ID of the pGPU in which the VM is currently running"
113+
;"in the form `domain:bus:device.function` PCI identifier."]
114+
pgpu_address
115+
116+
let nvidia_pgpu_metadata_p = param ~description:
117+
["Metadata of Nvidia physical GPU."]
118+
nvidia_pgpu_metadata
119+
120+
let nvidia_vgpu_metadata_p = param ~description:
121+
["Metadata of Nvidia virtual GPU."]
122+
nvidia_vgpu_metadata
123+
124+
let nvidia_vgpu_metadata_list_p = param ~description:
125+
["Metadata list of Nvidia virtual GPU."]
126+
nvidia_vgpu_metadata_list
127+
128+
let compatibility_p = param ~description:
129+
[ "Value indicating whether two or more GPUs are compatible with each other." ]
130+
compatibility
131+
132+
let get_pgpu_metadata =
133+
declare "get_pgpu_metadata"
134+
[ "Gets the metadata for a pGPU, given its address (PCI bus ID)." ]
135+
(debug_info_p
136+
@-> pgpu_address_p
137+
@-> returning nvidia_pgpu_metadata_p gpu_err
138+
)
139+
140+
let get_pgpu_vm_compatibility =
141+
declare "get_pgpu_vm_compatibility"
142+
[ "Checks compatibility between a VM's vGPU(s) and another pGPU." ]
143+
(debug_info_p
144+
@-> pgpu_address_p
145+
@-> domid_p
146+
@-> nvidia_pgpu_metadata_p
147+
@-> returning compatibility_p gpu_err
148+
)
149+
150+
let get_vgpu_metadata =
151+
declare "get_vgpu_metadata"
152+
[ "Obtains metadata for all vGPUs running in a domain." ]
153+
( debug_info_p
154+
@-> domid_p
155+
@-> pgpu_address_p
156+
@-> returning nvidia_vgpu_metadata_list_p gpu_err
157+
)
37158

38-
(** Get the metadata for a pGPU, given its address (PCI bus ID). *)
39-
external get_pgpu_metadata: debug_info -> pgpu_address -> nvidia_pgpu_metadata = ""
40-
41-
(** Check compatibility between a VM's vGPU(s) and another pGPU.
42-
* pgpu_address = PCI bus ID of the pGPU in which the VM is currently running
43-
* in the form `domain:bus:device.function` PCI identifier.
44-
* domid = domain ID of the VM in which the vGPU(s) is running.
45-
* pgpu_metadata = metadata of the pGPU to check compatibility for. *)
46-
external get_pgpu_vm_compatibility: debug_info -> pgpu_address -> domid -> nvidia_pgpu_metadata -> compatibility = ""
47-
48-
(** Obtain meta data for all vGPUs running in a domain. The
49-
* [pgpu_address] is a PCI identifier of the form
50-
* domain:bus:device.function
51-
*)
52-
external get_vgpu_metadata
53-
: debug_info
54-
-> domid
55-
-> pgpu_address
56-
-> nvidia_vgpu_metadata list
57-
= ""
58-
59-
(** Check compatibility between a pGPU (on a host) and a list of vGPUs
60-
* (assigned to a VM). The use case is VM.suspend/VM.resume: before
61-
* VM.resume [nvidia_vgpu_metadata] of the suspended VM is checked
62-
* against the [nvidia_pgpu_metadata] on the host where the VM is
63-
* resumed. A VM may use several vGPUs.
64-
*)
65-
external get_pgpu_vgpu_compatibility
66-
: debug_info
67-
-> nvidia_pgpu_metadata
68-
-> nvidia_vgpu_metadata list
69-
-> compatibility
70-
= ""
159+
let get_pgpu_vgpu_compatibility =
160+
declare "get_pgpu_vgpu_compatibility"
161+
[ "Checks compatibility between a pGPU (on a host) and a list of vGPUs "
162+
; "(assigned to a VM). Note: A VM may use several vGPUs."
163+
; "The use case is VM.suspend/VM.resume:"
164+
; "before VM.resume [nvidia_vgpu_metadata] of the suspended VM is "
165+
; "checked against the [nvidia_pgpu_metadata] on the host where the VM "
166+
; "is resumed." ]
167+
( debug_info_p
168+
@-> nvidia_pgpu_metadata_p
169+
@-> nvidia_vgpu_metadata_list_p
170+
@-> returning compatibility_p gpu_err)
171+
end
71172
end

gpumon/jbuild

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,35 @@ let coverage_rewriter =
2222
else
2323
""
2424

25-
let rewriters_camlp4 = ["rpclib.idl -syntax camlp4o"]
26-
let rewriters_ppx = ["ppx_deriving_rpc"; "ppx_sexp_conv"]
25+
let rewriters = ["ppx_deriving_rpc"]
2726

2827
let () = Printf.ksprintf Jbuild_plugin.V1.send {|
2928
(jbuild_version 1)
3029

31-
(library
32-
((name xapi_gpumon_interface)
33-
(public_name xcp.gpumon.interface)
34-
(modules (gpumon_interface))
35-
(flags (:standard -w -39 %s))
36-
(libraries
37-
(rpclib
38-
threads
39-
xcp))
40-
(wrapped false)
41-
%s))
42-
4330
(library
4431
((name xapi_gpumon)
4532
(public_name xcp.gpumon)
46-
(modules (:standard \ gpumon_interface))
4733
(flags (:standard -w -39-33 %s))
34+
(modules (:standard \ gpumon_cli ))
4835
(libraries
4936
(rpclib
5037
threads
51-
xcp
52-
xapi_gpumon_interface))
38+
xcp))
5339
(wrapped false)
5440
%s))
5541

56-
|} (flags rewriters_camlp4) coverage_rewriter (flags rewriters_ppx) coverage_rewriter
42+
(executable
43+
((name gpumon_cli)
44+
(modules (gpumon_cli))
45+
(libraries
46+
(cmdliner
47+
rpclib.cmdliner
48+
rpclib.markdown
49+
xcp.gpumon))))
50+
51+
(alias
52+
((name runtest)
53+
(deps (gpumon_cli.exe))
54+
(action (run ${<}))))
55+
56+
|} (flags rewriters) coverage_rewriter

0 commit comments

Comments
 (0)