Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add rdzv_backend parameter to DDPJobDefinition
  • Loading branch information
MichaelClifford committed Apr 11, 2023
commit af6fe022fb1328476df2638a3b0c17308f877a1c
4 changes: 4 additions & 0 deletions src/codeflare_sdk/job/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def __init__(
max_retries: int = 0,
mounts: Optional[List[str]] = None,
rdzv_port: int = 29500,
rdzv_backend: str = "c10d",
scheduler_args: Optional[Dict[str, str]] = None,
image: Optional[str] = None,
):
Expand All @@ -81,6 +82,7 @@ def __init__(
self.max_retries = max_retries
self.mounts: List[str] = mounts if mounts is not None else []
self.rdzv_port = rdzv_port
self.rdzv_backend = rdzv_backend
self.scheduler_args: Dict[str, str] = (
scheduler_args if scheduler_args is not None else dict()
)
Expand All @@ -104,6 +106,7 @@ def _dry_run(self, cluster: "Cluster"):
env=self.env,
max_retries=self.max_retries,
rdzv_port=self.rdzv_port,
rdzv_backend=self.rdzv_backend,
mounts=self.mounts,
),
scheduler=cluster.torchx_scheduler,
Expand Down Expand Up @@ -142,6 +145,7 @@ def _dry_run_no_cluster(self):
env=self.env, # should this still exist?
max_retries=self.max_retries,
rdzv_port=self.rdzv_port, # should this still exist?
rdzv_backend=self.rdzv_backend,
mounts=self.mounts,
image=self.image
if self.image is not None
Expand Down