Skip to content

Commit 0201892

Browse files
julienmancusokylehh
authored andcommitted
fix: improve sglang multinode handling in operator (#3151)
Signed-off-by: Julien Mancuso <[email protected]> Signed-off-by: Kyle H <[email protected]>
1 parent 1247d72 commit 0201892

File tree

8 files changed

+416
-76
lines changed

8 files changed

+416
-76
lines changed

deploy/cloud/operator/internal/dynamo/backend_sglang.go

Lines changed: 72 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@ const (
1515

1616
type SGLangBackend struct{}
1717

18+
// isPythonCommand checks if the command is a Python interpreter
19+
func isPythonCommand(cmd string) bool {
20+
if cmd == "python" || cmd == "python3" {
21+
return true
22+
}
23+
// Match python with version numbers like python3.11, python2.7, etc.
24+
matched, _ := regexp.MatchString(`^python\d+(\.\d+)*$`, cmd)
25+
return matched
26+
}
27+
1828
func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentOverridesSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
1929
// For single node, nothing to do
2030
if numberOfNodes <= 1 {
@@ -29,32 +39,83 @@ func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNod
2939
}
3040

3141
// Generate the flags to add
32-
flags := b.getMultinodeFlags(numberOfNodes, role, serviceName, multinodeDeployer)
42+
flags, needsShell := b.getMultinodeFlags(numberOfNodes, role, serviceName, multinodeDeployer)
3343
if flags == "" {
3444
return
3545
}
3646

37-
// Flatten all args into a single command and inject flags
38-
if len(container.Args) > 0 {
39-
fullCommand := strings.Join(container.Args, " ")
40-
modifiedCommand := b.injectFlagsIntoPythonCommand(fullCommand, flags)
41-
container.Args = []string{modifiedCommand}
47+
/*
48+
* Flag Injection Strategy for Multinode SGLang Deployments
49+
*
50+
* This code handles the injection of distributed training flags (--dist-init-addr, --nnodes, --node-rank)
51+
* into container commands for multinode SGLang deployments. The complexity arises from supporting multiple
52+
* container command patterns and ensuring proper environment variable interpretation.
53+
*
54+
* Two main scenarios are handled:
55+
*
56+
* 1. Direct Python Command (e.g., Command: ["python3"], Args: ["-m", "sglang", "..."])
57+
* - If shell interpretation is needed (for env vars): Wrap in "sh -c" with exec
58+
* - If no shell needed: Simply append flags to the Args array
59+
*
60+
* 2. Non-Python Command (e.g., Command: ["sh"], Args: ["-c", "python3 -m sglang ..."])
61+
* - Use regex-based injection to find embedded Python+SGLang commands within args
62+
* - Insert flags after the Python command but before any shell operators (|, &, ;)
63+
*
64+
* The needsShell flag indicates when environment variables require shell interpretation
65+
*/
66+
if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
67+
// Direct python command case
68+
if needsShell {
69+
// Transform to shell wrapper for env var interpretation
70+
fullCommand := strings.Join(container.Command, " ")
71+
originalArgs := strings.Join(container.Args, " ")
72+
var shellCommand string
73+
if len(container.Args) > 0 {
74+
// Use exec to ensure PID 1 is given to the python command
75+
shellCommand = fmt.Sprintf("exec %s %s %s", fullCommand, originalArgs, flags)
76+
} else {
77+
// Use exec to ensure PID 1 is given to the python command
78+
shellCommand = fmt.Sprintf("exec %s %s", fullCommand, flags)
79+
}
80+
container.Command = []string{"sh", "-c"}
81+
container.Args = []string{shellCommand}
82+
} else {
83+
// Simple append to args
84+
flagsSlice := strings.Fields(flags)
85+
container.Args = append(container.Args, flagsSlice...)
86+
}
87+
} else {
88+
// Non-python command case - try injection on each arg individually
89+
for i, arg := range container.Args {
90+
modifiedArg := b.injectFlagsIntoPythonCommand(arg, flags)
91+
if modifiedArg != arg { // flags were successfully injected
92+
container.Args[i] = modifiedArg
93+
break // stop after first successful injection
94+
}
95+
}
4296
}
4397
}
4498

4599
func (b *SGLangBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentOverridesSpec, serviceName string) {
46100
// do nothing
47101
}
48102

49-
// getMultinodeFlags returns the multinode flags as a single string
50-
func (b *SGLangBackend) getMultinodeFlags(numberOfNodes int32, role Role, serviceName string, multinodeDeployer MultinodeDeployer) string {
103+
// getMultinodeFlags returns the multinode flags and whether shell interpretation is needed
104+
func (b *SGLangBackend) getMultinodeFlags(numberOfNodes int32, role Role, serviceName string, multinodeDeployer MultinodeDeployer) (string, bool) {
51105
distInitAddr := fmt.Sprintf("%s:%s", multinodeDeployer.GetLeaderHostname(serviceName), SglangPort)
52-
nodeRank := multinodeDeployer.GetNodeRank()
53-
// Determine node-rank
106+
107+
var nodeRank string
108+
var needsShell bool
109+
54110
if role == RoleLeader {
55111
nodeRank = "0"
112+
needsShell = false
113+
} else {
114+
nodeRank, needsShell = multinodeDeployer.GetNodeRank()
56115
}
57-
return fmt.Sprintf("--dist-init-addr %s --nnodes %d --node-rank %s", distInitAddr, numberOfNodes, nodeRank)
116+
117+
flags := fmt.Sprintf("--dist-init-addr %s --nnodes %d --node-rank %s", distInitAddr, numberOfNodes, nodeRank)
118+
return flags, needsShell
58119
}
59120

60121
// injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them

0 commit comments

Comments
 (0)