update knobs

ai-dynamo · tedzhouhk · Aug 5, 2025 · Jul 30, 2025 · Jul 31, 2025 · Jul 31, 2025
commit 97cd70c1b33207062114877a9d3c2ac2af78d72a
diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -190,7 +190,7 @@ spec:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -240,4 +240,4 @@ spec:
             - /bin/sh
             - -c
           args:
-            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
+            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3 2>&1 | tee /tmp/vllm.log
@@ -30,10 +30,10 @@
 
 async def graceful_shutdown(runtime):
     """
-    By calling `runtime.shutdown()`, the endpoints will immediately be unavailable.
-    However, in-flight requests will still be processed until they are finished.
-    After all in-flight requests are finished, the `serve_endpoint` functions will return
-    and the engine will be shutdown by Python's garbage collector.
+    Shutdown dynamo distributed runtime. 
+    The endpoints will be immediately invalidate so no new requests will be accepted.
+    For endpoints served with graceful_shutdown=True, the serving function will wait until all in-flight requests are finished.
+    For endpoints served with graceful_shutdown=False, the serving function will return immediately.
     """
     logging.info("Received shutdown signal, shutting down DistributedRuntime")
     runtime.shutdown()
@@ -196,7 +196,7 @@ async def init(runtime: DistributedRuntime, config: Config):
     try:
         await asyncio.gather(
             # for decode, we want to transfer the in-flight requests to other decode engines,
-            # because graceful shutting down can take a long time for long OSLs
+            # because waiting them to finish can take a long time for long OSLs
             generate_endpoint.serve_endpoint(handler.generate, graceful_shutdown=False),
             clear_endpoint.serve_endpoint(handler.clear_kv_blocks),
         )