From d9d82806472819614df18256fc3effe96594a3aa Mon Sep 17 00:00:00 2001
From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
Date: Mon, 8 Sep 2025 19:53:55 -0700
Subject: [PATCH] feat: update benchmarking and deploy utils (#2933)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
---
 benchmarks/README.md                          |  5 +
 benchmarks/benchmark.sh                       |  4 +-
 .../profiler/deploy/profile_sla_job.yaml      | 11 +--
 benchmarks/profiler/utils/__init__.py         |  2 +
 benchmarks/utils/benchmark.py                 |  6 +-
 .../sglang/deploy/disagg_planner.yaml         |  4 +-
 .../backends/vllm/deploy/disagg_planner.yaml  |  4 +-
 deploy/utils/README.md                        | 21 +++-
 deploy/utils/download_pvc_results.py          | 21 +++-
 deploy/utils/inject_manifest.py               | 46 +++++++--
 deploy/utils/kubernetes.py                    | 97 ++++++++++++++-----
 deploy/utils/manifests/pvc-access-pod.yaml    |  2 +-
 deploy/utils/setup_k8s_namespace.sh           |  4 +-
 docs/benchmarks/benchmarking.md               | 23 ++++-
 docs/benchmarks/pre_deployment_profiling.md   | 31 ++++--
 15 files changed, 209 insertions(+), 72 deletions(-)
 create mode 100644 benchmarks/profiler/utils/__init__.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 3c310ffa4a..42bc317f54 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -61,6 +61,11 @@ The benchmarking framework supports:
 - Customizable concurrency levels (configurable via CONCURRENCIES env var), sequence lengths, and models
 - Automated performance plot generation with custom labels
 
+**Sequential GPU Usage:**
+- Models are deployed and benchmarked **sequentially**, not in parallel
+- Each deployment gets exclusive access to all available GPUs during its benchmark run
+- Ensures accurate performance measurements and fair comparison across configurations
+
 **Supported Backends:**
 - DynamoGraphDeployments
 - External HTTP endpoints (for comparison with non-Dynamo backends)
diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh
index 797545c517..5b86280f09 100755
--- a/benchmarks/benchmark.sh
+++ b/benchmarks/benchmark.sh
@@ -11,7 +11,7 @@ DYNAMO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 
 # Configuration - all set via command line arguments
 NAMESPACE=""
-MODEL="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+MODEL="Qwen/Qwen3-0.6B"
 ISL=2000
 STD=10
 OSL=256
@@ -46,7 +46,7 @@ REQUIRED:
 
 OPTIONS:
     -h, --help                    Show this help message
-    -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)
+    -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B)
                                   NOTE: This must match the model configured in your deployment manifests and the model deployed in any endpoints.
     -i, --isl LENGTH              Input sequence length (default: $ISL)
     -s, --std STDDEV              Input sequence standard deviation (default: $STD)
diff --git a/benchmarks/profiler/deploy/profile_sla_job.yaml b/benchmarks/profiler/deploy/profile_sla_job.yaml
index 14be68c7b2..f0d39f0bc3 100644
--- a/benchmarks/profiler/deploy/profile_sla_job.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_job.yaml
@@ -29,9 +29,9 @@ spec:
         command: ["python", "-m", "benchmarks.profiler.profile_sla"]
         args:
           - --config
-          - /workspace/configs/disagg.yaml
+          - /data/configs/disagg.yaml
           - --output-dir
-          - /workspace/profiling_results
+          - /data/profiling_results
           - --namespace
           - ${NAMESPACE}
           - --backend
@@ -50,15 +50,10 @@ spec:
           - "20"
         volumeMounts:
           - name: output-volume
-            mountPath: /workspace/profiling_results
-          - name: configs
-            mountPath: /workspace/configs
+            mountPath: /data
       restartPolicy: Never
       volumes:
         - name: output-volume
           persistentVolumeClaim:
             claimName: dynamo-pvc
-        - name: configs
-          persistentVolumeClaim:
-            claimName: dynamo-pvc
   backoffLimit: 0
diff --git a/benchmarks/profiler/utils/__init__.py b/benchmarks/profiler/utils/__init__.py
new file mode 100644
index 0000000000..1a8431c3e3
--- /dev/null
+++ b/benchmarks/profiler/utils/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
index 236757049a..c007f0a211 100755
--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks/utils/benchmark.py
@@ -54,17 +54,17 @@ def main() -> int:
         help="Input in format <label>=<manifest_path_or_endpoint>. Can be specified multiple times for comparisons.",
     )
     parser.add_argument("--namespace", required=True, help="Kubernetes namespace")
-    parser.add_argument("--isl", type=int, default=200, help="Input sequence length")
+    parser.add_argument("--isl", type=int, default=2000, help="Input sequence length")
     parser.add_argument(
         "--std",
         type=int,
         default=10,
         help="Input sequence standard deviation",
     )
-    parser.add_argument("--osl", type=int, default=200, help="Output sequence length")
+    parser.add_argument("--osl", type=int, default=256, help="Output sequence length")
     parser.add_argument(
         "--model",
-        default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        default="Qwen/Qwen3-0.6B",
         help="Model name",
     )
     parser.add_argument(
diff --git a/components/backends/sglang/deploy/disagg_planner.yaml b/components/backends/sglang/deploy/disagg_planner.yaml
index 2e55dc37e9..dd32131363 100644
--- a/components/backends/sglang/deploy/disagg_planner.yaml
+++ b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -48,7 +48,7 @@ spec:
       pvc:
         create: false
         name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
-        mountPoint: /workspace/profiling_results
+        mountPoint: /data/profiling_results
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
@@ -62,7 +62,7 @@ spec:
               --environment=kubernetes
               --backend=sglang
               --adjustment-interval=60
-              --profile-results-dir=/workspace/profiling_results
+              --profile-results-dir=/data/profiling_results
     Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
       dynamoNamespace: dynamo
       componentType: frontend
diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 50e2640ca2..6394021719 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -48,7 +48,7 @@ spec:
       pvc:
         create: false
         name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
-        mountPoint: /workspace/profiling_results
+        mountPoint: /data/profiling_results
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
@@ -65,7 +65,7 @@ spec:
               --environment=kubernetes
               --backend=vllm
               --adjustment-interval=60
-              --profile-results-dir=/workspace/profiling_results
+              --profile-results-dir=/data/profiling_results
               --prometheus-port=9085
     Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
       dynamoNamespace: vllm-disagg-planner
diff --git a/deploy/utils/README.md b/deploy/utils/README.md
index 5f72eb6260..8c753e0138 100644
--- a/deploy/utils/README.md
+++ b/deploy/utils/README.md
@@ -88,23 +88,36 @@ These scripts interact with the Persistent Volume Claim (PVC) that stores config
 
 ```bash
 # The profiling job reads your DGD config from the PVC
-python3 deploy/utils/inject_manifest.py \
+# IMPORTANT: All paths must start with /data/ for security reasons
+python3 -m deploy.utils.inject_manifest \
   --namespace $NAMESPACE \
   --src ./my-disagg.yaml \
-  --dest /configs/disagg.yaml
+  --dest /data/configs/disagg.yaml
 ```
 
 **Download benchmark/profiling results:**
 
 ```bash
 # After benchmarking or profiling completes, download results
-python3 deploy/utils/download_pvc_results.py \
+python3 -m deploy.utils.download_pvc_results \
   --namespace $NAMESPACE \
   --output-dir ./pvc_files \
-  --folder /results \
+  --folder /data/results \
   --no-config   # optional: skip *.yaml/*.yml in the download
 ```
 
+#### Path Requirements
+
+**Important**: The PVC is mounted at `/data` in the access pod for security reasons. All destination paths must start with `/data/`.
+
+**Common path patterns:**
+- `/data/configs/` - Configuration files (DGD manifests)
+- `/data/results/` - Benchmark results
+- `/data/profiling_results/` - Profiling data
+- `/data/benchmarking/` - Benchmarking artifacts
+
+**User-friendly error messages**: If you forget the `/data/` prefix, the script will show a helpful error message with the correct path and example commands.
+
 #### Next Steps
 
 For complete benchmarking workflows:
diff --git a/deploy/utils/download_pvc_results.py b/deploy/utils/download_pvc_results.py
index aa266b6a22..7c9ef1ee5b 100755
--- a/deploy/utils/download_pvc_results.py
+++ b/deploy/utils/download_pvc_results.py
@@ -23,7 +23,7 @@
 
 Usage:
     python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> \
-        --folder </absolute/folder/in/pvc> [--no-config]
+        --folder /data/<folder/in/pvc> [--no-config]
 """
 
 import argparse
@@ -36,7 +36,7 @@
     from deploy.utils.kubernetes import (
         check_kubectl_access,
         cleanup_access_pod,
-        deploy_access_pod,
+        ensure_clean_access_pod,
         run_command,
     )
 except ModuleNotFoundError:
@@ -46,7 +46,7 @@
     from deploy.utils.kubernetes import (
         check_kubectl_access,
         cleanup_access_pod,
-        deploy_access_pod,
+        ensure_clean_access_pod,
         run_command,
     )
 
@@ -182,11 +182,22 @@ def main():
     parser.add_argument(
         "--folder",
         required=True,
-        help="Absolute folder path in the PVC to download, e.g. /profiling_results or /benchmarking_results",
+        help="Absolute folder path in the PVC to download, must start with /data/, e.g. /data/profiling_results or /data/benchmarking_results",
     )
 
     args = parser.parse_args()
 
+    # Validate folder path starts with /data/
+    if not args.folder.startswith("/data/"):
+        print("❌ Error: Folder path must start with '/data/'")
+        print(f"   Provided: {args.folder}")
+        print("   Quick Fix: Add '/data/' prefix to your path")
+        print("   Examples:")
+        print("     /profiling_results → /data/profiling_results")
+        print("     /benchmarking_results → /data/benchmarking_results")
+        print("     /configs → /data/configs")
+        sys.exit(1)
+
     print("📥 PVC Results Download")
     print("=" * 40)
 
@@ -194,7 +205,7 @@ def main():
     check_kubectl_access(args.namespace)
 
     # Deploy access pod
-    pod_name = deploy_access_pod(args.namespace)
+    pod_name = ensure_clean_access_pod(args.namespace)
     try:
         # List and download files
         files = list_pvc_contents(args.namespace, pod_name, args.folder, args.no_config)
diff --git a/deploy/utils/inject_manifest.py b/deploy/utils/inject_manifest.py
index a785377343..52973abc16 100755
--- a/deploy/utils/inject_manifest.py
+++ b/deploy/utils/inject_manifest.py
@@ -21,12 +21,15 @@
 Copies any Kubernetes manifest file into the PVC for later use by jobs.
 Both the source manifest path and destination path in the PVC are required.
 
+IMPORTANT: The PVC is mounted at /data in the access pod for security reasons.
+All destination paths must start with '/data/'.
+
 Usage:
     python3 inject_manifest.py --namespace <namespace> --src <local_manifest.yaml> --dest <absolute_path_in_pvc>
 
 Examples:
-    python3 inject_manifest.py --namespace <ns> --src ./my-disagg.yaml --dest /configs/disagg.yaml
-    python3 inject_manifest.py --namespace <ns> --src ./my-agg.yaml    --dest /configs/agg.yaml
+    python3 inject_manifest.py --namespace <ns> --src ./disagg.yaml --dest /data/configs/disagg.yaml
+    python3 inject_manifest.py --namespace <ns> --src ./my-data.yaml    --dest /data/custom/path/data.yaml
 """
 
 import argparse
@@ -37,7 +40,7 @@
     PVC_ACCESS_POD_NAME,
     check_kubectl_access,
     cleanup_access_pod,
-    deploy_access_pod,
+    ensure_clean_access_pod,
     run_command,
 )
 
@@ -100,16 +103,39 @@ def main():
     parser.add_argument(
         "--dest",
         required=True,
-        help="Absolute target path in PVC (e.g., /profiling_results/agg.yaml)",
+        help="Absolute target path in PVC (must start with /data/, e.g., /data/configs/agg.yaml)",
     )
 
     args = parser.parse_args()
 
-    # Validate target_path to prevent directory traversal
-    if not args.dest.startswith("/"):
-        print(
-            "ERROR: Target path must be an absolute path inside the PVC (start with '/')."
-        )
+    # Validate target_path to prevent directory traversal and ensure it's within PVC
+    if not args.dest.startswith("/data/"):
+        print("=" * 60)
+        print("❌ ERROR: Invalid target path")
+        print("=" * 60)
+        print("The PVC is mounted at /data in the access pod.")
+        print("All paths must start with '/data/' for security reasons.")
+        print("")
+        print("💡 QUICK FIX:")
+        if args.dest.startswith("/"):
+            # Suggest the fix
+            suggested_path = f"/data{args.dest}"
+            print(f"  Change: {args.dest}")
+            print(f"  To:     {suggested_path}")
+            print("")
+            print("📝 Example commands:")
+            print("  python3 -m deploy.utils.inject_manifest \\")
+            print(f"    --namespace {args.namespace} \\")
+            print(f"    --src {args.src} \\")
+            print(f"    --dest {suggested_path}")
+        else:
+            print(f"  Use: /data/{args.dest.lstrip('/')}")
+        print("")
+        print("🔍 Common patterns:")
+        print("  /configs/file.yaml     → /data/configs/file.yaml")
+        print("  /results/data.yaml     → /data/results/data.yaml")
+        print("  /profiling_results/... → /data/profiling_results/...")
+        print("=" * 60)
         sys.exit(1)
 
     if ".." in args.dest:
@@ -123,7 +149,7 @@ def main():
     check_kubectl_access(args.namespace)
 
     # Deploy access pod
-    deploy_access_pod(args.namespace)
+    ensure_clean_access_pod(args.namespace)
     try:
         # Copy manifest
         copy_manifest(args.namespace, args.src, args.dest)
diff --git a/deploy/utils/kubernetes.py b/deploy/utils/kubernetes.py
index b38b6c1b7b..f9b980c550 100644
--- a/deploy/utils/kubernetes.py
+++ b/deploy/utils/kubernetes.py
@@ -22,7 +22,7 @@
 
 
 def run_command(
-    cmd: List[str], capture_output: bool = True
+    cmd: List[str], capture_output: bool = True, exit_on_error: bool = True
 ) -> subprocess.CompletedProcess:
     """Run a command and handle errors."""
     try:
@@ -37,7 +37,10 @@ def run_command(
             print(f"STDOUT: {e.stdout}")
         if e.stderr:
             print(f"STDERR: {e.stderr}")
-        sys.exit(1)
+        if exit_on_error:
+            sys.exit(1)
+        else:
+            raise
 
 
 def check_kubectl_access(namespace: str) -> None:
@@ -47,6 +50,55 @@ def check_kubectl_access(namespace: str) -> None:
     print("✓ kubectl access confirmed")
 
 
+def ensure_clean_access_pod(namespace: str) -> str:
+    """Ensure a clean PVC access pod deployment by deleting any existing pod first."""
+
+    # Check if pod exists and delete it if it does
+    try:
+        result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                PVC_ACCESS_POD_NAME,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath={.metadata.name}",
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.returncode == 0 and result.stdout.strip() == PVC_ACCESS_POD_NAME:
+            print(f"Found existing access pod '{PVC_ACCESS_POD_NAME}', deleting it...")
+            run_command(
+                [
+                    "kubectl",
+                    "delete",
+                    "pod",
+                    PVC_ACCESS_POD_NAME,
+                    "-n",
+                    namespace,
+                    "--ignore-not-found",
+                ],
+                capture_output=False,
+                exit_on_error=False,
+            )
+            print("✓ Existing access pod deleted")
+    except Exception:
+        pass  # Pod doesn't exist, which is fine
+
+    try:
+        return deploy_access_pod(namespace)
+    except Exception as e:
+        print(f"Deployment failed: {e}")
+        print(
+            "Pod left running for debugging. Use 'kubectl delete pod pvc-access-pod -n <namespace>' to clean up manually."
+        )
+        raise
+
+
 def deploy_access_pod(namespace: str) -> str:
     """Deploy the PVC access pod and return pod name."""
 
@@ -67,25 +119,19 @@ def deploy_access_pod(namespace: str) -> str:
             text=True,
             check=False,
         )
-
         if result.returncode == 0 and result.stdout.strip() == "Running":
             print(f"✓ Access pod '{PVC_ACCESS_POD_NAME}' already running")
             return PVC_ACCESS_POD_NAME
     except Exception:
-        # Pod doesn't exist or isn't running
-        pass
+        pass  # Pod doesn't exist or isn't running
 
     print(f"Deploying access pod '{PVC_ACCESS_POD_NAME}' in namespace '{namespace}'...")
 
-    # Get the directory where this script is located
-    script_dir = Path(__file__).parent
-    pod_yaml_path = script_dir / "manifests" / "pvc-access-pod.yaml"
-
+    pod_yaml_path = Path(__file__).parent / "manifests" / "pvc-access-pod.yaml"
     if not pod_yaml_path.exists():
         print(f"ERROR: Pod YAML not found at {pod_yaml_path}")
         sys.exit(1)
 
-    # Deploy the pod
     run_command(
         ["kubectl", "apply", "-f", str(pod_yaml_path), "-n", namespace],
         capture_output=False,
@@ -103,6 +149,7 @@ def deploy_access_pod(namespace: str) -> str:
             "--timeout=60s",
         ],
         capture_output=False,
+        exit_on_error=False,
     )
     print("✓ Access pod is ready")
     return PVC_ACCESS_POD_NAME
@@ -110,16 +157,20 @@ def deploy_access_pod(namespace: str) -> str:
 
 def cleanup_access_pod(namespace: str) -> None:
     print("Cleaning up access pod...")
-    run_command(
-        [
-            "kubectl",
-            "delete",
-            "pod",
-            PVC_ACCESS_POD_NAME,
-            "-n",
-            namespace,
-            "--ignore-not-found",
-        ],
-        capture_output=False,
-    )
-    print("✓ Access pod deleted")
+    try:
+        run_command(
+            [
+                "kubectl",
+                "delete",
+                "pod",
+                PVC_ACCESS_POD_NAME,
+                "-n",
+                namespace,
+                "--ignore-not-found",
+            ],
+            capture_output=False,
+            exit_on_error=False,
+        )
+        print("✓ Access pod deleted")
+    except Exception as e:
+        print(f"Warning: Failed to clean up access pod: {e}")
diff --git a/deploy/utils/manifests/pvc-access-pod.yaml b/deploy/utils/manifests/pvc-access-pod.yaml
index 0e4b5f75b1..986b8c8b24 100644
--- a/deploy/utils/manifests/pvc-access-pod.yaml
+++ b/deploy/utils/manifests/pvc-access-pod.yaml
@@ -26,7 +26,7 @@ spec:
         - ALL
     volumeMounts:
     - name: profiling-storage
-      mountPath: /profiling_results
+      mountPath: /data
     resources:
       requests:
         memory: "128Mi"
diff --git a/deploy/utils/setup_k8s_namespace.sh b/deploy/utils/setup_k8s_namespace.sh
index 469aee0287..5703fd96b3 100755
--- a/deploy/utils/setup_k8s_namespace.sh
+++ b/deploy/utils/setup_k8s_namespace.sh
@@ -60,7 +60,7 @@ Sets up Kubernetes namespace for Dynamo (one-time per namespace):
       * Installs/updates the operator Helm release using that image
       * If credentials (DOCKER_USERNAME/DOCKER_PASSWORD) are provided, creates/updates docker-imagepullsecret
       * If credentials are not provided, prompts interactively to create the pull secret
-  - Otherwise installs the operator using default image: nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0
+  - Otherwise installs the operator using default image: nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.1
 
 Environment variables:
   NAMESPACE         Target Kubernetes namespace (default: default)
@@ -157,7 +157,7 @@ if [[ -n "$DOCKER_SERVER" && -n "$IMAGE_TAG" ]]; then
   fi
 else
   # Use default published image when custom not provided
-  DEFAULT_OPERATOR_IMAGE="nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0"
+  DEFAULT_OPERATOR_IMAGE="nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.1"
   if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
     pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
     helm dep build
diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md
index d0f314f4b7..e2f4624f39 100644
--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -33,7 +33,7 @@ The framework is a wrapper around `genai-perf` that:
 
 **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
 
-**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The actual model loaded is determined by your deployment manifests. Only one model can be benchmarked at a time across all inputs to ensure fair comparison. The default `--model` value in the benchmarking script is `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`, but it must match the model in the manifest(s) and the model deployed at the endpoint(s).
+**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The actual model loaded is determined by your deployment manifests. Only one model can be benchmarked at a time across all inputs to ensure fair comparison. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model in the manifest(s) and the model deployed at the endpoint(s).
 
 ## Prerequisites
 
@@ -103,7 +103,7 @@ REQUIRED:
 
 OPTIONS:
   -h, --help                    Show help message and examples
-  -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)
+  -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B)
                                 NOTE: This must match the model configured in your deployment manifests and endpoints
   -i, --isl LENGTH              Input sequence length (default: 2000)
   -s, --std STDDEV              Input sequence standard deviation (default: 10)
@@ -130,6 +130,23 @@ The script automatically:
 4. **Generates** comparison plots using your custom labels in `./benchmarks/results/plots/`
 5. **Cleans up** deployments when complete
 
+### GPU Resource Usage
+
+**Important**: Models are deployed and benchmarked **sequentially**, not in parallel. This means:
+
+- **One deployment at a time**: Each DynamoGraphDeployment is deployed, benchmarked, and cleaned up before the next one starts
+- **Full GPU access**: Each deployment gets exclusive access to all available GPUs during its benchmark run
+- **Resource isolation**: No resource conflicts between different deployment configurations
+- **Fair comparison**: Each configuration is tested under identical resource conditions
+
+This sequential approach ensures:
+- **Accurate performance measurements** without interference between deployments
+- **Consistent resource allocation** for fair comparison across different configurations
+- **Simplified resource management** without complex GPU scheduling
+- **Reliable cleanup** between benchmark runs
+
+If you need to benchmark multiple configurations simultaneously, consider using separate Kubernetes namespaces or running benchmarks on different clusters.
+
 ### Results Clearing Behavior
 
 **Important**: The benchmark script automatically clears the output directory before each run to ensure clean, reproducible results. This means:
@@ -155,7 +172,7 @@ For direct control over the benchmark workflow:
 ```bash
 # Endpoint benchmarking
 python3 -u -m benchmarks.utils.benchmark \
-   --endpoint "http://your-endpoint:8000" \
+   --input trtllm=http://your-endpoint:8000 \
    --namespace $NAMESPACE \
    --isl 2000 \
    --std 10 \
diff --git a/docs/benchmarks/pre_deployment_profiling.md b/docs/benchmarks/pre_deployment_profiling.md
index 5385451c47..20e07eb5f1 100644
--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -24,6 +24,21 @@ We assume there is no piggy-backed prefill requests in the decode engine. Even i
 
 The script will first detect the number of available GPUs on the current nodes (multi-node engine not supported yet). Then, it will profile the prefill and decode performance with different TP sizes. For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), the script directly measures the TTFT for a request with given isl without kv-reusing. For decode, since the ITL (or iteration time) is relevant with how many requests are in-flight, the script will measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL.
 
+### GPU Resource Usage
+
+**Important**: Profiling tests different tensor parallelism (TP) configurations **sequentially**, not in parallel. This means:
+
+- **One TP configuration at a time**: Each tensor parallelism size (TP1, TP2, TP4, TP8, etc.) is tested individually
+- **Full GPU access**: Each TP configuration gets exclusive access to all available GPUs during its profiling run
+- **Resource isolation**: No interference between different TP configurations during testing
+- **Accurate measurements**: Each configuration is profiled under identical resource conditions
+
+This sequential approach ensures:
+- **Precise performance profiling** without resource conflicts
+- **Consistent GPU allocation** for fair comparison across TP sizes
+- **Reliable cleanup** between different TP configuration tests
+- **Accurate SLA compliance verification** for each configuration
+
 After the profiling finishes, two plots will be generated in the `output-dir`. For example, here are the profiling results for `examples/llm/configs/disagg.yaml`:
 
 ![Prefill Performance](../../docs/images/h100_prefill_performance.png)
@@ -90,7 +105,7 @@ Use the injector utility to place your DGD manifest into the PVC. The profiling
 python3 deploy/utils/inject_manifest.py \
   --namespace $NAMESPACE \
   --src components/backends/vllm/deploy/disagg.yaml \
-  --dest /configs/disagg.yaml
+  --dest /data/configs/disagg.yaml
 
 # Set the docker image for the profiling job; any docker image that contains your script.
 export DOCKER_IMAGE=nvcr.io/nvidia/dynamo:latest-vllm
@@ -112,15 +127,17 @@ Use the default pre-built image and inject custom configurations via PVC:
 2. **Inject your custom disagg configuration:**
    ```bash
    # Use default disagg.yaml config
-   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src components/backends/vllm/deploy/disagg.yaml --dest /configs/disagg.yaml
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src components/backends/vllm/deploy/disagg.yaml --dest /data/configs/disagg.yaml
 
    # Or use a custom disagg config file
-   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /configs/disagg.yaml
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /data/configs/disagg.yaml
 
    # Or specify a custom target path in the PVC
-   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /profiling_results/my-disagg.yaml
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /data/profiling_results/my-disagg.yaml
    ```
 
+   > **Note**: All paths must start with `/data/` for security reasons. If you forget this prefix, the script will show a helpful error message with the correct path.
+
 3. **Set the config path for the profiling job:**
    ```bash
    export DGD_CONFIG_FILE=/workspace/profiling_results/disagg.yaml # or your custom path
@@ -176,10 +193,10 @@ To download the results:
 
 ```bash
 # Download to directory
-python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results
+python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results
 
 # Download without any of the auto-created config.yaml files used in profiling
-python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results --no-config
+python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results --no-config
 ```
 
 The script will:
@@ -191,7 +208,7 @@ The script will:
 
 The profiling results directory contains the following structure:
 ```
-/workspace/profiling_results/
+/workspace/data/profiling_results/
 ├── prefill_performance.png                    # Main prefill performance plot
 ├── decode_performance.png                     # Main decode performance plot
 ├── prefill_tp1/                               # Individual TP profiling directories