diff --git a/benchmarks/profiler/deploy/pvc-access-pod.yaml b/benchmarks/profiler/deploy/pvc-access-pod.yaml new file mode 100644 index 00000000000..5b2062d7926 --- /dev/null +++ b/benchmarks/profiler/deploy/pvc-access-pod.yaml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: pvc-access-pod + labels: + app: pvc-access +spec: + activeDeadlineSeconds: 300 # Auto-delete after 5 minutes + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + containers: + - name: ubuntu + image: ubuntu:22.04 + command: ["/bin/bash"] + args: ["-c", "sleep 290"] # Sleep for slightly less than deadline - tools can be installed via kubectl exec if needed + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false + capabilities: + drop: + - ALL + volumeMounts: + - name: profiling-storage + mountPath: /profiling_results + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: profiling-storage + persistentVolumeClaim: + claimName: profiling-pvc + restartPolicy: Never diff --git a/benchmarks/profiler/download_pvc_results.py b/benchmarks/profiler/download_pvc_results.py new file mode 100755 index 00000000000..6adf2f836b8 --- /dev/null +++ b/benchmarks/profiler/download_pvc_results.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +PVC Results Download Script + +This script downloads all relevant profiling results from the profiling PVC to a local directory. +It creates the necessary access pod, downloads the files, and cleans up automatically. + +Usage: + python3 download_pvc_results.py --namespace --output-dir [--no-config] + +Examples: + # Download to ./results directory + python3 download_pvc_results.py --namespace --output-dir ./results + + # Download to specific directory + python3 download_pvc_results.py --namespace --output-dir /home/user/profiling_data + + # Download without configuration files + python3 download_pvc_results.py --namespace --output-dir ./results --no-config +""" + +import argparse +import subprocess +import sys +import time +from pathlib import Path +from typing import List + +from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command + + +def list_pvc_contents( + namespace: str, pod_name: str, skip_config: bool = False +) -> List[str]: + """List contents of the PVC to identify relevant files.""" + print("Scanning PVC contents...") + + # Build find command with optional config file exclusion + find_cmd = [ + "kubectl", + "exec", + pod_name, + "-n", + namespace, + "--", + "find", + "/profiling_results", + "-type", + "f", + "-name", + "*.png", + "-o", + "-name", + "*.npz", + ] + + # Add config file patterns if not skipping them + if not skip_config: + find_cmd.extend( + [ + "-o", + "-name", + "*.yaml", + "-o", + "-name", + "*.yml", + ] + ) + + try: + result = run_command(find_cmd, capture_output=True) + + files = [f.strip() for f in result.stdout.split("\n") if f.strip()] + config_note = " (excluding config files)" if skip_config else "" + print(f"Found {len(files)} relevant files to download{config_note}") + return files + + except subprocess.CalledProcessError: + print("ERROR: Failed to list PVC contents") + sys.exit(1) + + +def download_files( + namespace: str, pod_name: str, files: List[str], output_dir: Path +) -> None: + """Download relevant files from PVC to local directory.""" + if not files: + print("No files to download") + return + + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) + print(f"Downloading {len(files)} files to {output_dir}") + + downloaded = 0 + failed = 0 + + for file_path in files: + try: + # Determine relative path and create local structure + rel_path = file_path.replace("/profiling_results/", "") + + # Validate relative path + if ".." in rel_path or rel_path.startswith("/"): + print(f" WARNING: Skipping potentially unsafe path: {file_path}") + failed += 1 + continue + + local_file = output_dir / rel_path + + # Ensure the file is within output_dir + if not local_file.resolve().is_relative_to(output_dir.resolve()): + print(f" WARNING: Skipping file outside output directory: {file_path}") + failed += 1 + continue + + local_file.parent.mkdir(parents=True, exist_ok=True) + + # Download file + run_command( + [ + "kubectl", + "cp", + f"{namespace}/{pod_name}:{file_path}", + str(local_file), + ], + capture_output=True, + ) + + downloaded += 1 + if downloaded % 5 == 0: # Progress update every 5 files + print(f" Downloaded {downloaded}/{len(files)} files...") + + except subprocess.CalledProcessError as e: + print(f" WARNING: Failed to download {file_path}: {e}") + failed += 1 + + print(f"✓ Download completed: {downloaded} successful, {failed} failed") + + +def download_summary_files( + namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False +) -> None: + """Download key summary files that might not match the pattern.""" + summary_files = [ + "/profiling_results/prefill_performance.png", + "/profiling_results/decode_performance.png", + ] + + # Add config files if not skipping them + if not skip_config: + summary_files.append( + "/profiling_results/disagg.yaml" + ) # In case it was injected + + print("Downloading summary files...") + + for file_path in summary_files: + try: + # Check if file exists first using subprocess.run directly + result = subprocess.run( + [ + "kubectl", + "exec", + pod_name, + "-n", + namespace, + "--", + "test", + "-f", + file_path, + ], + capture_output=True, + text=True, + check=False, + ) + + if result.returncode != 0: + # File doesn't exist, skip silently + continue + + # File exists, download it + rel_path = file_path.replace("/profiling_results/", "") + + # Validate relative path + if ".." in rel_path or rel_path.startswith("/"): + print( + f" ⚠️ Skipped {file_path.split('/')[-1]}: potentially unsafe path" + ) + continue + + local_file = output_dir / rel_path + + # Ensure the file is within output_dir + if not local_file.resolve().is_relative_to(output_dir.resolve()): + print( + f" ⚠️ Skipped {file_path.split('/')[-1]}: outside output directory" + ) + continue + + local_file.parent.mkdir(parents=True, exist_ok=True) + + run_command( + [ + "kubectl", + "cp", + f"{namespace}/{pod_name}:{file_path}", + str(local_file), + ], + capture_output=True, + ) + + print(f" ✓ {rel_path}") + + except Exception as e: + # File doesn't exist or failed to download, skip silently + print(f" ⚠️ Skipped {file_path.split('/')[-1]}: {e}") + pass + + +def cleanup_access_pod(namespace: str, pod_name: str) -> None: + """Clean up the access pod (let it auto-delete via activeDeadlineSeconds).""" + print(f"ℹ️ Access pod '{pod_name}' will auto-delete in 5 minutes") + print(f" To delete immediately: kubectl delete pod {pod_name} -n {namespace}") + + +def generate_readme(output_dir: Path, file_count: int) -> None: + """Generate a README file explaining the downloaded contents.""" + readme_content = f"""# Profiling Results + +Downloaded {file_count} files from profiling PVC. + +## File Structure + +### Performance Plots +- `prefill_performance.png` - Main prefill performance across TP sizes +- `decode_performance.png` - Main decode performance across TP sizes + +### Interpolation Data +- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data +- `selected_prefill_interpolation/*.png` - Prefill interpolation plots +- `selected_decode_interpolation/raw_data.npz` - Decode performance data +- `selected_decode_interpolation/*.png` - Decode interpolation plots + +### Configuration Files +- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling + +### Individual TP Results +- `prefill_tp*/` - Individual tensor parallelism profiling results +- `decode_tp*/` - Individual tensor parallelism profiling results + +## Loading Data + +To load the .npz data files in Python: + +```python +import numpy as np + +# Load prefill data +prefill_data = np.load('selected_prefill_interpolation/raw_data.npz') +print("Prefill data keys:", list(prefill_data.keys())) + +# Load decode data +decode_data = np.load('selected_decode_interpolation/raw_data.npz') +print("Decode data keys:", list(decode_data.keys())) +``` + +Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')} +""" + + readme_path = output_dir / "README.md" + with open(readme_path, "w") as f: + f.write(readme_content) + + print("📝 Generated README.md with download summary") + + +def main(): + parser = argparse.ArgumentParser( + description="Download profiling results from PVC to local directory", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + parser.add_argument( + "--namespace", + "-n", + required=True, + help="Kubernetes namespace containing the profiling PVC", + ) + + parser.add_argument( + "--output-dir", + "-o", + type=Path, + required=True, + help="Local directory to download results to", + ) + + parser.add_argument( + "--no-config", + action="store_true", + help="Skip downloading configuration files (*.yaml, *.yml)", + ) + + args = parser.parse_args() + + print("📥 PVC Results Download") + print("=" * 40) + + # Validate inputs + check_kubectl_access(args.namespace) + + # Deploy access pod + pod_name = deploy_access_pod(args.namespace) + + # List and download files + files = list_pvc_contents(args.namespace, pod_name, args.no_config) + download_files(args.namespace, pod_name, files, args.output_dir) + + # Download additional summary files + download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config) + + # Generate README + generate_readme(args.output_dir, len(files)) + + # Cleanup info + cleanup_access_pod(args.namespace, pod_name) + + print("\n✅ Download completed!") + print(f"📁 Results available at: {args.output_dir.absolute()}") + print("📄 See README.md for file descriptions") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/profiler/inject_disagg_config.py b/benchmarks/profiler/inject_disagg_config.py new file mode 100755 index 00000000000..e8ebda7dd15 --- /dev/null +++ b/benchmarks/profiler/inject_disagg_config.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Disagg Config Injection Script + +This script copies a DynamoGraphDeployment disagg configuration file into the profiling PVC +so it can be used by the SLA profiler job. The profiler can then reference this config +using the DGD_CONFIG_FILE environment variable. + +Usage: + python3 inject_disagg_config.py --namespace [--disagg-config ] [--target-path ] + +Examples: + # Use default disagg.yaml from components/backends/vllm/deploy/ + python3 inject_disagg_config.py --namespace + + # Use custom disagg config + python3 inject_disagg_config.py --namespace --disagg-config ./my-custom-disagg.yaml + + # Use custom target path in PVC + python3 inject_disagg_config.py --namespace --target-path /profiling_results/custom-disagg.yaml +""" + +import argparse +import sys +from pathlib import Path + +from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command + + +def copy_disagg_config( + namespace: str, disagg_config_path: Path, target_path: str +) -> None: + """Copy the disagg config file into the PVC via the access pod.""" + pod_name = "pvc-access-pod" + + if not disagg_config_path.exists(): + print(f"ERROR: Disagg config file not found: {disagg_config_path}") + sys.exit(1) + + print(f"Copying {disagg_config_path} to {target_path} in PVC...") + + # Copy file to pod + run_command( + [ + "kubectl", + "cp", + str(disagg_config_path), + f"{namespace}/{pod_name}:{target_path}", + ], + capture_output=False, + ) + + # Verify the file was copied + result = run_command( + ["kubectl", "exec", pod_name, "-n", namespace, "--", "ls", "-la", target_path], + capture_output=True, + ) + + print("✓ Disagg config successfully copied to PVC") + print(f"File details: {result.stdout.strip()}") + + +def cleanup_access_pod(namespace: str, keep_pod: bool = True) -> None: + """Optionally clean up the access pod.""" + if keep_pod: + print("ℹ️ Access pod 'pvc-access-pod' left running for future use") + print( + f" To access PVC: kubectl exec -it pvc-access-pod -n {namespace} -- /bin/bash" + ) + print(f" To delete pod: kubectl delete pod pvc-access-pod -n {namespace}") + else: + print("Cleaning up access pod...") + run_command( + [ + "kubectl", + "delete", + "pod", + "pvc-access-pod", + "-n", + namespace, + "--ignore-not-found", + ], + capture_output=False, + ) + print("✓ Access pod deleted") + + +def main(): + parser = argparse.ArgumentParser( + description="Inject disagg config into profiling PVC", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + parser.add_argument( + "--namespace", + "-n", + required=True, + help="Kubernetes namespace containing the profiling PVC", + ) + + parser.add_argument( + "--disagg-config", + type=Path, + default=Path("components/backends/vllm/deploy/disagg.yaml"), + help="Path to disagg config file (default: components/backends/vllm/deploy/disagg.yaml)", + ) + + parser.add_argument( + "--target-path", + default="/profiling_results/disagg.yaml", + help="Target path in PVC (default: /profiling_results/disagg.yaml)", + ) + + parser.add_argument( + "--cleanup", + action="store_true", + help="Delete the access pod after copying (default: keep running)", + ) + + args = parser.parse_args() + + # Validate target_path to prevent directory traversal + if not args.target_path.startswith("/profiling_results/"): + print("ERROR: Target path must be within /profiling_results/") + sys.exit(1) + + if ".." in args.target_path: + print("ERROR: Target path cannot contain '..'") + sys.exit(1) + + print("🚀 Disagg Config Injection") + print("=" * 40) + + # Validate inputs + check_kubectl_access(args.namespace) + + # Deploy access pod + deploy_access_pod(args.namespace) + + # Copy disagg config + copy_disagg_config(args.namespace, args.disagg_config, args.target_path) + + # Cleanup + cleanup_access_pod(args.namespace, keep_pod=not args.cleanup) + + print("\n✅ Disagg config injection completed!") + print(f"📁 Config available at: {args.target_path}") + print(f"🔧 Set DGD_CONFIG_FILE={args.target_path} in your profiler job") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/profiler/utils/kubernetes.py b/benchmarks/profiler/utils/kubernetes.py new file mode 100644 index 00000000000..679626d4870 --- /dev/null +++ b/benchmarks/profiler/utils/kubernetes.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys +import time +from pathlib import Path +from typing import List + + +def run_command( + cmd: List[str], capture_output: bool = True +) -> subprocess.CompletedProcess: + """Run a command and handle errors.""" + try: + result = subprocess.run( + cmd, capture_output=capture_output, text=True, check=True + ) + return result + except subprocess.CalledProcessError as e: + print(f"ERROR: Command failed: {' '.join(cmd)}") + print(f"Exit code: {e.returncode}") + if e.stdout: + print(f"STDOUT: {e.stdout}") + if e.stderr: + print(f"STDERR: {e.stderr}") + sys.exit(1) + + +def check_kubectl_access(namespace: str) -> None: + """Check if kubectl can access the specified namespace.""" + print(f"Checking kubectl access to namespace '{namespace}'...") + run_command(["kubectl", "get", "pods", "-n", namespace], capture_output=True) + print("✓ kubectl access confirmed") + + +def deploy_access_pod(namespace: str) -> str: + """Deploy the PVC access pod and return pod name.""" + pod_name = "pvc-access-pod" + + # Check if pod already exists and is running + try: + result = subprocess.run( + [ + "kubectl", + "get", + "pod", + pod_name, + "-n", + namespace, + "-o", + "jsonpath={.status.phase}", + ], + capture_output=True, + text=True, + check=False, + ) + + if result.returncode == 0 and result.stdout.strip() == "Running": + print(f"✓ Access pod '{pod_name}' already running") + return pod_name + except Exception: + # Pod doesn't exist or isn't running + pass + + print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...") + + # Get the directory where this script is located + script_dir = Path(__file__).parent + pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml" + + if not pod_yaml_path.exists(): + print(f"ERROR: Pod YAML not found at {pod_yaml_path}") + sys.exit(1) + + # Deploy the pod + run_command( + ["kubectl", "apply", "-f", str(pod_yaml_path), "-n", namespace], + capture_output=False, + ) + + print("Waiting for pod to be ready...") + + # Wait for pod to be ready (up to 60 seconds) + for i in range(60): + try: + result = subprocess.run( + [ + "kubectl", + "get", + "pod", + pod_name, + "-n", + namespace, + "-o", + "jsonpath={.status.phase}", + ], + capture_output=True, + text=True, + check=False, + ) + + if result.returncode == 0 and result.stdout.strip() == "Running": + print("✓ Access pod is ready") + return pod_name + + except Exception: + pass + + time.sleep(1) + if i % 10 == 0: + print(f" Still waiting... ({i+1}s)") + + print("ERROR: Access pod failed to become ready within 60 seconds") + sys.exit(1) diff --git a/docs/architecture/pre_deployment_profiling.md b/docs/architecture/pre_deployment_profiling.md index 2fdfbf301d3..62eb6435c0f 100644 --- a/docs/architecture/pre_deployment_profiling.md +++ b/docs/architecture/pre_deployment_profiling.md @@ -76,20 +76,56 @@ kubectl create secret docker-registry nvcr-imagepullsecret \ -n $NAMESPACE ``` -**Step 1: Build your own vLLM image for profiling** +**Step 1: Configure container image** +You have two options for configuring your profiling setup: + +**Option A: Use pre-built image with custom config injection (recommended)** + +Use the default pre-built image and inject custom configurations via PVC: + +1. **Set the container image:** + ```bash + export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.0 # or any existing image tag + ``` + +2. **Inject your custom disagg configuration:** + ```bash + # Use default disagg.yaml config + python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE + + # Or use a custom disagg config file + python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --disagg-config my-custom-disagg.yaml + + # Or specify a custom target path in the PVC + python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --target-path /profiling_results/my-disagg.yaml + ``` + +3. **Set the config path for the profiling job:** + ```bash + export DGD_CONFIG_FILE=/profiling_results/disagg.yaml # or your custom path + ``` + +This approach allows you to: +- Customize DGD configurations without rebuilding container images +- Test different model configurations easily +- Version control your DGD configs alongside your code + +> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues. + +> **Note**: The default location in the PVC is `/profiling_results/disagg.yaml`. If you don't inject a config, the profiler will fall back to the built-in config at `/workspace/components/backends/vllm/deploy/disagg.yaml`. + +**Option B: Build custom image (only if you need code changes)** + +Only needed if you require custom code modifications beyond configuration changes: ```bash # in the project's root folder ./container/build.sh --framework VLLM # Tag and push to your container registry -export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage -# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE -# Modify this yaml to profile different models -export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file +export DOCKER_IMAGE= +export DGD_CONFIG_FILE= # path to your disagg.yaml file within the DOCKER_IMAGE ``` -Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed. - **Step 2: Set SLA target** Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. @@ -149,20 +185,20 @@ After the profiling job completes successfully, the results are stored in the pe The profiling results are stored in a PVC named `profiling-pvc`. To access the results: -1. **Create a temporary pod to access the PVC:** +1. **Deploy the PVC access pod (if not already running):** ```bash - kubectl run temp-access --image=alpine:latest --restart=Never \ - --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \ - -n $NAMESPACE + kubectl apply -f benchmarks/profiler/deploy/pvc-access-pod.yaml -n $NAMESPACE ``` -2. **Inside the temporary pod, navigate to the results directory:** +2. **Access the PVC through the pod:** ```bash - kubectl exec -it temp-access -n $NAMESPACE -- sh - cd /workspace/profiling_results + kubectl exec -it pvc-access-pod -n $NAMESPACE -- /bin/bash + cd /profiling_results ls -la ``` +> **Note**: The same `pvc-access-pod` is used for both injecting disagg configs and accessing results. If you used the `inject_disagg_config.py` script earlier, the pod may already be running. The pod auto-deletes after 5 minutes of activity. + #### File Structure The profiling results directory contains the following structure: @@ -185,7 +221,33 @@ The profiling results directory contains the following structure: #### Downloading Results Locally -To download the profiling results to your local machine: +You can download the profiling results using the automated download script or manually: + +**Option 1: Automated Download (Recommended)** + +Use the provided download script to automatically fetch all relevant files: + +```bash +# Download to ./results directory +python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results + +# Download to specific directory +python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir /path/to/my/results + +# Download without any of the auto-created config.yaml files used in profiling +python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --no-config +``` + +The script will: +- Deploy a temporary access pod (auto-deletes after 5 minutes) +- Scan for relevant files (*.png, *.npz, *.yaml) +- Download all files maintaining directory structure +- Generate a README.md with file descriptions +- Clean up automatically + +**Option 2: Manual Download** + +To download the profiling results manually: 1. **Download performance plots and data files:** ```bash @@ -193,21 +255,21 @@ To download the profiling results to your local machine: mkdir -p ./profiling_results # Copy main performance plots - kubectl cp temp-access:/workspace/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE - kubectl cp temp-access:/workspace/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE + kubectl cp pvc-access-pod:/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE + kubectl cp pvc-access-pod:/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE # Copy interpolation directories (includes additional plots and data) - kubectl cp temp-access:/workspace/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r - kubectl cp temp-access:/workspace/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r + kubectl cp pvc-access-pod:/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r + kubectl cp pvc-access-pod:/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r ``` 2. **Alternative: Tar and download entire results directory:** ```bash - # Inside the temporary pod, create a tar archive - tar -czf /workspace/profiling_results/profiling_results.tar.gz -C /workspace/profiling_results . + # Inside the access pod, create a tar archive + tar -czf /profiling_results/profiling_results.tar.gz -C /profiling_results . # Download the archive to your local machine - kubectl cp temp-access:/workspace/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE + kubectl cp pvc-access-pod:/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE # Extract locally tar -xzf profiling_results.tar.gz -C ./profiling_results/ @@ -244,15 +306,18 @@ print("Decode data keys:", list(decode_data.keys())) #### Cleaning Up -Once you've downloaded your results, clean up the temporary pod: +The access pod automatically deletes after 5 minutes of activity, but you can also clean it up manually: + ```bash -# Exit the temporary pod (if still inside) +# Exit the access pod (if still inside) exit -# The pod should auto-delete due to --rm flag, but if needed: -kubectl delete pod temp-access -n $NAMESPACE +# Delete the access pod immediately (optional - it will auto-delete) +kubectl delete pod pvc-access-pod -n $NAMESPACE ``` +> **Note**: The access pod has `activeDeadlineSeconds: 300` and will auto-delete after 5 minutes to prevent resource waste. + ### Troubleshooting #### Image Pull Authentication Errors