diff --git a/benchmarks/incluster/README.md b/benchmarks/incluster/README.md
new file mode 120000
index 00000000000..2509b53553b
--- /dev/null
+++ b/benchmarks/incluster/README.md
@@ -0,0 +1 @@
+../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster
\ No newline at end of file
diff --git a/benchmarks/incluster/benchmark_job.yaml b/benchmarks/incluster/benchmark_job.yaml
new file mode 100644
index 00000000000..3bcab9f85da
--- /dev/null
+++ b/benchmarks/incluster/benchmark_job.yaml
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: dynamo-benchmark
+spec:
+  template:
+    spec:
+      serviceAccountName: dynamo-sa
+      imagePullSecrets:
+      - name: docker-imagepullsecret
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        fsGroup: 1000
+      containers:
+      - name: benchmark-runner
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          readOnlyRootFilesystem: true
+          runAsNonRoot: true
+        resources:
+          requests:
+            cpu: "4"
+            memory: "8Gi"
+          limits:
+            cpu: "8"
+            memory: "16Gi"
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+        command: ["python3", "-m", "benchmarks.utils.benchmark"]
+        args:
+          - --model
+          - "Qwen/Qwen3-0.6B"
+          - --isl
+          - "2000"
+          - --std
+          - "10"
+          - --osl
+          - "256"
+          - --output-dir
+          - /data/results
+          - --input
+          - "qwen-vllm-agg=vllm-agg-frontend:8000"
+          # add more copies of lines 58-59 for each additional service if you want to benchmark multiple services
+          # - --input
+          # - "name=service-url:port"
+        volumeMounts:
+          - name: data-volume
+            mountPath: /data
+      restartPolicy: Never
+      volumes:
+        - name: data-volume
+          persistentVolumeClaim:
+            claimName: dynamo-pvc
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600  # Clean up job after 1 hour
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
index 944c2d42155..840660c5e51 100755
--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks/utils/benchmark.py
@@ -7,53 +7,59 @@
 import re
 import sys
 from typing import Dict, Tuple
+from urllib.parse import urlsplit
 
-from benchmarks.utils.workflow import run_benchmark_workflow
+from benchmarks.utils.workflow import has_http_scheme, run_benchmark_workflow
+from deploy.utils.kubernetes import is_running_in_cluster
 
 
 def validate_inputs(inputs: Dict[str, str]) -> None:
-    """Validate that all inputs are HTTP endpoints"""
+    """Validate that all inputs are HTTP endpoints or internal service URLs when running in cluster"""
     for label, value in inputs.items():
-        if not value.lower().startswith(("http://", "https://")):
-            raise ValueError(
-                f"Input '{label}' must be an HTTP endpoint (starting with http:// or https://). Got: {value}"
-            )
+        v = value.strip()
+        if is_running_in_cluster():
+            # Allow HTTP(S) or internal service URLs like host[:port][/path]
+            if has_http_scheme(v):
+                pass
+            else:
+                parts = urlsplit(f"//{v}")
+                host_ok = bool(parts.hostname)
+                port_ok = parts.port is None or (1 <= parts.port <= 65535)
+                if not (host_ok and port_ok):
+                    raise ValueError(
+                        f"Input '{label}' must be HTTP(S) or internal service URL. Got: {value}"
+                    )
+        else:
+            if not has_http_scheme(v):
+                raise ValueError(f"Input '{label}' must be HTTP endpoint. Got: {value}")
 
         # Validate reserved labels
         if label.lower() == "plots":
-            raise ValueError(
-                "Label 'plots' is reserved and cannot be used. Please choose a different label."
-            )
+            raise ValueError("Label 'plots' is reserved")
 
 
 def parse_input(input_str: str) -> Tuple[str, str]:
     """Parse input string in format key=value with additional validation"""
     if "=" not in input_str:
-        raise ValueError(
-            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
-        )
+        raise ValueError(f"Invalid input format: {input_str}")
 
     parts = input_str.split("=", 1)  # Split on first '=' only
     if len(parts) != 2:
-        raise ValueError(
-            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
-        )
+        raise ValueError(f"Invalid input format: {input_str}")
 
     label, value = parts
 
     if not label.strip():
-        raise ValueError("Label cannot be empty")
+        raise ValueError("Empty label")
     if not value.strip():
-        raise ValueError("Value cannot be empty")
+        raise ValueError("Empty value")
 
     label = label.strip()
     value = value.strip()
 
     # Validate label characters
     if not re.match(r"^[a-zA-Z0-9_-]+$", label):
-        raise ValueError(
-            f"Label must contain only letters, numbers, hyphens, and underscores. Invalid label: {label}"
-        )
+        raise ValueError(f"Invalid label: {label}")
 
     return label, value
 
@@ -114,7 +120,7 @@ def main() -> int:
             )
             print()
 
-        # Validate that all inputs are HTTP endpoints
+        # Validate that inputs are HTTP endpoints or in-cluster service URLs
         validate_inputs(parsed_inputs)
 
     except ValueError as e:
diff --git a/benchmarks/utils/workflow.py b/benchmarks/utils/workflow.py
index 69020102719..83eb3a6e353 100644
--- a/benchmarks/utils/workflow.py
+++ b/benchmarks/utils/workflow.py
@@ -6,6 +6,21 @@
 
 from benchmarks.utils.genai import run_concurrency_sweep
 from benchmarks.utils.plot import generate_plots
+from deploy.utils.kubernetes import is_running_in_cluster
+
+
+def has_http_scheme(url: str) -> bool:
+    """Check if URL has HTTP or HTTPS scheme."""
+    return url.lower().startswith(("http://", "https://"))
+
+
+def normalize_service_url(endpoint: str) -> str:
+    e = endpoint.strip()
+    if has_http_scheme(e):
+        return e
+    if is_running_in_cluster():
+        return f"http://{e}"
+    return e  # Outside cluster, validation will have ensured scheme is present
 
 
 def print_concurrency_start(
@@ -30,7 +45,10 @@ def run_endpoint_benchmark(
     output_dir: Path,
 ) -> None:
     """Run benchmark for an existing endpoint with custom label"""
-    print(f"🚀 Starting benchmark of endpoint '{label}': {endpoint}")
+    # Normalize endpoint to a usable URL (handles in-cluster scheme-less inputs)
+    service_url = normalize_service_url(endpoint)
+
+    print(f"🚀 Starting benchmark of endpoint '{label}': {service_url}")
     print(f"📁 Results will be saved to: {output_dir / label}")
     print_concurrency_start(label, model, isl, osl, std)
 
@@ -38,7 +56,7 @@ def run_endpoint_benchmark(
     (output_dir / label).mkdir(parents=True, exist_ok=True)
 
     run_concurrency_sweep(
-        service_url=endpoint,
+        service_url=service_url,
         model_name=model,
         isl=isl,
         osl=osl,
@@ -73,7 +91,7 @@ def run_benchmark_workflow(
     model: str = "Qwen/Qwen3-0.6B",
     output_dir: str = "benchmarks/results",
 ) -> None:
-    """Main benchmark workflow orchestrator for HTTP endpoints only"""
+    """Main benchmark workflow orchestrator for HTTP endpoints (and in-cluster internal service URLs)"""
     output_dir_path = Path(output_dir)
     output_dir_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/deploy/utils/kubernetes.py b/deploy/utils/kubernetes.py
index f9b980c550a..f91b1ea5e49 100644
--- a/deploy/utils/kubernetes.py
+++ b/deploy/utils/kubernetes.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import subprocess
 import sys
 from pathlib import Path
@@ -20,6 +21,14 @@
 
 PVC_ACCESS_POD_NAME = "pvc-access-pod"
 
+K8S_SA_TOKEN = Path("/var/run/secrets/kubernetes.io/serviceaccount/token")
+
+
+def is_running_in_cluster() -> bool:
+    """Return True if running inside a Kubernetes cluster."""
+    # Prefer well-known env var; fall back to SA token presence
+    return bool(os.environ.get("KUBERNETES_SERVICE_HOST")) or K8S_SA_TOKEN.exists()
+
 
 def run_command(
     cmd: List[str], capture_output: bool = True, exit_on_error: bool = True
diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md
index 3f92ff5ddfe..34af76aa3d5 100644
--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -19,6 +19,46 @@ This benchmarking framework lets you compare performance across any combination
 - **DynamoGraphDeployments**
 - **External HTTP endpoints** (existing services deployed following standard documentation from vLLM, llm-d, AIBrix, etc.)
 
+## Choosing Your Benchmarking Approach
+
+Dynamo provides two benchmarking approaches to suit different use cases: **client-side** and **server-side**. Client-side refers to running benchmarks on your local machine and connecting to Kubernetes deployments via port-forwarding, while server-side refers to running benchmarks directly within the Kubernetes cluster using internal service URLs. Which method to use depends on your use case.
+
+**TLDR:**
+Need high performance/load testing? Server-side.
+Just quick testing/comparison? Client-side.
+
+### Use Client-Side Benchmarking When:
+- You want to quickly test deployments
+- You want immediate access to results on your local machine
+- You're comparing external services or deployments (not necessarily just Dynamo deployments)
+- You need to run benchmarks from your laptop/workstation
+
+→ **[Go to Client-Side Benchmarking (Local)](#client-side-benchmarking-local)**
+
+### Use Server-Side Benchmarking When:
+- You have a development environment with kubectl access
+- You're doing performance validation with high load/speed requirements
+- You're experiencing timeouts or performance issues with client-side benchmarking
+- You want optimal network performance (no port-forwarding overhead)
+- You're running automated CI/CD pipelines
+- You need isolated execution environments
+- You're doing resource-intensive benchmarking
+- You want persistent result storage in the cluster
+
+→ **[Go to Server-Side Benchmarking (In-Cluster)](#server-side-benchmarking-in-cluster)**
+
+### Quick Comparison
+
+| Feature | Client-Side | Server-Side |
+|---------|-------------|-------------|
+| **Location** | Your local machine | Kubernetes cluster |
+| **Network** | Port-forwarding required | Direct service DNS |
+| **Setup** | Quick and simple | Requires cluster resources |
+| **Performance** | Limited by local resources, may timeout under high load | Optimal cluster performance, handles high load |
+| **Isolation** | Shared environment | Isolated job execution |
+| **Results** | Local filesystem | Persistent volumes |
+| **Best for** | Light load | High load |
+
 ## What This Tool Does
 
 The framework is a Python-based wrapper around `genai-perf` that:
@@ -26,34 +66,35 @@ The framework is a Python-based wrapper around `genai-perf` that:
 - Runs concurrency sweeps across configurable load levels
 - Generates comparison plots with your custom labels
 - Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
-- Runs locally and connects to your Kubernetes deployments/endpoints
 - Provides direct Python script execution for maximum flexibility
 
 **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
 
 **Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
 
-## Prerequisites
+---
 
-1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed.
+# Client-Side Benchmarking (Local)
 
-2. **Ubuntu 24.04** - GenAI-Perf requires Ubuntu 24.04 or higher to work properly. If you are on Ubuntu 22.04 or lower, you will need to build perf_analyzer [from source](https://github.com/triton-inference-server/perf_analyzer/blob/main/docs/install.md#build-from-source).
+Client-side benchmarking runs on your local machine and connects to Kubernetes deployments via port-forwarding.
 
-3. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster.
+## Prerequisites
+
+1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed.
 
-4. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be:
+2. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be:
    - DynamoGraphDeployments exposed via HTTP endpoints
    - External services (vLLM, llm-d, AIBrix, etc.)
    - Any HTTP endpoint serving HuggingFace-compatible models
 
-5. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
+3. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
    ```bash
    pip install -r deploy/utils/requirements.txt
    ```
 
 ## User Workflow
 
-Follow these steps to benchmark Dynamo deployments:
+Follow these steps to benchmark Dynamo deployments using client-side benchmarking:
 
 ### Step 1: Establish Kubernetes Cluster and Install Dynamo
 Set up your Kubernetes cluster with NVIDIA GPUs and install the Dynamo Cloud platform. First follow the [installation guide](/docs/guides/dynamo_deploy/installation_guide.md) to install Dynamo Cloud, then use [deploy/utils/README](../../deploy/utils/README.md) to set up benchmarking resources.
@@ -206,9 +247,9 @@ Raw data is organized by deployment/benchmark type and concurrency level:
 
 **For Any Benchmarking (uses your custom labels):**
 ```text
-benchmarks/results/
-├── plots/                       # Performance visualization plots
-│   ├── SUMMARY.txt             # Human-readable benchmark summary
+results/                          # Client-side: ./benchmarks/results/ or custom dir
+├── plots/                        # Server-side: /data/results/
+│   ├── SUMMARY.txt              # Performance visualization plots
 │   ├── p50_inter_token_latency_vs_concurrency.png
 │   ├── avg_inter_token_latency_vs_concurrency.png
 │   ├── request_throughput_vs_concurrency.png
@@ -228,7 +269,7 @@ benchmarks/results/
 
 **Example with actual labels:**
 ```text
-benchmarks/results/
+results/
 ├── plots/
 ├── experiment-a/                  # --input experiment-a=http://localhost:8000
 ├── experiment-b/                  # --input experiment-b=http://localhost:8001
@@ -240,6 +281,205 @@ Each concurrency directory contains:
 - **`profile_export.json`** - Raw GenAI-Perf results
 - **`inputs.json`** - Generated test inputs
 
+---
+
+# Server-Side Benchmarking (In-Cluster)
+
+Server-side benchmarking runs directly within the Kubernetes cluster, eliminating the need for port forwarding and providing better resource utilization.
+
+## What Server-Side Benchmarking Does
+
+The server-side benchmarking solution:
+- Runs benchmarks directly within the Kubernetes cluster using internal service URLs
+- Uses Kubernetes service DNS for direct communication (no port forwarding required)
+- Leverages the existing benchmarking infrastructure (`benchmarks.utils.benchmark`)
+- Stores results persistently using `dynamo-pvc`
+- Provides isolated execution environment with configurable resources
+- Handles high load/speed requirements without timeout issues
+- **Note**: Each benchmark job runs within a single Kubernetes namespace, but can benchmark services across multiple namespaces using the full DNS format `svc_name.namespace.svc.cluster.local`
+
+## Prerequisites
+
+1. **Kubernetes cluster** with NVIDIA GPUs and Dynamo namespace setup (see [Dynamo Cloud/Platform docs](../guides/dynamo_deploy/README.md))
+2. **Storage and service account** PersistentVolumeClaim and service account configured with appropriate permissions (see [deploy/utils README](../../deploy/utils/README.md))
+3. **Docker image** containing the Dynamo benchmarking tools
+
+## Quick Start
+
+### Step 1: Deploy Your DynamoGraphDeployment
+Deploy your DynamoGraphDeployment using the [deployment documentation](../../components/backends/). Ensure it has a frontend service exposed.
+
+### Step 2: Deploy and Run Benchmark Job
+
+
+```bash
+export NAMESPACE=benchmarking
+
+# Deploy the benchmark job with default settings
+kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
+
+# Monitor the job, wait for it to complete
+kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
+```
+
+#### Customize the job configuration
+
+To customize the benchmark parameters, edit the `benchmarks/incluster/benchmark_job.yaml` file and modify:
+
+- **Model name**: Change `"Qwen/Qwen3-0.6B"` in the args section
+- **Experiment name and service URL**: Change `"qwen-vllm-agg=vllm-agg-frontend:8000"` so the service URL matches your deployed service
+- **Docker image**: Change the image field if needed
+
+Then deploy:
+```bash
+kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
+```
+
+### Step 3: Retrieve Results
+```bash
+# Download results from PVC (recommended)
+python3 -m deploy.utils.download_pvc_results \
+  --namespace $NAMESPACE \
+  --output-dir ./benchmarks/results/${INPUT_NAME} \
+  --folder /data/results/${INPUT_NAME} \
+  --no-config
+```
+
+## Cross-Namespace Service Access
+
+Server-side benchmarking can benchmark services across multiple namespaces from a single job using Kubernetes DNS. When referencing services in other namespaces, use the full DNS format:
+
+```bash
+# Access service in same namespace
+SERVICE_URL=vllm-agg-frontend:8000
+
+# Access service in different namespace
+SERVICE_URL=vllm-agg-frontend.production.svc.cluster.local:8000
+```
+
+**DNS Format**: `<service-name>.<namespace>.svc.cluster.local:port`
+
+This allows you to:
+- Benchmark multiple services across different namespaces in a single job
+- Compare services running in different environments (dev, staging, production)
+- Test cross-namespace integrations without port-forwarding
+- Run comprehensive cross-namespace performance comparisons
+
+## Configuration
+
+The benchmark job is configured directly in the YAML file.
+
+### Default Configuration
+
+- **Model**: `Qwen/Qwen3-0.6B`
+- **Service**: `qwen-vllm-agg=vllm-agg-frontend:8000`
+- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0`
+
+### Customizing the Job
+
+To customize the benchmark, edit `benchmarks/incluster/benchmark_job.yaml`:
+
+1. **Change the model**: Update the `--model` argument
+2. **Change the experiment name and/or service URL**: Update the `--input` argument (use `svc_name.namespace.svc.cluster.local:port` for cross-namespace access)
+3. **Add multiple services**: Uncomment and add more `--input` lines
+4. **Change Docker image**: Update the image field if needed
+
+### Example: Multi-Namespace Benchmarking
+
+To benchmark services across multiple namespaces, modify the `--input` arguments:
+
+```yaml
+args:
+  - --model
+  - "Qwen/Qwen3-0.6B"
+  - --isl
+  - "2000"
+  - --std
+  - "10"
+  - --osl
+  - "256"
+  - --output-dir
+  - /data/results
+  - --input
+  - "prod-vllm=vllm-agg-frontend.production.svc.cluster.local:8000"
+  - --input
+  - "staging-vllm=vllm-agg-frontend.staging.svc.cluster.local:8000"
+  - --input
+  - "dev-vllm=vllm-agg-frontend.development.svc.cluster.local:8000"
+```
+
+## Understanding Your Results
+
+Results are stored in `/data/results` and follow the same structure as client-side benchmarking:
+
+```text
+/data/results/
+├── plots/                           # Performance visualization plots
+│   ├── SUMMARY.txt                  # Human-readable benchmark summary
+│   ├── p50_inter_token_latency_vs_concurrency.png
+│   ├── avg_inter_token_latency_vs_concurrency.png
+│   ├── request_throughput_vs_concurrency.png
+│   ├── efficiency_tok_s_gpu_vs_user.png
+│   └── avg_time_to_first_token_vs_concurrency.png
+└── dsr1/                           # Results for dsr1 input
+    ├── c1/                          # Concurrency level 1
+    │   └── profile_export_genai_perf.json
+    ├── c2/                          # Concurrency level 2
+    └── ...                          # Other concurrency levels
+```
+
+## Monitoring and Debugging
+
+### Check Job Status
+```bash
+kubectl describe job dynamo-benchmark -n $NAMESPACE
+```
+
+### View Logs
+```bash
+# Follow logs in real-time
+kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
+```
+
+### Debug Failed Jobs
+```bash
+# Check pod status
+kubectl get pods -n $NAMESPACE -l job-name=dynamo-benchmark
+
+# Describe failed pod
+kubectl describe pod <pod-name> -n $NAMESPACE
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Service not found**: Ensure your DynamoGraphDeployment frontend service is running
+2. **Service account permissions**: Verify `dynamo-sa` has necessary RBAC permissions
+3. **PVC access**: Check that `dynamo-pvc` is properly configured and accessible
+4. **Image pull issues**: Ensure the Docker image is accessible from the cluster
+5. **Resource constraints**: Adjust resource limits if the job is being evicted
+
+### Debug Commands
+
+```bash
+# Check PVC status
+kubectl get pvc dynamo-pvc -n $NAMESPACE
+
+# Verify service account
+kubectl get sa dynamo-sa -n $NAMESPACE
+
+# Check service endpoints
+kubectl get svc -n $NAMESPACE
+
+# Verify your service exists and has endpoints
+SVC_NAME="${SERVICE_URL%%:*}"
+kubectl get svc "$SVC_NAME" -n "$NAMESPACE"
+kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"
+```
+
+---
+
 ## Customize Benchmarking Behavior
 
 The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior: