From 9fdf5ef863ed2628f781a36a65597f5c944e9e14 Mon Sep 17 00:00:00 2001 From: Oviya Seeniraj Date: Sat, 1 Nov 2025 17:47:13 -0700 Subject: [PATCH 1/6] feat(fault-injection): Add GPU fault injector agent - Agent runs as DaemonSet on GPU nodes - gpu_xid_injector.py: Injects XID errors into kernel logs - agent.py: HTTP server that receives injection requests - Dockerfile and requirements for deployment Enables API service to trigger XID injection on specific nodes. Signed-off-by: Oviya Seeniraj --- .../agents/gpu-fault-injector/Dockerfile | 49 ++++ .../agents/gpu-fault-injector/agent.py | 213 ++++++++++++++++++ .../gpu-fault-injector/gpu_xid_injector.py | 153 +++++++++++++ .../gpu-fault-injector/requirements.txt | 7 + 4 files changed, 422 insertions(+) create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile new file mode 100644 index 0000000000..9245224b2b --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg +# Runs as privileged DaemonSet on GPU nodes to inject XID errors +# +# NOTE: GPU nodes are AMD64/x86_64 architecture +# Build with: docker buildx build --platform linux/amd64 --load -t . + +FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04 + +# Install system dependencies (nsenter, nvidia-smi, journalctl) +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + curl \ + util-linux \ + systemd \ + kmod \ + pciutils \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +COPY requirements.txt /tmp/ +RUN pip3 install --no-cache-dir -r /tmp/requirements.txt + +# Create working directory +WORKDIR /app + +# Copy agent code +COPY agent.py /app/ +COPY gpu_xid_injector.py /app/ + +# Create log directory +RUN mkdir -p /var/log/gpu-fault-injector + +# Set environment +ENV PYTHONUNBUFFERED=1 + +# Expose port +EXPOSE 8083 + +# Health check +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD curl -f http://localhost:8083/health || exit 1 + +# Run agent +CMD ["python3", "agent.py"] diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py new file mode 100644 index 0000000000..b7ffd8cb0a --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py @@ -0,0 +1,213 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +""" +GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes. + +This agent provides privileged access for XID error injection: +- XID 79 injection via nsenter+kmsg (writes to host's /dev/kmsg) +- Triggers NVSentinel syslog-health-monitor detection +- Initiates complete fault tolerance workflow + +All GPU errors are simulated by injecting appropriate XID codes to syslog. +NVSentinel detects the XID and handles cordon/drain/restart/uncordon automatically. +""" + +import logging +import os +import subprocess +from datetime import datetime, timezone +from typing import Any, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Import kernel-level XID injector (for XID 79 via nsenter+kmsg) +try: + from gpu_xid_injector import GPUXIDInjectorKernel + + KERNEL_XID_AVAILABLE = True +except ImportError: + logger.warning("Kernel-level XID injector not available") + KERNEL_XID_AVAILABLE = False + GPUXIDInjectorKernel = None + + +# ============================================================================ +# Models and Enums +# ============================================================================ + + +class XIDInjectRequest(BaseModel): + """Request model for XID error injection via nsenter+kmsg""" + + fault_id: str + xid_type: int + gpu_id: int = 0 + duration: Optional[int] = None + + +# ============================================================================ +# GPU Fault Injector +# ============================================================================ + + +class GPUFaultInjector: + """GPU fault injection operations with DCGM integration. XID 79 via nsenter+kmsg.""" + + def __init__(self): + self.active_faults: dict[str, dict[str, Any]] = {} + self.node_name = os.getenv("NODE_NAME", "unknown") + self.dcgm_available = self._check_dcgm() + self.gpu_count = self._get_gpu_count() + + # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg) + self.kernel_xid_injector = None + self.kernel_xid_available = False + if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel: + try: + self.kernel_xid_injector = GPUXIDInjectorKernel() + self.kernel_xid_available = self.kernel_xid_injector.privileged + logger.info( + f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})" + ) + except Exception as e: + logger.warning(f"Kernel XID injector not available: {e}") + + logger.info(f"GPU Fault Injector initialized on node: {self.node_name}") + logger.info(f"DCGM available: {self.dcgm_available}") + logger.info(f"GPU count: {self.gpu_count}") + logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}") + + def _check_dcgm(self) -> bool: + """Check if DCGM is available""" + try: + result = subprocess.run( + ["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5 + ) + return result.returncode == 0 + except Exception as e: + logger.warning(f"DCGM not available: {e}") + return False + + def _get_gpu_count(self) -> int: + """Get number of GPUs on this node""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + return int(result.stdout.strip().split("\n")[0]) + return 0 + except Exception as e: + logger.error(f"Failed to get GPU count: {e}") + return 0 + + def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]: + """Run shell command with timeout""" + try: + result = subprocess.run( + command, capture_output=True, text=True, timeout=timeout + ) + success = result.returncode == 0 + output = result.stdout if success else result.stderr + return success, output.strip() + except subprocess.TimeoutExpired: + return False, "Command timed out" + except Exception as e: + return False, str(e) + + +# ============================================================================ +# FastAPI Application +# ============================================================================ + +app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0") +injector = GPUFaultInjector() + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return { + "status": "healthy", + "node": injector.node_name, + "gpu_count": injector.gpu_count, + "dcgm_available": injector.dcgm_available, + "active_faults": len(injector.active_faults), + } + + +@app.post("/inject-xid") +async def inject_xid(request: XIDInjectRequest): + """Inject XID 79 error via nsenter+kmsg (triggers NVSentinel detection)""" + logger.info( + f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}" + ) + + # Only XID 79 is supported via nsenter+kmsg + if request.xid_type != 79: + raise HTTPException( + status_code=400, + detail=f"XID {request.xid_type} not supported. Only XID 79 is implemented via nsenter+kmsg injection.", + ) + + if not injector.kernel_xid_available or not injector.kernel_xid_injector: + raise HTTPException( + status_code=503, + detail="Kernel-level XID injector not available. XID 79 requires privileged access to syslog/kmsg.", + ) + + success, message = injector.kernel_xid_injector.inject_xid_79_gpu_fell_off_bus( + gpu_id=request.gpu_id + ) + + if not success: + raise HTTPException(status_code=500, detail=message) + + # Track the fault + injector.active_faults[request.fault_id] = { + "type": f"xid_{request.xid_type}", + "gpu_id": request.gpu_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + return { + "status": "injected", + "node": injector.node_name, + "fault_id": request.fault_id, + "xid_type": request.xid_type, + "gpu_id": request.gpu_id, + "message": message, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + +@app.get("/faults") +async def list_active_faults(): + """List active faults on this node""" + return { + "node": injector.node_name, + "active_faults": list(injector.active_faults.keys()), + "count": len(injector.active_faults), + } + + +if __name__ == "__main__": + uvicorn.run( + app, + host="0.0.0.0", + port=8083, + log_level="info", + ) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py new file mode 100644 index 0000000000..01ecd829f6 --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +""" +GPU XID 79 Error Injector via nsenter+kmsg. + +Injects fake XID 79 messages to host's /dev/kmsg to trigger NVSentinel detection. +Uses nsenter to enter host namespaces and write kernel messages that NVSentinel +syslog-health-monitor can detect naturally. + +Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection +""" + +import logging +import os +import subprocess +from typing import Tuple + +logger = logging.getLogger(__name__) + + +class GPUXIDInjectorKernel: + """XID 79 injector via nsenter+kmsg (triggers NVSentinel detection)""" + + def __init__(self): + self.node_name = os.getenv("NODE_NAME", "unknown") + self.privileged = self._check_privileged() + + logger.info(f"XID 79 Injector initialized on {self.node_name}") + logger.info(f"Privileged: {self.privileged}") + logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow") + + def _check_privileged(self) -> bool: + """Check if we have privileged access (required for nsenter)""" + return os.geteuid() == 0 + + def _normalize_pci_address(self, pci_addr: str) -> str: + """ + Normalize PCI address from nvidia-smi format to kernel sysfs format. + + nvidia-smi returns: 00000001:00:00.0 (8-digit domain) + kernel expects: 0001:00:00.0 (4-digit domain) + + Azure VMs use extended PCI addresses, but the kernel shortens them. + """ + parts = pci_addr.split(":") + if len(parts) >= 3: + # Keep only last 4 digits of domain + domain = parts[0][-4:] if len(parts[0]) > 4 else parts[0] + normalized = f"{domain}:{parts[1]}:{parts[2]}" + logger.debug(f"Normalized PCI address: {pci_addr} -> {normalized}") + return normalized + return pci_addr + + def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]: + """ + Inject XID 79 (GPU Fell Off Bus) via nsenter+kmsg. + + Writes XID message to host's /dev/kmsg → NVSentinel detects → Full FT workflow + + Returns: (success, message) + """ + logger.info(f"Injecting XID 79 for GPU {gpu_id}") + + if not self.privileged: + return ( + False, + "XID 79 injection requires privileged mode (nsenter needs root)", + ) + + success, msg = self._inject_fake_xid_to_kmsg(gpu_id, 79) + + if success: + logger.info(f"XID 79 injected successfully: {msg}") + return True, msg + else: + logger.error(f"XID 79 injection failed: {msg}") + return False, msg + + def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]: + """ + Inject fake XID message to host's /dev/kmsg via nsenter. + + Uses nsenter to enter all host namespaces (PID 1) and write to /dev/kmsg. + Creates real kernel messages with proper metadata that NVSentinel can detect. + + Message format: "NVRM: NVRM: Xid (PCI:address): xid, message" + Duplicate "NVRM:" needed because /dev/kmsg splits on first colon. + """ + try: + # Get PCI address for the GPU + pci_result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=pci.bus_id", + "--format=csv,noheader", + "-i", + str(gpu_id), + ], + capture_output=True, + text=True, + timeout=10, + ) + + if pci_result.returncode != 0: + return ( + False, + f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}", + ) + + pci_addr_full = pci_result.stdout.strip() + pci_addr = self._normalize_pci_address(pci_addr_full) + + # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing) + xid_message = ( + f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, GPU has fallen off the bus." + ) + + # Write to host's /dev/kmsg via nsenter + kmsg_message = f"<3>{xid_message}" # <3> = kernel error priority + nsenter_cmd = [ + "nsenter", + "--target", + "1", # Target host PID 1 (init) + "--mount", # Enter mount namespace (for /dev/kmsg access) + "--uts", # Enter UTS namespace (hostname) + "--ipc", # Enter IPC namespace + "--pid", # Enter PID namespace (appear as host process) + "--", + "sh", + "-c", + f"echo '{kmsg_message}' > /dev/kmsg", + ] + + nsenter_result = subprocess.run( + nsenter_cmd, capture_output=True, text=True, timeout=5 + ) + + if nsenter_result.returncode != 0: + return ( + False, + f"Failed to write to host /dev/kmsg: {nsenter_result.stderr}", + ) + + return ( + True, + f"XID {xid} injected for GPU {gpu_id} (PCI: {pci_addr}) → NVSentinel", + ) + + except Exception as e: + logger.error(f"XID injection failed: {type(e).__name__}: {e}") + return False, f"Failed to inject XID: {e}" diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt new file mode 100644 index 0000000000..9335445931 --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.109.0 +httpx==0.26.0 +kubernetes==28.1.0 +pydantic==2.5.3 +python-multipart==0.0.6 +pyyaml==6.0.1 +uvicorn[standard]==0.27.0 From 9d823bb416c967e240ebae739100c6daf41a7d43 Mon Sep 17 00:00:00 2001 From: Oviya Seeniraj Date: Mon, 3 Nov 2025 12:42:33 -0800 Subject: [PATCH 2/6] fixed copyright and mypy issues Signed-off-by: Oviya Seeniraj --- .../agents/gpu-fault-injector/agent.py | 8 ++++---- .../agents/gpu-fault-injector/requirements.txt | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py index b7ffd8cb0a..8f3dce7f6d 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py @@ -18,7 +18,7 @@ import os import subprocess from datetime import datetime, timezone -from typing import Any, Optional +from typing import Any, Optional, Type import uvicorn from fastapi import FastAPI, HTTPException @@ -31,14 +31,14 @@ logger = logging.getLogger(__name__) # Import kernel-level XID injector (for XID 79 via nsenter+kmsg) +GPUXIDInjectorKernel: Optional[Type[Any]] = None try: - from gpu_xid_injector import GPUXIDInjectorKernel + from gpu_xid_injector import GPUXIDInjectorKernel # type: ignore[assignment] KERNEL_XID_AVAILABLE = True except ImportError: logger.warning("Kernel-level XID injector not available") KERNEL_XID_AVAILABLE = False - GPUXIDInjectorKernel = None # ============================================================================ @@ -72,7 +72,7 @@ def __init__(self): # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg) self.kernel_xid_injector = None self.kernel_xid_available = False - if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel: + if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None: try: self.kernel_xid_injector = GPUXIDInjectorKernel() self.kernel_xid_available = self.kernel_xid_injector.privileged diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt index 9335445931..1c0100e308 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# fastapi==0.109.0 httpx==0.26.0 kubernetes==28.1.0 From 01aa7686862f4d3115fb2c8bab51f6620f898523 Mon Sep 17 00:00:00 2001 From: Oviya Seeniraj Date: Tue, 4 Nov 2025 10:43:44 -0800 Subject: [PATCH 3/6] Updated requirements.txt to prevent security vulnerabilities Signed-off-by: Oviya Seeniraj --- .../agents/gpu-fault-injector/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt index 1c0100e308..7e2fd24a9a 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt @@ -2,10 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 # -fastapi==0.109.0 +fastapi==0.111.0 httpx==0.26.0 kubernetes==28.1.0 pydantic==2.5.3 -python-multipart==0.0.6 +python-multipart==0.0.20 pyyaml==6.0.1 uvicorn[standard]==0.27.0 From e9885e8d96fdbdc0b867d8169ba5f21825dd7eeb Mon Sep 17 00:00:00 2001 From: Oviya Seeniraj Date: Mon, 24 Nov 2025 17:25:57 -0800 Subject: [PATCH 4/6] support all XIDs in kmsg injector not just 79 + add predefined messages for all the DCGM/NVSentinel monitored XIDs Signed-off-by: Oviya Seeniraj --- .../agents/gpu-fault-injector/agent.py | 62 +++++- .../gpu-fault-injector/gpu_xid_injector.py | 203 ++++++++++++++++-- 2 files changed, 237 insertions(+), 28 deletions(-) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py index 8f3dce7f6d..53643e94d2 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py @@ -6,12 +6,22 @@ GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes. This agent provides privileged access for XID error injection: -- XID 79 injection via nsenter+kmsg (writes to host's /dev/kmsg) +- XID injection via nsenter+kmsg (writes to host's /dev/kmsg) - Triggers NVSentinel syslog-health-monitor detection - Initiates complete fault tolerance workflow -All GPU errors are simulated by injecting appropriate XID codes to syslog. -NVSentinel detects the XID and handles cordon/drain/restart/uncordon automatically. +Accepts ANY XID error code for testing flexibility. +Pre-defined messages for all DCGM/NVSentinel monitored XIDs: +- Devastating: 79, 74, 48, 94, 95, 119, 120, 140 +- Memory: 31, 32, 43, 63, 64 +- PCIe: 38, 39, 42 +- Thermal: 60, 61, 62 +- Power: 54, 56, 57 +- Graphics: 13, 45, 69 + +Unknown XIDs use generic error message format. +NVSentinel detects XIDs and handles actions based on its own rules. +See gpu_xid_injector.py for complete XID descriptions. """ import logging @@ -61,7 +71,12 @@ class XIDInjectRequest(BaseModel): class GPUFaultInjector: - """GPU fault injection operations with DCGM integration. XID 79 via nsenter+kmsg.""" + """ + GPU fault injection operations with DCGM integration. + + Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages). + Accepts any XID value (1-1000) for comprehensive fault tolerance testing. + """ def __init__(self): self.active_faults: dict[str, dict[str, Any]] = {} @@ -151,26 +166,51 @@ async def health_check(): @app.post("/inject-xid") async def inject_xid(request: XIDInjectRequest): - """Inject XID 79 error via nsenter+kmsg (triggers NVSentinel detection)""" + """ + Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection). + + Accepts any XID error code (1-1000) for maximum testing flexibility. + + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: + + Devastating (always FAIL): + - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors + - 119/120: GSP errors | 140: ECC unrecovered + + Subsystem (may WARN/escalate): + - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement) + - PCIe: 38, 39, 42 (bus, fabric, replay rate) + - Thermal: 60, 61, 62 (temperature limits) + - Power: 54, 56, 57 (power/clock state) + - Graphics: 13, 45, 69 (SM exceptions) + + Unknown XIDs use generic error message - NVSentinel will parse and handle + based on its own XID database. + """ logger.info( f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}" ) - # Only XID 79 is supported via nsenter+kmsg - if request.xid_type != 79: + # Validate XID type is a reasonable integer (basic sanity check) + if not isinstance(request.xid_type, int) or request.xid_type < 1 or request.xid_type > 1000: raise HTTPException( status_code=400, - detail=f"XID {request.xid_type} not supported. Only XID 79 is implemented via nsenter+kmsg injection.", + detail=( + f"Invalid XID type: {request.xid_type}. " + f"XID must be an integer between 1-1000. " + f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)." + ), ) if not injector.kernel_xid_available or not injector.kernel_xid_injector: raise HTTPException( status_code=503, - detail="Kernel-level XID injector not available. XID 79 requires privileged access to syslog/kmsg.", + detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.", ) - success, message = injector.kernel_xid_injector.inject_xid_79_gpu_fell_off_bus( - gpu_id=request.gpu_id + # Use the generic inject_xid method which supports multiple XID types + success, message = injector.kernel_xid_injector.inject_xid( + xid_type=request.xid_type, gpu_id=request.gpu_id ) if not success: diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py index 01ecd829f6..70e8f99444 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py @@ -3,33 +3,156 @@ # SPDX-License-Identifier: Apache-2.0 # """ -GPU XID 79 Error Injector via nsenter+kmsg. +GPU XID Error Injector via nsenter+kmsg. -Injects fake XID 79 messages to host's /dev/kmsg to trigger NVSentinel detection. +Injects fake XID messages to host's /dev/kmsg to trigger NVSentinel detection. Uses nsenter to enter host namespaces and write kernel messages that NVSentinel syslog-health-monitor can detect naturally. Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection + +Supported XIDs: +=============== +This injector accepts ANY XID error code (1-255+) for maximum testing flexibility. + +Pre-defined Messages for All DCGM/NVSentinel Monitored XIDs: +------------------------------------------------------------- +Based on DCGM health monitoring subsystems and NVSentinel detection rules. + +DEVASTATING XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored): +- 79: GPU fell off bus (most critical - node-level action) +- 74: NVLink uncorrectable error (multi-GPU communication failure) +- 48: Double-bit ECC error (severe memory error) +- 94: Contained ECC error (less severe memory error) +- 95: Uncontained error (very severe, GPU reset required) +- 119: GSP RPC Timeout (GPU Service Processor communication) +- 120: GSP Error (GPU Service Processor internal error) +- 140: ECC unrecovered error (persistent memory issue) + +SUBSYSTEM XIDs (DCGM_HEALTH_RESULT_WARN - may escalate): + +Memory Subsystem (DCGM_HEALTH_WATCH_MEM): +- 31: MMU Error +- 32: PBDMA Error +- 43: Reset Channel Verification Error +- 63: Pending Page Retirements +- 64: Row Remap Failure + +PCIe Subsystem (DCGM_HEALTH_WATCH_PCIE): +- 38: PCIe Bus Error +- 39: PCIe Fabric Error +- 42: PCIe Replay Rate exceeded + +Thermal Subsystem (DCGM_HEALTH_WATCH_THERMAL): +- 60: Clocks Event: Thermal limit exceeded +- 61: EDPP Power Brake: Thermal limit +- 62: Thermal Violations detected + +Power Subsystem (DCGM_HEALTH_WATCH_POWER): +- 54: Power state change event +- 56: Clock change event +- 57: Clocks Event: Power limit exceeded + +Graphics/Common XIDs: +- 13: Graphics Engine Exception +- 45: Preemptive Cleanup (due to previous errors) +- 69: Graphics Exception: Class Error + +Unknown XIDs: +------------- +Any XID not in XID_MESSAGES dict will use a generic error message format. +NVSentinel will parse and handle based on its own XID database and rules. + +Note: XIDs 43, 48, 74, 94, 95 are already supported via CUDA interception +(cuda_intercept.c LD_PRELOAD). kmsg injection adds complementary syslog-based +detection path for NVSentinel's syslog-health-monitor. """ import logging import os import subprocess -from typing import Tuple +from typing import Dict, Tuple logger = logging.getLogger(__name__) +# XID error code to descriptive message mapping +# Based on DCGM XID database and NVSentinel monitoring rules +# Source: DCGM/modules/health/DcgmHealthWatch.cpp BuildXidMappings() +XID_MESSAGES: Dict[int, str] = { + # Devastating XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored) + 79: "GPU has fallen off the bus", + 48: "DBE (Double Bit Error) ECC Error", + 74: "NVLink: Uncorrectable error", + 94: "Contained ECC error", + 95: "Uncontained error - GPU requires reset", + 119: "GSP RPC Timeout", + 120: "GSP Error", + 140: "ECC unrecovered error", + + # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM) + 31: "MMU Error", + 32: "PBDMA Error", + 43: "Reset Channel Verification Error", + 63: "Pending Page Retirements", + 64: "Row Remap Failure", + + # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE) + 38: "PCIe Bus Error", + 39: "PCIe Fabric Error", + 42: "PCIe Replay Rate exceeded", + # 74 already defined above (can be PCIe or NVLink context) + + # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL) + 60: "Clocks Event: Thermal limit exceeded", + 61: "EDPP Power Brake: Thermal limit", + 62: "Thermal Violations detected", + # 63 can be thermal or memory context ("Thermal diode detects short") + + # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER) + 54: "Power state change event", + 56: "Clock change event", + 57: "Clocks Event: Power limit exceeded", + + # Common Graphics XIDs (often seen in test environments) + 13: "Graphics Engine Exception", + 31: "GPU stopped responding", # Can be both MMU or timeout context + 45: "Preemptive Cleanup, due to previous errors", + 69: "Graphics Exception: Class Error", +} class GPUXIDInjectorKernel: - """XID 79 injector via nsenter+kmsg (triggers NVSentinel detection)""" + """ + XID injector via nsenter+kmsg (triggers NVSentinel detection). + + Accepts ANY XID error code for maximum flexibility in testing. + Pre-defined messages exist for common critical XIDs, but any XID value + can be injected - NVSentinel will parse and handle based on its own rules. + + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: + + Devastating XIDs (always trigger FAIL): + - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors + - 119/120: GSP errors, 140: ECC unrecovered + + Subsystem XIDs (trigger WARN, may escalate): + - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors + - PCIe (38, 39, 42): Bus, fabric, replay rate errors + - Thermal (60, 61, 62, 63): Temperature limit violations + - Power (54, 56, 57): Power/clock state changes + - Graphics (13, 45, 69): SM exceptions, preemptive cleanup + + Unknown XIDs use a generic error message format. + """ def __init__(self): self.node_name = os.getenv("NODE_NAME", "unknown") self.privileged = self._check_privileged() - logger.info(f"XID 79 Injector initialized on {self.node_name}") + logger.info(f"XID Injector initialized on {self.node_name}") logger.info(f"Privileged: {self.privileged}") + logger.info(f"Known XIDs with specific messages: {sorted(XID_MESSAGES.keys())}") logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow") + logger.info("Note: Accepts ANY XID value - unknown XIDs use generic message") def _check_privileged(self) -> bool: """Check if we have privileged access (required for nsenter)""" @@ -53,31 +176,59 @@ def _normalize_pci_address(self, pci_addr: str) -> str: return normalized return pci_addr - def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]: + def inject_xid(self, xid_type: int, gpu_id: int = 0) -> Tuple[bool, str]: """ - Inject XID 79 (GPU Fell Off Bus) via nsenter+kmsg. + Inject ANY XID error code via nsenter+kmsg. + + This method accepts any integer XID value for maximum testing flexibility. + Pre-defined messages exist for well-known XIDs (79, 74, 48, etc.), but + any XID can be injected. Unknown XIDs use a generic error message. - Writes XID message to host's /dev/kmsg → NVSentinel detects → Full FT workflow + Args: + xid_type: XID error code (any integer, commonly 1-255) + gpu_id: GPU device ID (default: 0) - Returns: (success, message) + Returns: + Tuple of (success: bool, message: str) """ - logger.info(f"Injecting XID 79 for GPU {gpu_id}") + logger.info(f"Injecting XID {xid_type} for GPU {gpu_id}") if not self.privileged: return ( False, - "XID 79 injection requires privileged mode (nsenter needs root)", + f"XID {xid_type} injection requires privileged mode (nsenter needs root)", ) - success, msg = self._inject_fake_xid_to_kmsg(gpu_id, 79) + success, msg = self._inject_fake_xid_to_kmsg(gpu_id, xid_type) if success: - logger.info(f"XID 79 injected successfully: {msg}") + logger.info(f"XID {xid_type} injected successfully: {msg}") return True, msg else: - logger.error(f"XID 79 injection failed: {msg}") + logger.error(f"XID {xid_type} injection failed: {msg}") return False, msg + # Convenience methods for specific XIDs (backward compatibility) + def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 79 (GPU Fell Off Bus) - most critical hardware failure.""" + return self.inject_xid(79, gpu_id) + + def inject_xid_74_nvlink_error(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 74 (NVLink error) - multi-GPU communication failure.""" + return self.inject_xid(74, gpu_id) + + def inject_xid_48_ecc_dbe(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 48 (Double-bit ECC error) - severe memory error.""" + return self.inject_xid(48, gpu_id) + + def inject_xid_94_ecc_contained(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 94 (Contained ECC error) - less severe memory error.""" + return self.inject_xid(94, gpu_id) + + def inject_xid_95_uncontained(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 95 (Uncontained error) - very severe, GPU reset required.""" + return self.inject_xid(95, gpu_id) + def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]: """ Inject fake XID message to host's /dev/kmsg via nsenter. @@ -87,6 +238,18 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]: Message format: "NVRM: NVRM: Xid (PCI:address): xid, message" Duplicate "NVRM:" needed because /dev/kmsg splits on first colon. + + Args: + gpu_id: GPU device ID (from nvidia-smi) + xid: XID error code (currently only 79 is used by public API) + + Returns: + Tuple of (success: bool, message: str) + + Note: This method accepts any XID code as a parameter for extensibility. + To add support for other XIDs (74, 48, 95, etc.), create corresponding + public methods like inject_xid_74_nvlink_error() and update the error + message template for each XID type. """ try: # Get PCI address for the GPU @@ -112,11 +275,17 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]: pci_addr_full = pci_result.stdout.strip() pci_addr = self._normalize_pci_address(pci_addr_full) - # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing) - xid_message = ( - f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, GPU has fallen off the bus." + # Get appropriate error message for this XID type + # If XID is known, use specific message; otherwise use generic format + error_msg = XID_MESSAGES.get( + xid, f"Graphics Exception: XID {xid} occurred on GPU" ) + # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing) + # Format matches NVSentinel pattern: NVRM: Xid (PCI:addr): code, description + xid_message = f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, {error_msg}" + logger.debug(f"Formatted XID message: {xid_message}") + # Write to host's /dev/kmsg via nsenter kmsg_message = f"<3>{xid_message}" # <3> = kernel error priority nsenter_cmd = [ From 9af21424219e8da535057bf56eb074e76f0a6450 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Wed, 26 Nov 2025 16:30:35 -0500 Subject: [PATCH 5/6] Update tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile Signed-off-by: Harrison Saturley-Hall --- .../agents/gpu-fault-injector/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile index 9245224b2b..78c689fb5a 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile @@ -11,7 +11,7 @@ FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04 # Install system dependencies (nsenter, nvidia-smi, journalctl) -RUN apt-get update && apt-get install -y \ +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \ python3 \ python3-pip \ curl \ From 97afb3f179d9f9b80d03166abb36587a4b310e52 Mon Sep 17 00:00:00 2001 From: Harrison King Saturley-Hall Date: Wed, 26 Nov 2025 16:32:42 -0500 Subject: [PATCH 6/6] fix: precommit formatting Signed-off-by: Harrison King Saturley-Hall --- .../agents/gpu-fault-injector/agent.py | 18 +++++++++++------- .../gpu-fault-injector/gpu_xid_injector.py | 16 ++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py index 53643e94d2..98831d3cb6 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py @@ -73,7 +73,7 @@ class XIDInjectRequest(BaseModel): class GPUFaultInjector: """ GPU fault injection operations with DCGM integration. - + Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages). Accepts any XID value (1-1000) for comprehensive fault tolerance testing. """ @@ -168,22 +168,22 @@ async def health_check(): async def inject_xid(request: XIDInjectRequest): """ Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection). - + Accepts any XID error code (1-1000) for maximum testing flexibility. - + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: - + Devastating (always FAIL): - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors - 119/120: GSP errors | 140: ECC unrecovered - + Subsystem (may WARN/escalate): - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement) - PCIe: 38, 39, 42 (bus, fabric, replay rate) - Thermal: 60, 61, 62 (temperature limits) - Power: 54, 56, 57 (power/clock state) - Graphics: 13, 45, 69 (SM exceptions) - + Unknown XIDs use generic error message - NVSentinel will parse and handle based on its own XID database. """ @@ -192,7 +192,11 @@ async def inject_xid(request: XIDInjectRequest): ) # Validate XID type is a reasonable integer (basic sanity check) - if not isinstance(request.xid_type, int) or request.xid_type < 1 or request.xid_type > 1000: + if ( + not isinstance(request.xid_type, int) + or request.xid_type < 1 + or request.xid_type > 1000 + ): raise HTTPException( status_code=400, detail=( diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py index 70e8f99444..26c1d4521e 100644 --- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py @@ -88,31 +88,26 @@ 119: "GSP RPC Timeout", 120: "GSP Error", 140: "ECC unrecovered error", - # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM) 31: "MMU Error", 32: "PBDMA Error", 43: "Reset Channel Verification Error", 63: "Pending Page Retirements", 64: "Row Remap Failure", - # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE) 38: "PCIe Bus Error", 39: "PCIe Fabric Error", 42: "PCIe Replay Rate exceeded", # 74 already defined above (can be PCIe or NVLink context) - # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL) 60: "Clocks Event: Thermal limit exceeded", 61: "EDPP Power Brake: Thermal limit", 62: "Thermal Violations detected", # 63 can be thermal or memory context ("Thermal diode detects short") - # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER) 54: "Power state change event", 56: "Clock change event", 57: "Clocks Event: Power limit exceeded", - # Common Graphics XIDs (often seen in test environments) 13: "Graphics Engine Exception", 31: "GPU stopped responding", # Can be both MMU or timeout context @@ -120,27 +115,28 @@ 69: "Graphics Exception: Class Error", } + class GPUXIDInjectorKernel: """ XID injector via nsenter+kmsg (triggers NVSentinel detection). - + Accepts ANY XID error code for maximum flexibility in testing. Pre-defined messages exist for common critical XIDs, but any XID value can be injected - NVSentinel will parse and handle based on its own rules. - + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: - + Devastating XIDs (always trigger FAIL): - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors - 119/120: GSP errors, 140: ECC unrecovered - + Subsystem XIDs (trigger WARN, may escalate): - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors - PCIe (38, 39, 42): Bus, fabric, replay rate errors - Thermal (60, 61, 62, 63): Temperature limit violations - Power (54, 56, 57): Power/clock state changes - Graphics (13, 45, 69): SM exceptions, preemptive cleanup - + Unknown XIDs use a generic error message format. """