diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile new file mode 100644 index 0000000000..78c689fb5a --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg +# Runs as privileged DaemonSet on GPU nodes to inject XID errors +# +# NOTE: GPU nodes are AMD64/x86_64 architecture +# Build with: docker buildx build --platform linux/amd64 --load -t . + +FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04 + +# Install system dependencies (nsenter, nvidia-smi, journalctl) +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + curl \ + util-linux \ + systemd \ + kmod \ + pciutils \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +COPY requirements.txt /tmp/ +RUN pip3 install --no-cache-dir -r /tmp/requirements.txt + +# Create working directory +WORKDIR /app + +# Copy agent code +COPY agent.py /app/ +COPY gpu_xid_injector.py /app/ + +# Create log directory +RUN mkdir -p /var/log/gpu-fault-injector + +# Set environment +ENV PYTHONUNBUFFERED=1 + +# Expose port +EXPOSE 8083 + +# Health check +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD curl -f http://localhost:8083/health || exit 1 + +# Run agent +CMD ["python3", "agent.py"] diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py new file mode 100644 index 0000000000..98831d3cb6 --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +""" +GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes. + +This agent provides privileged access for XID error injection: +- XID injection via nsenter+kmsg (writes to host's /dev/kmsg) +- Triggers NVSentinel syslog-health-monitor detection +- Initiates complete fault tolerance workflow + +Accepts ANY XID error code for testing flexibility. +Pre-defined messages for all DCGM/NVSentinel monitored XIDs: +- Devastating: 79, 74, 48, 94, 95, 119, 120, 140 +- Memory: 31, 32, 43, 63, 64 +- PCIe: 38, 39, 42 +- Thermal: 60, 61, 62 +- Power: 54, 56, 57 +- Graphics: 13, 45, 69 + +Unknown XIDs use generic error message format. +NVSentinel detects XIDs and handles actions based on its own rules. +See gpu_xid_injector.py for complete XID descriptions. +""" + +import logging +import os +import subprocess +from datetime import datetime, timezone +from typing import Any, Optional, Type + +import uvicorn +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Import kernel-level XID injector (for XID 79 via nsenter+kmsg) +GPUXIDInjectorKernel: Optional[Type[Any]] = None +try: + from gpu_xid_injector import GPUXIDInjectorKernel # type: ignore[assignment] + + KERNEL_XID_AVAILABLE = True +except ImportError: + logger.warning("Kernel-level XID injector not available") + KERNEL_XID_AVAILABLE = False + + +# ============================================================================ +# Models and Enums +# ============================================================================ + + +class XIDInjectRequest(BaseModel): + """Request model for XID error injection via nsenter+kmsg""" + + fault_id: str + xid_type: int + gpu_id: int = 0 + duration: Optional[int] = None + + +# ============================================================================ +# GPU Fault Injector +# ============================================================================ + + +class GPUFaultInjector: + """ + GPU fault injection operations with DCGM integration. + + Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages). + Accepts any XID value (1-1000) for comprehensive fault tolerance testing. + """ + + def __init__(self): + self.active_faults: dict[str, dict[str, Any]] = {} + self.node_name = os.getenv("NODE_NAME", "unknown") + self.dcgm_available = self._check_dcgm() + self.gpu_count = self._get_gpu_count() + + # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg) + self.kernel_xid_injector = None + self.kernel_xid_available = False + if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None: + try: + self.kernel_xid_injector = GPUXIDInjectorKernel() + self.kernel_xid_available = self.kernel_xid_injector.privileged + logger.info( + f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})" + ) + except Exception as e: + logger.warning(f"Kernel XID injector not available: {e}") + + logger.info(f"GPU Fault Injector initialized on node: {self.node_name}") + logger.info(f"DCGM available: {self.dcgm_available}") + logger.info(f"GPU count: {self.gpu_count}") + logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}") + + def _check_dcgm(self) -> bool: + """Check if DCGM is available""" + try: + result = subprocess.run( + ["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5 + ) + return result.returncode == 0 + except Exception as e: + logger.warning(f"DCGM not available: {e}") + return False + + def _get_gpu_count(self) -> int: + """Get number of GPUs on this node""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + return int(result.stdout.strip().split("\n")[0]) + return 0 + except Exception as e: + logger.error(f"Failed to get GPU count: {e}") + return 0 + + def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]: + """Run shell command with timeout""" + try: + result = subprocess.run( + command, capture_output=True, text=True, timeout=timeout + ) + success = result.returncode == 0 + output = result.stdout if success else result.stderr + return success, output.strip() + except subprocess.TimeoutExpired: + return False, "Command timed out" + except Exception as e: + return False, str(e) + + +# ============================================================================ +# FastAPI Application +# ============================================================================ + +app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0") +injector = GPUFaultInjector() + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return { + "status": "healthy", + "node": injector.node_name, + "gpu_count": injector.gpu_count, + "dcgm_available": injector.dcgm_available, + "active_faults": len(injector.active_faults), + } + + +@app.post("/inject-xid") +async def inject_xid(request: XIDInjectRequest): + """ + Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection). + + Accepts any XID error code (1-1000) for maximum testing flexibility. + + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: + + Devastating (always FAIL): + - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors + - 119/120: GSP errors | 140: ECC unrecovered + + Subsystem (may WARN/escalate): + - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement) + - PCIe: 38, 39, 42 (bus, fabric, replay rate) + - Thermal: 60, 61, 62 (temperature limits) + - Power: 54, 56, 57 (power/clock state) + - Graphics: 13, 45, 69 (SM exceptions) + + Unknown XIDs use generic error message - NVSentinel will parse and handle + based on its own XID database. + """ + logger.info( + f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}" + ) + + # Validate XID type is a reasonable integer (basic sanity check) + if ( + not isinstance(request.xid_type, int) + or request.xid_type < 1 + or request.xid_type > 1000 + ): + raise HTTPException( + status_code=400, + detail=( + f"Invalid XID type: {request.xid_type}. " + f"XID must be an integer between 1-1000. " + f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)." + ), + ) + + if not injector.kernel_xid_available or not injector.kernel_xid_injector: + raise HTTPException( + status_code=503, + detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.", + ) + + # Use the generic inject_xid method which supports multiple XID types + success, message = injector.kernel_xid_injector.inject_xid( + xid_type=request.xid_type, gpu_id=request.gpu_id + ) + + if not success: + raise HTTPException(status_code=500, detail=message) + + # Track the fault + injector.active_faults[request.fault_id] = { + "type": f"xid_{request.xid_type}", + "gpu_id": request.gpu_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + return { + "status": "injected", + "node": injector.node_name, + "fault_id": request.fault_id, + "xid_type": request.xid_type, + "gpu_id": request.gpu_id, + "message": message, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + +@app.get("/faults") +async def list_active_faults(): + """List active faults on this node""" + return { + "node": injector.node_name, + "active_faults": list(injector.active_faults.keys()), + "count": len(injector.active_faults), + } + + +if __name__ == "__main__": + uvicorn.run( + app, + host="0.0.0.0", + port=8083, + log_level="info", + ) diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py new file mode 100644 index 0000000000..26c1d4521e --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py @@ -0,0 +1,318 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +""" +GPU XID Error Injector via nsenter+kmsg. + +Injects fake XID messages to host's /dev/kmsg to trigger NVSentinel detection. +Uses nsenter to enter host namespaces and write kernel messages that NVSentinel +syslog-health-monitor can detect naturally. + +Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection + +Supported XIDs: +=============== +This injector accepts ANY XID error code (1-255+) for maximum testing flexibility. + +Pre-defined Messages for All DCGM/NVSentinel Monitored XIDs: +------------------------------------------------------------- +Based on DCGM health monitoring subsystems and NVSentinel detection rules. + +DEVASTATING XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored): +- 79: GPU fell off bus (most critical - node-level action) +- 74: NVLink uncorrectable error (multi-GPU communication failure) +- 48: Double-bit ECC error (severe memory error) +- 94: Contained ECC error (less severe memory error) +- 95: Uncontained error (very severe, GPU reset required) +- 119: GSP RPC Timeout (GPU Service Processor communication) +- 120: GSP Error (GPU Service Processor internal error) +- 140: ECC unrecovered error (persistent memory issue) + +SUBSYSTEM XIDs (DCGM_HEALTH_RESULT_WARN - may escalate): + +Memory Subsystem (DCGM_HEALTH_WATCH_MEM): +- 31: MMU Error +- 32: PBDMA Error +- 43: Reset Channel Verification Error +- 63: Pending Page Retirements +- 64: Row Remap Failure + +PCIe Subsystem (DCGM_HEALTH_WATCH_PCIE): +- 38: PCIe Bus Error +- 39: PCIe Fabric Error +- 42: PCIe Replay Rate exceeded + +Thermal Subsystem (DCGM_HEALTH_WATCH_THERMAL): +- 60: Clocks Event: Thermal limit exceeded +- 61: EDPP Power Brake: Thermal limit +- 62: Thermal Violations detected + +Power Subsystem (DCGM_HEALTH_WATCH_POWER): +- 54: Power state change event +- 56: Clock change event +- 57: Clocks Event: Power limit exceeded + +Graphics/Common XIDs: +- 13: Graphics Engine Exception +- 45: Preemptive Cleanup (due to previous errors) +- 69: Graphics Exception: Class Error + +Unknown XIDs: +------------- +Any XID not in XID_MESSAGES dict will use a generic error message format. +NVSentinel will parse and handle based on its own XID database and rules. + +Note: XIDs 43, 48, 74, 94, 95 are already supported via CUDA interception +(cuda_intercept.c LD_PRELOAD). kmsg injection adds complementary syslog-based +detection path for NVSentinel's syslog-health-monitor. +""" + +import logging +import os +import subprocess +from typing import Dict, Tuple + +logger = logging.getLogger(__name__) + +# XID error code to descriptive message mapping +# Based on DCGM XID database and NVSentinel monitoring rules +# Source: DCGM/modules/health/DcgmHealthWatch.cpp BuildXidMappings() +XID_MESSAGES: Dict[int, str] = { + # Devastating XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored) + 79: "GPU has fallen off the bus", + 48: "DBE (Double Bit Error) ECC Error", + 74: "NVLink: Uncorrectable error", + 94: "Contained ECC error", + 95: "Uncontained error - GPU requires reset", + 119: "GSP RPC Timeout", + 120: "GSP Error", + 140: "ECC unrecovered error", + # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM) + 31: "MMU Error", + 32: "PBDMA Error", + 43: "Reset Channel Verification Error", + 63: "Pending Page Retirements", + 64: "Row Remap Failure", + # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE) + 38: "PCIe Bus Error", + 39: "PCIe Fabric Error", + 42: "PCIe Replay Rate exceeded", + # 74 already defined above (can be PCIe or NVLink context) + # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL) + 60: "Clocks Event: Thermal limit exceeded", + 61: "EDPP Power Brake: Thermal limit", + 62: "Thermal Violations detected", + # 63 can be thermal or memory context ("Thermal diode detects short") + # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER) + 54: "Power state change event", + 56: "Clock change event", + 57: "Clocks Event: Power limit exceeded", + # Common Graphics XIDs (often seen in test environments) + 13: "Graphics Engine Exception", + 31: "GPU stopped responding", # Can be both MMU or timeout context + 45: "Preemptive Cleanup, due to previous errors", + 69: "Graphics Exception: Class Error", +} + + +class GPUXIDInjectorKernel: + """ + XID injector via nsenter+kmsg (triggers NVSentinel detection). + + Accepts ANY XID error code for maximum flexibility in testing. + Pre-defined messages exist for common critical XIDs, but any XID value + can be injected - NVSentinel will parse and handle based on its own rules. + + Pre-defined messages for all DCGM/NVSentinel monitored XIDs: + + Devastating XIDs (always trigger FAIL): + - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors + - 119/120: GSP errors, 140: ECC unrecovered + + Subsystem XIDs (trigger WARN, may escalate): + - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors + - PCIe (38, 39, 42): Bus, fabric, replay rate errors + - Thermal (60, 61, 62, 63): Temperature limit violations + - Power (54, 56, 57): Power/clock state changes + - Graphics (13, 45, 69): SM exceptions, preemptive cleanup + + Unknown XIDs use a generic error message format. + """ + + def __init__(self): + self.node_name = os.getenv("NODE_NAME", "unknown") + self.privileged = self._check_privileged() + + logger.info(f"XID Injector initialized on {self.node_name}") + logger.info(f"Privileged: {self.privileged}") + logger.info(f"Known XIDs with specific messages: {sorted(XID_MESSAGES.keys())}") + logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow") + logger.info("Note: Accepts ANY XID value - unknown XIDs use generic message") + + def _check_privileged(self) -> bool: + """Check if we have privileged access (required for nsenter)""" + return os.geteuid() == 0 + + def _normalize_pci_address(self, pci_addr: str) -> str: + """ + Normalize PCI address from nvidia-smi format to kernel sysfs format. + + nvidia-smi returns: 00000001:00:00.0 (8-digit domain) + kernel expects: 0001:00:00.0 (4-digit domain) + + Azure VMs use extended PCI addresses, but the kernel shortens them. + """ + parts = pci_addr.split(":") + if len(parts) >= 3: + # Keep only last 4 digits of domain + domain = parts[0][-4:] if len(parts[0]) > 4 else parts[0] + normalized = f"{domain}:{parts[1]}:{parts[2]}" + logger.debug(f"Normalized PCI address: {pci_addr} -> {normalized}") + return normalized + return pci_addr + + def inject_xid(self, xid_type: int, gpu_id: int = 0) -> Tuple[bool, str]: + """ + Inject ANY XID error code via nsenter+kmsg. + + This method accepts any integer XID value for maximum testing flexibility. + Pre-defined messages exist for well-known XIDs (79, 74, 48, etc.), but + any XID can be injected. Unknown XIDs use a generic error message. + + Args: + xid_type: XID error code (any integer, commonly 1-255) + gpu_id: GPU device ID (default: 0) + + Returns: + Tuple of (success: bool, message: str) + """ + logger.info(f"Injecting XID {xid_type} for GPU {gpu_id}") + + if not self.privileged: + return ( + False, + f"XID {xid_type} injection requires privileged mode (nsenter needs root)", + ) + + success, msg = self._inject_fake_xid_to_kmsg(gpu_id, xid_type) + + if success: + logger.info(f"XID {xid_type} injected successfully: {msg}") + return True, msg + else: + logger.error(f"XID {xid_type} injection failed: {msg}") + return False, msg + + # Convenience methods for specific XIDs (backward compatibility) + def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 79 (GPU Fell Off Bus) - most critical hardware failure.""" + return self.inject_xid(79, gpu_id) + + def inject_xid_74_nvlink_error(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 74 (NVLink error) - multi-GPU communication failure.""" + return self.inject_xid(74, gpu_id) + + def inject_xid_48_ecc_dbe(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 48 (Double-bit ECC error) - severe memory error.""" + return self.inject_xid(48, gpu_id) + + def inject_xid_94_ecc_contained(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 94 (Contained ECC error) - less severe memory error.""" + return self.inject_xid(94, gpu_id) + + def inject_xid_95_uncontained(self, gpu_id: int = 0) -> Tuple[bool, str]: + """Inject XID 95 (Uncontained error) - very severe, GPU reset required.""" + return self.inject_xid(95, gpu_id) + + def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]: + """ + Inject fake XID message to host's /dev/kmsg via nsenter. + + Uses nsenter to enter all host namespaces (PID 1) and write to /dev/kmsg. + Creates real kernel messages with proper metadata that NVSentinel can detect. + + Message format: "NVRM: NVRM: Xid (PCI:address): xid, message" + Duplicate "NVRM:" needed because /dev/kmsg splits on first colon. + + Args: + gpu_id: GPU device ID (from nvidia-smi) + xid: XID error code (currently only 79 is used by public API) + + Returns: + Tuple of (success: bool, message: str) + + Note: This method accepts any XID code as a parameter for extensibility. + To add support for other XIDs (74, 48, 95, etc.), create corresponding + public methods like inject_xid_74_nvlink_error() and update the error + message template for each XID type. + """ + try: + # Get PCI address for the GPU + pci_result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=pci.bus_id", + "--format=csv,noheader", + "-i", + str(gpu_id), + ], + capture_output=True, + text=True, + timeout=10, + ) + + if pci_result.returncode != 0: + return ( + False, + f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}", + ) + + pci_addr_full = pci_result.stdout.strip() + pci_addr = self._normalize_pci_address(pci_addr_full) + + # Get appropriate error message for this XID type + # If XID is known, use specific message; otherwise use generic format + error_msg = XID_MESSAGES.get( + xid, f"Graphics Exception: XID {xid} occurred on GPU" + ) + + # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing) + # Format matches NVSentinel pattern: NVRM: Xid (PCI:addr): code, description + xid_message = f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, {error_msg}" + logger.debug(f"Formatted XID message: {xid_message}") + + # Write to host's /dev/kmsg via nsenter + kmsg_message = f"<3>{xid_message}" # <3> = kernel error priority + nsenter_cmd = [ + "nsenter", + "--target", + "1", # Target host PID 1 (init) + "--mount", # Enter mount namespace (for /dev/kmsg access) + "--uts", # Enter UTS namespace (hostname) + "--ipc", # Enter IPC namespace + "--pid", # Enter PID namespace (appear as host process) + "--", + "sh", + "-c", + f"echo '{kmsg_message}' > /dev/kmsg", + ] + + nsenter_result = subprocess.run( + nsenter_cmd, capture_output=True, text=True, timeout=5 + ) + + if nsenter_result.returncode != 0: + return ( + False, + f"Failed to write to host /dev/kmsg: {nsenter_result.stderr}", + ) + + return ( + True, + f"XID {xid} injected for GPU {gpu_id} (PCI: {pci_addr}) → NVSentinel", + ) + + except Exception as e: + logger.error(f"XID injection failed: {type(e).__name__}: {e}") + return False, f"Failed to inject XID: {e}" diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt new file mode 100644 index 0000000000..7e2fd24a9a --- /dev/null +++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +fastapi==0.111.0 +httpx==0.26.0 +kubernetes==28.1.0 +pydantic==2.5.3 +python-multipart==0.0.20 +pyyaml==6.0.1 +uvicorn[standard]==0.27.0