From 9fdf5ef863ed2628f781a36a65597f5c944e9e14 Mon Sep 17 00:00:00 2001
From: Oviya Seeniraj <oseeniraj@nvidia.com>
Date: Sat, 1 Nov 2025 17:47:13 -0700
Subject: [PATCH 1/6] feat(fault-injection): Add GPU fault injector agent

- Agent runs as DaemonSet on GPU nodes
- gpu_xid_injector.py: Injects XID errors into kernel logs
- agent.py: HTTP server that receives injection requests
- Dockerfile and requirements for deployment

Enables API service to trigger XID injection on specific nodes.

Signed-off-by: Oviya Seeniraj <oseeniraj@nvidia.com>
---
 .../agents/gpu-fault-injector/Dockerfile      |  49 ++++
 .../agents/gpu-fault-injector/agent.py        | 213 ++++++++++++++++++
 .../gpu-fault-injector/gpu_xid_injector.py    | 153 +++++++++++++
 .../gpu-fault-injector/requirements.txt       |   7 +
 4 files changed, 422 insertions(+)
 create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
 create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
 create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
 create mode 100644 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
new file mode 100644
index 0000000000..9245224b2b
--- /dev/null
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg
+# Runs as privileged DaemonSet on GPU nodes to inject XID errors
+#
+# NOTE: GPU nodes are AMD64/x86_64 architecture
+# Build with: docker buildx build --platform linux/amd64 --load -t <image> .
+
+FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04
+
+# Install system dependencies (nsenter, nvidia-smi, journalctl)
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    util-linux \
+    systemd \
+    kmod \
+    pciutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+COPY requirements.txt /tmp/
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+
+# Create working directory
+WORKDIR /app
+
+# Copy agent code
+COPY agent.py /app/
+COPY gpu_xid_injector.py /app/
+
+# Create log directory
+RUN mkdir -p /var/log/gpu-fault-injector
+
+# Set environment
+ENV PYTHONUNBUFFERED=1
+
+# Expose port
+EXPOSE 8083
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:8083/health || exit 1
+
+# Run agent
+CMD ["python3", "agent.py"]
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
new file mode 100644
index 0000000000..b7ffd8cb0a
--- /dev/null
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
@@ -0,0 +1,213 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+"""
+GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes.
+
+This agent provides privileged access for XID error injection:
+- XID 79 injection via nsenter+kmsg (writes to host's /dev/kmsg)
+- Triggers NVSentinel syslog-health-monitor detection
+- Initiates complete fault tolerance workflow
+
+All GPU errors are simulated by injecting appropriate XID codes to syslog.
+NVSentinel detects the XID and handles cordon/drain/restart/uncordon automatically.
+"""
+
+import logging
+import os
+import subprocess
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Import kernel-level XID injector (for XID 79 via nsenter+kmsg)
+try:
+    from gpu_xid_injector import GPUXIDInjectorKernel
+
+    KERNEL_XID_AVAILABLE = True
+except ImportError:
+    logger.warning("Kernel-level XID injector not available")
+    KERNEL_XID_AVAILABLE = False
+    GPUXIDInjectorKernel = None
+
+
+# ============================================================================
+# Models and Enums
+# ============================================================================
+
+
+class XIDInjectRequest(BaseModel):
+    """Request model for XID error injection via nsenter+kmsg"""
+
+    fault_id: str
+    xid_type: int
+    gpu_id: int = 0
+    duration: Optional[int] = None
+
+
+# ============================================================================
+# GPU Fault Injector
+# ============================================================================
+
+
+class GPUFaultInjector:
+    """GPU fault injection operations with DCGM integration. XID 79 via nsenter+kmsg."""
+
+    def __init__(self):
+        self.active_faults: dict[str, dict[str, Any]] = {}
+        self.node_name = os.getenv("NODE_NAME", "unknown")
+        self.dcgm_available = self._check_dcgm()
+        self.gpu_count = self._get_gpu_count()
+
+        # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg)
+        self.kernel_xid_injector = None
+        self.kernel_xid_available = False
+        if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel:
+            try:
+                self.kernel_xid_injector = GPUXIDInjectorKernel()
+                self.kernel_xid_available = self.kernel_xid_injector.privileged
+                logger.info(
+                    f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})"
+                )
+            except Exception as e:
+                logger.warning(f"Kernel XID injector not available: {e}")
+
+        logger.info(f"GPU Fault Injector initialized on node: {self.node_name}")
+        logger.info(f"DCGM available: {self.dcgm_available}")
+        logger.info(f"GPU count: {self.gpu_count}")
+        logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}")
+
+    def _check_dcgm(self) -> bool:
+        """Check if DCGM is available"""
+        try:
+            result = subprocess.run(
+                ["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5
+            )
+            return result.returncode == 0
+        except Exception as e:
+            logger.warning(f"DCGM not available: {e}")
+            return False
+
+    def _get_gpu_count(self) -> int:
+        """Get number of GPUs on this node"""
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if result.returncode == 0:
+                return int(result.stdout.strip().split("\n")[0])
+            return 0
+        except Exception as e:
+            logger.error(f"Failed to get GPU count: {e}")
+            return 0
+
+    def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]:
+        """Run shell command with timeout"""
+        try:
+            result = subprocess.run(
+                command, capture_output=True, text=True, timeout=timeout
+            )
+            success = result.returncode == 0
+            output = result.stdout if success else result.stderr
+            return success, output.strip()
+        except subprocess.TimeoutExpired:
+            return False, "Command timed out"
+        except Exception as e:
+            return False, str(e)
+
+
+# ============================================================================
+# FastAPI Application
+# ============================================================================
+
+app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0")
+injector = GPUFaultInjector()
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "node": injector.node_name,
+        "gpu_count": injector.gpu_count,
+        "dcgm_available": injector.dcgm_available,
+        "active_faults": len(injector.active_faults),
+    }
+
+
+@app.post("/inject-xid")
+async def inject_xid(request: XIDInjectRequest):
+    """Inject XID 79 error via nsenter+kmsg (triggers NVSentinel detection)"""
+    logger.info(
+        f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}"
+    )
+
+    # Only XID 79 is supported via nsenter+kmsg
+    if request.xid_type != 79:
+        raise HTTPException(
+            status_code=400,
+            detail=f"XID {request.xid_type} not supported. Only XID 79 is implemented via nsenter+kmsg injection.",
+        )
+
+    if not injector.kernel_xid_available or not injector.kernel_xid_injector:
+        raise HTTPException(
+            status_code=503,
+            detail="Kernel-level XID injector not available. XID 79 requires privileged access to syslog/kmsg.",
+        )
+
+    success, message = injector.kernel_xid_injector.inject_xid_79_gpu_fell_off_bus(
+        gpu_id=request.gpu_id
+    )
+
+    if not success:
+        raise HTTPException(status_code=500, detail=message)
+
+    # Track the fault
+    injector.active_faults[request.fault_id] = {
+        "type": f"xid_{request.xid_type}",
+        "gpu_id": request.gpu_id,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+
+    return {
+        "status": "injected",
+        "node": injector.node_name,
+        "fault_id": request.fault_id,
+        "xid_type": request.xid_type,
+        "gpu_id": request.gpu_id,
+        "message": message,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+
+
+@app.get("/faults")
+async def list_active_faults():
+    """List active faults on this node"""
+    return {
+        "node": injector.node_name,
+        "active_faults": list(injector.active_faults.keys()),
+        "count": len(injector.active_faults),
+    }
+
+
+if __name__ == "__main__":
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8083,
+        log_level="info",
+    )
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
new file mode 100644
index 0000000000..01ecd829f6
--- /dev/null
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+"""
+GPU XID 79 Error Injector via nsenter+kmsg.
+
+Injects fake XID 79 messages to host's /dev/kmsg to trigger NVSentinel detection.
+Uses nsenter to enter host namespaces and write kernel messages that NVSentinel
+syslog-health-monitor can detect naturally.
+
+Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection
+"""
+
+import logging
+import os
+import subprocess
+from typing import Tuple
+
+logger = logging.getLogger(__name__)
+
+
+class GPUXIDInjectorKernel:
+    """XID 79 injector via nsenter+kmsg (triggers NVSentinel detection)"""
+
+    def __init__(self):
+        self.node_name = os.getenv("NODE_NAME", "unknown")
+        self.privileged = self._check_privileged()
+
+        logger.info(f"XID 79 Injector initialized on {self.node_name}")
+        logger.info(f"Privileged: {self.privileged}")
+        logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow")
+
+    def _check_privileged(self) -> bool:
+        """Check if we have privileged access (required for nsenter)"""
+        return os.geteuid() == 0
+
+    def _normalize_pci_address(self, pci_addr: str) -> str:
+        """
+        Normalize PCI address from nvidia-smi format to kernel sysfs format.
+
+        nvidia-smi returns: 00000001:00:00.0 (8-digit domain)
+        kernel expects:     0001:00:00.0     (4-digit domain)
+
+        Azure VMs use extended PCI addresses, but the kernel shortens them.
+        """
+        parts = pci_addr.split(":")
+        if len(parts) >= 3:
+            # Keep only last 4 digits of domain
+            domain = parts[0][-4:] if len(parts[0]) > 4 else parts[0]
+            normalized = f"{domain}:{parts[1]}:{parts[2]}"
+            logger.debug(f"Normalized PCI address: {pci_addr} -> {normalized}")
+            return normalized
+        return pci_addr
+
+    def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """
+        Inject XID 79 (GPU Fell Off Bus) via nsenter+kmsg.
+
+        Writes XID message to host's /dev/kmsg → NVSentinel detects → Full FT workflow
+
+        Returns: (success, message)
+        """
+        logger.info(f"Injecting XID 79 for GPU {gpu_id}")
+
+        if not self.privileged:
+            return (
+                False,
+                "XID 79 injection requires privileged mode (nsenter needs root)",
+            )
+
+        success, msg = self._inject_fake_xid_to_kmsg(gpu_id, 79)
+
+        if success:
+            logger.info(f"XID 79 injected successfully: {msg}")
+            return True, msg
+        else:
+            logger.error(f"XID 79 injection failed: {msg}")
+            return False, msg
+
+    def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
+        """
+        Inject fake XID message to host's /dev/kmsg via nsenter.
+
+        Uses nsenter to enter all host namespaces (PID 1) and write to /dev/kmsg.
+        Creates real kernel messages with proper metadata that NVSentinel can detect.
+
+        Message format: "NVRM: NVRM: Xid (PCI:address): xid, message"
+        Duplicate "NVRM:" needed because /dev/kmsg splits on first colon.
+        """
+        try:
+            # Get PCI address for the GPU
+            pci_result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=pci.bus_id",
+                    "--format=csv,noheader",
+                    "-i",
+                    str(gpu_id),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+
+            if pci_result.returncode != 0:
+                return (
+                    False,
+                    f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}",
+                )
+
+            pci_addr_full = pci_result.stdout.strip()
+            pci_addr = self._normalize_pci_address(pci_addr_full)
+
+            # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing)
+            xid_message = (
+                f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, GPU has fallen off the bus."
+            )
+
+            # Write to host's /dev/kmsg via nsenter
+            kmsg_message = f"<3>{xid_message}"  # <3> = kernel error priority
+            nsenter_cmd = [
+                "nsenter",
+                "--target",
+                "1",  # Target host PID 1 (init)
+                "--mount",  # Enter mount namespace (for /dev/kmsg access)
+                "--uts",  # Enter UTS namespace (hostname)
+                "--ipc",  # Enter IPC namespace
+                "--pid",  # Enter PID namespace (appear as host process)
+                "--",
+                "sh",
+                "-c",
+                f"echo '{kmsg_message}' > /dev/kmsg",
+            ]
+
+            nsenter_result = subprocess.run(
+                nsenter_cmd, capture_output=True, text=True, timeout=5
+            )
+
+            if nsenter_result.returncode != 0:
+                return (
+                    False,
+                    f"Failed to write to host /dev/kmsg: {nsenter_result.stderr}",
+                )
+
+            return (
+                True,
+                f"XID {xid} injected for GPU {gpu_id} (PCI: {pci_addr}) → NVSentinel",
+            )
+
+        except Exception as e:
+            logger.error(f"XID injection failed: {type(e).__name__}: {e}")
+            return False, f"Failed to inject XID: {e}"
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
new file mode 100644
index 0000000000..9335445931
--- /dev/null
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.109.0
+httpx==0.26.0
+kubernetes==28.1.0
+pydantic==2.5.3
+python-multipart==0.0.6
+pyyaml==6.0.1
+uvicorn[standard]==0.27.0

From 9d823bb416c967e240ebae739100c6daf41a7d43 Mon Sep 17 00:00:00 2001
From: Oviya Seeniraj <oseeniraj@nvidia.com>
Date: Mon, 3 Nov 2025 12:42:33 -0800
Subject: [PATCH 2/6] fixed copyright and mypy issues

Signed-off-by: Oviya Seeniraj <oseeniraj@nvidia.com>
---
 .../agents/gpu-fault-injector/agent.py                    | 8 ++++----
 .../agents/gpu-fault-injector/requirements.txt            | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
index b7ffd8cb0a..8f3dce7f6d 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
@@ -18,7 +18,7 @@
 import os
 import subprocess
 from datetime import datetime, timezone
-from typing import Any, Optional
+from typing import Any, Optional, Type
 
 import uvicorn
 from fastapi import FastAPI, HTTPException
@@ -31,14 +31,14 @@
 logger = logging.getLogger(__name__)
 
 # Import kernel-level XID injector (for XID 79 via nsenter+kmsg)
+GPUXIDInjectorKernel: Optional[Type[Any]] = None
 try:
-    from gpu_xid_injector import GPUXIDInjectorKernel
+    from gpu_xid_injector import GPUXIDInjectorKernel  # type: ignore[assignment]
 
     KERNEL_XID_AVAILABLE = True
 except ImportError:
     logger.warning("Kernel-level XID injector not available")
     KERNEL_XID_AVAILABLE = False
-    GPUXIDInjectorKernel = None
 
 
 # ============================================================================
@@ -72,7 +72,7 @@ def __init__(self):
         # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg)
         self.kernel_xid_injector = None
         self.kernel_xid_available = False
-        if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel:
+        if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None:
             try:
                 self.kernel_xid_injector = GPUXIDInjectorKernel()
                 self.kernel_xid_available = self.kernel_xid_injector.privileged
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
index 9335445931..1c0100e308 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
 fastapi==0.109.0
 httpx==0.26.0
 kubernetes==28.1.0

From 01aa7686862f4d3115fb2c8bab51f6620f898523 Mon Sep 17 00:00:00 2001
From: Oviya Seeniraj <oseeniraj@nvidia.com>
Date: Tue, 4 Nov 2025 10:43:44 -0800
Subject: [PATCH 3/6] Updated requirements.txt to prevent security
 vulnerabilities

Signed-off-by: Oviya Seeniraj <oseeniraj@nvidia.com>
---
 .../agents/gpu-fault-injector/requirements.txt                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
index 1c0100e308..7e2fd24a9a 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
@@ -2,10 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-fastapi==0.109.0
+fastapi==0.111.0
 httpx==0.26.0
 kubernetes==28.1.0
 pydantic==2.5.3
-python-multipart==0.0.6
+python-multipart==0.0.20
 pyyaml==6.0.1
 uvicorn[standard]==0.27.0

From e9885e8d96fdbdc0b867d8169ba5f21825dd7eeb Mon Sep 17 00:00:00 2001
From: Oviya Seeniraj <oseeniraj@nvidia.com>
Date: Mon, 24 Nov 2025 17:25:57 -0800
Subject: [PATCH 4/6] support all XIDs in kmsg injector not just 79 + add
 predefined messages for all the DCGM/NVSentinel monitored XIDs Signed-off-by:
 Oviya Seeniraj <oseeniraj@nvidia.com>

---
 .../agents/gpu-fault-injector/agent.py        |  62 +++++-
 .../gpu-fault-injector/gpu_xid_injector.py    | 203 ++++++++++++++++--
 2 files changed, 237 insertions(+), 28 deletions(-)

diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
index 8f3dce7f6d..53643e94d2 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
@@ -6,12 +6,22 @@
 GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes.
 
 This agent provides privileged access for XID error injection:
-- XID 79 injection via nsenter+kmsg (writes to host's /dev/kmsg)
+- XID injection via nsenter+kmsg (writes to host's /dev/kmsg)
 - Triggers NVSentinel syslog-health-monitor detection
 - Initiates complete fault tolerance workflow
 
-All GPU errors are simulated by injecting appropriate XID codes to syslog.
-NVSentinel detects the XID and handles cordon/drain/restart/uncordon automatically.
+Accepts ANY XID error code for testing flexibility.
+Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+- Devastating: 79, 74, 48, 94, 95, 119, 120, 140
+- Memory: 31, 32, 43, 63, 64
+- PCIe: 38, 39, 42
+- Thermal: 60, 61, 62
+- Power: 54, 56, 57
+- Graphics: 13, 45, 69
+
+Unknown XIDs use generic error message format.
+NVSentinel detects XIDs and handles actions based on its own rules.
+See gpu_xid_injector.py for complete XID descriptions.
 """
 
 import logging
@@ -61,7 +71,12 @@ class XIDInjectRequest(BaseModel):
 
 
 class GPUFaultInjector:
-    """GPU fault injection operations with DCGM integration. XID 79 via nsenter+kmsg."""
+    """
+    GPU fault injection operations with DCGM integration.
+    
+    Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages).
+    Accepts any XID value (1-1000) for comprehensive fault tolerance testing.
+    """
 
     def __init__(self):
         self.active_faults: dict[str, dict[str, Any]] = {}
@@ -151,26 +166,51 @@ async def health_check():
 
 @app.post("/inject-xid")
 async def inject_xid(request: XIDInjectRequest):
-    """Inject XID 79 error via nsenter+kmsg (triggers NVSentinel detection)"""
+    """
+    Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection).
+    
+    Accepts any XID error code (1-1000) for maximum testing flexibility.
+    
+    Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+    
+    Devastating (always FAIL):
+    - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors
+    - 119/120: GSP errors | 140: ECC unrecovered
+    
+    Subsystem (may WARN/escalate):
+    - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement)
+    - PCIe: 38, 39, 42 (bus, fabric, replay rate)
+    - Thermal: 60, 61, 62 (temperature limits)
+    - Power: 54, 56, 57 (power/clock state)
+    - Graphics: 13, 45, 69 (SM exceptions)
+    
+    Unknown XIDs use generic error message - NVSentinel will parse and handle
+    based on its own XID database.
+    """
     logger.info(
         f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}"
     )
 
-    # Only XID 79 is supported via nsenter+kmsg
-    if request.xid_type != 79:
+    # Validate XID type is a reasonable integer (basic sanity check)
+    if not isinstance(request.xid_type, int) or request.xid_type < 1 or request.xid_type > 1000:
         raise HTTPException(
             status_code=400,
-            detail=f"XID {request.xid_type} not supported. Only XID 79 is implemented via nsenter+kmsg injection.",
+            detail=(
+                f"Invalid XID type: {request.xid_type}. "
+                f"XID must be an integer between 1-1000. "
+                f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)."
+            ),
         )
 
     if not injector.kernel_xid_available or not injector.kernel_xid_injector:
         raise HTTPException(
             status_code=503,
-            detail="Kernel-level XID injector not available. XID 79 requires privileged access to syslog/kmsg.",
+            detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.",
         )
 
-    success, message = injector.kernel_xid_injector.inject_xid_79_gpu_fell_off_bus(
-        gpu_id=request.gpu_id
+    # Use the generic inject_xid method which supports multiple XID types
+    success, message = injector.kernel_xid_injector.inject_xid(
+        xid_type=request.xid_type, gpu_id=request.gpu_id
     )
 
     if not success:
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
index 01ecd829f6..70e8f99444 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
@@ -3,33 +3,156 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 """
-GPU XID 79 Error Injector via nsenter+kmsg.
+GPU XID Error Injector via nsenter+kmsg.
 
-Injects fake XID 79 messages to host's /dev/kmsg to trigger NVSentinel detection.
+Injects fake XID messages to host's /dev/kmsg to trigger NVSentinel detection.
 Uses nsenter to enter host namespaces and write kernel messages that NVSentinel
 syslog-health-monitor can detect naturally.
 
 Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection
+
+Supported XIDs:
+===============
+This injector accepts ANY XID error code (1-255+) for maximum testing flexibility.
+
+Pre-defined Messages for All DCGM/NVSentinel Monitored XIDs:
+-------------------------------------------------------------
+Based on DCGM health monitoring subsystems and NVSentinel detection rules.
+
+DEVASTATING XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored):
+- 79:  GPU fell off bus (most critical - node-level action)
+- 74:  NVLink uncorrectable error (multi-GPU communication failure)
+- 48:  Double-bit ECC error (severe memory error)
+- 94:  Contained ECC error (less severe memory error)
+- 95:  Uncontained error (very severe, GPU reset required)
+- 119: GSP RPC Timeout (GPU Service Processor communication)
+- 120: GSP Error (GPU Service Processor internal error)
+- 140: ECC unrecovered error (persistent memory issue)
+
+SUBSYSTEM XIDs (DCGM_HEALTH_RESULT_WARN - may escalate):
+
+Memory Subsystem (DCGM_HEALTH_WATCH_MEM):
+- 31:  MMU Error
+- 32:  PBDMA Error
+- 43:  Reset Channel Verification Error
+- 63:  Pending Page Retirements
+- 64:  Row Remap Failure
+
+PCIe Subsystem (DCGM_HEALTH_WATCH_PCIE):
+- 38:  PCIe Bus Error
+- 39:  PCIe Fabric Error
+- 42:  PCIe Replay Rate exceeded
+
+Thermal Subsystem (DCGM_HEALTH_WATCH_THERMAL):
+- 60:  Clocks Event: Thermal limit exceeded
+- 61:  EDPP Power Brake: Thermal limit
+- 62:  Thermal Violations detected
+
+Power Subsystem (DCGM_HEALTH_WATCH_POWER):
+- 54:  Power state change event
+- 56:  Clock change event
+- 57:  Clocks Event: Power limit exceeded
+
+Graphics/Common XIDs:
+- 13:  Graphics Engine Exception
+- 45:  Preemptive Cleanup (due to previous errors)
+- 69:  Graphics Exception: Class Error
+
+Unknown XIDs:
+-------------
+Any XID not in XID_MESSAGES dict will use a generic error message format.
+NVSentinel will parse and handle based on its own XID database and rules.
+
+Note: XIDs 43, 48, 74, 94, 95 are already supported via CUDA interception
+(cuda_intercept.c LD_PRELOAD). kmsg injection adds complementary syslog-based
+detection path for NVSentinel's syslog-health-monitor.
 """
 
 import logging
 import os
 import subprocess
-from typing import Tuple
+from typing import Dict, Tuple
 
 logger = logging.getLogger(__name__)
 
+# XID error code to descriptive message mapping
+# Based on DCGM XID database and NVSentinel monitoring rules
+# Source: DCGM/modules/health/DcgmHealthWatch.cpp BuildXidMappings()
+XID_MESSAGES: Dict[int, str] = {
+    # Devastating XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored)
+    79: "GPU has fallen off the bus",
+    48: "DBE (Double Bit Error) ECC Error",
+    74: "NVLink: Uncorrectable error",
+    94: "Contained ECC error",
+    95: "Uncontained error - GPU requires reset",
+    119: "GSP RPC Timeout",
+    120: "GSP Error",
+    140: "ECC unrecovered error",
+    
+    # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM)
+    31: "MMU Error",
+    32: "PBDMA Error",
+    43: "Reset Channel Verification Error",
+    63: "Pending Page Retirements",
+    64: "Row Remap Failure",
+    
+    # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE)
+    38: "PCIe Bus Error",
+    39: "PCIe Fabric Error",
+    42: "PCIe Replay Rate exceeded",
+    # 74 already defined above (can be PCIe or NVLink context)
+    
+    # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL)
+    60: "Clocks Event: Thermal limit exceeded",
+    61: "EDPP Power Brake: Thermal limit",
+    62: "Thermal Violations detected",
+    # 63 can be thermal or memory context ("Thermal diode detects short")
+    
+    # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER)
+    54: "Power state change event",
+    56: "Clock change event",
+    57: "Clocks Event: Power limit exceeded",
+    
+    # Common Graphics XIDs (often seen in test environments)
+    13: "Graphics Engine Exception",
+    31: "GPU stopped responding",  # Can be both MMU or timeout context
+    45: "Preemptive Cleanup, due to previous errors",
+    69: "Graphics Exception: Class Error",
+}
 
 class GPUXIDInjectorKernel:
-    """XID 79 injector via nsenter+kmsg (triggers NVSentinel detection)"""
+    """
+    XID injector via nsenter+kmsg (triggers NVSentinel detection).
+    
+    Accepts ANY XID error code for maximum flexibility in testing.
+    Pre-defined messages exist for common critical XIDs, but any XID value
+    can be injected - NVSentinel will parse and handle based on its own rules.
+    
+    Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+    
+    Devastating XIDs (always trigger FAIL):
+    - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors
+    - 119/120: GSP errors, 140: ECC unrecovered
+    
+    Subsystem XIDs (trigger WARN, may escalate):
+    - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors
+    - PCIe (38, 39, 42): Bus, fabric, replay rate errors
+    - Thermal (60, 61, 62, 63): Temperature limit violations
+    - Power (54, 56, 57): Power/clock state changes
+    - Graphics (13, 45, 69): SM exceptions, preemptive cleanup
+    
+    Unknown XIDs use a generic error message format.
+    """
 
     def __init__(self):
         self.node_name = os.getenv("NODE_NAME", "unknown")
         self.privileged = self._check_privileged()
 
-        logger.info(f"XID 79 Injector initialized on {self.node_name}")
+        logger.info(f"XID Injector initialized on {self.node_name}")
         logger.info(f"Privileged: {self.privileged}")
+        logger.info(f"Known XIDs with specific messages: {sorted(XID_MESSAGES.keys())}")
         logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow")
+        logger.info("Note: Accepts ANY XID value - unknown XIDs use generic message")
 
     def _check_privileged(self) -> bool:
         """Check if we have privileged access (required for nsenter)"""
@@ -53,31 +176,59 @@ def _normalize_pci_address(self, pci_addr: str) -> str:
             return normalized
         return pci_addr
 
-    def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]:
+    def inject_xid(self, xid_type: int, gpu_id: int = 0) -> Tuple[bool, str]:
         """
-        Inject XID 79 (GPU Fell Off Bus) via nsenter+kmsg.
+        Inject ANY XID error code via nsenter+kmsg.
+
+        This method accepts any integer XID value for maximum testing flexibility.
+        Pre-defined messages exist for well-known XIDs (79, 74, 48, etc.), but
+        any XID can be injected. Unknown XIDs use a generic error message.
 
-        Writes XID message to host's /dev/kmsg → NVSentinel detects → Full FT workflow
+        Args:
+            xid_type: XID error code (any integer, commonly 1-255)
+            gpu_id: GPU device ID (default: 0)
 
-        Returns: (success, message)
+        Returns:
+            Tuple of (success: bool, message: str)
         """
-        logger.info(f"Injecting XID 79 for GPU {gpu_id}")
+        logger.info(f"Injecting XID {xid_type} for GPU {gpu_id}")
 
         if not self.privileged:
             return (
                 False,
-                "XID 79 injection requires privileged mode (nsenter needs root)",
+                f"XID {xid_type} injection requires privileged mode (nsenter needs root)",
             )
 
-        success, msg = self._inject_fake_xid_to_kmsg(gpu_id, 79)
+        success, msg = self._inject_fake_xid_to_kmsg(gpu_id, xid_type)
 
         if success:
-            logger.info(f"XID 79 injected successfully: {msg}")
+            logger.info(f"XID {xid_type} injected successfully: {msg}")
             return True, msg
         else:
-            logger.error(f"XID 79 injection failed: {msg}")
+            logger.error(f"XID {xid_type} injection failed: {msg}")
             return False, msg
 
+    # Convenience methods for specific XIDs (backward compatibility)
+    def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 79 (GPU Fell Off Bus) - most critical hardware failure."""
+        return self.inject_xid(79, gpu_id)
+
+    def inject_xid_74_nvlink_error(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 74 (NVLink error) - multi-GPU communication failure."""
+        return self.inject_xid(74, gpu_id)
+
+    def inject_xid_48_ecc_dbe(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 48 (Double-bit ECC error) - severe memory error."""
+        return self.inject_xid(48, gpu_id)
+
+    def inject_xid_94_ecc_contained(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 94 (Contained ECC error) - less severe memory error."""
+        return self.inject_xid(94, gpu_id)
+
+    def inject_xid_95_uncontained(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 95 (Uncontained error) - very severe, GPU reset required."""
+        return self.inject_xid(95, gpu_id)
+
     def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
         """
         Inject fake XID message to host's /dev/kmsg via nsenter.
@@ -87,6 +238,18 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
 
         Message format: "NVRM: NVRM: Xid (PCI:address): xid, message"
         Duplicate "NVRM:" needed because /dev/kmsg splits on first colon.
+
+        Args:
+            gpu_id: GPU device ID (from nvidia-smi)
+            xid: XID error code (currently only 79 is used by public API)
+
+        Returns:
+            Tuple of (success: bool, message: str)
+
+        Note: This method accepts any XID code as a parameter for extensibility.
+        To add support for other XIDs (74, 48, 95, etc.), create corresponding
+        public methods like inject_xid_74_nvlink_error() and update the error
+        message template for each XID type.
         """
         try:
             # Get PCI address for the GPU
@@ -112,11 +275,17 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
             pci_addr_full = pci_result.stdout.strip()
             pci_addr = self._normalize_pci_address(pci_addr_full)
 
-            # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing)
-            xid_message = (
-                f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, GPU has fallen off the bus."
+            # Get appropriate error message for this XID type
+            # If XID is known, use specific message; otherwise use generic format
+            error_msg = XID_MESSAGES.get(
+                xid, f"Graphics Exception: XID {xid} occurred on GPU"
             )
 
+            # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing)
+            # Format matches NVSentinel pattern: NVRM: Xid (PCI:addr): code, description
+            xid_message = f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, {error_msg}"
+            logger.debug(f"Formatted XID message: {xid_message}")
+
             # Write to host's /dev/kmsg via nsenter
             kmsg_message = f"<3>{xid_message}"  # <3> = kernel error priority
             nsenter_cmd = [

From 9af21424219e8da535057bf56eb074e76f0a6450 Mon Sep 17 00:00:00 2001
From: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Wed, 26 Nov 2025 16:30:35 -0500
Subject: [PATCH 5/6] Update
 tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile

Signed-off-by: Harrison Saturley-Hall <harrison.saturley.hall@gmail.com>
---
 .../agents/gpu-fault-injector/Dockerfile                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
index 9245224b2b..78c689fb5a 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
@@ -11,7 +11,7 @@
 FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04
 
 # Install system dependencies (nsenter, nvidia-smi, journalctl)
-RUN apt-get update && apt-get install -y \
+RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
     python3 \
     python3-pip \
     curl \

From 97afb3f179d9f9b80d03166abb36587a4b310e52 Mon Sep 17 00:00:00 2001
From: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Wed, 26 Nov 2025 16:32:42 -0500
Subject: [PATCH 6/6] fix: precommit formatting

Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
---
 .../agents/gpu-fault-injector/agent.py         | 18 +++++++++++-------
 .../gpu-fault-injector/gpu_xid_injector.py     | 16 ++++++----------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
index 53643e94d2..98831d3cb6 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
@@ -73,7 +73,7 @@ class XIDInjectRequest(BaseModel):
 class GPUFaultInjector:
     """
     GPU fault injection operations with DCGM integration.
-    
+
     Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages).
     Accepts any XID value (1-1000) for comprehensive fault tolerance testing.
     """
@@ -168,22 +168,22 @@ async def health_check():
 async def inject_xid(request: XIDInjectRequest):
     """
     Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection).
-    
+
     Accepts any XID error code (1-1000) for maximum testing flexibility.
-    
+
     Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
-    
+
     Devastating (always FAIL):
     - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors
     - 119/120: GSP errors | 140: ECC unrecovered
-    
+
     Subsystem (may WARN/escalate):
     - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement)
     - PCIe: 38, 39, 42 (bus, fabric, replay rate)
     - Thermal: 60, 61, 62 (temperature limits)
     - Power: 54, 56, 57 (power/clock state)
     - Graphics: 13, 45, 69 (SM exceptions)
-    
+
     Unknown XIDs use generic error message - NVSentinel will parse and handle
     based on its own XID database.
     """
@@ -192,7 +192,11 @@ async def inject_xid(request: XIDInjectRequest):
     )
 
     # Validate XID type is a reasonable integer (basic sanity check)
-    if not isinstance(request.xid_type, int) or request.xid_type < 1 or request.xid_type > 1000:
+    if (
+        not isinstance(request.xid_type, int)
+        or request.xid_type < 1
+        or request.xid_type > 1000
+    ):
         raise HTTPException(
             status_code=400,
             detail=(
diff --git a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
index 70e8f99444..26c1d4521e 100644
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
@@ -88,31 +88,26 @@
     119: "GSP RPC Timeout",
     120: "GSP Error",
     140: "ECC unrecovered error",
-    
     # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM)
     31: "MMU Error",
     32: "PBDMA Error",
     43: "Reset Channel Verification Error",
     63: "Pending Page Retirements",
     64: "Row Remap Failure",
-    
     # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE)
     38: "PCIe Bus Error",
     39: "PCIe Fabric Error",
     42: "PCIe Replay Rate exceeded",
     # 74 already defined above (can be PCIe or NVLink context)
-    
     # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL)
     60: "Clocks Event: Thermal limit exceeded",
     61: "EDPP Power Brake: Thermal limit",
     62: "Thermal Violations detected",
     # 63 can be thermal or memory context ("Thermal diode detects short")
-    
     # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER)
     54: "Power state change event",
     56: "Clock change event",
     57: "Clocks Event: Power limit exceeded",
-    
     # Common Graphics XIDs (often seen in test environments)
     13: "Graphics Engine Exception",
     31: "GPU stopped responding",  # Can be both MMU or timeout context
@@ -120,27 +115,28 @@
     69: "Graphics Exception: Class Error",
 }
 
+
 class GPUXIDInjectorKernel:
     """
     XID injector via nsenter+kmsg (triggers NVSentinel detection).
-    
+
     Accepts ANY XID error code for maximum flexibility in testing.
     Pre-defined messages exist for common critical XIDs, but any XID value
     can be injected - NVSentinel will parse and handle based on its own rules.
-    
+
     Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
-    
+
     Devastating XIDs (always trigger FAIL):
     - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors
     - 119/120: GSP errors, 140: ECC unrecovered
-    
+
     Subsystem XIDs (trigger WARN, may escalate):
     - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors
     - PCIe (38, 39, 42): Bus, fabric, replay rate errors
     - Thermal (60, 61, 62, 63): Temperature limit violations
     - Power (54, 56, 57): Power/clock state changes
     - Graphics (13, 45, 69): SM exceptions, preemptive cleanup
-    
+
     Unknown XIDs use a generic error message format.
     """