NVIDIA-NeMo · rapaul-nv · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 11, 2026
diff --git a/deploy/nemotron-customizer/airgap/.gitignore b/deploy/nemotron-customizer/airgap/.gitignore
@@ -0,0 +1,7 @@
+# Generated by airgap runner.
+out/
+airgap-bundle/
+archives/
+__pycache__/
+*.lock.yaml
+*.tar
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution b/deploy/nemotron-customizer/airgap/Dockerfile.execution
@@ -0,0 +1,52 @@
+# Derivative execution image for Nemotron Customizer airgap.
+# Built from the real training/runtime image and only adds small missing
+# wrapper packages.
+
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ARG EXECUTION_REQUIREMENTS
+ARG REPO_OVERLAYS
+ARG REPO_OVERLAYS_DIR
+ARG PYTHON_BIN=python
+ARG PIP_NO_DEPS=true
+
+ENV HF_HUB_OFFLINE=1
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
+ENV WANDB_MODE=offline
+
+COPY ${EXECUTION_REQUIREMENTS} /opt/nemotron-airgap/execution-requirements.txt
+COPY ${REPO_OVERLAYS} /opt/nemotron-airgap/repo-overlays.json
+COPY ${REPO_OVERLAYS_DIR}/ /opt/nemotron-airgap/repo-overlays/
+
+# Build-time installs keep --no-cache-dir so derivative image layers stay small.
+RUN if [ -s /opt/nemotron-airgap/execution-requirements.txt ]; then \
+      if [ "${PIP_NO_DEPS}" = "true" ]; then \
+        ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/execution-requirements.txt; \
+      else \
+        ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/execution-requirements.txt; \
+      fi; \
+    fi && \
+    ${PYTHON_BIN} - <<'PY'
+import json
+import pathlib
+import shutil
+
+root = pathlib.Path("/opt/nemotron-airgap/repo-overlays")
+items = json.loads(pathlib.Path("/opt/nemotron-airgap/repo-overlays.json").read_text())
+for item in items:
+    repo = item["repo"]
+    source = item.get("source", repo)
+    target = pathlib.Path(item["target"])
+    src = root / source
+    if not src.exists():
+        raise SystemExit(f"missing baked repo overlay: {src}")
+    if target.exists() or target.is_symlink():
+        if target.is_dir() and not target.is_symlink():
+            shutil.rmtree(target)
+        else:
+            target.unlink()
+    target.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copytree(src, target)
+PY
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore
@@ -0,0 +1,14 @@
+**
+
+!deploy
+!deploy/nemotron-customizer
+!deploy/nemotron-customizer/airgap
+!deploy/nemotron-customizer/airgap/out
+!deploy/nemotron-customizer/airgap/out/execution-context
+!deploy/nemotron-customizer/airgap/out/execution-context/**
+!deploy/nemotron-customizer/airgap/out/repo-overlays
+!deploy/nemotron-customizer/airgap/out/repo-overlays/**
+
+**/.git
+**/__pycache__
+**/*.pyc
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher b/deploy/nemotron-customizer/airgap/Dockerfile.launcher
@@ -0,0 +1,30 @@
+# Launcher image for Nemotron Customizer airgap.
+# It contains the repo and a uv-synced environment. It does not run training.
+
+ARG BASE_IMAGE=python:3.12-slim
+FROM ${BASE_IMAGE}
+
+ARG UV_VERSION=0.11.1
+
+WORKDIR /workspace/Nemotron
+
+ENV UV_LINK_MODE=copy
+ENV UV_PYTHON_DOWNLOADS=never
+ENV HF_HUB_OFFLINE=1
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
+ENV WANDB_MODE=offline
+ENV PYTHONPATH=/workspace/Nemotron/src
+ENV PATH=/workspace/Nemotron/.venv/bin:$PATH
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python -m pip install --no-cache-dir "uv==${UV_VERSION}"
+
+COPY . .
+
+RUN uv sync --frozen --no-dev
+
+CMD ["bash"]
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore
@@ -0,0 +1,21 @@
+.git
+.venv
+.ruff_cache
+.pytest_cache
+**/__pycache__
+**/*.pyc
+
+/.nemo_run
+/outputs
+/output
+/logs
+/checkpoints
+/wandb
+/data
+/downloads
+
+deploy/nemotron-customizer/airgap/out
+deploy/nemotron-customizer/airgap/airgap-bundle
+deploy/nemotron-customizer/airgap/archives
+deploy/nemotron-customizer/airgap/*.tar
+deploy/nemotron-customizer/airgap/*.lock.yaml
diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md
@@ -0,0 +1,135 @@
+# Nemotron Customizer Airgap
+
+This folder is scoped only to Nemotron Customizer steps under
+`src/nemotron/steps/`.
+
+The flow is intentionally small:
+
+1. Build one **launcher image** with this repo and `uv.lock`.
+2. Build one or more **execution images** by grouping selected workflow stages by base image.
+3. Save those images as tarballs for the airgapped side.
+4. Keep models, datasets, checkpoints, and customer files on persistent storage.
+
+Edit `airgap.yaml` first:
+
+- `workflow.stages`: the Nemotron Customizer steps the customer wants to run
+- `dependencies`: central step dependency map, for example SFT training needs SFT packing
+- `step_execution_images`: which execution image each step should use
+- `execution_images`: the base image, output tag, and known/import-probed Python requirements
+
+Only steps reached from `workflow.stages` are built. Steps are grouped by
+`base_image + repo_overlays`; each group gets one derivative image with the
+union of its small missing packages. If two selected step families share the
+same base image and repo overlays, the runner emits one combined execution image for
+both.
+
+Run from the repo root:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml
+```
+
+That prints the plan. To actually pull/build/save images on the connected
+machine:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --execute
+```
+
+To run only a few stages:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --stage validate \
+  --stage discover-execution-deps
+```
+
+To override the workflow without editing YAML, pass one or more selected
+Nemotron step targets. Dependencies are still expanded from `dependencies`.
+For example, SDG plus SFT also adds `prep/sft_packing` because SFT needs packed
+data:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --target sdg/data_designer:tiny \
+  --target sft/megatron_bridge:tiny
+```
+
+Outputs are written under `deploy/nemotron-customizer/airgap/out/` by default:
+
+- `airgap-manifest.yaml`: what was validated and built
+- `airgap-build-state.yaml`: incomplete execute run state used for resume
+- `airgap-build-complete.yaml`: final execute run state after success
+- `requirements-<execution-group>.txt`: small missing packages per execution image
+- `repo-overlays-<execution-group>.json`: git auto-mounts discovered from selected step configs
+- `launcher-image.tar`
+- `execution-*.tar`
+- SHA256 checksums for saved image tarballs in `airgap-manifest.yaml`
+
+If an execute run fails midway, leave `airgap-build-state.yaml` in place and rerun
+the same command. Completed expensive actions are reused when their artifacts
+still exist. If you intentionally change the workflow or image plan before
+finishing, move or remove `airgap-build-state.yaml` first; the runner will not
+silently overwrite incomplete state from a different plan.
+
+Runtime dependency probes use Docker volumes named
+`nemotron-airgap-pip-cache-<platform>` to avoid downloading the same wheels on
+every probe loop. To reset them, run `docker volume ls | grep
+nemotron-airgap-pip-cache` and remove the relevant volume with
+`docker volume rm`.
+
+Large assets are not baked into images. The customer should stage them on
+executor-visible persistent storage and reference them through config overrides
+and `run.env.mounts`.
+
+During dependency discovery, the runner mounts the connected-machine checkout
+into each execution image only to probe imports. The final execution image deliberately
+does not bake this repo; the launcher image and the normal nemo-run/nemo-runspec
+code transport provide the repo to the remote job at submission time.
+
+Repo logistics stay outside `airgap.yaml`. If a selected step config contains
+`${auto_mount:git+...}`, the runner treats it as a connected-machine build input:
+it fetches that pinned repo and bakes it into the derivative execution image at the
+requested target path. Runtime jobs then use the baked image and do not clone
+from GitHub. Site-specific data/model mounts remain in env profiles or step
+overrides.
+
+If the connected machine is not the same architecture as the target cluster,
+set `platform: linux/amd64` on the `launcher_image` or execution image entry in
+`airgap.yaml`. If you need to minimize transfer size for several images that
+share layers, `docker save -o all-images.tar tag1 tag2 ...` can be used after
+the runner builds the images; a single tar deduplicates shared layers better
+than one tar per image.
+
+The Dockerfiles expect the chosen base images to have Python and `pip` available
+for bootstrapping small offline additions. The runtime defaults bake
+`HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, and
+`WANDB_MODE=offline`; customers with an internal mirror can override those at
+submission time through their env profile or `run.env.env_vars`.
+
+For SFT Megatron-Bridge, build with the normal config so the runner can discover
+the pinned Megatron-LM and Megatron-Bridge auto-mounts:
+
+```yaml
+workflow:
+  stages:
+    - sft/megatron_bridge:tiny
+```
+
+When submitting inside the airgap, use the deploy overlay config so those git
+auto-mounts are cleared at runtime while persistent storage mounts from the env
+profile still apply. Use the image printed by the runner under
+`selected execution images`, or read it from `out/airgap-manifest.yaml` under
+`step_execution_images`.
+
+```bash
+uv run nemotron steps run sft/megatron_bridge \
+  -c deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml \
+  -b <your-airgap-profile> \
+  run.env.container_image=<image-printed-for-sft/megatron_bridge>
+```
diff --git a/deploy/nemotron-customizer/airgap/SKILL.md b/deploy/nemotron-customizer/airgap/SKILL.md
@@ -0,0 +1,115 @@
+---
+name: nemotron-customizer-airgap
+description: Prepare, validate, build, and use Nemotron Customizer airgap image bundles for offline clusters. Use when planning airgapped deployments, editing deploy/nemotron-customizer/airgap/airgap.yaml, selecting workflow targets, grouping step execution images, baking repo overlays or wheel additions, resuming airgap runner builds, or submitting `nemotron steps run` jobs inside an airgapped environment.
+---
+
+# Nemotron Customizer Airgap
+
+Use this skill to help an agent produce a connected-machine airgap bundle and
+then submit Nemotron Customizer steps from the airgapped side. Keep it grounded
+in the checked-in runner and manifests; do not invent a parallel packaging flow.
+
+## Read First
+
+- `deploy/nemotron-customizer/airgap/README.md` for the operator flow.
+- `deploy/nemotron-customizer/airgap/airgap.yaml` for the current image map.
+- `deploy/nemotron-customizer/airgap/runner.py` when changing behavior.
+- `tests/deploy/test_airgap_runner.py` before editing runner logic.
+- `deploy/nemotron-customizer/airgap/configs/` for runtime overlay configs.
+
+For selected steps, inspect the catalog through the CLI:
+
+```bash
+uv run nemotron steps show <step_id> --json
+```
+
+## Workflow
+
+1. Establish the side of the workflow:
+   - Connected machine: validate, build, save image tarballs.
+   - Airgapped side: load images, set env profiles, run selected steps.
+
+2. Gather the minimum inputs:
+   - Target steps and config names, for example `sft/megatron_bridge:tiny`.
+   - Target architecture or Docker platform, for example `linux/amd64`.
+   - Available base images and whether the connected machine can pull them.
+   - Airgapped env profile name, mounts, model/data/checkpoint locations.
+   - Whether destructive or expensive actions such as `--execute`, Docker build,
+     Docker volume cleanup, or state-file removal are explicitly allowed.
+
+3. Plan with the runner first:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml
+```
+
+Use `--target <step_id>:<config>` for one-off selections without editing YAML.
+The runner expands dependencies from `dependencies`, validates selected step
+files/configs, groups execution images, and prints selected execution images.
+
+4. Edit `airgap.yaml` only where the runner expects configuration:
+   - `workflow.stages` or CLI `--target` for selected customer steps.
+   - `dependencies` for explicit upstream Nemotron Customizer step outputs.
+   - `step_execution_images` for step-to-image mapping.
+   - `execution_images` for base image, tag, tar, platform, and import probes.
+   - `launcher_image` for the launcher container.
+
+5. Execute only when the user asks for a real build:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --execute
+```
+
+If a build fails midway, keep `airgap-build-state.yaml` and rerun the same
+command. Remove or move that state only when intentionally changing the plan.
+
+6. On the airgapped side, use images from `out/airgap-manifest.yaml` under
+`step_execution_images`. Submit with the plural CLI:
+
+```bash
+uv run nemotron steps run <step_id> \
+  -c <config-or-airgap-overlay> \
+  -b <airgap-profile> \
+  run.env.container_image=<image-from-manifest>
+```
+
+For `sft/megatron_bridge`, prefer the airgap overlay configs under
+`deploy/nemotron-customizer/airgap/configs/`; they clear runtime git auto-mounts
+because the runner bakes those repos into the execution image.
+
+## Guardrails
+
+- Keep models, datasets, checkpoints, secrets, and customer files out of images.
+  Put them on persistent storage and reference them through config overrides and
+  `run.env.mounts`.
+- Treat `${auto_mount:git+...}` as a connected-machine build input. The runner
+  bakes pinned repo overlays into execution images so airgapped jobs do not clone
+  from GitHub.
+- Do not add missing packages blindly. Let `discover-execution-deps` and
+  import probes determine small additions; keep heavyweight framework deps in
+  the base image choice.
+- Preserve offline defaults unless the user has an internal mirror:
+  `HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`,
+  and `WANDB_MODE=offline`.
+- Use `nemotron steps ...`; do not reintroduce `nemotron step ...`.
+
+## Validation
+
+After edits to runner logic, YAML structure, or airgap docs, run:
+
+```bash
+uv run pytest tests/deploy/test_airgap_runner.py -q
+```
+
+For CLI-facing examples, also smoke the command shape:
+
+```bash
+uv run nemotron steps --help
+uv run nemotron steps show prep/sft_packing --json
+```
+
+Do not run Docker build/save stages during validation unless the user explicitly
+asked for a real connected-machine bundle build.