Fix large batch NaN bug with broadcast biasAdd kernel

Root cause: GpuBuffer.add was reading batchSize*10 elements from a 10-element bias buffer, causing garbage memory reads and NaN. Changes: - Add bias_add Metal kernel with proper broadcast semantics (gid % stride) - Add type-safe CpuBuffer/GpuBuffer API (no implicit coercions) - Add Float.inf/negInf definitions via IEEE 754 division by zero - Update GpuMNIST to use biasAdd for output layer bias Training now achieves 92.1% accuracy on 10k samples.
lecopivo · alok · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
commit d3ea5f322bd34b591ed10deeabc507eca22b9edb
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -65,3 +65,19 @@ SciLean uses dependent types (`Float^[784]`, `Float^[128, 784]`) wrapping comput
 - Make heavy use of metaprogramming for tactics and automation
 - Clear distinction between forward and reverse mode differentiation in naming
 - Add existing imports as comments when disabling them
+
+## TODO (for future sessions)
+- Reenable doc.verso
+
+## Lean 4 Tips
+- **Float infinity**: Lean 4 stdlib doesn't have `Float.inf`. Define as:
+  ```lean
+  def Float.inf : Float := 1.0 / 0.0
+  def Float.negInf : Float := -1.0 / 0.0
+  ```
+  These are proper IEEE 754 infinity values for min/max tracking.
+
+  ---
+
+  use lean-lsp-mcp hover on nested src code after writing it to ENSURE its in
+  the right namespace. like `Float.inf` may need to be `_root_.Float.inf`.
diff --git a/Metal/kmeans.metal b/Metal/kmeans.metal
@@ -1366,6 +1366,22 @@ kernel void bias_gelu(
     }
 }
 
+// Add bias only (no activation) - for output layer before softmax
+// Broadcasts bias across batch dimension: output[i] = input[i] + bias[i % stride]
+kernel void bias_add(
+    device const float* input [[buffer(0)]],
+    device const float* bias [[buffer(1)]],
+    device float* output [[buffer(2)]],
+    constant uint& n [[buffer(3)]],
+    constant uint& stride [[buffer(4)]],
+    uint gid [[thread_position_in_grid]]
+) {
+    if (gid < n) {
+        uint bias_idx = gid % stride;
+        output[gid] = input[gid] + bias[bias_idx];
+    }
+}
+
 // Fused layer norm: y = (x - mean) / sqrt(var + eps) * gamma + beta
 // This is a simplified version for vectors (no batch dimension)
 kernel void layer_norm(

diff --git a/Metal/metal_backend.mm b/Metal/metal_backend.mm
@@ -1192,6 +1192,70 @@ LEAN_EXPORT lean_obj_res scilean_gpu_bias_gelu_f32(
     }
 }
 
+// Bias add (no activation) on GPU buffers
+// Broadcasts bias across batch dimension: output[i] = input[i] + bias[i % stride]
+// Used for output layer before softmax where we don't want activation
+LEAN_EXPORT lean_obj_res scilean_gpu_bias_add_f32(
+    b_lean_obj_arg X_buf,
+    b_lean_obj_arg bias_buf,
+    size_t n,
+    size_t stride,
+    lean_obj_arg /* world */
+) {
+    if (!ensure_metal_initialized()) {
+        return lean_io_result_mk_error(lean_mk_string("Metal not available"));
+    }
+
+    id<MTLBuffer> X = get_mtl_buffer(X_buf);
+    id<MTLBuffer> bias = get_mtl_buffer(bias_buf);
+    if (!X || !bias) {
+        return lean_io_result_mk_error(lean_mk_string("Invalid GpuBuffer"));
+    }
+
+    @autoreleasepool {
+        id<MTLComputePipelineState> pipeline = get_pipeline(@"bias_add");
+        if (!pipeline) {
+            return lean_io_result_mk_error(lean_mk_string("Failed to get bias_add pipeline"));
+        }
+
+        size_t output_size = n * sizeof(float);
+        id<MTLBuffer> Y = get_pooled_buffer(output_size);
+        if (!Y) {
+            Y = [device newBufferWithLength:output_size options:MTLResourceStorageModeShared];
+        }
+
+        uint32_t n32 = (uint32_t)n;
+        uint32_t stride32 = (uint32_t)stride;
+
+        // Use batch encoder if in batch mode
+        bool batched = is_batch_mode();
+        id<MTLCommandBuffer> commandBuffer = batched ? g_batch_command_buffer : [commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = batched ? g_batch_encoder : [commandBuffer computeCommandEncoder];
+
+        [encoder setComputePipelineState:pipeline];
+        [encoder setBuffer:X offset:0 atIndex:0];
+        [encoder setBuffer:bias offset:0 atIndex:1];
+        [encoder setBuffer:Y offset:0 atIndex:2];
+        [encoder setBytes:&n32 length:sizeof(n32) atIndex:3];
+        [encoder setBytes:&stride32 length:sizeof(stride32) atIndex:4];
+
+        MTLSize gridSize = MTLSizeMake(n, 1, 1);
+        NSUInteger tgSize = MIN(pipeline.maxTotalThreadsPerThreadgroup, n);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)];
+
+        if (!batched) {
+            [encoder endEncoding];
+            [commandBuffer commit];
+            [commandBuffer waitUntilCompleted];
+        } else {
+            [g_batch_outputs addObject:Y];
+        }
+
+        lean_obj_res result = wrap_gpu_buffer(Y, output_size);
+        return lean_io_result_mk_ok(result);
+    }
+}
+
 // Average pooling 2D on GPU buffers
 // Supports batching: when in batch mode, queues to shared command buffer
 LEAN_EXPORT lean_obj_res scilean_gpu_avgpool2d_f32(

diff --git a/SciLean/FFI/Metal.lean b/SciLean/FFI/Metal.lean
@@ -10,6 +10,9 @@ matrix (gemv, gemm variants), fill, kmeans.
 Performance on M4: gemmSimd ~10 TFLOP/s, gemmTiled ~6 TFLOP/s at 2048x2048.
 -/
 
+import SciLean.FFI.Float32Array
+import SciLean.Util.Float
+
 namespace SciLean.Metal
 
 /-! ## Core -/
@@ -22,26 +25,7 @@ opaque isAvailable : Unit → Bool
 def withGPU [Inhabited α] (gpuFn cpuFn : Unit → α) : α :=
   if isAvailable () then gpuFn () else cpuFn ()
 
-/-! ## GPU-Resident Buffers
-
-GPU-resident buffers stay on the GPU between operations, eliminating the overhead
-of copying data to/from CPU memory on every operation. This is critical for
-performance in ML workloads where data flows through many operations.
-
-Usage pattern:
-```
--- Upload once
-let weights ← GpuBuffer.fromByteArray weightData
-let input ← GpuBuffer.fromByteArray inputData
-
--- Chain operations on GPU (no copies!)
-let h1 ← GpuBuffer.gemm weights input m k n
-let h2 ← GpuBuffer.relu h1
-
--- Download only final result
-let output ← h2.toByteArray
-```
--/
+/-! ## GPU-Resident Buffers -/
 
 /-! ## Command Buffer Batching
 
@@ -99,11 +83,11 @@ namespace GpuBuffer
 @[extern "scilean_gpu_alloc_f32"]
 opaque alloc (numFloats : USize) : IO GpuBuffer
 
-/-- Upload ByteArray (Float32 data) to GPU -/
+/-- Upload ByteArray (Float32 data) to GPU (low-level, prefer `CpuBuffer.upload` for type safety) -/
 @[extern "scilean_gpu_upload_f32"]
 opaque fromByteArray (data : @& ByteArray) : IO GpuBuffer
 
-/-- Download GPU buffer to ByteArray -/
+/-- Download GPU buffer to ByteArray (low-level, prefer `GpuBuffer.download` for type safety) -/
 @[extern "scilean_gpu_download_f32"]
 opaque toByteArray (buf : @& GpuBuffer) : IO ByteArray
 
@@ -185,6 +169,13 @@ opaque layerNorm (x gamma beta : @& GpuBuffer) (n hiddenSize : USize) : IO GpuBu
 @[extern "scilean_gpu_bias_gelu_f32"]
 opaque biasGelu (x bias : @& GpuBuffer) (n stride : USize) : IO GpuBuffer
 
+/-- Bias + add (no activation): y = x + bias (broadcast)
+    For output layer before softmax where we don't want activation.
+    n = total elements, stride = bias size (broadcast across batch).
+    Supports batching. -/
+@[extern "scilean_gpu_bias_add_f32"]
+opaque biasAdd (x bias : @& GpuBuffer) (n stride : USize) : IO GpuBuffer
+
 /-- Average pooling 2D
     Supports batching. -/
 @[extern "scilean_gpu_avgpool2d_f32"]
@@ -286,6 +277,81 @@ opaque colSum (x : @& GpuBuffer) (rows cols : USize) : IO GpuBuffer
 
 end GpuBuffer
 
+/-! ## Type-Safe CPU/GPU Buffer System
+
+Data transfer between CPU and GPU is a major performance bottleneck. This type system
+makes transfers **explicit** at the type level - no implicit coercions allowed!
+
+- `CpuBuffer` - CPU-resident data (wrapper around ByteArray)
+- `GpuBuffer` - GPU-resident data (opaque Metal buffer handle)
+
+To transfer data, you MUST use explicit functions:
+- `CpuBuffer.upload : CpuBuffer → IO GpuBuffer` (CPU → GPU)
+- `GpuBuffer.download : GpuBuffer → IO CpuBuffer` (GPU → CPU)
+
+This prevents accidental data transfers that kill performance. GPU operations only
+accept `GpuBuffer`, CPU operations only accept `CpuBuffer`.
+
+Usage pattern:
+```lean
+-- Load data on CPU
+let cpuWeights : CpuBuffer := ⟨weightData⟩
+let cpuInput : CpuBuffer := ⟨inputData⟩
+
+-- Explicit upload to GPU
+let gpuWeights ← cpuWeights.upload
+let gpuInput ← cpuInput.upload
+
+-- Chain operations on GPU (no copies! type system enforces this)
+let h1 ← GpuBuffer.gemm gpuWeights gpuInput m k n
+let h2 ← GpuBuffer.relu h1
+
+-- Explicit download when needed
+let cpuOutput ← h2.download
+let outputBytes := cpuOutput.data  -- access underlying ByteArray
+```
+-/
+
+/-- CPU-resident buffer. Wrapper around ByteArray that prevents implicit conversion to GpuBuffer.
+    Use `.upload` to explicitly move data to GPU. -/
+structure CpuBuffer where
+  /-- The underlying raw byte data (Float32 format) -/
+  data : ByteArray
+  deriving Inhabited
+
+namespace CpuBuffer
+
+/-- Size in bytes -/
+@[inline] def sizeBytes (buf : CpuBuffer) : Nat := buf.data.size
+
+/-- Size in Float32 elements -/
+@[inline] def numFloats (buf : CpuBuffer) : Nat := buf.data.size / 4
+
+/-- Create a zero-initialized CPU buffer with n Float32 elements -/
+def zeros (n : Nat) : CpuBuffer :=
+  ⟨ByteArray.replicateFloat32 n 0.0⟩
+
+/-- Upload CPU buffer to GPU. This is an EXPLICIT transfer operation. -/
+def upload (buf : CpuBuffer) : IO GpuBuffer :=
+  GpuBuffer.fromByteArray buf.data
+
+end CpuBuffer
+
+namespace GpuBuffer
+
+/-- Download GPU buffer to CPU. This is an EXPLICIT transfer operation.
+    Returns a type-safe CpuBuffer wrapper. -/
+def download (buf : GpuBuffer) : IO CpuBuffer := do
+  let data ← toByteArray buf
+  return ⟨data⟩
+
+end GpuBuffer
+
+-- IMPORTANT: No `Coe CpuBuffer GpuBuffer` instance!
+-- IMPORTANT: No `Coe CpuBuffer ByteArray` instance!
+-- IMPORTANT: No `Coe ByteArray CpuBuffer` instance!
+-- All transfers must be explicit.
+
 /-! ## Matrix Operations -/
 
 -- Matrix-vector multiply on GPU: y = A * x. A is m x n, x is n-dim, returns m-dim y

diff --git a/SciLean/Util/Float.lean b/SciLean/Util/Float.lean
@@ -0,0 +1,12 @@
+/-
+Float utilities for SciLean
+
+Lean 4 stdlib doesn't provide Float.inf/negInf, so we define them here
+using IEEE 754 division by zero semantics.
+-/
+
+/-- IEEE 754 positive infinity -/
+def Float.inf : Float := 1.0 / 0.0
+
+/-- IEEE 754 negative infinity -/
+def Float.negInf : Float := -1.0 / 0.0