test: Add PyTorch→Lean MNIST verification pipeline

Validates full data interchange: - test_npy_roundtrip.py: Create test .npy files - train_mnist_export.py: Train 784→128→10 MLP, export weights - TestNpyRoundtrip.lean: Verify 1D/2D arrays, matmul, softmax - VerifyPyTorchMNIST.lean: Load PyTorch weights, verify logits match All logits match Python to 1e-4 tolerance. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
lecopivo · alok · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
commit 690b55122c7d44b3f6efbc53bc05a6940d8e6ac7
diff --git a/examples/TestNpyRoundtrip.lean b/examples/TestNpyRoundtrip.lean
@@ -0,0 +1,102 @@
+/-
+Test .npy roundtrip: Load arrays from Python, verify values match.
+-/
+import SciLean
+import SciLean.Data.Npy
+import SciLean.Data.DataArray.TensorOperations
+
+open SciLean
+
+def almostEq (a b : Float) (tol : Float := 1e-10) : Bool :=
+  (a - b).abs < tol
+
+def main : IO Unit := do
+  IO.println "=== Testing .npy Roundtrip ==="
+  IO.println ""
+
+  -- Test 1: Load 1D array
+  IO.println "Test 1: Load 1D array"
+  let arr1d ← Npy.loadFile "data/npy_test/test_1d.npy"
+  IO.println s!"  dtype: {repr arr1d.dtype}, shape: {repr arr1d.shape.dims}"
+  let floats1d ← IO.ofExcept arr1d.toFloatArray
+  IO.print "  values: ["
+  for i in [0:floats1d.size] do
+    if i > 0 then IO.print ", "
+    IO.print s!"{floats1d.uget i.toUSize sorry_proof}"
+  IO.println "]"
+  let sum1d := (List.range floats1d.size).foldl (fun acc i =>
+    acc + floats1d.uget i.toUSize sorry_proof) 0.0
+  IO.println s!"  sum: {sum1d} (expected: 15.0)"
+  if almostEq sum1d 15.0 then
+    IO.println "  ✓ PASS"
+  else
+    IO.println "  ✗ FAIL"
+
+  -- Test 2: Load 2D array as flat
+  IO.println ""
+  IO.println "Test 2: Load 2D array (3x4)"
+  let arr2d ← Npy.loadFile "data/npy_test/test_2d.npy"
+  IO.println s!"  dtype: {repr arr2d.dtype}, shape: {repr arr2d.shape.dims}"
+  let floats2d ← IO.ofExcept arr2d.toFloatArray
+  let sum2d := (List.range floats2d.size).foldl (fun acc i =>
+    acc + floats2d.uget i.toUSize sorry_proof) 0.0
+  IO.println s!"  sum: {sum2d} (expected: 66.0)"
+  if almostEq sum2d 66.0 then
+    IO.println "  ✓ PASS"
+  else
+    IO.println "  ✗ FAIL"
+
+  -- Test 3: Matrix-vector multiply
+  IO.println ""
+  IO.println "Test 3: Matrix-vector multiply"
+  let arrA ← Npy.loadFile "data/npy_test/test_A.npy"
+  let arrX ← Npy.loadFile "data/npy_test/test_x.npy"
+  let arrY ← Npy.loadFile "data/npy_test/test_y.npy"
+  IO.println s!"  A shape: {repr arrA.shape.dims}"
+  IO.println s!"  x shape: {repr arrX.shape.dims}"
+  IO.println s!"  y shape: {repr arrY.shape.dims}"
+
+  -- Load as DataArrayN and compute
+  let A : Float^[3, 2] ← Npy.loadFloat64 "data/npy_test/test_A.npy"
+  let x : Float^[2] ← Npy.loadFloat64 "data/npy_test/test_x.npy"
+  let expectedY : Float^[3] ← Npy.loadFloat64 "data/npy_test/test_y.npy"
+
+  -- Compute y = A @ x using contractRightAddR
+  let computedY : Float^[3] := DataArrayN.contractRightAddR 1.0 A x 0.0 0
+
+  IO.println s!"  expected y: [{expectedY[⟨0, sorry_proof⟩]}, {expectedY[⟨1, sorry_proof⟩]}, {expectedY[⟨2, sorry_proof⟩]}]"
+  IO.println s!"  computed y: [{computedY[⟨0, sorry_proof⟩]}, {computedY[⟨1, sorry_proof⟩]}, {computedY[⟨2, sorry_proof⟩]}]"
+
+  let match0 := almostEq computedY[⟨0, sorry_proof⟩] expectedY[⟨0, sorry_proof⟩]
+  let match1 := almostEq computedY[⟨1, sorry_proof⟩] expectedY[⟨1, sorry_proof⟩]
+  let match2 := almostEq computedY[⟨2, sorry_proof⟩] expectedY[⟨2, sorry_proof⟩]
+
+  if match0 && match1 && match2 then
+    IO.println "  ✓ PASS"
+  else
+    IO.println "  ✗ FAIL"
+
+  -- Test 4: Softmax
+  IO.println ""
+  IO.println "Test 4: Softmax"
+  let logits : Float^[4] ← Npy.loadFloat64 "data/npy_test/test_logits.npy"
+  let expectedSoftmax : Float^[4] ← Npy.loadFloat64 "data/npy_test/test_softmax.npy"
+
+  -- Compute softmax using DataArrayN.softmax
+  let computedSoftmax : Float^[4] := DataArrayN.softmax logits
+
+  IO.println s!"  expected: [{expectedSoftmax[⟨0, sorry_proof⟩]}, {expectedSoftmax[⟨1, sorry_proof⟩]}, {expectedSoftmax[⟨2, sorry_proof⟩]}, {expectedSoftmax[⟨3, sorry_proof⟩]}]"
+  IO.println s!"  computed: [{computedSoftmax[⟨0, sorry_proof⟩]}, {computedSoftmax[⟨1, sorry_proof⟩]}, {computedSoftmax[⟨2, sorry_proof⟩]}, {computedSoftmax[⟨3, sorry_proof⟩]}]"
+
+  let softmaxMatch := almostEq computedSoftmax[⟨0, sorry_proof⟩] expectedSoftmax[⟨0, sorry_proof⟩] 1e-6
+                   && almostEq computedSoftmax[⟨1, sorry_proof⟩] expectedSoftmax[⟨1, sorry_proof⟩] 1e-6
+                   && almostEq computedSoftmax[⟨2, sorry_proof⟩] expectedSoftmax[⟨2, sorry_proof⟩] 1e-6
+                   && almostEq computedSoftmax[⟨3, sorry_proof⟩] expectedSoftmax[⟨3, sorry_proof⟩] 1e-6
+
+  if softmaxMatch then
+    IO.println "  ✓ PASS"
+  else
+    IO.println "  ✗ FAIL"
+
+  IO.println ""
+  IO.println "=== All tests completed ==="
diff --git a/examples/VerifyPyTorchMNIST.lean b/examples/VerifyPyTorchMNIST.lean
@@ -0,0 +1,158 @@
+/-
+Verify PyTorch MNIST model: Load exported weights, run inference, compare to Python.
+
+This tests the full pipeline:
+1. Load .npy weights exported from PyTorch
+2. Run forward pass in Lean
+3. Compare outputs to Python's expected results
+-/
+import SciLean
+import SciLean.Data.Npy
+import SciLean.Data.DataArray.TensorOperations
+
+open SciLean
+
+/-! ## Simple MLP Forward Pass
+
+Architecture: 784 → 128 (ReLU) → 10
+-/
+
+/-- ReLU activation for FloatArray -/
+def reluArr (x : FloatArray) : FloatArray := Id.run do
+  let mut y : FloatArray := .emptyWithCapacity x.size
+  for i in [0:x.size] do
+    let v := x.uget i.toUSize sorry_proof
+    y := y.push (if v > 0 then v else 0)
+  y
+
+/-- Element-wise addition for FloatArray -/
+def addArr (x y : FloatArray) : FloatArray := Id.run do
+  let mut z : FloatArray := .emptyWithCapacity x.size
+  for i in [0:x.size] do
+    z := z.push (x.uget i.toUSize sorry_proof + y.uget i.toUSize sorry_proof)
+  z
+
+/-- Single MLP forward pass using DataArrayN: y = W2 @ relu(W1 @ x + b1) + b2 -/
+def mlpForward
+    (w1 : Float^[128, 784]) (b1 : Float^[128])
+    (w2 : Float^[10, 128]) (b2 : Float^[10])
+    (x : Float^[784]) : Float^[10] :=
+  -- h = W1 @ x + b1
+  let h : Float^[128] := DataArrayN.contractRightAddR 1.0 w1 x 0.0 0
+  let h := h + b1
+  -- h_relu = relu(h)
+  let hArr := DataArrayN.toFloatArray h
+  let hReluArr := reluArr hArr
+  let hRelu : Float^[128] := DataArrayN.fromFloatArray hReluArr
+  -- logits = W2 @ h_relu + b2
+  let logits : Float^[10] := DataArrayN.contractRightAddR 1.0 w2 hRelu 0.0 0
+  logits + b2
+
+/-- Get argmax prediction from FloatArray -/
+def argmaxArr (x : FloatArray) : Nat := Id.run do
+  if x.size == 0 then return 0
+  let mut maxIdx := 0
+  let mut maxVal := x.uget 0 sorry_proof
+  for i in [1:x.size] do
+    let v := x.uget i.toUSize sorry_proof
+    if v > maxVal then
+      maxIdx := i
+      maxVal := v
+  maxIdx
+
+/-- Check if two FloatArrays are approximately equal -/
+def almostEqArr (x y : FloatArray) (tol : Float := 1e-5) : Bool := Id.run do
+  if x.size != y.size then return false
+  for i in [0:x.size] do
+    let xi := x.uget i.toUSize sorry_proof
+    let yi := y.uget i.toUSize sorry_proof
+    if (xi - yi).abs > tol then
+      return false
+  true
+
+def main : IO Unit := do
+  IO.println "=== Verifying PyTorch MNIST Model in Lean ==="
+  IO.println ""
+
+  -- Load weights
+  IO.println "Loading weights..."
+  let w1 : Float^[128, 784] ← Npy.loadFloat64 "data/mnist_weights/w1.npy"
+  let b1 : Float^[128] ← Npy.loadFloat64 "data/mnist_weights/b1.npy"
+  let w2 : Float^[10, 128] ← Npy.loadFloat64 "data/mnist_weights/w2.npy"
+  let b2 : Float^[10] ← Npy.loadFloat64 "data/mnist_weights/b2.npy"
+  IO.println s!"  w1: 128x784, b1: 128, w2: 10x128, b2: 10"
+
+  -- Load test data
+  IO.println ""
+  IO.println "Loading test data..."
+  let testImages ← Npy.loadFile "data/mnist_weights/test_images.npy"
+  let testLogits ← Npy.loadFile "data/mnist_weights/test_logits.npy"
+  let testPreds ← Npy.loadFile "data/mnist_weights/test_preds.npy"
+  let testLabels ← Npy.loadFile "data/mnist_weights/test_labels.npy"
+
+  let imageFloats ← IO.ofExcept testImages.toFloatArray
+  let logitFloats ← IO.ofExcept testLogits.toFloatArray
+  let numTests := 10
+
+  IO.println s!"  test_images: 10x784"
+  IO.println s!"  test_logits: 10x10"
+
+  -- Run inference and compare
+  IO.println ""
+  IO.println "Running inference and comparing..."
+
+  let mut correct := 0
+  let mut allMatch := true
+
+  for testIdx in [0:numTests] do
+    -- Extract single image
+    let mut imgData : FloatArray := .emptyWithCapacity 784
+    for j in [0:784] do
+      imgData := imgData.push (imageFloats.uget (testIdx * 784 + j).toUSize sorry_proof)
+    let img : Float^[784] := DataArrayN.fromFloatArray imgData
+
+    -- Run forward pass
+    let leanLogits := mlpForward w1 b1 w2 b2 img
+    let leanLogitsArr := DataArrayN.toFloatArray leanLogits
+    let leanPred := argmaxArr leanLogitsArr
+
+    -- Get Python's expected logits for this sample
+    let mut pyLogitsArr : FloatArray := .emptyWithCapacity 10
+    for j in [0:10] do
+      pyLogitsArr := pyLogitsArr.push (logitFloats.uget (testIdx * 10 + j).toUSize sorry_proof)
+
+    -- Get Python's prediction (from test_preds as int64)
+    let pyPredBytes := testPreds.data.copySlice (testPreds.startIndex + testIdx * 8) ByteArray.empty 0 8
+    let pyPred := pyPredBytes[0]!.toNat  -- Simple extraction for small values
+
+    -- Get true label (from test_labels as int64)
+    let labelBytes := testLabels.data.copySlice (testLabels.startIndex + testIdx * 8) ByteArray.empty 0 8
+    let trueLabel := labelBytes[0]!.toNat
+
+    -- Check if logits match
+    let logitsMatch := almostEqArr leanLogitsArr pyLogitsArr 1e-4
+
+    -- Check predictions
+    let predsMatch := leanPred == pyPred
+    if leanPred == trueLabel then correct := correct + 1
+
+    if not logitsMatch || not predsMatch then
+      allMatch := false
+      IO.println s!"  Sample {testIdx}: MISMATCH"
+      IO.println s!"    Lean pred: {leanPred}, Python pred: {pyPred}, label: {trueLabel}"
+      -- Show first few logit values for debugging
+      IO.println s!"    Lean logits[0..2]: {leanLogitsArr.uget 0 sorry_proof}, {leanLogitsArr.uget 1 sorry_proof}, {leanLogitsArr.uget 2 sorry_proof}"
+      IO.println s!"    Py   logits[0..2]: {pyLogitsArr.uget 0 sorry_proof}, {pyLogitsArr.uget 1 sorry_proof}, {pyLogitsArr.uget 2 sorry_proof}"
+    else
+      let sym := if leanPred == trueLabel then "✓" else "✗"
+      IO.println s!"  Sample {testIdx}: pred={leanPred} label={trueLabel} {sym} (logits match)"
+
+  IO.println ""
+  IO.println s!"Accuracy: {correct}/{numTests} = {100.0 * correct.toFloat / numTests.toFloat}%"
+
+  if allMatch then
+    IO.println ""
+    IO.println "=== SUCCESS: All logits match Python! ==="
+  else
+    IO.println ""
+    IO.println "=== WARNING: Some outputs did not match ==="
diff --git a/scripts/test_npy_roundtrip.py b/scripts/test_npy_roundtrip.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""
+Test .npy roundtrip: Create arrays in Python, save, load in Lean, verify.
+"""
+import numpy as np
+import os
+
+os.makedirs("data/npy_test", exist_ok=True)
+
+# Test 1: Simple 1D array
+arr1d = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float64)
+np.save("data/npy_test/test_1d.npy", arr1d)
+print(f"test_1d.npy: {arr1d}")
+print(f"  dtype: {arr1d.dtype}, shape: {arr1d.shape}")
+print(f"  sum: {arr1d.sum()}, mean: {arr1d.mean()}")
+
+# Test 2: 2D matrix (3x4)
+arr2d = np.arange(12.0, dtype=np.float64).reshape(3, 4)
+np.save("data/npy_test/test_2d.npy", arr2d)
+print(f"\ntest_2d.npy:\n{arr2d}")
+print(f"  dtype: {arr2d.dtype}, shape: {arr2d.shape}")
+print(f"  sum: {arr2d.sum()}")
+
+# Test 3: Matrix-vector multiply test
+# A @ x should give specific result
+A = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float64)  # 3x2
+x = np.array([1.0, 2.0], dtype=np.float64)  # 2
+y = A @ x  # Should be [5.0, 11.0, 17.0]
+np.save("data/npy_test/test_A.npy", A)
+np.save("data/npy_test/test_x.npy", x)
+np.save("data/npy_test/test_y.npy", y)
+print(f"\nMatrix-vector test:")
+print(f"  A (3x2):\n{A}")
+print(f"  x (2):\n{x}")
+print(f"  y = A @ x (3):\n{y}")
+
+# Test 4: Softmax test
+logits = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float64)
+# Stable softmax
+exp_logits = np.exp(logits - logits.max())
+softmax_out = exp_logits / exp_logits.sum()
+np.save("data/npy_test/test_logits.npy", logits)
+np.save("data/npy_test/test_softmax.npy", softmax_out)
+print(f"\nSoftmax test:")
+print(f"  logits: {logits}")
+print(f"  softmax: {softmax_out}")
+print(f"  sum (should be 1.0): {softmax_out.sum()}")
+
+print("\n=== All test files created ===")