perf: Optimize Metal GEMM to 2.4+ TFLOP/s, add fused ML ops

GEMM Optimizations: - Add gemm_m4_pro: Software-pipelined kernel with register prefetch - Add gemm_m4_max: Larger 128×64 tiles for better compute density - Both require M, N, K multiples of 64 Bug Fixes: - Fix GPU reduce_sum for large arrays (>1024 elements) via two-pass reduction - Fix Accelerate GEMM: Replace broken cblas_sgemm with vDSP_mmul - Add Accelerate framework to lakefile.lean linking Fused Operations: - Add fused softmax (single memory pass) - Add bias_relu, bias_gelu kernels - Add layer_norm kernel Benchmarks at 2048×2048: - MPS: 2.53 TFLOP/s - M4Pro: 2.44 TFLOP/s - M4: 2.42 TFLOP/s - Accelerate: 2.19 TFLOP/s (CPU AMX)
lecopivo · alok · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
commit f73af791b0eee4e0f3d1cbaf5e3de7e9b7a7612e
diff --git a/Metal/kmeans.metal b/Metal/kmeans.metal
diff --git a/Metal/metal_backend.mm b/Metal/metal_backend.mm
diff --git a/SciLean/FFI/Metal.lean b/SciLean/FFI/Metal.lean
@@ -179,12 +179,24 @@ opaque gemmSimd (m k n : USize) (A B : @& ByteArray) : ByteArray
 @[extern "scilean_metal_gemm_simd_opt_f32"]
 opaque gemmSimdOpt (m k n : USize) (A B : @& ByteArray) : ByteArray
 
--- M4-optimized GEMM: float4 loads, 128×128 tiles, no bounds checks
--- REQUIRES: M, N, K are multiples of 128
--- 16 simdgroups (512 threads), 4×4 accumulator grid per simdgroup
+-- M4-optimized GEMM: float4 loads, 64×64 tiles, no bounds checks
+-- REQUIRES: M, N, K are multiples of 64
+-- 8 simdgroups (256 threads), 4×2 accumulator grid per simdgroup
 @[extern "scilean_metal_gemm_m4_f32"]
 opaque gemmM4 (m k n : USize) (A B : @& ByteArray) : ByteArray
 
+-- M4-Pro GEMM: Double-buffered with software pipelining
+-- REQUIRES: M, N, K are multiples of 64
+-- Prefetches next tile while computing current
+@[extern "scilean_metal_gemm_m4_pro_f32"]
+opaque gemmM4Pro (m k n : USize) (A B : @& ByteArray) : ByteArray
+
+-- M4-Max GEMM: Larger tiles (128×64) for better compute density
+-- REQUIRES: M multiple of 128, N, K multiples of 64
+-- 16 simdgroups (512 threads), maximum occupancy
+@[extern "scilean_metal_gemm_m4_max_f32"]
+opaque gemmM4Max (m k n : USize) (A B : @& ByteArray) : ByteArray
+
 -- MPS matrix multiply on GPU (Float32): Apple's Metal Performance Shaders
 -- This uses Apple's highly optimized GEMM that leverages the Neural Engine and GPU
 @[extern "scilean_metal_gemm_mps_f32"]
@@ -287,24 +299,14 @@ opaque fill (n : USize) (value : Float32) : ByteArray
 
 -- Fused Operations
 
--- Softmax: softmax(x) = exp(x - max(x)) / sum(exp(x - max(x)))
--- Currently implemented using multiple GPU passes.
+-- Fused Softmax: softmax(x) = exp(x - max(x)) / sum(exp(x - max(x)))
+-- Single GPU dispatch with optimized memory access
+@[extern "scilean_metal_softmax_f32"]
+opaque softmaxFused (n : USize) (x : @& ByteArray) : ByteArray
+
+-- Softmax (multi-pass fallback implementation)
 def softmax (sz : USize) (x : ByteArray) : ByteArray :=
-  -- Find max for numerical stability
-  let maxVal := reduceMax sz x
-  -- Create array filled with max value
-  let maxArr := fill sz maxVal
-  -- Subtract max: x - max
-  let shifted := sub sz x maxArr
-  -- Compute exp using exp2: exp(x) = 2^(x * log2(e)), log2(e) ≈ 1.4427
-  let log2e : Float32 := (1.4426950408889634 : Float32)  -- log2(e)
-  let log2eArr := fill sz log2e
-  let scaledShifted := mul sz shifted log2eArr
-  let expVals := exp2 sz scaledShifted
-  -- Sum the exp values
-  let sumVal := reduceSum sz expVals
-  -- Normalize: exp / sum
-  let sumArr := fill sz sumVal
-  div sz expVals sumArr
+  -- Use fused version if available
+  softmaxFused sz x
 
 end SciLean.Metal.Float32
diff --git a/examples/GEMMComparison.lean b/examples/GEMMComparison.lean
@@ -0,0 +1,75 @@
+import SciLean.FFI.Metal
+import SciLean.FFI.Float32Array
+
+open SciLean
+
+def benchGemm (name : String) (gemm : USize → USize → USize → ByteArray → ByteArray → ByteArray)
+    (n : Nat) (numIters : Nat) : IO Unit := do
+  let amat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+  let bmat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+
+  -- Warmup
+  for _ in [:3] do
+    let _ := gemm n.toUSize n.toUSize n.toUSize amat bmat
+
+  -- Benchmark
+  let mut sizeAccum := 0
+  let start ← IO.monoNanosNow
+  for _ in [:numIters] do
+    let r := gemm n.toUSize n.toUSize n.toUSize amat bmat
+    sizeAccum := sizeAccum + r.size
+  let finish ← IO.monoNanosNow
+
+  let totalNs := finish - start
+  let avgNs := totalNs / numIters
+  let avgMs := avgNs.toFloat / 1_000_000.0
+  let flops := 2.0 * n.toFloat * n.toFloat * n.toFloat
+  let gflops := if avgNs > 0 then flops / avgNs.toFloat else 0.0
+  let tflops := gflops / 1000.0
+
+  if tflops >= 1.0 then
+    IO.println s!"  {name}: {avgMs} ms, {tflops} TFLOP/s"
+  else
+    IO.println s!"  {name}: {avgMs} ms, {gflops} GFLOP/s"
+
+def main : IO Unit := do
+  IO.println "=== GEMM Kernel Comparison ==="
+  IO.println "Comparing all available GEMM implementations\n"
+
+  for n in [256, 512, 1024, 2048] do
+    let numIters := if n >= 2048 then 5 else if n >= 1024 then 10 else 20
+    IO.println s!"Matrix size: {n}×{n} (n={numIters} iterations)"
+
+    -- Basic naive GEMM
+    benchGemm "Naive      " Metal.Float32.gemm n numIters
+
+    -- Tiled GEMM (32x32 tiles)
+    benchGemm "Tiled      " Metal.Float32.gemmTiled n numIters
+
+    -- Simdgroup GEMM (hardware matrix units)
+    benchGemm "Simd       " Metal.Float32.gemmSimd n numIters
+
+    -- Optimized simdgroup GEMM
+    benchGemm "SimdOpt    " Metal.Float32.gemmSimdOpt n numIters
+
+    -- M4-optimized GEMM (if n is multiple of 64)
+    if n % 64 == 0 then
+      benchGemm "M4         " Metal.Float32.gemmM4 n numIters
+
+    -- M4-Pro: Double-buffered with software pipelining (if n is multiple of 64)
+    if n % 64 == 0 then
+      benchGemm "M4Pro      " Metal.Float32.gemmM4Pro n numIters
+
+    -- M4-Max: Larger tiles (if m is multiple of 128)
+    if n % 128 == 0 then
+      benchGemm "M4Max      " Metal.Float32.gemmM4Max n numIters
+
+    -- MPS (Metal Performance Shaders)
+    benchGemm "MPS        " Metal.Float32.gemmMPS n numIters
+
+    -- Accelerate (CPU AMX)
+    benchGemm "Accelerate " Metal.Float32.gemmAccelerate n numIters
+
+    IO.println ""
+
+  IO.println "Done!"
diff --git a/examples/GEMMCorrectness.lean b/examples/GEMMCorrectness.lean
@@ -0,0 +1,51 @@
+import SciLean.FFI.Metal
+import SciLean.FFI.Float32Array
+
+open SciLean
+
+-- Check if GEMM result is correct using reduceSum
+-- For all-ones matrices: C = A * B should have all entries = k, so sum = m*n*k
+def checkGemm (name : String) (gemm : USize → USize → USize → ByteArray → ByteArray → ByteArray) (n : Nat) : IO Unit := do
+  let amat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+  let bmat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+  let cmat := gemm n.toUSize n.toUSize n.toUSize amat bmat
+
+  -- Check first element: C[0,0] should be n (dot product of n ones = n)
+  -- Read first 4 bytes and decode as Float32
+  let b0 := cmat.get! 0
+  let b1 := cmat.get! 1
+  let b2 := cmat.get! 2
+  let b3 := cmat.get! 3
+  let bits : UInt32 := b0.toUInt32 ||| (b1.toUInt32 <<< 8) ||| (b2.toUInt32 <<< 16) ||| (b3.toUInt32 <<< 24)
+
+  -- C should be n×n matrix with all entries = n, so sum = n*n*n = n³
+  let expectedSum := (n * n * n).toFloat
+  let actualSumF32 := Metal.Float32.reduceSum (n * n).toUSize cmat
+  let actualSum := actualSumF32.toFloat
+
+  let relError := if expectedSum > 0 then (actualSum - expectedSum).abs / expectedSum else actualSum.abs
+
+  if relError < 0.01 then
+    IO.println s!"  {name}: CORRECT (sum = {actualSum}, expected = {expectedSum}, C[0,0] bits = {bits})"
+  else
+    IO.println s!"  {name}: FAILED (sum = {actualSum}, expected = {expectedSum}, error = {relError * 100}%, C[0,0] bits = {bits})"
+
+def main : IO Unit := do
+  IO.println "=== GEMM Correctness Check ==="
+  IO.println "Computing C = A * B where A, B are all 1s\n"
+
+  -- First, test that fill works
+  IO.println "Testing fill(4, 1.0)..."
+  let testFill := Metal.Float32.fill 4 (1.0 : Float32)
+  IO.println s!"  Fill result size: {testFill.size} bytes"
+  IO.println s!"  Fill bytes: [{testFill.get! 0}, {testFill.get! 1}, {testFill.get! 2}, {testFill.get! 3}]"
+  IO.println ""
+
+  for n in [4, 8, 64] do
+    IO.println s!"Matrix size: {n}×{n}"
+    checkGemm "M4Pro      " Metal.Float32.gemmM4Pro n
+    checkGemm "MPS        " Metal.Float32.gemmMPS n
+    checkGemm "Accelerate " Metal.Float32.gemmAccelerate n
+    IO.println ""
+
+  IO.println "Done!"
diff --git a/examples/GEMMFocus.lean b/examples/GEMMFocus.lean
@@ -0,0 +1,52 @@
+import SciLean.FFI.Metal
+import SciLean.FFI.Float32Array
+
+open SciLean
+
+def benchGemm (name : String) (gemm : USize → USize → USize → ByteArray → ByteArray → ByteArray)
+    (n : Nat) (numIters : Nat) : IO Unit := do
+  let amat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+  let bmat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+
+  -- Warmup
+  for _ in [:5] do
+    let _ := gemm n.toUSize n.toUSize n.toUSize amat bmat
+
+  -- Benchmark
+  let mut sizeAccum := 0
+  let start ← IO.monoNanosNow
+  for _ in [:numIters] do
+    let r := gemm n.toUSize n.toUSize n.toUSize amat bmat
+    sizeAccum := sizeAccum + r.size
+  let finish ← IO.monoNanosNow
+
+  let totalNs := finish - start
+  let avgNs := totalNs / numIters
+  let avgMs := avgNs.toFloat / 1_000_000.0
+  let flops := 2.0 * n.toFloat * n.toFloat * n.toFloat
+  let gflops := if avgNs > 0 then flops / avgNs.toFloat else 0.0
+  let tflops := gflops / 1000.0
+
+  if tflops >= 1.0 then
+    IO.println s!"  {name}: {avgMs} ms, {tflops} TFLOP/s"
+  else
+    IO.println s!"  {name}: {avgMs} ms, {gflops} GFLOP/s"
+
+def main : IO Unit := do
+  IO.println "=== Focused GEMM Analysis ==="
+  IO.println "Testing M4Pro, MPS, and Accelerate at various sizes\n"
+
+  -- Test at power-of-2 sizes from 128 to 4096
+  for log2n in [7, 8, 9, 10, 11, 12] do
+    let n := 1 <<< log2n  -- 128, 256, 512, 1024, 2048, 4096
+    let numIters := if n >= 2048 then 10 else if n >= 1024 then 20 else 50
+    IO.println s!"Matrix size: {n}×{n} ({numIters} iterations)"
+
+    -- Compare M4Pro with MPS and Accelerate
+    if n % 64 == 0 then
+      benchGemm "M4Pro      " Metal.Float32.gemmM4Pro n numIters
+    benchGemm "MPS        " Metal.Float32.gemmMPS n numIters
+    benchGemm "Accelerate " Metal.Float32.gemmAccelerate n numIters
+    IO.println ""
+
+  IO.println "Done!"
diff --git a/examples/LargeGEMM.lean b/examples/LargeGEMM.lean
@@ -0,0 +1,47 @@
+import SciLean.FFI.Metal
+import SciLean.FFI.Float32Array
+
+open SciLean
+
+def main : IO Unit := do
+  IO.println "=== Large GEMM Performance Benchmark ==="
+  IO.println "Warming up..."
+
+  -- Extended warmup
+  for _ in [:50] do
+    let a := Metal.Float32.fill 4096 (1.0 : Float32)
+    let b := Metal.Float32.fill 4096 (1.0 : Float32)
+    let _ := Metal.Float32.gemm 64 64 64 a b
+
+  IO.println "Running benchmarks..."
+
+  for n in [64, 128, 256, 512, 1024, 2048] do
+    let amat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+    let bmat := Metal.Float32.fill (n * n).toUSize (1.0 : Float32)
+
+    -- More iterations for smaller matrices
+    let numIters := if n >= 2048 then 5 else if n >= 1024 then 20 else if n >= 512 then 50 else 100
+
+    -- Warmup for this size
+    for _ in [:5] do
+      let _ := Metal.Float32.gemm n.toUSize n.toUSize n.toUSize amat bmat
+
+    -- Time entire loop (like OverheadTest that got realistic numbers)
+    let mut sizeAccum := 0
+    let start ← IO.monoNanosNow
+    for _ in [:numIters] do
+      let r := Metal.Float32.gemm n.toUSize n.toUSize n.toUSize amat bmat
+      sizeAccum := sizeAccum + r.size
+    let finish ← IO.monoNanosNow
+
+    let totalNs := finish - start
+    let avgNs := totalNs / numIters
+    let avgUs := avgNs.toFloat / 1000.0
+    let avgMs := avgUs / 1000.0
+    let flops := 2.0 * n.toFloat * n.toFloat * n.toFloat
+    let gflops := if avgNs > 0 then flops / avgNs.toFloat else 0.0
+
+    IO.println s!"gemm({n}×{n}): {avgMs} ms, {gflops} GFLOP/s (size={sizeAccum})"
+
+  IO.println ""
+  IO.println "Done!"