From dbf055ed5e1fd4c0307550eb8be3127e32c63a4b Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Wed, 20 Aug 2025 11:27:41 +0100 Subject: [PATCH 1/5] Fixes for DynamicQuantizeMatMul and Attention3D tests Signed-off-by: Jonathan Clohessy --- .../quantization/dynamic_quantize_matmul.cc | 14 +++ .../core/mlas/lib/kleidiai/sgemm_kleidiai.cpp | 87 ++++++++++++------- 2 files changed, 70 insertions(+), 31 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index 85a2cbaea0e44..7ffb70137afb6 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -200,6 +200,20 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available); + // Kleidi dynamic path requires strictly positive, finite scales. + // Disable if any invalid scale is detected. + if (can_use_dynamic_quant_mlas_) { + const float* bs_data = b_scale_tensor->Data(); + const size_t bs_size = static_cast(b_scale_tensor->Shape().Size()); + for (size_t i = 0; i < bs_size; ++i) { + const float s = bs_data[i]; + if (!std::isfinite(s) || s <= 0.0f) { + can_use_dynamic_quant_mlas_ = false; + break; + } + } + } + // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops. // We check that here too before attempting to use them. if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) { diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index caa445b71e2a5..ba1e175f39d0f 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -153,30 +153,41 @@ ArmKleidiAI::MlasGemmBatch( MLAS_THREADPOOL* ThreadPool ) { - if(TransA == CblasTrans) - { - return false; + if (M == 0 || N == 0) { + return true; } - if (TransA == CblasNoTrans && K == 0) { - if (Data->beta != 1.0f) { + + if (Data->alpha == 0.0f) { + if (Data->beta == 0.0f) { + for (size_t i = 0; i < M; ++i) { + std::fill_n(Data->C + i * Data->ldc, N, 0.0f); + } + } else if (Data->beta != 1.0f) { for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { Data->C[i * Data->ldc + j] *= Data->beta; } } } + return true; } - if (Data->beta == 0.0f){ - std::fill_n(Data->C, M * Data->ldc, 0.0f); - } - //Fallback in the case of unsupported cases - if (M == 0 || N == 0 || K == 0 || - TransA != CblasNoTrans || - (TransB != CblasNoTrans && !Data[0].BIsPacked)) - { - return false; + + if (K == 0) { + if (Data->beta == 0.0f) { + for (size_t i = 0; i < M; ++i) { + std::fill_n(Data->C + i * Data->ldc, N, 0.0f); + } + } else if (Data->beta != 1.0f) { + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + Data->C[i * Data->ldc + j] *= Data->beta; + } + } + } + return true; } + if (TransA == CblasNoTrans) { const size_t mr = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); const size_t kr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); @@ -185,11 +196,9 @@ ArmKleidiAI::MlasGemmBatch( auto m_step = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); auto n_step = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); - if (M < m_step || N < n_step) { - if (GetMlasPlatform().MlasGemmBatchOverride != ArmKleidiAI::MlasGemmBatch){ - //Fallback to MLAS - return false; - } + if ((M < m_step && N < n_step) && !Data->BIsPacked) { + // Fallback to MLAS + return false; } std::vector KaiPackedData; @@ -316,7 +325,7 @@ ArmKleidiAI::MlasGemmBatch( float* dst_tile = reinterpret_cast(CTile); // quick copy of data in cases where we are not scaling or accumulating anything - // with bounds checking on tile sizing to ensure the data fits in the memory block + // with bounds checking on tile sizing to ensure the data fits in the memory block bool can_memcpy = ( Data[BIdx].alpha == 1.0f && Data[BIdx].beta == 0.0f && @@ -328,21 +337,37 @@ ArmKleidiAI::MlasGemmBatch( if (can_memcpy) { std::memcpy(dst_tile, temp_tile, TileSizeM * TileSizeN * sizeof(float)); - }else { - // apply alpha scaling and beta to output files - for (size_t i = 0; i < TileSizeM; ++i) { - for (size_t j = 0; j < TileSizeN; ++j) { - const size_t idx = i * TileSizeN + j; - const size_t dst_idx = i * Data[BIdx].ldc + j; - - float ab = temp_tile[idx]; - float c_orig = dst_tile[dst_idx]; + return true; + } - dst_tile[dst_idx] = Data[BIdx].alpha * ab + Data[BIdx].beta * c_orig; + float alpha = Data[BIdx].alpha; + float beta = Data[BIdx].beta; + size_t ldc = Data[BIdx].ldc; + + for (size_t i = 0; i < TileSizeM; ++i) { + for (size_t j = 0; j < TileSizeN; ++j) { + const size_t temp_idx = i * TileSizeN + j; + const size_t dst_idx = i * ldc + j; + + float ab = temp_tile[temp_idx]; + float c_orig = dst_tile[dst_idx]; + + if (alpha == 1.0f && beta == 0.0f) { + dst_tile[dst_idx] = ab; + } else if (alpha == 1.0f) { + dst_tile[dst_idx] = ab + beta * c_orig; + } else if (beta == 0.0f) { + dst_tile[dst_idx] = alpha * ab; + } else { + dst_tile[dst_idx] = alpha * ab + beta * c_orig; } } } + return true; }); + return true; + } + else { + return false; } - return true; } From b59fbefd5485b9d055803cc04d226a262319ab69 Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Thu, 21 Aug 2025 18:48:17 +0100 Subject: [PATCH 2/5] Apply Lintrunner fixes and correct GEMM A pointer arithmetic Signed-off-by: Jonathan Clohessy --- .../cpu/quantization/dynamic_quantize_matmul.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index 7ffb70137afb6..c3e0e61dee593 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -206,10 +206,10 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { const float* bs_data = b_scale_tensor->Data(); const size_t bs_size = static_cast(b_scale_tensor->Shape().Size()); for (size_t i = 0; i < bs_size; ++i) { - const float s = bs_data[i]; - if (!std::isfinite(s) || s <= 0.0f) { - can_use_dynamic_quant_mlas_ = false; - break; + const float s = bs_data[i]; + if (!std::isfinite(s) || s <= 0.0f) { + can_use_dynamic_quant_mlas_ = false; + break; } } } @@ -406,8 +406,9 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { std::vector gemm_data_vec(num_gemms); for (size_t gemm_idx = 0; gemm_idx < num_gemms; gemm_idx++) { + auto& params = gemm_data_vec[gemm_idx]; - params.A = reinterpret_cast(a_data + helper.LeftOffsets()[gemm_idx]); + params.A = reinterpret_cast(a_data) + helper.LeftOffsets()[gemm_idx]; params.lda = gemm_shape.K; params.PackedB = packed_b_.get(); params.C = y_data + helper.OutputOffsets()[gemm_idx]; From 02cd2eca50fa5f605a9292557ccb335dd2990591 Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Mon, 25 Aug 2025 17:55:50 +0100 Subject: [PATCH 3/5] Refactor a_data handling to be the same as y_data Signed-off-by: Jonathan Clohessy --- .../quantization/dynamic_quantize_matmul.cc | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index c3e0e61dee593..36a6f70cc69d9 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -203,14 +203,13 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { // Kleidi dynamic path requires strictly positive, finite scales. // Disable if any invalid scale is detected. if (can_use_dynamic_quant_mlas_) { - const float* bs_data = b_scale_tensor->Data(); - const size_t bs_size = static_cast(b_scale_tensor->Shape().Size()); - for (size_t i = 0; i < bs_size; ++i) { - const float s = bs_data[i]; - if (!std::isfinite(s) || s <= 0.0f) { - can_use_dynamic_quant_mlas_ = false; - break; - } + const auto bs = b_scale_tensor->DataAsSpan(); + const bool has_invalid = + std::any_of(bs.begin(), bs.end(), + [](float s) { return !std::isfinite(s) || s <= 0.0f; }); + + if (has_invalid) { + can_use_dynamic_quant_mlas_ = false; } } @@ -393,7 +392,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { if (y->Shape().Size() == 0) return Status::OK(); - auto a_data = static_cast(ctx->Input(IN_A)->DataRaw()); + const float* a_data = ctx->Input(IN_A)->Data(); auto* y_data = y->MutableData(); // batch gemm @@ -406,9 +405,8 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { std::vector gemm_data_vec(num_gemms); for (size_t gemm_idx = 0; gemm_idx < num_gemms; gemm_idx++) { - auto& params = gemm_data_vec[gemm_idx]; - params.A = reinterpret_cast(a_data) + helper.LeftOffsets()[gemm_idx]; + params.A = a_data + helper.LeftOffsets()[gemm_idx]; params.lda = gemm_shape.K; params.PackedB = packed_b_.get(); params.C = y_data + helper.OutputOffsets()[gemm_idx]; From c6ba0105b402e174bcf279d45cf5d7d5cf10b478 Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Tue, 26 Aug 2025 19:01:38 +0100 Subject: [PATCH 4/5] Consolidate conditions and remove extraneous brackets Signed-off-by: Jonathan Clohessy --- .../core/mlas/lib/kleidiai/sgemm_kleidiai.cpp | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index ba1e175f39d0f..7a69eaf0bf7ba 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -157,7 +157,7 @@ ArmKleidiAI::MlasGemmBatch( return true; } - if (Data->alpha == 0.0f) { + if (Data->alpha == 0.0f || K == 0) { if (Data->beta == 0.0f) { for (size_t i = 0; i < M; ++i) { std::fill_n(Data->C + i * Data->ldc, N, 0.0f); @@ -172,22 +172,6 @@ ArmKleidiAI::MlasGemmBatch( return true; } - if (K == 0) { - if (Data->beta == 0.0f) { - for (size_t i = 0; i < M; ++i) { - std::fill_n(Data->C + i * Data->ldc, N, 0.0f); - } - } else if (Data->beta != 1.0f) { - for (size_t i = 0; i < M; ++i) { - for (size_t j = 0; j < N; ++j) { - Data->C[i * Data->ldc + j] *= Data->beta; - } - } - } - return true; - } - - if (TransA == CblasNoTrans) { const size_t mr = kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); const size_t kr = kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); @@ -196,7 +180,7 @@ ArmKleidiAI::MlasGemmBatch( auto m_step = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); auto n_step = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); - if ((M < m_step && N < n_step) && !Data->BIsPacked) { + if (M < m_step && N < n_step && !Data->BIsPacked) { // Fallback to MLAS return false; } From 29671a7376b4436df4e76a1637c338734e088f8a Mon Sep 17 00:00:00 2001 From: Jonathan Clohessy Date: Wed, 27 Aug 2025 16:25:09 +0100 Subject: [PATCH 5/5] Removed boolean return types from within lambda scope Signed-off-by: Jonathan Clohessy --- onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index 7a69eaf0bf7ba..c579ff1542eb9 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -321,7 +321,7 @@ ArmKleidiAI::MlasGemmBatch( if (can_memcpy) { std::memcpy(dst_tile, temp_tile, TileSizeM * TileSizeN * sizeof(float)); - return true; + return; } float alpha = Data[BIdx].alpha; @@ -347,7 +347,7 @@ ArmKleidiAI::MlasGemmBatch( } } } - return true; + return; }); return true; }