[CPU] Optimize GQA attention bias application for FP16 (#25871)

derdeljan-msft · snnn · commit 32ea3f5fe8f5 · 2025-08-29T13:13:54.000-07:00
### Description

When using attention bias input for GQA op with FP16, on the platforms
that don't natively support FP16 math a cast to fp32 needs to be
performed, and thus a temporary buffer needs to be created to store the
fp32 values. The issue is that this temporary buffer was being allocated
/ deallocated inside of a loop for every token being processed.
Refactored the implementation so that the allocation takes place only
once.

Phi model throughput increased by 15%.
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -280,6 +280,18 @@ class GQAAttentionBase {
                                           output, static_cast<int>(present_buffer_sequence_length), nullptr);
         }
 
+        // Pre-allocate buffer for attention mask to avoid allocating it for every processed token
+        float* attention_bias_thread_fp32 = nullptr;
+        if (attention_bias_thread != nullptr) {
+          if constexpr (!std::is_same_v<U, T>) {
+            static_assert(std::is_same_v<U, float> && std::is_same_v<T, MLFloat16>);
+
+            size_t bytes = attention_total_seqlen * sizeof(float);
+            attention_bias_thread_fp32 = static_cast<float*>(allocator->Alloc(bytes));
+          }
+        }
+        BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator));
+
         // compute Softmax
         U* output_softmax = output;
         for (size_t seq = 0; seq < sequence_length; seq++) {
@@ -316,9 +328,6 @@ class GQAAttentionBase {
                                  static_cast<int>(window_size));
             } else {
               static_assert(std::is_same_v<U, float> && std::is_same_v<T, MLFloat16>);
-              size_t bytes = window_size * sizeof(float);
-              auto attention_bias_thread_fp32 = static_cast<float*>(allocator->Alloc(bytes));
-              BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator));
 
               MlasConvertHalfToFloatBuffer(attention_bias_thread + start_offset, attention_bias_thread_fp32, window_size);
               ApplyAttentionBias(output_softmax + start_offset, attention_bias_thread_fp32, static_cast<int>(window_size));