Fix MoE CPP tests (microsoft#25877)

This change adds skip test for QMoE CPU tests when running on TensorRT or CUDA EP. In the QMoE kernel there was a memory overwrite bug in the accumulate part, updated that and this fixed the python tests back
intel · preetha-intel · Sep 1, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
commit abe485ee02f6432cbc608c3b0f765e86df36e467
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -331,7 +331,13 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
         const int64_t token_idx = route_idx / k_;
         const float weight = route_scale[route_idx];
 
-        float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + token_idx * hidden_size;
+        const size_t buffer_offset = static_cast<size_t>(token_idx) * static_cast<size_t>(hidden_size);
+        if (buffer_offset + static_cast<size_t>(hidden_size) > output_buffer_size) {
+          // Skip this token to prevent buffer overflow
+          continue;
+        }
+
+        float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + buffer_offset;
         const float* src = C2 + i * hidden_size;
         for (int64_t j = 0; j < hidden_size; ++j) {
           dest[j] += weight * (src[j] + (B2_bias ? bias2_float[j] : 0.0f));
@@ -344,8 +350,9 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   auto accumulate = [&](float* buffer) {
     memset(buffer, 0, output_buffer_size * sizeof(float));
     for (int i = 0; i < num_expert_threads; ++i) {
+      const size_t thread_offset = static_cast<size_t>(i) * output_buffer_size;
       for (size_t j = 0; j < output_buffer_size; ++j) {
-        buffer[j] += thread_local_outputs[static_cast<size_t>(i) * output_buffer_size + j];
+        buffer[j] += thread_local_outputs[thread_offset + j];
       }
     }
   };

diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -144,6 +144,12 @@ static void RunQMoETest(const std::vector<float>& input, const std::vector<float
   // Test CPU execution provider (always available)
   // Skip CPU test if FC3 weights are provided since CPU doesn't support FC3
   if (fc3_experts_weights.empty()) {
+    // Ensure CPU EP is available before running CPU tests
+    auto cpu_ep = DefaultCpuExecutionProvider();
+    if (!cpu_ep) {
+      return;  // Skip CPU test if CPU EP is not available
+    }
+
     OpTester cpu_tester("QMoE", 1, onnxruntime::kMSDomain);
     cpu_tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
     cpu_tester.AddAttribute<std::string>("activation_type", activation_type);
@@ -1323,6 +1329,13 @@ TEST(MoETest, QMoETest_Mixtral_Int4) {
 
 // CPU-specific QMoE tests
 TEST(MoETest, QMoETest_CPU_Int4_MLAS) {
+#ifdef USE_MLAS
+  // Skip this test if we're not testing CPU execution provider
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (!cpu_ep) {
+    GTEST_SKIP() << "CPU execution provider not available";
+  }
+
   int num_rows = 2;
   int num_experts = 2;
   int hidden_size = 32;
@@ -1387,9 +1400,19 @@ TEST(MoETest, QMoETest_CPU_Int4_MLAS) {
   std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
   cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
   cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
+#else
+  GTEST_SKIP() << "Skipping CPU QMoE test";
+#endif
 }
 
 TEST(MoETest, QMoETest_CPU_Int8_MLAS) {
+#ifdef USE_MLAS
+  // Skip this test if we're not testing CPU execution provider
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (!cpu_ep) {
+    GTEST_SKIP() << "CPU execution provider not available";
+  }
+
   // Test CPU implementation with 8-bit quantization - CPU ONLY
   int num_rows = 1;
   int num_experts = 2;
@@ -1446,9 +1469,19 @@ TEST(MoETest, QMoETest_CPU_Int8_MLAS) {
   std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
   cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
   cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
+#else
+  GTEST_SKIP() << "Skipping CPU QMoE test";
+#endif
 }
 
 TEST(MoETest, QMoETest_CPU_FC3_Error) {
+#ifdef USE_MLAS
+  // Skip this test if we're not testing CPU execution provider
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (!cpu_ep) {
+    GTEST_SKIP() << "CPU execution provider not available";
+  }
+
   // Test that CPU throws error when FC3 gating is provided - CPU ONLY
   int num_rows = 1;
   int num_experts = 2;
@@ -1506,9 +1539,19 @@ TEST(MoETest, QMoETest_CPU_FC3_Error) {
 
   // Expect this to fail with FC3 not implemented error
   cpu_tester.Run(OpTester::ExpectResult::kExpectFailure, "FC3 gating is not yet implemented", {}, nullptr, &cpu_execution_providers);
+#else
+  GTEST_SKIP() << "Skipping CPU QMoE test";
+#endif
 }
 
 TEST(MoETest, QMoETest_CPU_SwiGLU_Int4) {
+#ifdef USE_MLAS
+  // Skip this test if we're not testing CPU execution provider
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (!cpu_ep) {
+    GTEST_SKIP() << "CPU execution provider not available";
+  }
+
   // Test CPU implementation with 4-bit quantization and SwiGLU activation
   int num_rows = 2;
   int num_experts = 2;
@@ -1573,9 +1616,18 @@ TEST(MoETest, QMoETest_CPU_SwiGLU_Int4) {
   std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
   cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
   cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
+#else
+  GTEST_SKIP() << "Skipping CPU QMoE test";
+#endif
 }
 
 TEST(MoETest, QMoETest_CPU_SwiGLU_Int8) {
+#ifdef USE_MLAS
+  // Skip this test if we're not testing CPU execution provider
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (!cpu_ep) {
+    GTEST_SKIP() << "CPU execution provider not available";
+  }
   // Test CPU implementation with 8-bit quantization and SwiGLU activation
   int num_rows = 1;
   int num_experts = 2;
@@ -1633,6 +1685,9 @@ TEST(MoETest, QMoETest_CPU_SwiGLU_Int8) {
   std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
   cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
   cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
+#else
+  GTEST_SKIP() << "Skipping CPU QMoE test";
+#endif
 }
 
 #endif