Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
e525ea2
[CPU] Optimize GQA attention bias application for FP16 (#25871)
derdeljan-msft Aug 28, 2025
574806b
Fixes for DynamicQuantizeMatMul and Attention3D tests (#25814)
JonathanC-ARM Aug 28, 2025
abe485e
Fix MoE CPP tests (#25877)
apsonawane Aug 28, 2025
179f371
[c++] Eliminate dynamic initialization of static Ort::Global<void>::a…
chwarr Aug 28, 2025
3563f2e
python GPU IO Bindings for NVIDIA (#25776)
ishwar-raut1 Aug 28, 2025
47f355a
[CANN] Add a `enable_cann_subgraph` feature parameter (#25867)
bachelor-dou Aug 28, 2025
1eb18f1
[EP ABI] Add OpAttr_GetTensorAttributeAsOrtValue and replace the exis…
chilo-ms Aug 29, 2025
820554e
Language bindings for model compatibility API (#25878)
adrastogi Aug 29, 2025
4754a1d
[QNN-EP] Introduce Level1 Transformer into qnn.preprocess (#25883)
qti-hungjuiw Aug 29, 2025
3fc9779
[QNN EP] Minor fix weight name missing when not valid QDQ node group …
qti-yuduo Aug 29, 2025
ca77b7e
Add custom ops library_path to EP metadata (#25830)
psakhamoori Aug 29, 2025
7a919c6
[OVEP] OpenVINO EP Features and bug-fixes for ORT-1.23 (#25884)
preetha-intel Aug 29, 2025
c9bdbd7
[java] Auto EP and compile model support (#25131)
Craigacp Aug 29, 2025
d51430c
Add error handling to extract_nuget_files.ps1 (#25866)
Aug 29, 2025
928df7c
[Fix] illegal memory access in GetInputIndices with optional inputs (…
mingyueliuh Aug 29, 2025
69ec7b1
Re-enable cpuinfo for ARM64EC (#25863)
edgchen1 Aug 29, 2025
1aa0fab
Merge branch 'master' into sync_msft_01092025
Jaswanth51 Sep 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix MoE CPP tests (microsoft#25877)
This change adds skip test for QMoE CPU tests when running on TensorRT
or CUDA EP.
In the QMoE kernel there was a memory overwrite bug in the accumulate
part, updated that and this fixed the python tests back
  • Loading branch information
apsonawane authored Aug 28, 2025
commit abe485ee02f6432cbc608c3b0f765e86df36e467
11 changes: 9 additions & 2 deletions onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,13 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
const int64_t token_idx = route_idx / k_;
const float weight = route_scale[route_idx];

float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + token_idx * hidden_size;
const size_t buffer_offset = static_cast<size_t>(token_idx) * static_cast<size_t>(hidden_size);
if (buffer_offset + static_cast<size_t>(hidden_size) > output_buffer_size) {
// Skip this token to prevent buffer overflow
continue;
}

float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + buffer_offset;
const float* src = C2 + i * hidden_size;
for (int64_t j = 0; j < hidden_size; ++j) {
dest[j] += weight * (src[j] + (B2_bias ? bias2_float[j] : 0.0f));
Expand All @@ -344,8 +350,9 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
auto accumulate = [&](float* buffer) {
memset(buffer, 0, output_buffer_size * sizeof(float));
for (int i = 0; i < num_expert_threads; ++i) {
const size_t thread_offset = static_cast<size_t>(i) * output_buffer_size;
for (size_t j = 0; j < output_buffer_size; ++j) {
buffer[j] += thread_local_outputs[static_cast<size_t>(i) * output_buffer_size + j];
buffer[j] += thread_local_outputs[thread_offset + j];
}
}
};
Expand Down
55 changes: 55 additions & 0 deletions onnxruntime/test/contrib_ops/moe_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ static void RunQMoETest(const std::vector<float>& input, const std::vector<float
// Test CPU execution provider (always available)
// Skip CPU test if FC3 weights are provided since CPU doesn't support FC3
if (fc3_experts_weights.empty()) {
// Ensure CPU EP is available before running CPU tests
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
return; // Skip CPU test if CPU EP is not available
}

OpTester cpu_tester("QMoE", 1, onnxruntime::kMSDomain);
cpu_tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
cpu_tester.AddAttribute<std::string>("activation_type", activation_type);
Expand Down Expand Up @@ -1323,6 +1329,13 @@ TEST(MoETest, QMoETest_Mixtral_Int4) {

// CPU-specific QMoE tests
TEST(MoETest, QMoETest_CPU_Int4_MLAS) {
#ifdef USE_MLAS
// Skip this test if we're not testing CPU execution provider
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
GTEST_SKIP() << "CPU execution provider not available";
}

int num_rows = 2;
int num_experts = 2;
int hidden_size = 32;
Expand Down Expand Up @@ -1387,9 +1400,19 @@ TEST(MoETest, QMoETest_CPU_Int4_MLAS) {
std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
#else
GTEST_SKIP() << "Skipping CPU QMoE test";
#endif
}

TEST(MoETest, QMoETest_CPU_Int8_MLAS) {
#ifdef USE_MLAS
// Skip this test if we're not testing CPU execution provider
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
GTEST_SKIP() << "CPU execution provider not available";
}

// Test CPU implementation with 8-bit quantization - CPU ONLY
int num_rows = 1;
int num_experts = 2;
Expand Down Expand Up @@ -1446,9 +1469,19 @@ TEST(MoETest, QMoETest_CPU_Int8_MLAS) {
std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
#else
GTEST_SKIP() << "Skipping CPU QMoE test";
#endif
}

TEST(MoETest, QMoETest_CPU_FC3_Error) {
#ifdef USE_MLAS
// Skip this test if we're not testing CPU execution provider
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
GTEST_SKIP() << "CPU execution provider not available";
}

// Test that CPU throws error when FC3 gating is provided - CPU ONLY
int num_rows = 1;
int num_experts = 2;
Expand Down Expand Up @@ -1506,9 +1539,19 @@ TEST(MoETest, QMoETest_CPU_FC3_Error) {

// Expect this to fail with FC3 not implemented error
cpu_tester.Run(OpTester::ExpectResult::kExpectFailure, "FC3 gating is not yet implemented", {}, nullptr, &cpu_execution_providers);
#else
GTEST_SKIP() << "Skipping CPU QMoE test";
#endif
}

TEST(MoETest, QMoETest_CPU_SwiGLU_Int4) {
#ifdef USE_MLAS
// Skip this test if we're not testing CPU execution provider
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
GTEST_SKIP() << "CPU execution provider not available";
}

// Test CPU implementation with 4-bit quantization and SwiGLU activation
int num_rows = 2;
int num_experts = 2;
Expand Down Expand Up @@ -1573,9 +1616,18 @@ TEST(MoETest, QMoETest_CPU_SwiGLU_Int4) {
std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
#else
GTEST_SKIP() << "Skipping CPU QMoE test";
#endif
}

TEST(MoETest, QMoETest_CPU_SwiGLU_Int8) {
#ifdef USE_MLAS
// Skip this test if we're not testing CPU execution provider
auto cpu_ep = DefaultCpuExecutionProvider();
if (!cpu_ep) {
GTEST_SKIP() << "CPU execution provider not available";
}
// Test CPU implementation with 8-bit quantization and SwiGLU activation
int num_rows = 1;
int num_experts = 2;
Expand Down Expand Up @@ -1633,6 +1685,9 @@ TEST(MoETest, QMoETest_CPU_SwiGLU_Int8) {
std::vector<std::unique_ptr<IExecutionProvider>> cpu_execution_providers;
cpu_execution_providers.push_back(DefaultCpuExecutionProvider());
cpu_tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &cpu_execution_providers);
#else
GTEST_SKIP() << "Skipping CPU QMoE test";
#endif
}

#endif
Expand Down