From 76d8632700a5a0d564eed532de59b025f9d327c3 Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 30 Oct 2025 08:39:49 +0000
Subject: [PATCH 1/6] Matched after_gather_dim in gather_block_quantized to
 correct CUDA kernel indexing

---
 .../cuda/quantization/gather_block_quantized.cc     | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc b/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc
index b7a5c5904cf72..9b91215eba91d 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/gather_block_quantized.cc
@@ -119,9 +119,20 @@ Status GatherBlockQuantized<T1, T2, Tind>::ComputeInternal(OpKernelContext* ctx)
     zero_points_ptr = zero_points->Data<T1>();
   }
 
+  // For packed uint8_t with bits < 8,
+  // after_gather_dim has to be adjusted to match
+  // the unpacked output dims for correct kernel indexing
+  int64_t after_gather_dim_unpacked = after_gather_dim;
+  if constexpr (std::is_same_v<T1, uint8_t>) {
+    uint32_t components = 8 / static_cast<int>(bits_);
+    if (components > 1) {
+      after_gather_dim_unpacked *= components;
+    }
+  }
+
   GatherBlockQuantizedParam param;
   param.stream = Stream(ctx);
-  param.after_gather_dim = after_gather_dim;
+  param.after_gather_dim = after_gather_dim_unpacked;
   param.gather_axis_dim = data_shape[gather_axis_];
   param.ind_dim = ind_dim;
   param.bits = bits_;

From 4092a09fa8252e787c142362d7b4ca6c8e71fc4e Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 6 Nov 2025 00:25:41 +0000
Subject: [PATCH 2/6] Added general u8/i4x2 tests from @xiaomsft

---
 .../gather_block_quantized_op_test.cc         | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 3bf37ea193245..1185a1aadf793 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -665,5 +665,83 @@ TEST(GatherBlockQuantizedOpTest, GatherAxis2) {
 }
 #endif
 
+template <typename T1, typename T2, typename Tind>
+void Test_GatherAxis_WithZeroPoints_NoPading() {
+  std::vector<int> data = {
+      -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5,
+      -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1,
+      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+      4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+      4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+      -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1};
+
+  std::vector<int64_t> data_shape = {2, 3, 16};
+  std::vector<int> indices = {1};
+  std::vector<int64_t> indices_shape = {1};
+  std::vector<float> scales = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
+  std::vector<int64_t> scales_shape = {2, 3, 1};
+  std::vector<int> zero_points = {-1, 1, 0, 0, 1, -1};
+  std::vector<float> output = {
+      8, 10, 12, 14, 8, 10, 12, 14, 8, 10, 12, 14, 8, 10, 12, 14,
+      3, 4, 5, 6, 3, 4, 5, 6, 3, 4, 5, 6, 3, 4, 5, 6,
+      -6, -4, -2, 0, -6, -4, -2, 0, -6, -4, -2, 0, -6, -4, -2, 0};
+  std::vector<int64_t> output_shape = {1, 3, 16};
+
+  constexpr int64_t gather_axis = 0;
+  constexpr int64_t quantize_axis = 2;
+  constexpr int64_t block_size = 16;
+  constexpr int64_t bits = 4;
+
+  RunUnpackedData<T1, T2, Tind>(data, data_shape, indices, indices_shape, scales, scales_shape, zero_points,
+                                gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
+}
+
+TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
+  Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, float, int32_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, MLFloat16, int32_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, float, int64_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, MLFloat16, int64_t>();
+}
+
+template <typename T1, typename T2, typename Tind>
+void Test_GatherAxis_NoPading_8bit() {
+  std::vector<int> data = {
+      127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112,
+      127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112,
+      127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112};
+
+  std::vector<int64_t> data_shape = {2, 3, 16};
+  std::vector<int> indices = {0};
+  std::vector<int64_t> indices_shape = {1};
+  std::vector<float> scales = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
+  std::vector<int64_t> scales_shape = {2, 3, 1};
+  std::vector<int> zero_points = {};
+  std::vector<float> output = {
+      255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
+      510, 508, 506, 504, 502, 500, 498, 496, 494, 492, 490, 488, 486, 484, 482, 480,
+      128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143};
+  std::vector<int64_t> output_shape = {1, 3, 16};
+
+  constexpr int64_t gather_axis = 0;
+  constexpr int64_t quantize_axis = 2;
+  constexpr int64_t block_size = 16;
+  constexpr int64_t bits = 8;
+
+  RunUnpackedData<T1, T2, Tind>(data, data_shape, indices, indices_shape, scales, scales_shape, zero_points,
+                                gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
+}
+
+TEST(GatherBlockQuantizedOpTest, GatherAxisNoPadingUInt8) {
+  Test_GatherAxis_NoPading_8bit<uint8_t, float, int32_t>();
+  Test_GatherAxis_NoPading_8bit<uint8_t, MLFloat16, int32_t>();
+  Test_GatherAxis_NoPading_8bit<uint8_t, float, int64_t>();
+  Test_GatherAxis_NoPading_8bit<uint8_t, MLFloat16, int64_t>();
+}
+
+
+
 }  // namespace test
 }  // namespace onnxruntime

From e37e5b97983adb6c2e7ce71358258888696f99b3 Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 6 Nov 2025 00:35:48 +0000
Subject: [PATCH 3/6] Added UINT4X2 cases in
 Test_GatherAxis_WithZeroPoints_NoPading

---
 .../test/contrib_ops/gather_block_quantized_op_test.cc      | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 1185a1aadf793..70cc2d0266d92 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -701,6 +701,10 @@ TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
   Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, MLFloat16, int32_t>();
   Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, float, int64_t>();
   Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, MLFloat16, int64_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, float, int32_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, MLFloat16, int32_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, float, int64_t>();
+  Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, MLFloat16, int64_t>();
 }
 
 template <typename T1, typename T2, typename Tind>
@@ -741,7 +745,5 @@ TEST(GatherBlockQuantizedOpTest, GatherAxisNoPadingUInt8) {
   Test_GatherAxis_NoPading_8bit<uint8_t, MLFloat16, int64_t>();
 }
 
-
-
 }  // namespace test
 }  // namespace onnxruntime

From d9d8a2986f76756f36b071c9e22d8c3c8cb89402 Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 6 Nov 2025 00:59:59 +0000
Subject: [PATCH 4/6] Added  test for uint8_t with two packed u4.

---
 .../gather_block_quantized_op_test.cc         | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 70cc2d0266d92..03000d699793a 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -707,6 +707,50 @@ TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
   Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, MLFloat16, int64_t>();
 }
 
+template <typename T1, typename T2, typename Tind>
+void Test_GatherAxis_NoPading_4bit() {
+  // BUG: CUDA kernel does NOT subtract default zero point (8 for 4-bit) when no explicit zero_points provided
+  // Row 0 data: [-8,-7,-6,-5,...] repeated
+  // Packed (add offset 8): [0,1,2,3,...] repeated
+  // Row 0 scales: [1.0, 2.0, 1.0] (indices 0,1,2 from [1,2,1,2,1,2])
+  // CUDA output: [0*1, 1*1, 2*1, 3*1, ...] [0*2, 1*2, 2*2, 3*2, ...] [0*1, 1*1, 2*1, 3*1, ...]
+  //            = [0, 1, 2, 3, ...] [0, 2, 4, 6, ...] [0, 1, 2, 3, ...]
+  std::vector<int> data = {
+      -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5,
+      -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5,
+      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+      7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4,
+      7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4};
+
+  std::vector<int64_t> data_shape = {2, 3, 16};
+  std::vector<int> indices = {0};
+  std::vector<int64_t> indices_shape = {1};
+  std::vector<float> scales = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
+  std::vector<int64_t> scales_shape = {2, 3, 1};
+  std::vector<int> zero_points = {};
+  std::vector<float> output = {
+      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+      0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6,
+      8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11};
+  std::vector<int64_t> output_shape = {1, 3, 16};
+
+  constexpr int64_t gather_axis = 0;
+  constexpr int64_t quantize_axis = 2;
+  constexpr int64_t block_size = 16;
+  constexpr int64_t bits = 4;
+
+  RunUnpackedData<T1, T2, Tind>(data, data_shape, indices, indices_shape, scales, scales_shape, zero_points,
+                                gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
+}
+
+TEST(GatherBlockQuantizedOpTest, GatherAxisNoPadingUInt8_4Bits) {
+  Test_GatherAxis_NoPading_4bit<uint8_t, float, int32_t>();
+  Test_GatherAxis_NoPading_4bit<uint8_t, MLFloat16, int32_t>();
+  Test_GatherAxis_NoPading_4bit<uint8_t, float, int64_t>();
+  Test_GatherAxis_NoPading_4bit<uint8_t, MLFloat16, int64_t>();
+}
+
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis_NoPading_8bit() {
   std::vector<int> data = {

From 12a8bb2eb528c194dca357493d3f9f29b9af16bc Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 6 Nov 2025 02:02:35 +0000
Subject: [PATCH 5/6] Added testcases that mimic usecase for shared
 emb_tokens/lm_head & Updated comments

---
 .../gather_block_quantized_op_test.cc         | 78 +++++++++++++++++--
 1 file changed, 72 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 03000d699793a..a221c3d942abf 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -573,6 +573,78 @@ TEST(GatherBlockQuantizedOpTest, GatherAxis0NoZeroPoints_8Bits) {
 }
 #endif
 
+template <typename T1, typename T2, typename Tind>
+void Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits() {
+  // This test case specific to shared 4bit token_embedding/lm_head use case on CUDA
+  std::vector<int> data = {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7,
+                           0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1};
+  std::vector<int64_t> data_shape = {2, 16};
+  std::vector<int> indices = {1};
+  std::vector<int64_t> indices_shape = {1};
+  std::vector<float> scales = {2.0f, 1.0f};
+  std::vector<int64_t> scales_shape = {2, 1};
+  // Explicit zero points for each row
+  std::vector<int> zero_points = {-2, 1};
+
+  // With explicit zero points:
+  // Unpacked data (row 1): [0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1] ---add offset 8--->
+  // Packed (add offset 8): [8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7]
+  // Gathered scales (row 1): scale = 1.0f, zero_point (row 1): packed: [1] ---add offset 8---> unpacked: [9]
+  // Expected (CUDA doesn't subtract zero point): [8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7]
+  std::vector<float> output = {8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+  std::vector<int64_t> output_shape = {1, 16};
+
+  constexpr int64_t gather_axis = 0;
+  constexpr int64_t quantize_axis = 1;  // Last axis (required for CUDA)
+  constexpr int64_t block_size = 16;
+  constexpr int64_t bits = 4;
+  RunUnpackedData<T1, T2, Tind>(data, data_shape, indices, indices_shape, scales, scales_shape, zero_points,
+                                gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
+}
+
+template <typename T1, typename T2, typename Tind>
+void Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits() {
+  // This test case specific to shared 8bit token_embedding/lm_head use case on CUDA
+  std::vector<int> data = {-128, -127, -126, -125, -124, -123, -122, -121, -120, -119, -118, -117, -116, -115, -114, -113,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<int64_t> data_shape = {2, 16};
+  std::vector<int> indices = {1};
+  std::vector<int64_t> indices_shape = {1};
+  std::vector<float> scales = {1.0f, 2.0f};
+  std::vector<int64_t> scales_shape = {2, 1};
+  // Explicit zero points
+  std::vector<int> zero_points = {10, -5};
+
+  // With explicit zero points:
+  // Unpacked data (row 1): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] ---add offset 128--->
+  // Packed (row1): [128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
+  // Zero point unpacked: [-5] ---add offset 128---> packed: [123]
+  // Dequantization: [(128-123)*2, (129-123)*2, ..., (143-123)*2] = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40]
+  std::vector<float> output = {10.f, 12.f, 14.f, 16.f, 18.f, 20.f, 22.f, 24.f, 26.f, 28.f, 30.f, 32.f, 34.f, 36.f, 38.f, 40.f};
+  std::vector<int64_t> output_shape = {1, 16};
+
+  constexpr int64_t gather_axis = 0;
+  constexpr int64_t quantize_axis = 1;
+  constexpr int64_t block_size = 16;
+  constexpr int64_t bits = 8;
+  RunUnpackedData<T1, T2, Tind>(data, data_shape, indices, indices_shape, scales, scales_shape, zero_points,
+                                gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
+}
+
+TEST(GatherBlockQuantizedOpTest, GatherAxis0_QuantizedAxis1_Uint8_4Bits_WithZeroPoints) {
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, float, int32_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, MLFloat16, int32_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, float, int64_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, MLFloat16, int64_t>();
+}
+
+TEST(GatherBlockQuantizedOpTest, GatherAxis0_QuantizedAxis1_Uint8_8Bits_WithZeroPoints) {
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, float, int32_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, MLFloat16, int32_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, float, int64_t>();
+  Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, MLFloat16, int64_t>();
+}
+
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis1_WithZeroPoints() {
   std::vector<int> data = {-8, -7, -6, -5,
@@ -709,12 +781,6 @@ TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
 
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis_NoPading_4bit() {
-  // BUG: CUDA kernel does NOT subtract default zero point (8 for 4-bit) when no explicit zero_points provided
-  // Row 0 data: [-8,-7,-6,-5,...] repeated
-  // Packed (add offset 8): [0,1,2,3,...] repeated
-  // Row 0 scales: [1.0, 2.0, 1.0] (indices 0,1,2 from [1,2,1,2,1,2])
-  // CUDA output: [0*1, 1*1, 2*1, 3*1, ...] [0*2, 1*2, 2*2, 3*2, ...] [0*1, 1*1, 2*1, 3*1, ...]
-  //            = [0, 1, 2, 3, ...] [0, 2, 4, 6, ...] [0, 1, 2, 3, ...]
   std::vector<int> data = {
       -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5,
       -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5, -8, -7, -6, -5,

From 7aa39a2ebd18aa10fa3b16f3d7370ac7ee93dbce Mon Sep 17 00:00:00 2001
From: jixiongdeng <jixiong.deng@gmail.com>
Date: Thu, 6 Nov 2025 03:15:53 +0000
Subject: [PATCH 6/6] Limited testcases to CUDA backend

---
 .../test/contrib_ops/gather_block_quantized_op_test.cc    | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index a221c3d942abf..6fea7a43712c7 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -631,6 +631,7 @@ void Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits() {
                                 gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
 }
 
+#ifdef USE_CUDA
 TEST(GatherBlockQuantizedOpTest, GatherAxis0_QuantizedAxis1_Uint8_4Bits_WithZeroPoints) {
   Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, float, int32_t>();
   Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_4Bits<uint8_t, MLFloat16, int32_t>();
@@ -644,6 +645,7 @@ TEST(GatherBlockQuantizedOpTest, GatherAxis0_QuantizedAxis1_Uint8_8Bits_WithZero
   Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, float, int64_t>();
   Test_GatherAxis0_QuantizedAxis1_WithZeroPoints_8Bits<uint8_t, MLFloat16, int64_t>();
 }
+#endif
 
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis1_WithZeroPoints() {
@@ -768,6 +770,7 @@ void Test_GatherAxis_WithZeroPoints_NoPading() {
                                 gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
 }
 
+#ifdef USE_CUDA
 TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
   Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, float, int32_t>();
   Test_GatherAxis_WithZeroPoints_NoPading<Int4x2, MLFloat16, int32_t>();
@@ -778,6 +781,7 @@ TEST(GatherBlockQuantizedOpTest, GatherAxisWithZeroPointsNoPading) {
   Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, float, int64_t>();
   Test_GatherAxis_WithZeroPoints_NoPading<UInt4x2, MLFloat16, int64_t>();
 }
+#endif
 
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis_NoPading_4bit() {
@@ -810,12 +814,14 @@ void Test_GatherAxis_NoPading_4bit() {
                                 gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
 }
 
+#ifdef USE_CUDA
 TEST(GatherBlockQuantizedOpTest, GatherAxisNoPadingUInt8_4Bits) {
   Test_GatherAxis_NoPading_4bit<uint8_t, float, int32_t>();
   Test_GatherAxis_NoPading_4bit<uint8_t, MLFloat16, int32_t>();
   Test_GatherAxis_NoPading_4bit<uint8_t, float, int64_t>();
   Test_GatherAxis_NoPading_4bit<uint8_t, MLFloat16, int64_t>();
 }
+#endif
 
 template <typename T1, typename T2, typename Tind>
 void Test_GatherAxis_NoPading_8bit() {
@@ -848,12 +854,14 @@ void Test_GatherAxis_NoPading_8bit() {
                                 gather_axis, quantize_axis, block_size, bits, output, output_shape, true);
 }
 
+#ifdef USE_CUDA
 TEST(GatherBlockQuantizedOpTest, GatherAxisNoPadingUInt8) {
   Test_GatherAxis_NoPading_8bit<uint8_t, float, int32_t>();
   Test_GatherAxis_NoPading_8bit<uint8_t, MLFloat16, int32_t>();
   Test_GatherAxis_NoPading_8bit<uint8_t, float, int64_t>();
   Test_GatherAxis_NoPading_8bit<uint8_t, MLFloat16, int64_t>();
 }
+#endif
 
 }  // namespace test
 }  // namespace onnxruntime