Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 24 additions & 25 deletions onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME()
#include "core/common/narrow.h"
#include "core/common/safeint.h"
#include "core/mlas/inc/mlas.h"
Expand All @@ -10,6 +11,7 @@
#include "core/util/math_cpuonly.h"
#include "core/util/qmath.h"

#include <cassert>

Check warning on line 14 in onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Found C++ system header after other header. Should be: dynamic_quantize_matmul.h, c system, c++ system, other. [build/include_order] [4] Raw Output: onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc:14: Found C++ system header after other header. Should be: dynamic_quantize_matmul.h, c system, c++ system, other. [build/include_order] [4]
#include <algorithm>
#include <vector>

Expand Down Expand Up @@ -169,43 +171,40 @@
// only pack Matrix B
if (input_idx == GetBIdx()) {
const Tensor* b_zp_constant_tensor{nullptr};
bool b_quantization_is_asymmetric = false;
bool b_quantization_might_be_asymmetric = false;

// zero point tensor could be provided as a direct input to the kernel and not as a constant so this
// test is not sufficient
const OrtValue* b_zp;
if (Info().TryGetConstantInput(IN_B_ZERO_POINT, &b_zp)) {
b_zp_constant_tensor = &b_zp->Get<Tensor>();
}

// MlasDynamicQgemm requires symmetric quantization for B, so no zero point should exist or it should
// have a zero value
if (b_zp_constant_tensor != nullptr) { // Covers the case where tensor is not a constant
const auto& shape = b_zp_constant_tensor->Shape();
const auto* zp_data = static_cast<const uint8_t*>(b_zp_constant_tensor->DataRaw());
size_t zp_size = static_cast<size_t>(shape.Size());
// MlasDynamicQgemm requires symmetric quantization: zp must be scalar 0 or 1D all-zero
if ((shape.NumDimensions() == 0) && (zp_data[0] == 0)) {
b_quantization_is_asymmetric = false;
} else if (shape.NumDimensions() == 1) {
b_quantization_is_asymmetric = false;
for (size_t i = 0; i < zp_size; ++i) {
if (zp_data[i] != 0) {
b_quantization_is_asymmetric = true;
break;
}
}
} else {
// Unsupported higher-rank zp tensor
b_quantization_is_asymmetric = true;
}
// MlasDynamicQgemm requires symmetric quantization for B, so the B zero point value should either be all zeros
// or not provided.
if (b_zp_constant_tensor != nullptr) {
// B zero point is constant. Check if it is all zeros.
assert(b_zp_constant_tensor->IsDataType<uint8_t>() || b_zp_constant_tensor->IsDataType<int8_t>());
const auto* zp_bytes = static_cast<const std::byte*>(b_zp_constant_tensor->DataRaw());
const size_t zp_size_in_bytes = b_zp_constant_tensor->SizeInBytes();
b_quantization_might_be_asymmetric = std::any_of(zp_bytes, zp_bytes + zp_size_in_bytes,
[](std::byte v) { return v != std::byte{0}; });
} else {
// B zero point input is not constant. If it exists, we can't assume symmetric quantization.
const auto input_defs = Info().node().InputDefs();
const bool b_zp_input_exists = input_defs.size() > IN_B_ZERO_POINT && input_defs[IN_B_ZERO_POINT]->Exists();
b_quantization_might_be_asymmetric = b_zp_input_exists;
}

// MlasDynamicQgemm requires scale data to be available at packing stage
const Tensor* b_scale_tensor = nullptr;
const bool b_scale_available = Info().TryGetConstantInput(IN_B_SCALE, &b_scale_tensor);

can_use_dynamic_quant_mlas_ = (!b_quantization_is_asymmetric && b_scale_available);
can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available);

// Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops.
// We check that here too before attempting to use them.
if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) {
can_use_dynamic_quant_mlas_ = false;
}

// Only handle the common case of a 2D weight matrix. Additional matrices
// could be handled by stacking the packed buffers.
Expand Down
182 changes: 114 additions & 68 deletions onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,48 @@ static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, con
}
}

struct TestDynamicQuantizeMatMulOptions {
bool is_matrix_b_constant = true;

bool per_column = false;

bool is_scale_constant = false;

bool has_zp = true;
bool is_zp_constant = false;
bool is_zp_zero = false;

bool has_bias = false;
bool is_bias_constant = false;

bool empty_input = false;
};

template <typename T>
void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
bool per_column = false,
bool has_zp = true,
bool has_bias = false,
bool empty_input = false) {
void TestDynamicQuantizeMatMul(const TestDynamicQuantizeMatMulOptions& opts) {
static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>);

SCOPED_TRACE(MakeString(
"b data type:", (std::is_same_v<T, uint8_t> ? "uint8" : "int8"),
", is_matrix_b_constant:", opts.is_matrix_b_constant,
", per_column:", opts.per_column,
", is_scale_constant:", opts.is_scale_constant,
", has_zp:", opts.has_zp,
", is_zp_constant:", opts.is_zp_constant,
", is_zp_zero:", opts.is_zp_zero,
", has_bias:", opts.has_bias,
", is_bias_constant:", opts.is_bias_constant,
", empty_input:", opts.empty_input));

// create rand inputs
RandomValueGenerator random{1668426375};

int64_t M = empty_input ? 1 : 4;
int64_t M = opts.empty_input ? 1 : 4;
int64_t N = 128;
int64_t K = 128;
std::vector<int64_t> A_dims{empty_input ? 0 : M, K};
std::vector<int64_t> A_dims{opts.empty_input ? 0 : M, K};
std::vector<int64_t> B_dims{K, N};
std::vector<int64_t> Y_dims{empty_input ? 0 : M, K};
std::vector<int64_t> Y_dims{opts.empty_input ? 0 : M, K};
std::vector<float> A_data = random.Uniform<float>(A_dims, -1.0f, 1.0f);
std::vector<T> B_data;
std::vector<T> tmp_B_data = random.Uniform<T>(B_dims,
Expand All @@ -106,101 +133,120 @@ void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
return static_cast<T>(v);
});

int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
int64_t b_scale_zp_size = opts.per_column ? B_dims.back() : 1;
std::vector<float> B_scale = random.Uniform<float>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
std::vector<T> B_zero_point(b_scale_zp_size);
std::for_each(B_zero_point.begin(),
B_zero_point.end(),
[&random](T& zp) {
zp = static_cast<T>(random.Uniform<T>(std::array<int64_t, 1>{1},
std::numeric_limits<T>::min(),
std::numeric_limits<T>::max())[0]);
});
if (!opts.is_zp_zero) {
std::for_each(B_zero_point.begin(),
B_zero_point.end(),
[&random](T& zp) {
zp = static_cast<T>(random.Uniform<T>(std::array<int64_t, 1>{1},
std::numeric_limits<T>::min(),
std::numeric_limits<T>::max())[0]);
});
}

std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);

OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
test.AddInput<float>("A", A_dims, A_data);
test.AddInput<T>("B", B_dims, B_data, is_matrix_b_constant);
test.AddInput<float>("b_scale", {b_scale_zp_size}, B_scale);
test.AddInput<T>("B", B_dims, B_data, opts.is_matrix_b_constant);
test.AddInput<float>("b_scale", {b_scale_zp_size}, B_scale, opts.is_scale_constant);

if (has_zp) {
test.AddInput<T>("b_zero_point", {b_scale_zp_size}, B_zero_point);
if (opts.has_zp) {
test.AddInput<T>("b_zero_point", {b_scale_zp_size}, B_zero_point, opts.is_zp_constant);
} else {
test.AddOptionalInputEdge<T>();
}

if (has_bias) {
test.AddInput<float>("bias", {B_dims.back()}, Bias);
if (opts.has_bias) {
test.AddInput<float>("bias", {B_dims.back()}, Bias, opts.is_bias_constant);
} else {
test.AddOptionalInputEdge<float>();
}

std::vector<float> Y_data(M * N);
CalculateDynamicQuantizeMatMul<T>(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data,
per_column, has_zp, has_bias);
opts.per_column, opts.has_zp, opts.has_bias);
test.AddOutput<float>("Y", Y_dims, Y_data);
test.SetOutputRelErr("Y", 0.02f);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
}

template <typename T, bool HasZeroPoint, bool HasBias>
void RunDynamicQuantizeMatMulTest() {
TestDynamicQuantizeMatMul<T>(false, /*is_matrix_b_constant*/
false, /*per_column*/
HasZeroPoint, /*has_zp*/
HasBias /*has_bias*/
);

TestDynamicQuantizeMatMul<T>(true, /*is_matrix_b_constant*/
false, /*per_column*/
HasZeroPoint, /*has_zp*/
HasBias /*has_bias*/
);

TestDynamicQuantizeMatMul<T>(false, /*is_matrix_b_constant*/
true, /*per_column*/
HasZeroPoint, /*has_zp*/
HasBias /*has_bias*/
);

TestDynamicQuantizeMatMul<T>(true, /*is_matrix_b_constant*/
true, /*per_column*/
HasZeroPoint, /*has_zp*/
HasBias /*has_bias*/
);
template <typename T>
void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
bool per_column = false,
bool has_zp = true,
bool has_bias = false,
bool empty_input = false) {
TestDynamicQuantizeMatMulOptions opts{};
opts.is_matrix_b_constant = is_matrix_b_constant;
opts.per_column = per_column;
opts.has_zp = has_zp;
opts.has_bias = has_bias;
opts.empty_input = empty_input;

TestDynamicQuantizeMatMul<T>(opts);
}

TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) {
RunDynamicQuantizeMatMulTest<int8_t, true, false>();
template <typename T>
void RunDynamicQuantizeMatMulTest() {
for (bool is_matrix_b_constant : {false, true}) {
for (bool per_column : {false, true}) {
for (bool has_zp : {false, true}) {
for (bool has_bias : {false, true}) {
TestDynamicQuantizeMatMul<T>(is_matrix_b_constant,
per_column,
has_zp,
has_bias);
}
}
}
}
}

TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) {
RunDynamicQuantizeMatMulTest<uint8_t, true, false>();
TEST(DynamicQuantizeMatMul, Int8) {
RunDynamicQuantizeMatMulTest<int8_t>();
}

TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) {
RunDynamicQuantizeMatMulTest<int8_t, false, true>();
TEST(DynamicQuantizeMatMul, UInt8) {
RunDynamicQuantizeMatMulTest<uint8_t>();
}

TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) {
RunDynamicQuantizeMatMulTest<uint8_t, false, true>();
}
TEST(DynamicQuantizeMatMul, WithConstantBInputs) {
TestDynamicQuantizeMatMulOptions base_opts{};
base_opts.is_matrix_b_constant = true;
base_opts.is_scale_constant = true;
base_opts.is_zp_constant = true;

TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) {
RunDynamicQuantizeMatMulTest<int8_t, false, false>();
}
{
// no zp
auto opts = base_opts;
opts.has_zp = false;

TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) {
RunDynamicQuantizeMatMulTest<uint8_t, false, false>();
}
TestDynamicQuantizeMatMul<int8_t>(opts);
TestDynamicQuantizeMatMul<uint8_t>(opts);
}

TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) {
RunDynamicQuantizeMatMulTest<int8_t, true, true>();
}
{
// zp that is zero (symmetric quantization)
auto opts = base_opts;
opts.has_zp = true;
opts.is_zp_zero = true;

TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) {
RunDynamicQuantizeMatMulTest<uint8_t, true, true>();
TestDynamicQuantizeMatMul<int8_t>(opts);
TestDynamicQuantizeMatMul<uint8_t>(opts);
}

{
// zp that is non-zero
auto opts = base_opts;
opts.has_zp = true;
opts.is_zp_zero = false;

TestDynamicQuantizeMatMul<int8_t>(opts);
TestDynamicQuantizeMatMul<uint8_t>(opts);
}
}

TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
Expand Down
Loading