From de18a1bae90f5b08f63abb72c93b0428113e39e2 Mon Sep 17 00:00:00 2001 From: wangw <271502003@qq.com> Date: Sat, 9 Mar 2024 20:04:42 +0800 Subject: [PATCH 1/4] Add topk operator and cpu kernel. --- .../include/kernel/attributes/topk_info.h | 25 ++ src/04kernel/include/kernel/collectors/topk.h | 20 ++ src/04kernel/src/attributes/topk_info.cc | 14 ++ src/04kernel/src/collectors/topk.cc | 30 +++ src/04kernel/src/kernels/topk/cpu_kernel.cc | 61 +++++ src/04kernel/src/kernels/topk/cpu_kernel.hh | 23 ++ src/04kernel/test/kernels/topk/test_cpu.cpp | 238 ++++++++++++++++++ .../include/computation/operators/topk.h | 20 ++ src/05computation/src/operators/topk.cc | 17 ++ src/07onnx/src/operators.cpp | 2 + src/07onnx/src/operators/topk.cc | 55 ++++ src/07onnx/src/operators/topk.hh | 23 ++ 12 files changed, 528 insertions(+) create mode 100644 src/04kernel/include/kernel/attributes/topk_info.h create mode 100644 src/04kernel/include/kernel/collectors/topk.h create mode 100644 src/04kernel/src/attributes/topk_info.cc create mode 100644 src/04kernel/src/collectors/topk.cc create mode 100644 src/04kernel/src/kernels/topk/cpu_kernel.cc create mode 100644 src/04kernel/src/kernels/topk/cpu_kernel.hh create mode 100644 src/04kernel/test/kernels/topk/test_cpu.cpp create mode 100644 src/05computation/include/computation/operators/topk.h create mode 100644 src/05computation/src/operators/topk.cc create mode 100644 src/07onnx/src/operators/topk.cc create mode 100644 src/07onnx/src/operators/topk.hh diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h new file mode 100644 index 00000000..a9210078 --- /dev/null +++ b/src/04kernel/include/kernel/attributes/topk_info.h @@ -0,0 +1,25 @@ +#ifndef KERNEL_TOPK_INFO_H +#define KERNEL_TOPK_INFO_H + +#include "../tensor.h" + +namespace refactor::kernel { + + struct TopKInfo { + + uint8_t topk; + uint8_t axis; + size_t in_stride, in_stride_pre_axis, out_stride_pre_axis; + size_t elem_size, axis_elem_size; + + TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input); + size_t getElementSize() const {return elem_size;} + size_t getAxisElementSize()const { return axis_elem_size;} + size_t getInStride()const{return in_stride;} + size_t getInStridePreAxis()const{return in_stride_pre_axis;} + size_t getOutStridePreAxis()const {return out_stride_pre_axis;} + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_INFO_H diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h new file mode 100644 index 00000000..c4d8490f --- /dev/null +++ b/src/04kernel/include/kernel/collectors/topk.h @@ -0,0 +1,20 @@ +#ifndef KERNEL_TOPK_H +#define KERNEL_TOPK_H + +#include "../collector.h" + +namespace refactor::kernel { + + struct TopKCollector final : public InfoCollector { + uint32_t topk, axis; + + constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept + : InfoCollector(target), topk(topk), axis(axis_) {} + + std::vector + filter(TensorRefs inputs, TensorRefs outputs) const final; + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_H diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc new file mode 100644 index 00000000..12ab16bb --- /dev/null +++ b/src/04kernel/src/attributes/topk_info.cc @@ -0,0 +1,14 @@ +#include "kernel/attributes/topk_info.h" +#include + +namespace refactor::kernel { + +TopKInfo::TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input):topk(topk), + axis(axis), + in_stride(input.strides()[axis]), + in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]), + out_stride_pre_axis(in_stride_pre_axis/input.shape[axis]*topk), + elem_size(input.elementsSize()), + axis_elem_size(input.shape[axis]){} + +} diff --git a/src/04kernel/src/collectors/topk.cc b/src/04kernel/src/collectors/topk.cc new file mode 100644 index 00000000..91a97427 --- /dev/null +++ b/src/04kernel/src/collectors/topk.cc @@ -0,0 +1,30 @@ +#include "kernel/collectors/topk.h" +#include "../kernels/topk/cpu_kernel.hh" +#include "kernel/attributes/topk_info.h" +//#include "../kernels/topk/cuda_kernel.hh" + +namespace refactor::kernel { + + std::vector + TopKCollector::filter(TensorRefs inputs, TensorRefs outputs) const { + TopKInfo info(topk, axis, inputs[0]); + std::vector ans; + switch (_target) { + case decltype(_target)::Cpu: + if (auto ptr = TopKCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + //todo :暂时用cpu的实现 + case decltype(_target)::Nvidia: + if (auto ptr = TopKCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + default: + UNREACHABLEX(void, "Unknown target"); + } + return ans; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.cc b/src/04kernel/src/kernels/topk/cpu_kernel.cc new file mode 100644 index 00000000..7330e44d --- /dev/null +++ b/src/04kernel/src/kernels/topk/cpu_kernel.cc @@ -0,0 +1,61 @@ +#include "cpu_kernel.hh" +#include +#include + +namespace refactor::kernel { + using K = TopKCpu; + + K::TopKCpu(TopKInfo info) noexcept + : Kernel(), info(std::move(info)) {} + + auto K::build(TopKInfo info) noexcept -> KernelBox { + return std::make_unique(std::move(info)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing topk operation on generic cpu"; + } + + auto K::lower(Resources &) const noexcept -> RoutineWorkspace { + using namespace runtime; + return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + auto src = reinterpret_cast(inputs[0]); + + auto dstVal = reinterpret_cast(outputs[0]);//T + auto dstIndex = reinterpret_cast(outputs[1]); + + + size_t M = info.getElementSize() / info.getAxisElementSize(); + size_t N = info.getAxisElementSize(); + auto inStride1 = info.getInStridePreAxis(); + auto inStride2 = info.getInStride(); + auto outStride1 = info.getOutStridePreAxis(); + auto outStride2 = inStride2; + + for(size_t m = 0; m < M; m ++){ + using PairType = std::pair; + std::list list; + for(size_t n = 0; n < N; n++){ + auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2; + list.push_back({src[srcIdx],n}); + } + list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;}); + + size_t offset = m /inStride2 * outStride1 + m % inStride2; + std::for_each_n(list.begin(), (uint32_t)info.topk, + [&](auto &elem) { + dstVal[offset] = elem.first; + dstIndex[offset] = elem.second; + offset += outStride2; + }); + } + }; + } +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.hh b/src/04kernel/src/kernels/topk/cpu_kernel.hh new file mode 100644 index 00000000..75b2a4ce --- /dev/null +++ b/src/04kernel/src/kernels/topk/cpu_kernel.hh @@ -0,0 +1,23 @@ +#ifndef KERNEL_TOPK_CPU_KERNEL_HH +#define KERNEL_TOPK_CPU_KERNEL_HH + +#include "kernel/attributes/topk_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct TopKCpu final : public Kernel { + TopKInfo info; + explicit TopKCpu(TopKInfo info) noexcept; + + static KernelBox build(TopKInfo info) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; + RoutineWorkspace lower(Resources &) const noexcept final; + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_CPU_KERNEL_HH diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp new file mode 100644 index 00000000..b0dcaa80 --- /dev/null +++ b/src/04kernel/test/kernels/topk/test_cpu.cpp @@ -0,0 +1,238 @@ +#include "../../../src/kernels/topk/cpu_kernel.hh" +#include +#include + +using namespace refactor; +using namespace kernel; + +TEST(kernel, TopKCpu) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3}); + + auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + + // check + std::vector expectVal = {3,2,1,7,6,5,11,10,9}; + std::vector expectIdx = {3,2,1,3,2,1,3,2,1}; + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {6,7,4,5,2,3,14,15,12,13,10,11}; + std::vector expectIdx = {3,3,2,2,1,1,3,3,2,2,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} + +TEST(kernel, TopKCpu2) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{2, 4, 2}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{1, 4, 2}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{1, 4, 2}); + + auto kernel = TopKCpu::build(TopKInfo(1,0, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {8,9,10,11,12,13,14,15}; + std::vector expectIdx = {1,1,1,1,1,1,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} + + +TEST(kernel, TopKCpu3) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{1, 3, 2, 2}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{1, 3, 2, 2}); + + auto kernel = TopKCpu::build(TopKInfo(1,0, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {12, 13, 14, 15, 16, 17, 18, 19, 20,21, 22,23}; + std::vector expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} + +TEST(kernel, TopKCpu4) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 2, 2, 2}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 2, 2, 2}); + + auto kernel = TopKCpu::build(TopKInfo(2,1, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {8, 9, 10, 11,4,5,6,7,20,21,22,23,16,17,18,19}; + std::vector expectIdx = {2,2,2,2,1,1,1,1,2,2,2,2,1,1,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} + + +TEST(kernel, TopKCpu5) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 3, 1, 2}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 3, 1, 2}); + + auto kernel = TopKCpu::build(TopKInfo(1,2, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {2,3,6,7,10,11,14,15,18,19,22,23}; + std::vector expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} + +TEST(kernel, TopKCpu6) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2}); + auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 3, 2, 1}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 3, 2, 1}); + + auto kernel = TopKCpu::build(TopKInfo(1,3, *inputTensor)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins(inputTensor->elementsSize()); + std::vector out0(outputTensor0->elementsSize()); + std::vector out1(outputTensor1->elementsSize()); + + std::iota(ins.begin(), ins.end(), 0); + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< expectVal = {1,3,5,7,9,11,13,15,17,19,21,23}; + std::vector expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1}; + + + for(size_t i=0;i< expectVal.size(); ++i){ + EXPECT_EQ(expectVal[i], out0[i]); + EXPECT_EQ(expectIdx[i], out1[i]); + } +} \ No newline at end of file diff --git a/src/05computation/include/computation/operators/topk.h b/src/05computation/include/computation/operators/topk.h new file mode 100644 index 00000000..bdce6d6f --- /dev/null +++ b/src/05computation/include/computation/operators/topk.h @@ -0,0 +1,20 @@ +#ifndef COMPUTATION_TOPK_H +#define COMPUTATION_TOPK_H + +#include "../operator.h" + +namespace refactor::computation { + + struct TopK final : public Operator { + uint32_t topk,axis; + constexpr TopK(uint32_t topk, uint32_t axis) noexcept : topk(topk), axis(axis){} + + static size_t typeId() noexcept; + size_t opTypeId() const noexcept final; + std::string_view name() const noexcept final; + kernel::CollectorBox candidateKernels(Target) const noexcept final; + }; + +}// namespace refactor::computation + +#endif// COMPUTATION_SPLIT_H diff --git a/src/05computation/src/operators/topk.cc b/src/05computation/src/operators/topk.cc new file mode 100644 index 00000000..f25e9a85 --- /dev/null +++ b/src/05computation/src/operators/topk.cc @@ -0,0 +1,17 @@ +#include "computation/operators/topk.h" +#include "kernel/collectors/topk.h" + +namespace refactor::computation { + + size_t TopK::typeId() noexcept { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + size_t TopK::opTypeId() const noexcept { return typeId(); } + std::string_view TopK::name() const noexcept { return "TopK"; } + auto TopK::candidateKernels(Target target) const noexcept -> kernel::CollectorBox { + using Collector_ = kernel::TopKCollector; + return std::make_unique(target, topk, axis); + } + +}// namespace refactor::computation diff --git a/src/07onnx/src/operators.cpp b/src/07onnx/src/operators.cpp index 0981f720..8e50a810 100644 --- a/src/07onnx/src/operators.cpp +++ b/src/07onnx/src/operators.cpp @@ -38,6 +38,7 @@ #include "operators/transpose.hh" #include "operators/unsqueeze.hh" #include "operators/where.hh" +#include "operators/topk.hh" namespace refactor::onnx { @@ -131,6 +132,7 @@ namespace refactor::onnx { REGISTER(Where , Where ); REGISTER(HardSigmoid , HardSigmoid ); REGISTER(Pad , Pad ); + REGISTER(TopK , TopK ); // clang-format on #undef REGISTER } diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc new file mode 100644 index 00000000..8b3b31c2 --- /dev/null +++ b/src/07onnx/src/operators/topk.cc @@ -0,0 +1,55 @@ +#include "common.h" +#include "topk.hh" +#include "computation/operators/topk.h" +#include + +namespace refactor::onnx { + using Op = TopK; + + Op::TopK(Int topk, Int axis):topk(topk), axis(axis){} + + auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox { + auto axis = attributes["axis"].int_(); + auto topk = attributes["topk"].int_(); + return OpBox(std::make_unique(topk, axis)); + } + + auto Op::typeId() -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto Op::opTypeId() const -> size_t { return typeId(); } + auto Op::opTypeName() const -> std::string_view { return "TopK"; } + + auto Op::infer(TensorRefs inputs, InferOptions const &options) const -> InferResult { + if (inputs.empty() || inputs.size() >= 2) { + return Err(InferError(ERROR_MSG("Input size error"))); + } + auto const &input = inputs[0]; + auto rank = input.rank(); + auto axis_ = axis < 0 ? axis + rank : axis; + if (rank <= axis_) { + return Err(InferError(ERROR_MSG("axis error"))); + } + if (topk < 0 || topk > input.shape[axis_].value()){ + return Err(InferError(ERROR_MSG("topk error"))); + } + + Tensors ans(2, nullptr); + auto dependencies = extractDependency(inputs); + ans[0] = Tensor::share(input.dataType, input.shape, dependencies); + ans[0]->shape[axis_] = DimExpr(topk); + ans[1] = Tensor::share(input.dataType, input.shape, dependencies); + ans[1]->shape[axis_] = DimExpr(topk); + return Ok(Tensors{std::move(ans)}); + } + + auto Op::lower(TensorRefs inputs) const -> computation::OpBox { + using Op_ = computation::TopK; + auto rank = inputs[0].rank(); + auto axis_ = axis < 0 ? axis + rank : axis; + return std::make_unique(topk, axis_); + } + +}// namespace refactor::onnx diff --git a/src/07onnx/src/operators/topk.hh b/src/07onnx/src/operators/topk.hh new file mode 100644 index 00000000..2b86f5bb --- /dev/null +++ b/src/07onnx/src/operators/topk.hh @@ -0,0 +1,23 @@ +#ifndef ONNX_TOPK_HH +#define ONNX_TOPK_HH + +#include "frontend/operator.h" + +namespace refactor::onnx { + using namespace frontend; + + struct TopK final : public Operator { + Int topk, axis; + TopK(Int topk, Int axis); + + static size_t typeId(); + static OpBox build(ModelContext const &, std::string_view, Attributes); + size_t opTypeId() const final; + std::string_view opTypeName() const final; + InferResult infer(TensorRefs, InferOptions const &) const final; + computation::OpBox lower(TensorRefs) const final; + }; + +}// namespace refactor::onnx + +#endif// ONNX_WHERE_HH From 5b33afdb5f6dd08d9d478ab380064750e2f8117e Mon Sep 17 00:00:00 2001 From: wangw <271502003@qq.com> Date: Wed, 20 Mar 2024 21:18:50 +0800 Subject: [PATCH 2/4] add assign_pos\scatter operator and cpu kernel for moe. --- CMakeLists.txt | 1 + .../include/kernel/attributes/moe_info.h | 24 ++++++ src/04kernel/include/kernel/collectors/moe.h | 29 +++++++ src/04kernel/src/attributes/moe_info.cc | 13 +++ src/04kernel/src/collectors/moe.cc | 51 ++++++++++++ src/04kernel/src/kernels/moe/cpu_kernel.cc | 83 +++++++++++++++++++ src/04kernel/src/kernels/moe/cpu_kernel.hh | 35 ++++++++ src/04kernel/test/kernels/moe/test_cpu.cpp | 75 +++++++++++++++++ .../include/computation/operators/moe.h | 37 +++++++++ src/05computation/src/operators/moe.cc | 34 ++++++++ src/07onnx/src/operators/topk.cc | 2 +- src/08-02moe/CMakeLists.txt | 15 ++++ src/08-02moe/include/operators.h | 10 +++ src/08-02moe/src/operators.cpp | 16 ++++ src/08-02moe/src/operators/moe.cc | 78 +++++++++++++++++ src/08-02moe/src/operators/moe.hh | 38 +++++++++ src/08-02moe/test/test_moe.cpp | 25 ++++++ 17 files changed, 565 insertions(+), 1 deletion(-) create mode 100644 src/04kernel/include/kernel/attributes/moe_info.h create mode 100644 src/04kernel/include/kernel/collectors/moe.h create mode 100644 src/04kernel/src/attributes/moe_info.cc create mode 100644 src/04kernel/src/collectors/moe.cc create mode 100644 src/04kernel/src/kernels/moe/cpu_kernel.cc create mode 100644 src/04kernel/src/kernels/moe/cpu_kernel.hh create mode 100644 src/04kernel/test/kernels/moe/test_cpu.cpp create mode 100644 src/05computation/include/computation/operators/moe.h create mode 100644 src/05computation/src/operators/moe.cc create mode 100644 src/08-02moe/CMakeLists.txt create mode 100644 src/08-02moe/include/operators.h create mode 100644 src/08-02moe/src/operators.cpp create mode 100644 src/08-02moe/src/operators/moe.cc create mode 100644 src/08-02moe/src/operators/moe.hh create mode 100644 src/08-02moe/test/test_moe.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 49ddcda6..a0e853b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,4 +106,5 @@ add_subdirectory(src/06frontend) add_subdirectory(src/07onnx) add_subdirectory(src/08communication) add_subdirectory(src/08-01llm) +add_subdirectory(src/08-02moe) add_subdirectory(src/09python_ffi) diff --git a/src/04kernel/include/kernel/attributes/moe_info.h b/src/04kernel/include/kernel/attributes/moe_info.h new file mode 100644 index 00000000..79eaa7b2 --- /dev/null +++ b/src/04kernel/include/kernel/attributes/moe_info.h @@ -0,0 +1,24 @@ +#ifndef KERNEL_MOE_INFO_H +#define KERNEL_MOE_INFO_H + +#include "../tensor.h" + +namespace refactor::kernel { + + struct AssignPosInfo { + uint32_t top, expert_num; + uint32_t elementSize; + + AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate); + }; + + struct ReorderInfo{ + bool scatter; + uint32_t top; + uint32_t blockNum, blockSize; + ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs); + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_INFO_H diff --git a/src/04kernel/include/kernel/collectors/moe.h b/src/04kernel/include/kernel/collectors/moe.h new file mode 100644 index 00000000..457de919 --- /dev/null +++ b/src/04kernel/include/kernel/collectors/moe.h @@ -0,0 +1,29 @@ +#ifndef KERNEL_MOE_H +#define KERNEL_MOE_H + +#include "../collector.h" + +namespace refactor::kernel { + + struct AssignPosCollector final : public InfoCollector { + uint32_t topk,numExperts; + constexpr AssignPosCollector(decltype(_target) target, uint32_t topk, uint32_t numExperts) noexcept + : InfoCollector(target) ,topk(topk), numExperts(numExperts){} + + std::vector + filter(TensorRefs inputs, TensorRefs outputs) const final; + }; + + struct ReorderCollector final : public InfoCollector { + bool scatter; + uint32_t topk; + constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept + : InfoCollector(target) ,scatter(scatter), topk(topk){} + + std::vector + filter(TensorRefs inputs, TensorRefs outputs) const final; + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_H diff --git a/src/04kernel/src/attributes/moe_info.cc b/src/04kernel/src/attributes/moe_info.cc new file mode 100644 index 00000000..f5b0b677 --- /dev/null +++ b/src/04kernel/src/attributes/moe_info.cc @@ -0,0 +1,13 @@ +#include "kernel/attributes/moe_info.h" +#include + +namespace refactor::kernel { + +AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\ + top(top), expert_num(expert_num),elementSize(gate.elementsSize()){} + +ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\ + scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){} + + +} diff --git a/src/04kernel/src/collectors/moe.cc b/src/04kernel/src/collectors/moe.cc new file mode 100644 index 00000000..4906499e --- /dev/null +++ b/src/04kernel/src/collectors/moe.cc @@ -0,0 +1,51 @@ +#include "kernel/collectors/moe.h" +#include "../kernels/moe/cpu_kernel.hh" +#include "kernel/attributes/moe_info.h" + +namespace refactor::kernel { + + std::vector + AssignPosCollector::filter(TensorRefs inputs, TensorRefs outputs) const { + AssignPosInfo info(topk, numExperts, inputs[0]); + std::vector ans; + switch (_target) { + case decltype(_target)::Cpu: + if (auto ptr = AssignPosCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + //todo :暂时用cpu的实现 + case decltype(_target)::Nvidia: + if (auto ptr = AssignPosCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + default: + UNREACHABLEX(void, "Unknown target"); + } + return ans; + } + + std::vector + ReorderCollector::filter(TensorRefs inputs, TensorRefs outputs) const { + ReorderInfo info(scatter, topk, inputs); + std::vector ans; + switch (_target) { + case decltype(_target)::Cpu: + if (auto ptr = ReorderCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + //todo :暂时用cpu的实现 + case decltype(_target)::Nvidia: + if (auto ptr = ReorderCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } + break; + default: + UNREACHABLEX(void, "Unknown target"); + } + return ans; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.cc b/src/04kernel/src/kernels/moe/cpu_kernel.cc new file mode 100644 index 00000000..215e8765 --- /dev/null +++ b/src/04kernel/src/kernels/moe/cpu_kernel.cc @@ -0,0 +1,83 @@ +#include "cpu_kernel.hh" +#include +#include + +namespace refactor::kernel { + + AssignPosCpu::AssignPosCpu(AssignPosInfo info) noexcept + : Kernel(), info(std::move(info)) {} + + auto AssignPosCpu::build(AssignPosInfo info) noexcept -> KernelBox { + return std::make_unique(std::move(info)); + } + auto AssignPosCpu::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto AssignPosCpu::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto AssignPosCpu::description() const noexcept -> std::string_view { + return "Performing AssignPos operation on generic cpu"; + } + + auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace { + using namespace runtime; + return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + auto gate = reinterpret_cast(inputs[0]); + + auto expert_cnt = reinterpret_cast(outputs[0]);//T + auto pos = reinterpret_cast(outputs[1]); + std::memset(expert_cnt, 0, info.expert_num); + for (size_t i = 0; i < info.elementSize; i ++){ + ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!"); + expert_cnt[gate[i]] ++; + } + std::vector expert_accumlate; + expert_accumlate.assign(info.expert_num, 0); + for (size_t i=0; i KernelBox { + return std::make_unique(std::move(info)); + } + auto ReorderCpu::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto ReorderCpu::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto ReorderCpu::description() const noexcept -> std::string_view { + return "Performing scatter operation on generic cpu"; + } + + auto ReorderCpu::lower(Resources &) const noexcept -> RoutineWorkspace { + using namespace runtime; + return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + auto input = reinterpret_cast(inputs[0]); + auto pos = reinterpret_cast(inputs[1]); + auto dstVal = reinterpret_cast(outputs[0]);//T + + for(size_t i = 0; i +#include + +using namespace refactor; +using namespace kernel; + +TEST(kernel, AssignPosCpu) { + // build routine + //auto inputTensor = Tensor::share(DataType::F32, Shape{4, 1024}); + auto gate = Tensor::share(DataType::U32, Shape{8, 2}); + auto expert_cnt = Tensor::share(DataType::U32, Shape{4}); + auto pos = Tensor::share(DataType::U32, Shape{16}); + + auto kernel = AssignPosCpu::build(AssignPosInfo(2,4, *gate)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins = {3,2, 0,1, 2,1, 1,3, 2,0, 1,3, 1,0, 1,2}; + std::vector out0(expert_cnt->elementsSize()); + std::vector out1(pos->elementsSize()); + + // inference + void const *inputs[]{ins.data()}; + void *outputs[]{out0.data(), out1.data()}; + routine(res, nullptr, inputs, outputs); + + // check + std::vector expectExpertCnt = {3,6,4,3}; + std::vector expectPos = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0}; + //std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<> inputTensors{input, pos}; + TensorRefs inputs_; + inputs_.reserve(inputTensors.size()); + std::transform(inputTensors.begin(), inputTensors.end(), + std::back_inserter(inputs_), + [](auto const &it) { return std::cref(*it); }); + + auto kernel = ReorderCpu::build(ReorderInfo(true, top, inputs_)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector ins0(input->elementsSize()); + std::iota(ins0.begin(), ins0.end(), 0); + std::vector ins1 = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0}; + std::vector out(input->elementsSize() * top); + + // inference + void const *inputs[]{ins0.data(), ins1.data()}; + void *outputs[]{out.data()}; + routine(res, nullptr, inputs, outputs); + std::for_each(out.begin(), out.end(),[](const float &val){std::cout< size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + auto AssignPos::opTypeId() const noexcept -> size_t { return typeId(); } + auto AssignPos::name() const noexcept -> std::string_view { return "moe::AssignPos"; } + auto AssignPos::candidateKernels(Target target) const -> kernel::CollectorBox { + using Collector_ = kernel::AssignPosCollector; + return std::make_unique(target, topk, numExperts); + } + auto AssignPos::serialize() const noexcept -> std::string { + return "moe::AssignPos()"; + } + + auto Reorder::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + auto Reorder::opTypeId() const noexcept -> size_t { return typeId(); } + auto Reorder::name() const noexcept -> std::string_view { return "moe::Reorder"; } + auto Reorder::candidateKernels(Target target) const -> kernel::CollectorBox { + using Collector_ = kernel::ReorderCollector; + return std::make_unique(target, scatter, topk); + } + auto Reorder::serialize() const noexcept -> std::string { + return "moe::Reorder()"; + } + +}// namespace refactor::computation diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc index 8b3b31c2..1745f37e 100644 --- a/src/07onnx/src/operators/topk.cc +++ b/src/07onnx/src/operators/topk.cc @@ -6,7 +6,7 @@ namespace refactor::onnx { using Op = TopK; - Op::TopK(Int topk, Int axis):topk(topk), axis(axis){} + Op::TopK(Int topk, Int axis):Operator(), topk(topk), axis(axis){} auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox { auto axis = attributes["axis"].int_(); diff --git a/src/08-02moe/CMakeLists.txt b/src/08-02moe/CMakeLists.txt new file mode 100644 index 00000000..25b882cc --- /dev/null +++ b/src/08-02moe/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.12 FATAL_ERROR) +project(moe VERSION 0.0.0 LANGUAGES CXX) +message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION}) + +file(GLOB_RECURSE MOE_SRC src/*.cc src/*.cpp) +add_library(moe STATIC ${MOE_SRC}) +target_link_libraries(moe PUBLIC frontend) +target_include_directories(moe PUBLIC include) + +file(GLOB_RECURSE MOE_TEST test/*.cpp) +if(MOE_TEST) + add_executable(moe_test ${MOE_TEST}) + add_test(moe_test moe_test) + target_link_libraries(moe_test moe GTest::gtest_main Backward::Object) +endif() diff --git a/src/08-02moe/include/operators.h b/src/08-02moe/include/operators.h new file mode 100644 index 00000000..b4221025 --- /dev/null +++ b/src/08-02moe/include/operators.h @@ -0,0 +1,10 @@ +#ifndef MOE_OPERATORS_H +#define MOE_OPERATORS_H + +namespace refactor::moe { + + void register_(); + +}// namespace refactor::moe + +#endif// MOE_OPERATORS_H diff --git a/src/08-02moe/src/operators.cpp b/src/08-02moe/src/operators.cpp new file mode 100644 index 00000000..887c70ee --- /dev/null +++ b/src/08-02moe/src/operators.cpp @@ -0,0 +1,16 @@ +#include "operators.h" +#include "operators/moe.hh" + +namespace refactor::moe { + using namespace frontend; + + void register_() { +#define REGISTER(NAME, CLASS) Operator::register_("moe::" #NAME) + // clang-format off + REGISTER(AssignPos , AssignPos ); + REGISTER(Reorder , Reorder ); + // clang-format on +#undef REGISTER + } + +}// namespace refactor::moe diff --git a/src/08-02moe/src/operators/moe.cc b/src/08-02moe/src/operators/moe.cc new file mode 100644 index 00000000..be971795 --- /dev/null +++ b/src/08-02moe/src/operators/moe.cc @@ -0,0 +1,78 @@ +#include "moe.hh" +#include "common.h" +#include "computation/operators/moe.h" + +namespace refactor::moe { + + AssignPos::AssignPos(uint32_t topk, uint32_t numExperts) : Operator() ,topk(topk), numExperts(numExperts){} + + auto AssignPos::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { + auto topk = attributes["topk"].int_(); + auto num_experts = attributes["num_experts"].int_(); + return OpBox(std::make_unique(topk, num_experts)); + } + auto AssignPos::typeId() -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto AssignPos::opTypeId() const -> size_t { return typeId(); } + auto AssignPos::opTypeName() const -> std::string_view { return "moe::AssignPos"; } + + auto AssignPos::infer(TensorRefs inputs, InferOptions const &) const -> InferResult { + EXPECT_SIZE(1) + + auto const &gate = inputs[0]; + + if (gate.dataType != DataType::I16) { + return Err(InferError(ERROR_MSG("Input data type not support"))); + } + + return Ok(Tensors{Tensor::share(gate.dataType, Shape{DimExpr(numExperts)}, extractDependency(inputs)), + Tensor::share(gate.dataType, gate.shape, extractDependency(inputs))}); + } + + auto AssignPos::lower(TensorRefs) const -> computation::OpBox { + using Op_ = computation::AssignPos; + return std::make_unique(topk, numExperts); + } + + Reorder::Reorder(bool scatter, uint32_t topk, uint32_t dim) : Operator() ,scatter(scatter), top(topk), dim(dim){} + + auto Reorder::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { + auto topk = attributes["topk"].int_(); + bool scatter = attributes["scatter"].int_() != 0 ; + bool dim = attributes["dim"].int_(); + return OpBox(std::make_unique(scatter, topk, dim)); + } + auto Reorder::typeId() -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto Reorder::opTypeId() const -> size_t { return typeId(); } + auto Reorder::opTypeName() const -> std::string_view { return "moe::Reorder"; } + + auto Reorder::infer(TensorRefs inputs, InferOptions const &) const -> InferResult { + EXPECT_SIZE(2) + auto const &input = inputs[0]; + auto const &pos = inputs[1]; + if (dim != 0) + return Err(InferError(ERROR_MSG("dim is not right!"))); + if(scatter && input.elementsSize() * top != pos.elementsSize()) + return Err(InferError(ERROR_MSG("Inputs data size are not right!"))); + else if(!scatter && input.elementsSize() != pos.elementsSize()) + return Err(InferError(ERROR_MSG("Inputs data size are not right!"))); + + if (pos.dataType != DataType::I16) { + return Err(InferError(ERROR_MSG("Input data type not support"))); + } + + return Ok(Tensors{Tensor::share(input.dataType, pos.shape, extractDependency(inputs))}); + } + + auto Reorder::lower(TensorRefs) const -> computation::OpBox { + using Op_ = computation::Reorder; + return std::make_unique(scatter, top); + } +}// namespace refactor::llm diff --git a/src/08-02moe/src/operators/moe.hh b/src/08-02moe/src/operators/moe.hh new file mode 100644 index 00000000..96f82709 --- /dev/null +++ b/src/08-02moe/src/operators/moe.hh @@ -0,0 +1,38 @@ +#ifndef MOE_HH +#define MOE_HH + +#include "frontend/operator.h" + +namespace refactor::moe { + using namespace frontend; + + struct AssignPos final : public Operator { + uint32_t topk, numExperts; + explicit AssignPos(uint32_t topk, uint32_t numExperts); + + static OpBox build(ModelContext const &, std::string_view, Attributes); + static size_t typeId(); + + size_t opTypeId() const final; + std::string_view opTypeName() const final; + InferResult infer(TensorRefs, InferOptions const &) const final; + computation::OpBox lower(TensorRefs) const final; + }; + + struct Reorder final : public Operator { + bool scatter; + uint32_t top, dim; + explicit Reorder(bool scatter, uint32_t topk, uint32_t dim); + + static OpBox build(ModelContext const &, std::string_view, Attributes); + static size_t typeId(); + + size_t opTypeId() const final; + std::string_view opTypeName() const final; + InferResult infer(TensorRefs, InferOptions const &) const final; + computation::OpBox lower(TensorRefs) const final; + }; + +}// namespace refactor::llm + +#endif// LLM_RMS_ATTENTION_HH diff --git a/src/08-02moe/test/test_moe.cpp b/src/08-02moe/test/test_moe.cpp new file mode 100644 index 00000000..d4946573 --- /dev/null +++ b/src/08-02moe/test/test_moe.cpp @@ -0,0 +1,25 @@ +#include "../src/operators/moe.hh" +#include "operators.h" +#include + +using namespace refactor; +using namespace moe; + +TEST(infer, AssignPos) { + moe::register_(); + auto edges = Edges{ + + {Tensor::share(DataType::I16, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2 + }; + count_t inputs[]{0}; + auto infered = AssignPos(2,4).infer(TensorRefs(edges, inputs), {true}); + ASSERT_TRUE(infered.isOk()); + auto outputs = std::move(infered.unwrap()); + ASSERT_EQ(outputs.size(), 2); + auto expert_cnt = std::move(outputs[0]); + ASSERT_EQ(expert_cnt->dataType, DataType::F32); + ASSERT_EQ(expert_cnt->shape, (Shape{DimExpr(4)})); + auto pos = std::move(outputs[1]); + ASSERT_EQ(pos->dataType, DataType::I16); + ASSERT_EQ(pos->shape, (Shape{DimExpr(16)})); +} From 2b698b3ce1fa96bd7735dddfdf864ba87c3a9553 Mon Sep 17 00:00:00 2001 From: wangw <271502003@qq.com> Date: Mon, 1 Apr 2024 15:48:13 +0800 Subject: [PATCH 3/4] fix datatype --- .../include/kernel/attributes/moe_info.h | 12 ++++++------ .../include/kernel/attributes/topk_info.h | 6 +++--- src/04kernel/include/kernel/collectors/moe.h | 4 ++-- src/04kernel/include/kernel/collectors/topk.h | 4 ++-- src/04kernel/src/attributes/moe_info.cc | 4 ++-- src/04kernel/src/attributes/topk_info.cc | 2 +- src/04kernel/src/kernels/moe/cpu_kernel.cc | 10 +++++----- src/04kernel/src/kernels/topk/cpu_kernel.cc | 6 +++--- src/04kernel/test/kernels/topk/test_cpu.cpp | 4 ++-- .../include/computation/operators/moe.h | 8 ++++---- .../include/computation/operators/topk.h | 4 ++-- src/07onnx/src/operators/topk.cc | 2 +- src/08-02moe/include/{ => moe}/operators.h | 0 src/08-02moe/src/operators.cpp | 2 +- src/08-02moe/src/operators/moe.cc | 19 ++++++++++++------- src/08-02moe/src/operators/moe.hh | 8 ++++---- src/08-02moe/test/test_moe.cpp | 6 +++--- src/09python_ffi/CMakeLists.txt | 2 +- src/09python_ffi/src/main.cpp | 2 ++ 19 files changed, 56 insertions(+), 49 deletions(-) rename src/08-02moe/include/{ => moe}/operators.h (100%) diff --git a/src/04kernel/include/kernel/attributes/moe_info.h b/src/04kernel/include/kernel/attributes/moe_info.h index 79eaa7b2..3e46b505 100644 --- a/src/04kernel/include/kernel/attributes/moe_info.h +++ b/src/04kernel/include/kernel/attributes/moe_info.h @@ -6,17 +6,17 @@ namespace refactor::kernel { struct AssignPosInfo { - uint32_t top, expert_num; - uint32_t elementSize; + int64_t top, expert_num; + int64_t elementSize; - AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate); + AssignPosInfo(int64_t top, int64_t expert_num, Tensor const &gate); }; struct ReorderInfo{ bool scatter; - uint32_t top; - uint32_t blockNum, blockSize; - ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs); + int64_t top; + int64_t blockNum, blockSize; + ReorderInfo(bool scatter, int64_t top, TensorRefs inputs); }; }// namespace refactor::kernel diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h index a9210078..491810d1 100644 --- a/src/04kernel/include/kernel/attributes/topk_info.h +++ b/src/04kernel/include/kernel/attributes/topk_info.h @@ -7,12 +7,12 @@ namespace refactor::kernel { struct TopKInfo { - uint8_t topk; - uint8_t axis; + int64_t topk; + int64_t axis; size_t in_stride, in_stride_pre_axis, out_stride_pre_axis; size_t elem_size, axis_elem_size; - TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input); + TopKInfo(int64_t topk, int64_t axis, Tensor const &input); size_t getElementSize() const {return elem_size;} size_t getAxisElementSize()const { return axis_elem_size;} size_t getInStride()const{return in_stride;} diff --git a/src/04kernel/include/kernel/collectors/moe.h b/src/04kernel/include/kernel/collectors/moe.h index 457de919..258450dc 100644 --- a/src/04kernel/include/kernel/collectors/moe.h +++ b/src/04kernel/include/kernel/collectors/moe.h @@ -16,8 +16,8 @@ namespace refactor::kernel { struct ReorderCollector final : public InfoCollector { bool scatter; - uint32_t topk; - constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept + int64_t topk; + constexpr ReorderCollector(decltype(_target) target, bool scatter, int64_t topk) noexcept : InfoCollector(target) ,scatter(scatter), topk(topk){} std::vector diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h index c4d8490f..3e0dc288 100644 --- a/src/04kernel/include/kernel/collectors/topk.h +++ b/src/04kernel/include/kernel/collectors/topk.h @@ -6,9 +6,9 @@ namespace refactor::kernel { struct TopKCollector final : public InfoCollector { - uint32_t topk, axis; + int64_t topk, axis; - constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept + constexpr TopKCollector(decltype(_target) target, int64_t topk, int64_t axis_) noexcept : InfoCollector(target), topk(topk), axis(axis_) {} std::vector diff --git a/src/04kernel/src/attributes/moe_info.cc b/src/04kernel/src/attributes/moe_info.cc index f5b0b677..829c54ae 100644 --- a/src/04kernel/src/attributes/moe_info.cc +++ b/src/04kernel/src/attributes/moe_info.cc @@ -3,10 +3,10 @@ namespace refactor::kernel { -AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\ +AssignPosInfo::AssignPosInfo(int64_t top, int64_t expert_num, Tensor const &gate):\ top(top), expert_num(expert_num),elementSize(gate.elementsSize()){} -ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\ +ReorderInfo::ReorderInfo(bool scatter, int64_t top, TensorRefs inputs):\ scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){} diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc index 12ab16bb..52032db9 100644 --- a/src/04kernel/src/attributes/topk_info.cc +++ b/src/04kernel/src/attributes/topk_info.cc @@ -3,7 +3,7 @@ namespace refactor::kernel { -TopKInfo::TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input):topk(topk), +TopKInfo::TopKInfo(int64_t topk, int64_t axis, Tensor const &input):topk(topk), axis(axis), in_stride(input.strides()[axis]), in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]), diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.cc b/src/04kernel/src/kernels/moe/cpu_kernel.cc index 215e8765..ef4f77d9 100644 --- a/src/04kernel/src/kernels/moe/cpu_kernel.cc +++ b/src/04kernel/src/kernels/moe/cpu_kernel.cc @@ -25,16 +25,16 @@ namespace refactor::kernel { auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace { using namespace runtime; return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { - auto gate = reinterpret_cast(inputs[0]); + auto gate = reinterpret_cast(inputs[0]); - auto expert_cnt = reinterpret_cast(outputs[0]);//T - auto pos = reinterpret_cast(outputs[1]); + auto expert_cnt = reinterpret_cast(outputs[0]);//T + auto pos = reinterpret_cast(outputs[1]); std::memset(expert_cnt, 0, info.expert_num); for (size_t i = 0; i < info.elementSize; i ++){ ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!"); expert_cnt[gate[i]] ++; } - std::vector expert_accumlate; + std::vector expert_accumlate; expert_accumlate.assign(info.expert_num, 0); for (size_t i=0; iinfo](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { auto input = reinterpret_cast(inputs[0]); - auto pos = reinterpret_cast(inputs[1]); + auto pos = reinterpret_cast(inputs[1]); auto dstVal = reinterpret_cast(outputs[0]);//T for(size_t i = 0; i(inputs[0]); auto dstVal = reinterpret_cast(outputs[0]);//T - auto dstIndex = reinterpret_cast(outputs[1]); + auto dstIndex = reinterpret_cast(outputs[1]); size_t M = info.getElementSize() / info.getAxisElementSize(); @@ -40,7 +40,7 @@ namespace refactor::kernel { auto outStride2 = inStride2; for(size_t m = 0; m < M; m ++){ - using PairType = std::pair; + using PairType = std::pair; std::list list; for(size_t n = 0; n < N; n++){ auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2; @@ -49,7 +49,7 @@ namespace refactor::kernel { list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;}); size_t offset = m /inStride2 * outStride1 + m % inStride2; - std::for_each_n(list.begin(), (uint32_t)info.topk, + std::for_each_n(list.begin(), (int64_t)info.topk, [&](auto &elem) { dstVal[offset] = elem.first; dstIndex[offset] = elem.second; diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp index b0dcaa80..cea4e066 100644 --- a/src/04kernel/test/kernels/topk/test_cpu.cpp +++ b/src/04kernel/test/kernels/topk/test_cpu.cpp @@ -9,7 +9,7 @@ TEST(kernel, TopKCpu) { // build routine auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4}); auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3}); - auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3}); + auto outputTensor1 = Tensor::share(DataType::I64, Shape{3, 3}); auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor)); ASSERT_TRUE(kernel); @@ -28,7 +28,7 @@ TEST(kernel, TopKCpu) { // check std::vector expectVal = {3,2,1,7,6,5,11,10,9}; - std::vector expectIdx = {3,2,1,3,2,1,3,2,1}; + std::vector expectIdx = {3,2,1,3,2,1,3,2,1}; std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<shape[axis_] = DimExpr(topk); - ans[1] = Tensor::share(input.dataType, input.shape, dependencies); + ans[1] = Tensor::share(DataType::I64, input.shape, dependencies); ans[1]->shape[axis_] = DimExpr(topk); return Ok(Tensors{std::move(ans)}); } diff --git a/src/08-02moe/include/operators.h b/src/08-02moe/include/moe/operators.h similarity index 100% rename from src/08-02moe/include/operators.h rename to src/08-02moe/include/moe/operators.h diff --git a/src/08-02moe/src/operators.cpp b/src/08-02moe/src/operators.cpp index 887c70ee..5db39632 100644 --- a/src/08-02moe/src/operators.cpp +++ b/src/08-02moe/src/operators.cpp @@ -1,4 +1,4 @@ -#include "operators.h" +#include "moe/operators.h" #include "operators/moe.hh" namespace refactor::moe { diff --git a/src/08-02moe/src/operators/moe.cc b/src/08-02moe/src/operators/moe.cc index be971795..68bfeff5 100644 --- a/src/08-02moe/src/operators/moe.cc +++ b/src/08-02moe/src/operators/moe.cc @@ -4,7 +4,7 @@ namespace refactor::moe { - AssignPos::AssignPos(uint32_t topk, uint32_t numExperts) : Operator() ,topk(topk), numExperts(numExperts){} + AssignPos::AssignPos(Int topk, Int numExperts) : Operator() ,topk(topk), numExperts(numExperts){} auto AssignPos::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { auto topk = attributes["topk"].int_(); @@ -23,8 +23,10 @@ namespace refactor::moe { EXPECT_SIZE(1) auto const &gate = inputs[0]; - - if (gate.dataType != DataType::I16) { + if(topk < 0 || numExperts < 0 || topk > numExperts){ + return Err(InferError(ERROR_MSG("topk or numExperts is error"))); + } + if (gate.dataType != DataType::I64) { return Err(InferError(ERROR_MSG("Input data type not support"))); } @@ -37,7 +39,7 @@ namespace refactor::moe { return std::make_unique(topk, numExperts); } - Reorder::Reorder(bool scatter, uint32_t topk, uint32_t dim) : Operator() ,scatter(scatter), top(topk), dim(dim){} + Reorder::Reorder(bool scatter, Int topk, Int dim) : Operator() ,scatter(scatter), top(topk), dim(dim){} auto Reorder::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { auto topk = attributes["topk"].int_(); @@ -59,12 +61,15 @@ namespace refactor::moe { auto const &pos = inputs[1]; if (dim != 0) return Err(InferError(ERROR_MSG("dim is not right!"))); - if(scatter && input.elementsSize() * top != pos.elementsSize()) + if(top < 0 ){ + return Err(InferError(ERROR_MSG("topkis error"))); + } + if(scatter && input.elementsSize()/input.shape[input.shape.size()-1].value() * top != pos.elementsSize()) return Err(InferError(ERROR_MSG("Inputs data size are not right!"))); - else if(!scatter && input.elementsSize() != pos.elementsSize()) + else if(!scatter && input.elementsSize()/input.shape[input.shape.size()-1].value() != pos.elementsSize()) return Err(InferError(ERROR_MSG("Inputs data size are not right!"))); - if (pos.dataType != DataType::I16) { + if (pos.dataType != DataType::I64) { return Err(InferError(ERROR_MSG("Input data type not support"))); } diff --git a/src/08-02moe/src/operators/moe.hh b/src/08-02moe/src/operators/moe.hh index 96f82709..42501bdb 100644 --- a/src/08-02moe/src/operators/moe.hh +++ b/src/08-02moe/src/operators/moe.hh @@ -7,8 +7,8 @@ namespace refactor::moe { using namespace frontend; struct AssignPos final : public Operator { - uint32_t topk, numExperts; - explicit AssignPos(uint32_t topk, uint32_t numExperts); + Int topk, numExperts; + explicit AssignPos(Int topk, Int numExperts); static OpBox build(ModelContext const &, std::string_view, Attributes); static size_t typeId(); @@ -21,8 +21,8 @@ namespace refactor::moe { struct Reorder final : public Operator { bool scatter; - uint32_t top, dim; - explicit Reorder(bool scatter, uint32_t topk, uint32_t dim); + Int top, dim; + explicit Reorder(bool scatter, Int topk, Int dim); static OpBox build(ModelContext const &, std::string_view, Attributes); static size_t typeId(); diff --git a/src/08-02moe/test/test_moe.cpp b/src/08-02moe/test/test_moe.cpp index d4946573..c735d803 100644 --- a/src/08-02moe/test/test_moe.cpp +++ b/src/08-02moe/test/test_moe.cpp @@ -1,5 +1,5 @@ #include "../src/operators/moe.hh" -#include "operators.h" +#include "moe/operators.h" #include using namespace refactor; @@ -9,7 +9,7 @@ TEST(infer, AssignPos) { moe::register_(); auto edges = Edges{ - {Tensor::share(DataType::I16, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2 + {Tensor::share(DataType::I64, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2 }; count_t inputs[]{0}; auto infered = AssignPos(2,4).infer(TensorRefs(edges, inputs), {true}); @@ -20,6 +20,6 @@ TEST(infer, AssignPos) { ASSERT_EQ(expert_cnt->dataType, DataType::F32); ASSERT_EQ(expert_cnt->shape, (Shape{DimExpr(4)})); auto pos = std::move(outputs[1]); - ASSERT_EQ(pos->dataType, DataType::I16); + ASSERT_EQ(pos->dataType, DataType::I64); ASSERT_EQ(pos->shape, (Shape{DimExpr(16)})); } diff --git a/src/09python_ffi/CMakeLists.txt b/src/09python_ffi/CMakeLists.txt index ccce34d3..50fc535a 100644 --- a/src/09python_ffi/CMakeLists.txt +++ b/src/09python_ffi/CMakeLists.txt @@ -7,7 +7,7 @@ add_subdirectory(pybind11) file(GLOB_RECURSE PYFFI_SRC src/*.cc src/*.cpp) pybind11_add_module(python_ffi SHARED ${PYFFI_SRC}) -target_link_libraries(python_ffi PRIVATE onnx llm communication) +target_link_libraries(python_ffi PRIVATE onnx llm communication moe) target_include_directories(python_ffi PRIVATE include) # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a diff --git a/src/09python_ffi/src/main.cpp b/src/09python_ffi/src/main.cpp index 48a4ea6f..d2d2c8ce 100644 --- a/src/09python_ffi/src/main.cpp +++ b/src/09python_ffi/src/main.cpp @@ -3,6 +3,7 @@ #include "import.h" #include "llm/operators.h" #include "onnx/operators.h" +#include "moe/operators.h" #include // keep this line to convert stl types namespace py = pybind11; @@ -17,6 +18,7 @@ namespace refactor::python_ffi { onnx::register_(); llm::register_(); communication::register_(); + moe::register_(); // clang-format off From cf4e92c540aff02bc7f03d3b11d41d5ea345f7ae Mon Sep 17 00:00:00 2001 From: wangw <271502003@qq.com> Date: Mon, 6 May 2024 10:23:43 +0800 Subject: [PATCH 4/4] add topk cuda kernel --- .../cuda/include/kernel/cuda/topk.cuh | 19 ++++ src/04kernel/cuda/src/topk.cu | 103 ++++++++++++++++++ .../include/kernel/attributes/topk_info.h | 21 ++-- src/04kernel/include/kernel/collectors/topk.h | 4 +- src/04kernel/src/attributes/topk_info.cc | 16 +-- src/04kernel/src/kernels/topk/cpu_kernel.cc | 30 ++--- src/04kernel/src/kernels/topk/cuda_kernel.cc | 57 ++++++++++ src/04kernel/src/kernels/topk/cuda_kernel.hh | 26 +++++ src/04kernel/test/kernels/topk/test_cpu.cpp | 4 +- src/04kernel/test/kernels/topk/test_cuda.cpp | 68 ++++++++++++ .../include/computation/operators/topk.h | 4 +- src/07onnx/src/operators/topk.cc | 2 +- 12 files changed, 314 insertions(+), 40 deletions(-) create mode 100644 src/04kernel/cuda/include/kernel/cuda/topk.cuh create mode 100644 src/04kernel/cuda/src/topk.cu create mode 100644 src/04kernel/src/kernels/topk/cuda_kernel.cc create mode 100644 src/04kernel/src/kernels/topk/cuda_kernel.hh create mode 100644 src/04kernel/test/kernels/topk/test_cuda.cpp diff --git a/src/04kernel/cuda/include/kernel/cuda/topk.cuh b/src/04kernel/cuda/include/kernel/cuda/topk.cuh new file mode 100644 index 00000000..b06cfc00 --- /dev/null +++ b/src/04kernel/cuda/include/kernel/cuda/topk.cuh @@ -0,0 +1,19 @@ +#ifndef KERNEL_CUDA_TOPK_CUH +#define KERNEL_CUDA_TOPK_CUH + +#include "threads_distributer.cuh" + +namespace refactor::kernel::cuda { + + void launchTopK( + KernelLaunchParameters const ¶ms, + float const *data, float *dstVal, unsigned int *dstIdx, + unsigned int topk, + unsigned int stride_axis, + unsigned int stride_in_pre, + unsigned int stride_out_pre, + unsigned int size_axis); + +}// namespace refactor::kernel::cuda + +#endif// KERNEL_CUDA_TOPK_CUH diff --git a/src/04kernel/cuda/src/topk.cu b/src/04kernel/cuda/src/topk.cu new file mode 100644 index 00000000..6b247ead --- /dev/null +++ b/src/04kernel/cuda/src/topk.cu @@ -0,0 +1,103 @@ +#include "kernel/cuda/topk.cuh" +#include "macro.cuh" +#include +#include +#include + +namespace refactor::kernel::cuda { + +using PairType = thrust::pair; + +struct ComparePair { + __host__ __device__ + bool operator()(const PairType& a, const PairType& b) const { + return a.first > b.first; + } +}; + +/* + __device__ + void process_element(unsigned int n, float *__restrict__ dstVal, + uint32_t *__restrict__ dstIdx, + PairType *list, + uint32_t stride_axis, + uint32_t init_offset){ + for (auto tid = blockIdx.x * blockDim.x + threadIdx.x, + step = blockDim.x * gridDim.x; + tid < n; + tid += step) { + uint32_t offset = init_offset + stride_axis * tid; + dstVal[offset] = list[tid].first; + dstIdx[offset] = list[tid].second; + } + } +*/ + + + + __global__ static void TopKKernel( + unsigned long long n, + float const *__restrict__ data, + float *__restrict__ dstVal, + uint32_t *__restrict__ dstIdx, + uint32_t topk, + uint32_t stride_axis, + uint32_t stride_in_pre, + uint32_t stride_out_pre, + unsigned int size) { + for (auto tid = blockIdx.x * blockDim.x + threadIdx.x, + step = blockDim.x * gridDim.x; + tid < n; + tid += step) { + PairType *list = new PairType[size]; + + for(uint32_t i = 0; i < size; i++){ + uint32_t srcIdx = tid /stride_axis * stride_in_pre + tid % stride_axis + i * stride_axis; + + list[i] = PairType(data[srcIdx], i); + } + // thrust没有partial_sort算法,可尝试优化:分成size/topk组,每组取一个最大值 + thrust::sort(thrust::device, list, list + size, ComparePair()); + + + uint32_t init_offset = tid /stride_axis * stride_out_pre + tid % stride_axis; + for (uint32_t i = 0; i < topk; i++) + { + uint32_t offset = init_offset + stride_axis * i; + dstVal[offset] = list[i].first; + dstIdx[offset] = list[i].second; + } + + delete[] list; + } + } + + + + void launchTopK( + KernelLaunchParameters const ¶ms, + float const *data, float *dstVal, uint32_t *dstIdx, + uint32_t topk, + uint32_t stride_axis, + uint32_t stride_in_pre, + uint32_t stride_out_pre, + unsigned int size_axis) { + + TopKKernel<<< + params.gridSize, + params.blockSize, + 0, + reinterpret_cast(params.stream)>>>( + params.n, + (data), + (dstVal), + (dstIdx), + topk, + stride_axis, + stride_in_pre, + stride_out_pre, + size_axis); + + } + +}// namespace refactor::kernel::cuda diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h index 491810d1..5cfc5ee6 100644 --- a/src/04kernel/include/kernel/attributes/topk_info.h +++ b/src/04kernel/include/kernel/attributes/topk_info.h @@ -6,18 +6,17 @@ namespace refactor::kernel { struct TopKInfo { + struct Stride{ + dim_t axis, in_pre, out_pre; + }; + struct Size{ + dim_t axis, except_axis; + }; + uint32_t topk; + Stride stride; + Size size; - int64_t topk; - int64_t axis; - size_t in_stride, in_stride_pre_axis, out_stride_pre_axis; - size_t elem_size, axis_elem_size; - - TopKInfo(int64_t topk, int64_t axis, Tensor const &input); - size_t getElementSize() const {return elem_size;} - size_t getAxisElementSize()const { return axis_elem_size;} - size_t getInStride()const{return in_stride;} - size_t getInStridePreAxis()const{return in_stride_pre_axis;} - size_t getOutStridePreAxis()const {return out_stride_pre_axis;} + TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input); }; }// namespace refactor::kernel diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h index 3e0dc288..c4d8490f 100644 --- a/src/04kernel/include/kernel/collectors/topk.h +++ b/src/04kernel/include/kernel/collectors/topk.h @@ -6,9 +6,9 @@ namespace refactor::kernel { struct TopKCollector final : public InfoCollector { - int64_t topk, axis; + uint32_t topk, axis; - constexpr TopKCollector(decltype(_target) target, int64_t topk, int64_t axis_) noexcept + constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept : InfoCollector(target), topk(topk), axis(axis_) {} std::vector diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc index 52032db9..532f385d 100644 --- a/src/04kernel/src/attributes/topk_info.cc +++ b/src/04kernel/src/attributes/topk_info.cc @@ -3,12 +3,14 @@ namespace refactor::kernel { -TopKInfo::TopKInfo(int64_t topk, int64_t axis, Tensor const &input):topk(topk), - axis(axis), - in_stride(input.strides()[axis]), - in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]), - out_stride_pre_axis(in_stride_pre_axis/input.shape[axis]*topk), - elem_size(input.elementsSize()), - axis_elem_size(input.shape[axis]){} +TopKInfo::TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input){ + this->topk =topk; + auto tmpStride = axis == 0 ? 0 : input.strides()[axis - 1]; + this->stride = {input.strides()[axis],\ + tmpStride,\ + tmpStride/input.shape[axis]*topk}; + this->size = {input.shape[axis], \ + input.elementsSize()/input.shape[axis]}; +} } diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.cc b/src/04kernel/src/kernels/topk/cpu_kernel.cc index 06e1683a..e695e3f7 100644 --- a/src/04kernel/src/kernels/topk/cpu_kernel.cc +++ b/src/04kernel/src/kernels/topk/cpu_kernel.cc @@ -1,6 +1,6 @@ #include "cpu_kernel.hh" #include -#include +#include namespace refactor::kernel { using K = TopKCpu; @@ -29,31 +29,31 @@ namespace refactor::kernel { auto src = reinterpret_cast(inputs[0]); auto dstVal = reinterpret_cast(outputs[0]);//T - auto dstIndex = reinterpret_cast(outputs[1]); + auto dstIndex = reinterpret_cast(outputs[1]); - size_t M = info.getElementSize() / info.getAxisElementSize(); - size_t N = info.getAxisElementSize(); - auto inStride1 = info.getInStridePreAxis(); - auto inStride2 = info.getInStride(); - auto outStride1 = info.getOutStridePreAxis(); - auto outStride2 = inStride2; + size_t M = info.size.except_axis; + size_t N = info.size.axis; for(size_t m = 0; m < M; m ++){ - using PairType = std::pair; - std::list list; + using PairType = std::pair; + std::vector list; for(size_t n = 0; n < N; n++){ - auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2; + auto srcIdx = m /info.stride.axis * info.stride.in_pre + m % info.stride.axis + n * info.stride.axis; list.push_back({src[srcIdx],n}); } - list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;}); + //list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;}); + std::partial_sort(list.begin(), \ + list.begin() + info.topk, \ + list.end(), \ + [](const PairType &a, const PairType &b)->bool{return a.first > b.first;}); - size_t offset = m /inStride2 * outStride1 + m % inStride2; - std::for_each_n(list.begin(), (int64_t)info.topk, + size_t offset = m /info.stride.axis * info.stride.out_pre + m % info.stride.axis; + std::for_each_n(list.begin(), (uint32_t)info.topk, [&](auto &elem) { dstVal[offset] = elem.first; dstIndex[offset] = elem.second; - offset += outStride2; + offset += info.stride.axis; }); } }; diff --git a/src/04kernel/src/kernels/topk/cuda_kernel.cc b/src/04kernel/src/kernels/topk/cuda_kernel.cc new file mode 100644 index 00000000..acfa4733 --- /dev/null +++ b/src/04kernel/src/kernels/topk/cuda_kernel.cc @@ -0,0 +1,57 @@ +#include "cuda_kernel.hh" + +#ifdef USE_CUDA +#include "kernel/cuda/threads_distributer.cuh" +#include "kernel/cuda/topk.cuh" +#include +#include +#include +#include +#endif + +namespace refactor::kernel { + using K = TopKCuda; + + K::TopKCuda(TopKInfo info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(TopKInfo info) noexcept -> KernelBox { +#ifndef USE_CUDA + return nullptr; +#endif + + return std::make_unique(std::move(info)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing concat operation using CUDA"; + } + +#ifdef USE_CUDA + auto K::lower(Resources &) const noexcept -> RoutineWorkspace { + //return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs){ + + //} + return [info = this->info, params = cuda::ThreadsDistributer()(info.size.except_axis)] + (Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + cuda::launchTopK( + params, + reinterpret_cast(inputs[0]), + reinterpret_cast(outputs[0]), + reinterpret_cast(outputs[1]), + info.topk, + info.stride.axis, + info.stride.in_pre, + info.stride.out_pre, + info.size.axis); + }; + } +#endif +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/topk/cuda_kernel.hh b/src/04kernel/src/kernels/topk/cuda_kernel.hh new file mode 100644 index 00000000..069bbd44 --- /dev/null +++ b/src/04kernel/src/kernels/topk/cuda_kernel.hh @@ -0,0 +1,26 @@ +#ifndef KERNEL_TOPK_CUDA_KERNEL_HH +#define KERNEL_TOPK_CUDA_KERNEL_HH + +#include "kernel/attributes/topk_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct TopKCuda final : public Kernel { + TopKInfo info; + + explicit TopKCuda(TopKInfo) noexcept; + + static KernelBox build(TopKInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_CUDA + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_TOPK_CUDA_KERNEL_HH diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp index cea4e066..b0dcaa80 100644 --- a/src/04kernel/test/kernels/topk/test_cpu.cpp +++ b/src/04kernel/test/kernels/topk/test_cpu.cpp @@ -9,7 +9,7 @@ TEST(kernel, TopKCpu) { // build routine auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4}); auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3}); - auto outputTensor1 = Tensor::share(DataType::I64, Shape{3, 3}); + auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3}); auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor)); ASSERT_TRUE(kernel); @@ -28,7 +28,7 @@ TEST(kernel, TopKCpu) { // check std::vector expectVal = {3,2,1,7,6,5,11,10,9}; - std::vector expectIdx = {3,2,1,3,2,1,3,2,1}; + std::vector expectIdx = {3,2,1,3,2,1,3,2,1}; std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout< +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, TopKCuda) { + // build routine + auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4}); + std::vector> outputTensors{ + Tensor::share(DataType::F32, Shape{3, 3}), + Tensor::share(DataType::U32, Shape{3, 3})}; + + auto kCpu = TopKCpu::build(TopKInfo(3,1, *inputTensor)); + auto kCuda = TopKCuda::build(TopKInfo(3,1, *inputTensor)); + ASSERT_TRUE(kCpu); + ASSERT_TRUE(kCuda); + auto res = runtime::Resources(); + auto rCpu = kCpu->lower(res).routine; + auto rCuda = kCuda->lower(res).routine; + + // device malloc + auto &dev = *device::init(Device::Type::Nvidia, 0, ""); + Arc + gpuIn = dev.malloc(inputTensor->bytesSize()), + gpuOuts[]{ + dev.malloc(outputTensors[0]->bytesSize()), + dev.malloc(outputTensors[1]->bytesSize()), + }; + // put input data + std::vector data(inputTensor->elementsSize()); + + std::vector outCpu1(outputTensors[0]->elementsSize()); + std::vector outCpu2(outputTensors[1]->elementsSize()); + + + std::vector out1(outputTensors[0]->elementsSize()); + std::vector out2(outputTensors[1]->elementsSize()); + + std::iota(data.begin(), data.end(), 0); + gpuIn->copyFromHost(data.data(), inputTensor->bytesSize()); + // inference + { + void const *inputs[]{*gpuIn}; + void *outputs[]{*gpuOuts[0], *gpuOuts[1]}; + rCuda(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{data.data()}; + void *outputs[]{outCpu1.data(), outCpu2.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + + gpuOuts[0]->copyToHost(out1.data(), outputTensors[0]->bytesSize()); + EXPECT_EQ(out1, outCpu1); + gpuOuts[1]->copyToHost(out2.data(), outputTensors[1]->bytesSize()); + EXPECT_EQ(out2, outCpu2); + +} + +#endif diff --git a/src/05computation/include/computation/operators/topk.h b/src/05computation/include/computation/operators/topk.h index 8ecbdfed..d5c401f4 100644 --- a/src/05computation/include/computation/operators/topk.h +++ b/src/05computation/include/computation/operators/topk.h @@ -6,8 +6,8 @@ namespace refactor::computation { struct TopK final : public Operator { - int64_t topk, axis; - constexpr TopK(int64_t topk, int64_t axis) noexcept : topk(topk), axis(axis){} + uint32_t topk, axis; + constexpr TopK(uint32_t topk, uint32_t axis) noexcept : topk(topk), axis(axis){} static size_t typeId() noexcept; size_t opTypeId() const noexcept final; diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc index 98653472..c1e908e6 100644 --- a/src/07onnx/src/operators/topk.cc +++ b/src/07onnx/src/operators/topk.cc @@ -40,7 +40,7 @@ namespace refactor::onnx { auto dependencies = extractDependency(inputs); ans[0] = Tensor::share(input.dataType, input.shape, dependencies); ans[0]->shape[axis_] = DimExpr(topk); - ans[1] = Tensor::share(DataType::I64, input.shape, dependencies); + ans[1] = Tensor::share(DataType::U32, input.shape, dependencies); ans[1]->shape[axis_] = DimExpr(topk); return Ok(Tensors{std::move(ans)}); }