From de18a1bae90f5b08f63abb72c93b0428113e39e2 Mon Sep 17 00:00:00 2001
From: wangw <271502003@qq.com>
Date: Sat, 9 Mar 2024 20:04:42 +0800
Subject: [PATCH 1/4] Add topk operator and cpu kernel.

---
 .../include/kernel/attributes/topk_info.h     |  25 ++
 src/04kernel/include/kernel/collectors/topk.h |  20 ++
 src/04kernel/src/attributes/topk_info.cc      |  14 ++
 src/04kernel/src/collectors/topk.cc           |  30 +++
 src/04kernel/src/kernels/topk/cpu_kernel.cc   |  61 +++++
 src/04kernel/src/kernels/topk/cpu_kernel.hh   |  23 ++
 src/04kernel/test/kernels/topk/test_cpu.cpp   | 238 ++++++++++++++++++
 .../include/computation/operators/topk.h      |  20 ++
 src/05computation/src/operators/topk.cc       |  17 ++
 src/07onnx/src/operators.cpp                  |   2 +
 src/07onnx/src/operators/topk.cc              |  55 ++++
 src/07onnx/src/operators/topk.hh              |  23 ++
 12 files changed, 528 insertions(+)
 create mode 100644 src/04kernel/include/kernel/attributes/topk_info.h
 create mode 100644 src/04kernel/include/kernel/collectors/topk.h
 create mode 100644 src/04kernel/src/attributes/topk_info.cc
 create mode 100644 src/04kernel/src/collectors/topk.cc
 create mode 100644 src/04kernel/src/kernels/topk/cpu_kernel.cc
 create mode 100644 src/04kernel/src/kernels/topk/cpu_kernel.hh
 create mode 100644 src/04kernel/test/kernels/topk/test_cpu.cpp
 create mode 100644 src/05computation/include/computation/operators/topk.h
 create mode 100644 src/05computation/src/operators/topk.cc
 create mode 100644 src/07onnx/src/operators/topk.cc
 create mode 100644 src/07onnx/src/operators/topk.hh
diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h
new file mode 100644
index 00000000..a9210078
--- /dev/null
+++ b/src/04kernel/include/kernel/attributes/topk_info.h
@@ -0,0 +1,25 @@
+#ifndef KERNEL_TOPK_INFO_H
+#define KERNEL_TOPK_INFO_H
+
+#include "../tensor.h"
+
+namespace refactor::kernel {
+
+    struct TopKInfo {
+        
+        uint8_t topk;
+        uint8_t axis;
+        size_t in_stride, in_stride_pre_axis, out_stride_pre_axis;
+        size_t elem_size, axis_elem_size;
+        
+        TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input);
+        size_t getElementSize() const {return  elem_size;}
+        size_t getAxisElementSize()const { return axis_elem_size;}
+        size_t getInStride()const{return in_stride;}
+        size_t getInStridePreAxis()const{return in_stride_pre_axis;}
+        size_t getOutStridePreAxis()const {return out_stride_pre_axis;}
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_INFO_H
diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h
new file mode 100644
index 00000000..c4d8490f
--- /dev/null
+++ b/src/04kernel/include/kernel/collectors/topk.h
@@ -0,0 +1,20 @@
+#ifndef KERNEL_TOPK_H
+#define KERNEL_TOPK_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct TopKCollector final : public InfoCollector {
+        uint32_t topk, axis;
+
+        constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept
+            : InfoCollector(target), topk(topk), axis(axis_) {}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_H
diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc
new file mode 100644
index 00000000..12ab16bb
--- /dev/null
+++ b/src/04kernel/src/attributes/topk_info.cc
@@ -0,0 +1,14 @@
+#include "kernel/attributes/topk_info.h"
+#include <numeric>
+
+namespace refactor::kernel {
+
+TopKInfo::TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input):topk(topk), 
+            axis(axis),
+            in_stride(input.strides()[axis]),
+            in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]),
+            out_stride_pre_axis(in_stride_pre_axis/input.shape[axis]*topk),
+            elem_size(input.elementsSize()),
+            axis_elem_size(input.shape[axis]){}        
+
+}
diff --git a/src/04kernel/src/collectors/topk.cc b/src/04kernel/src/collectors/topk.cc
new file mode 100644
index 00000000..91a97427
--- /dev/null
+++ b/src/04kernel/src/collectors/topk.cc
@@ -0,0 +1,30 @@
+#include "kernel/collectors/topk.h"
+#include "../kernels/topk/cpu_kernel.hh"
+#include "kernel/attributes/topk_info.h"
+//#include "../kernels/topk/cuda_kernel.hh"
+
+namespace refactor::kernel {
+
+    std::vector<KernelBox>
+    TopKCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        TopKInfo info(topk, axis, inputs[0]);
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = TopKCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            //todo ：暂时用cpu的实现
+            case decltype(_target)::Nvidia:
+                if (auto ptr = TopKCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.cc b/src/04kernel/src/kernels/topk/cpu_kernel.cc
new file mode 100644
index 00000000..7330e44d
--- /dev/null
+++ b/src/04kernel/src/kernels/topk/cpu_kernel.cc
@@ -0,0 +1,61 @@
+#include "cpu_kernel.hh"
+#include <execution>
+#include <list>
+
+namespace refactor::kernel {
+    using K = TopKCpu;
+
+    K::TopKCpu(TopKInfo info) noexcept
+        : Kernel(), info(std::move(info)) {}
+
+    auto K::build(TopKInfo info) noexcept -> KernelBox {
+        return std::make_unique<K>(std::move(info));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing topk operation on generic cpu";
+    }
+
+    auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto src = reinterpret_cast<float const *>(inputs[0]);
+            
+            auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
+            auto dstIndex = reinterpret_cast<uint32_t*>(outputs[1]);
+                       
+            
+            size_t M = info.getElementSize() / info.getAxisElementSize();
+            size_t N = info.getAxisElementSize();
+            auto inStride1 = info.getInStridePreAxis();
+            auto inStride2 = info.getInStride();
+            auto outStride1 = info.getOutStridePreAxis();
+            auto outStride2 = inStride2;
+
+            for(size_t m = 0; m < M; m ++){
+                using PairType = std::pair<float, uint8_t>;
+                std::list<PairType> list;
+                for(size_t n = 0; n < N; n++){                    
+                    auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2;                    
+                    list.push_back({src[srcIdx],n});                    
+                }
+                list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
+                
+                size_t offset = m /inStride2 * outStride1 + m % inStride2;
+                std::for_each_n(list.begin(), (uint32_t)info.topk,
+                            [&](auto &elem) {                                
+                                dstVal[offset] = elem.first;
+                                dstIndex[offset] = elem.second;
+                                offset += outStride2;
+                            });   
+            }
+        };
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.hh b/src/04kernel/src/kernels/topk/cpu_kernel.hh
new file mode 100644
index 00000000..75b2a4ce
--- /dev/null
+++ b/src/04kernel/src/kernels/topk/cpu_kernel.hh
@@ -0,0 +1,23 @@
+#ifndef KERNEL_TOPK_CPU_KERNEL_HH
+#define KERNEL_TOPK_CPU_KERNEL_HH
+
+#include "kernel/attributes/topk_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct TopKCpu final : public Kernel {
+        TopKInfo info;
+        explicit TopKCpu(TopKInfo info) noexcept;
+
+        static KernelBox build(TopKInfo info) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_CPU_KERNEL_HH
diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp
new file mode 100644
index 00000000..b0dcaa80
--- /dev/null
+++ b/src/04kernel/test/kernels/topk/test_cpu.cpp
@@ -0,0 +1,238 @@
+#include "../../../src/kernels/topk/cpu_kernel.hh"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, TopKCpu) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3});
+
+    auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+  
+    // check
+    std::vector<float> expectVal = {3,2,1,7,6,5,11,10,9};
+    std::vector<uint32_t> expectIdx = {3,2,1,3,2,1,3,2,1};
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+TEST(kernel, TopKCpu1) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 4, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 3, 2});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 3, 2});
+
+    auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {6,7,4,5,2,3,14,15,12,13,10,11};
+    std::vector<uint32_t> expectIdx = {3,3,2,2,1,1,3,3,2,2,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+TEST(kernel, TopKCpu2) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 4, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{1, 4, 2});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{1, 4, 2});
+
+    auto kernel = TopKCpu::build(TopKInfo(1,0, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {8,9,10,11,12,13,14,15};
+    std::vector<uint32_t> expectIdx = {1,1,1,1,1,1,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+
+TEST(kernel, TopKCpu3) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{1, 3, 2, 2});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{1, 3, 2, 2});
+
+    auto kernel = TopKCpu::build(TopKInfo(1,0, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {12, 13, 14, 15, 16, 17, 18,  19, 20,21, 22,23};
+    std::vector<uint32_t> expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+TEST(kernel, TopKCpu4) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 2, 2, 2});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 2, 2, 2});
+
+    auto kernel = TopKCpu::build(TopKInfo(2,1, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {8, 9, 10, 11,4,5,6,7,20,21,22,23,16,17,18,19};
+    std::vector<uint32_t> expectIdx = {2,2,2,2,1,1,1,1,2,2,2,2,1,1,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+
+TEST(kernel, TopKCpu5) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 3, 1, 2});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 3, 1, 2});
+
+    auto kernel = TopKCpu::build(TopKInfo(1,2, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {2,3,6,7,10,11,14,15,18,19,22,23};
+    std::vector<uint32_t> expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
+
+TEST(kernel, TopKCpu6) {
+    // build routine    
+    auto inputTensor = Tensor::share(DataType::F32, Shape{2, 3, 2, 2});
+    auto outputTensor0 = Tensor::share(DataType::F32, Shape{2, 3, 2, 1});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{2, 3, 2, 1});
+
+    auto kernel = TopKCpu::build(TopKInfo(1,3, *inputTensor));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float> ins(inputTensor->elementsSize());
+    std::vector<float>  out0(outputTensor0->elementsSize());
+    std::vector<uint32_t> out1(outputTensor1->elementsSize());
+
+    std::iota(ins.begin(), ins.end(), 0);
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    // check
+    std::vector<float> expectVal = {1,3,5,7,9,11,13,15,17,19,21,23};
+    std::vector<uint32_t> expectIdx = {1,1,1,1,1,1,1,1,1,1,1,1};
+    
+
+    for(size_t i=0;i< expectVal.size(); ++i){
+        EXPECT_EQ(expectVal[i], out0[i]);
+        EXPECT_EQ(expectIdx[i], out1[i]);
+    }
+}
\ No newline at end of file
diff --git a/src/05computation/include/computation/operators/topk.h b/src/05computation/include/computation/operators/topk.h
new file mode 100644
index 00000000..bdce6d6f
--- /dev/null
+++ b/src/05computation/include/computation/operators/topk.h
@@ -0,0 +1,20 @@
+#ifndef COMPUTATION_TOPK_H
+#define COMPUTATION_TOPK_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct TopK final : public Operator {
+        uint32_t topk,axis;
+        constexpr TopK(uint32_t topk, uint32_t axis) noexcept : topk(topk), axis(axis){}
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// COMPUTATION_SPLIT_H
diff --git a/src/05computation/src/operators/topk.cc b/src/05computation/src/operators/topk.cc
new file mode 100644
index 00000000..f25e9a85
--- /dev/null
+++ b/src/05computation/src/operators/topk.cc
@@ -0,0 +1,17 @@
+#include "computation/operators/topk.h"
+#include "kernel/collectors/topk.h"
+
+namespace refactor::computation {
+
+    size_t TopK::typeId() noexcept {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    size_t TopK::opTypeId() const noexcept { return typeId(); }
+    std::string_view TopK::name() const noexcept { return "TopK"; }
+    auto TopK::candidateKernels(Target target) const noexcept -> kernel::CollectorBox {
+        using Collector_ = kernel::TopKCollector;
+        return std::make_unique<Collector_>(target, topk, axis);
+    }
+
+}// namespace refactor::computation
diff --git a/src/07onnx/src/operators.cpp b/src/07onnx/src/operators.cpp
index 0981f720..8e50a810 100644
--- a/src/07onnx/src/operators.cpp
+++ b/src/07onnx/src/operators.cpp
@@ -38,6 +38,7 @@
 #include "operators/transpose.hh"
 #include "operators/unsqueeze.hh"
 #include "operators/where.hh"
+#include "operators/topk.hh"
 
 namespace refactor::onnx {
 
@@ -131,6 +132,7 @@ namespace refactor::onnx {
         REGISTER(Where                , Where                );
         REGISTER(HardSigmoid          , HardSigmoid          );
         REGISTER(Pad                  , Pad                  );
+        REGISTER(TopK                 , TopK                 );
         // clang-format on
 #undef REGISTER
     }
diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc
new file mode 100644
index 00000000..8b3b31c2
--- /dev/null
+++ b/src/07onnx/src/operators/topk.cc
@@ -0,0 +1,55 @@
+#include "common.h"
+#include "topk.hh"
+#include "computation/operators/topk.h"
+#include <execution>
+
+namespace refactor::onnx {
+    using Op = TopK;
+
+    Op::TopK(Int topk, Int axis):topk(topk), axis(axis){}
+
+    auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox {
+        auto axis = attributes["axis"].int_();
+        auto topk = attributes["topk"].int_();
+        return OpBox(std::make_unique<Op>(topk, axis));
+    }
+
+    auto Op::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto Op::opTypeId() const -> size_t { return typeId(); }
+    auto Op::opTypeName() const -> std::string_view { return "TopK"; }
+
+    auto Op::infer(TensorRefs inputs, InferOptions const &options) const -> InferResult {
+            if (inputs.empty() || inputs.size() >= 2) {
+            return Err(InferError(ERROR_MSG("Input size error")));
+        }
+        auto const &input = inputs[0];
+        auto rank = input.rank();
+        auto axis_ = axis < 0 ? axis + rank : axis;
+        if (rank <= axis_) {
+            return Err(InferError(ERROR_MSG("axis error")));
+        }
+        if (topk < 0 || topk > input.shape[axis_].value()){
+            return Err(InferError(ERROR_MSG("topk error")));
+        }
+
+        Tensors ans(2, nullptr);
+        auto dependencies = extractDependency(inputs);
+        ans[0] = Tensor::share(input.dataType, input.shape, dependencies);
+        ans[0]->shape[axis_] = DimExpr(topk);
+        ans[1] = Tensor::share(input.dataType, input.shape, dependencies);            
+        ans[1]->shape[axis_] = DimExpr(topk);  
+        return Ok(Tensors{std::move(ans)});
+    }
+
+    auto Op::lower(TensorRefs inputs) const -> computation::OpBox {
+        using Op_ = computation::TopK;
+        auto rank = inputs[0].rank();
+        auto axis_ = axis < 0 ? axis + rank : axis;
+        return std::make_unique<Op_>(topk, axis_);
+    }
+
+}// namespace refactor::onnx
diff --git a/src/07onnx/src/operators/topk.hh b/src/07onnx/src/operators/topk.hh
new file mode 100644
index 00000000..2b86f5bb
--- /dev/null
+++ b/src/07onnx/src/operators/topk.hh
@@ -0,0 +1,23 @@
+#ifndef ONNX_TOPK_HH
+#define ONNX_TOPK_HH
+
+#include "frontend/operator.h"
+
+namespace refactor::onnx {
+    using namespace frontend;
+
+    struct TopK final : public Operator {
+        Int topk, axis;
+        TopK(Int topk, Int axis);
+
+        static size_t typeId();
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+}// namespace refactor::onnx
+
+#endif// ONNX_WHERE_HH

From 5b33afdb5f6dd08d9d478ab380064750e2f8117e Mon Sep 17 00:00:00 2001
From: wangw <271502003@qq.com>
Date: Wed, 20 Mar 2024 21:18:50 +0800
Subject: [PATCH 2/4] add assign_pos\scatter operator and cpu kernel for moe.

---
 CMakeLists.txt                                |  1 +
 .../include/kernel/attributes/moe_info.h      | 24 ++++++
 src/04kernel/include/kernel/collectors/moe.h  | 29 +++++++
 src/04kernel/src/attributes/moe_info.cc       | 13 +++
 src/04kernel/src/collectors/moe.cc            | 51 ++++++++++++
 src/04kernel/src/kernels/moe/cpu_kernel.cc    | 83 +++++++++++++++++++
 src/04kernel/src/kernels/moe/cpu_kernel.hh    | 35 ++++++++
 src/04kernel/test/kernels/moe/test_cpu.cpp    | 75 +++++++++++++++++
 .../include/computation/operators/moe.h       | 37 +++++++++
 src/05computation/src/operators/moe.cc        | 34 ++++++++
 src/07onnx/src/operators/topk.cc              |  2 +-
 src/08-02moe/CMakeLists.txt                   | 15 ++++
 src/08-02moe/include/operators.h              | 10 +++
 src/08-02moe/src/operators.cpp                | 16 ++++
 src/08-02moe/src/operators/moe.cc             | 78 +++++++++++++++++
 src/08-02moe/src/operators/moe.hh             | 38 +++++++++
 src/08-02moe/test/test_moe.cpp                | 25 ++++++
 17 files changed, 565 insertions(+), 1 deletion(-)
 create mode 100644 src/04kernel/include/kernel/attributes/moe_info.h
 create mode 100644 src/04kernel/include/kernel/collectors/moe.h
 create mode 100644 src/04kernel/src/attributes/moe_info.cc
 create mode 100644 src/04kernel/src/collectors/moe.cc
 create mode 100644 src/04kernel/src/kernels/moe/cpu_kernel.cc
 create mode 100644 src/04kernel/src/kernels/moe/cpu_kernel.hh
 create mode 100644 src/04kernel/test/kernels/moe/test_cpu.cpp
 create mode 100644 src/05computation/include/computation/operators/moe.h
 create mode 100644 src/05computation/src/operators/moe.cc
 create mode 100644 src/08-02moe/CMakeLists.txt
 create mode 100644 src/08-02moe/include/operators.h
 create mode 100644 src/08-02moe/src/operators.cpp
 create mode 100644 src/08-02moe/src/operators/moe.cc
 create mode 100644 src/08-02moe/src/operators/moe.hh
 create mode 100644 src/08-02moe/test/test_moe.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49ddcda6..a0e853b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,4 +106,5 @@ add_subdirectory(src/06frontend)
 add_subdirectory(src/07onnx)
 add_subdirectory(src/08communication)
 add_subdirectory(src/08-01llm)
+add_subdirectory(src/08-02moe)
 add_subdirectory(src/09python_ffi)
diff --git a/src/04kernel/include/kernel/attributes/moe_info.h b/src/04kernel/include/kernel/attributes/moe_info.h
new file mode 100644
index 00000000..79eaa7b2
--- /dev/null
+++ b/src/04kernel/include/kernel/attributes/moe_info.h
@@ -0,0 +1,24 @@
+#ifndef KERNEL_MOE_INFO_H
+#define KERNEL_MOE_INFO_H
+
+#include "../tensor.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosInfo {
+        uint32_t top, expert_num;
+        uint32_t elementSize;
+        
+        AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate);        
+    };
+
+    struct ReorderInfo{
+        bool scatter;  
+        uint32_t top;
+        uint32_t blockNum, blockSize;
+        ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs);
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_INFO_H
diff --git a/src/04kernel/include/kernel/collectors/moe.h b/src/04kernel/include/kernel/collectors/moe.h
new file mode 100644
index 00000000..457de919
--- /dev/null
+++ b/src/04kernel/include/kernel/collectors/moe.h
@@ -0,0 +1,29 @@
+#ifndef KERNEL_MOE_H
+#define KERNEL_MOE_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosCollector final : public InfoCollector {
+        uint32_t topk,numExperts;
+        constexpr AssignPosCollector(decltype(_target) target, uint32_t topk, uint32_t numExperts) noexcept
+            : InfoCollector(target) ,topk(topk), numExperts(numExperts){}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+    struct ReorderCollector final : public InfoCollector {
+        bool scatter;
+        uint32_t topk;
+        constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept
+            : InfoCollector(target) ,scatter(scatter), topk(topk){}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_H
diff --git a/src/04kernel/src/attributes/moe_info.cc b/src/04kernel/src/attributes/moe_info.cc
new file mode 100644
index 00000000..f5b0b677
--- /dev/null
+++ b/src/04kernel/src/attributes/moe_info.cc
@@ -0,0 +1,13 @@
+#include "kernel/attributes/moe_info.h"
+#include <numeric>
+
+namespace refactor::kernel {
+
+AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\
+    top(top), expert_num(expert_num),elementSize(gate.elementsSize()){}      
+
+ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\
+    scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){}  
+
+
+}
diff --git a/src/04kernel/src/collectors/moe.cc b/src/04kernel/src/collectors/moe.cc
new file mode 100644
index 00000000..4906499e
--- /dev/null
+++ b/src/04kernel/src/collectors/moe.cc
@@ -0,0 +1,51 @@
+#include "kernel/collectors/moe.h"
+#include "../kernels/moe/cpu_kernel.hh"
+#include "kernel/attributes/moe_info.h"
+
+namespace refactor::kernel {
+
+    std::vector<KernelBox>
+    AssignPosCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        AssignPosInfo info(topk, numExperts, inputs[0]);
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = AssignPosCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            //todo ：暂时用cpu的实现
+            case decltype(_target)::Nvidia:
+                if (auto ptr = AssignPosCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+    std::vector<KernelBox>
+    ReorderCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        ReorderInfo info(scatter, topk, inputs);
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = ReorderCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            //todo ：暂时用cpu的实现
+            case decltype(_target)::Nvidia:
+                if (auto ptr = ReorderCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.cc b/src/04kernel/src/kernels/moe/cpu_kernel.cc
new file mode 100644
index 00000000..215e8765
--- /dev/null
+++ b/src/04kernel/src/kernels/moe/cpu_kernel.cc
@@ -0,0 +1,83 @@
+#include "cpu_kernel.hh"
+#include <execution>
+#include <list>
+
+namespace refactor::kernel {
+
+    AssignPosCpu::AssignPosCpu(AssignPosInfo info) noexcept
+        : Kernel(), info(std::move(info)) {}
+
+    auto AssignPosCpu::build(AssignPosInfo info) noexcept -> KernelBox {
+        return std::make_unique<AssignPosCpu>(std::move(info));
+    }
+    auto AssignPosCpu::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto AssignPosCpu::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto AssignPosCpu::description() const noexcept -> std::string_view {
+        return "Performing AssignPos operation on generic cpu";
+    }
+
+    auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto gate = reinterpret_cast<uint8_t const *>(inputs[0]);
+             
+            auto expert_cnt = reinterpret_cast<uint8_t*>(outputs[0]);//T
+            auto pos = reinterpret_cast<uint8_t*>(outputs[1]);
+            std::memset(expert_cnt, 0, info.expert_num);
+            for (size_t i = 0; i < info.elementSize; i ++){
+                ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!");
+                expert_cnt[gate[i]] ++;
+            }
+            std::vector<uint8_t> expert_accumlate;
+            expert_accumlate.assign(info.expert_num, 0);
+            for (size_t i=0; i<expert_accumlate.size(); ++i){
+                expert_accumlate[i] = (i==0) ? expert_cnt[i] : (expert_accumlate[i-1] + expert_cnt[i]);
+            }
+
+            for (size_t i=0; i< info.elementSize; ++i){
+                pos[--expert_accumlate[gate[i]]] = i;
+            }          
+        };
+    }
+
+
+    ReorderCpu::ReorderCpu(ReorderInfo info) noexcept
+        : Kernel(), info(std::move(info)) {}
+
+    auto ReorderCpu::build(ReorderInfo info) noexcept -> KernelBox {
+        return std::make_unique<ReorderCpu>(std::move(info));
+    }
+    auto ReorderCpu::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto ReorderCpu::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto ReorderCpu::description() const noexcept -> std::string_view {
+        return "Performing scatter operation on generic cpu";
+    }
+
+    auto ReorderCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto input = reinterpret_cast<float const *>(inputs[0]);
+            auto pos = reinterpret_cast<uint32_t const *>(inputs[1]);
+            auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
+                             
+            for(size_t i = 0; i<info.blockNum; i++){
+                if (info.scatter)
+                    std::copy_n(input + (pos[i]/info.top) * info.blockSize, info.blockSize, dstVal + i*info.blockSize);
+                else 
+                    std::copy_n(input + i*info.blockSize, info.blockSize, dstVal + pos[i] * info.blockSize);
+            }            
+        };
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.hh b/src/04kernel/src/kernels/moe/cpu_kernel.hh
new file mode 100644
index 00000000..2401f079
--- /dev/null
+++ b/src/04kernel/src/kernels/moe/cpu_kernel.hh
@@ -0,0 +1,35 @@
+#ifndef KERNEL_MOE_CPU_KERNEL_HH
+#define KERNEL_MOE_CPU_KERNEL_HH
+
+#include "kernel/attributes/moe_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosCpu final : public Kernel {
+        AssignPosInfo info;
+        explicit AssignPosCpu(AssignPosInfo info) noexcept;
+
+        static KernelBox build(AssignPosInfo info) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+    struct ReorderCpu final : public Kernel {
+        ReorderInfo info;
+        explicit ReorderCpu(ReorderInfo info) noexcept;
+
+        static KernelBox build(ReorderInfo info) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_CPU_KERNEL_HH
diff --git a/src/04kernel/test/kernels/moe/test_cpu.cpp b/src/04kernel/test/kernels/moe/test_cpu.cpp
new file mode 100644
index 00000000..2574d8b7
--- /dev/null
+++ b/src/04kernel/test/kernels/moe/test_cpu.cpp
@@ -0,0 +1,75 @@
+#include "../../../src/kernels/moe/cpu_kernel.hh"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, AssignPosCpu) {
+    // build routine    
+    //auto inputTensor = Tensor::share(DataType::F32, Shape{4, 1024});
+    auto gate = Tensor::share(DataType::U32, Shape{8, 2});
+    auto expert_cnt = Tensor::share(DataType::U32, Shape{4});
+    auto pos = Tensor::share(DataType::U32, Shape{16});
+
+    auto kernel = AssignPosCpu::build(AssignPosInfo(2,4, *gate));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<uint8_t> ins = {3,2, 0,1, 2,1, 1,3, 2,0, 1,3, 1,0, 1,2};
+    std::vector<uint8_t>  out0(expert_cnt->elementsSize());
+    std::vector<uint8_t> out1(pos->elementsSize());
+
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+  
+    // check
+    std::vector<uint32_t> expectExpertCnt = {3,6,4,3};
+    std::vector<uint32_t> expectPos = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
+    //std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    for(size_t i=0;i< expectPos.size(); ++i){
+        EXPECT_EQ(expectPos[i], out1[i]);
+    }
+    for(size_t i=0;i< expectExpertCnt.size(); ++i){
+        EXPECT_EQ(expectExpertCnt[i], out0[i]);
+    }
+}
+
+TEST(kernel, ReorderScatterCpu) {
+    // build routine    
+    const int seq = 8, hid = 4, top = 2;
+    auto input = Tensor::share(DataType::U32, Shape{seq, hid});
+    auto pos = Tensor::share(DataType::U32, Shape{seq * top});
+    std::vector<Arc<Tensor>> inputTensors{input, pos};
+    TensorRefs inputs_;
+    inputs_.reserve(inputTensors.size());
+    std::transform(inputTensors.begin(), inputTensors.end(),
+                   std::back_inserter(inputs_),
+                   [](auto const &it) { return std::cref(*it); });
+
+    auto kernel = ReorderCpu::build(ReorderInfo(true, top, inputs_));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float>  ins0(input->elementsSize());
+    std::iota(ins0.begin(), ins0.end(), 0);
+    std::vector<uint32_t> ins1 = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
+    std::vector<float>  out(input->elementsSize() * top);
+
+    // inference
+    void const *inputs[]{ins0.data(), ins1.data()};
+    void *outputs[]{out.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out.begin(), out.end(),[](const float &val){std::cout<<val<<" ";});
+    // check    
+    for(size_t i=0;i< seq; ++i){
+        int row = ins1[i]/top;
+        for(size_t j = 0; j<hid; j++)
+            EXPECT_EQ(ins0[row *hid + j], out[i*hid + j]);
+    }
+}
diff --git a/src/05computation/include/computation/operators/moe.h b/src/05computation/include/computation/operators/moe.h
new file mode 100644
index 00000000..9c062122
--- /dev/null
+++ b/src/05computation/include/computation/operators/moe.h
@@ -0,0 +1,37 @@
+#ifndef COMPUTATION_MOE_H
+#define COMPUTATION_MOE_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct AssignPos final : public Operator {
+        uint32_t topk,numExperts;
+
+        constexpr explicit AssignPos(uint32_t topk, uint32_t numExperts) noexcept : Operator(), 
+            topk(topk), numExperts(numExperts){}
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const final;
+        std::string serialize() const noexcept final;
+    };
+
+     struct Reorder final : public Operator {
+        bool scatter;
+        uint32_t topk;
+
+        constexpr explicit Reorder(bool scatter, uint32_t topk) noexcept : Operator(), 
+            scatter(scatter), topk(topk){}
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const final;
+        std::string serialize() const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// COMPUTATION_RMS_NORMALIZATION_H
diff --git a/src/05computation/src/operators/moe.cc b/src/05computation/src/operators/moe.cc
new file mode 100644
index 00000000..a62d4abf
--- /dev/null
+++ b/src/05computation/src/operators/moe.cc
@@ -0,0 +1,34 @@
+#include "computation/operators/moe.h"
+#include "kernel/collectors/moe.h"
+
+namespace refactor::computation {
+
+    auto AssignPos::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto AssignPos::opTypeId() const noexcept -> size_t { return typeId(); }
+    auto AssignPos::name() const noexcept -> std::string_view { return "moe::AssignPos"; }
+    auto AssignPos::candidateKernels(Target target) const -> kernel::CollectorBox {
+        using Collector_ = kernel::AssignPosCollector;
+        return std::make_unique<Collector_>(target, topk, numExperts);
+    }
+    auto AssignPos::serialize() const noexcept -> std::string {
+        return "moe::AssignPos()";
+    }
+
+    auto Reorder::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto Reorder::opTypeId() const noexcept -> size_t { return typeId(); }
+    auto Reorder::name() const noexcept -> std::string_view { return "moe::Reorder"; }
+    auto Reorder::candidateKernels(Target target) const -> kernel::CollectorBox {
+        using Collector_ = kernel::ReorderCollector;
+        return std::make_unique<Collector_>(target, scatter, topk);
+    }
+    auto Reorder::serialize() const noexcept -> std::string {
+        return "moe::Reorder()";
+    }
+
+}// namespace refactor::computation
diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc
index 8b3b31c2..1745f37e 100644
--- a/src/07onnx/src/operators/topk.cc
+++ b/src/07onnx/src/operators/topk.cc
@@ -6,7 +6,7 @@
 namespace refactor::onnx {
     using Op = TopK;
 
-    Op::TopK(Int topk, Int axis):topk(topk), axis(axis){}
+    Op::TopK(Int topk, Int axis):Operator(), topk(topk), axis(axis){}
 
     auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox {
         auto axis = attributes["axis"].int_();
diff --git a/src/08-02moe/CMakeLists.txt b/src/08-02moe/CMakeLists.txt
new file mode 100644
index 00000000..25b882cc
--- /dev/null
+++ b/src/08-02moe/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+project(moe VERSION 0.0.0 LANGUAGES CXX)
+message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})
+
+file(GLOB_RECURSE MOE_SRC src/*.cc src/*.cpp)
+add_library(moe STATIC ${MOE_SRC})
+target_link_libraries(moe PUBLIC frontend)
+target_include_directories(moe PUBLIC include)
+
+file(GLOB_RECURSE MOE_TEST test/*.cpp)
+if(MOE_TEST)
+    add_executable(moe_test ${MOE_TEST})
+    add_test(moe_test moe_test)
+    target_link_libraries(moe_test moe GTest::gtest_main Backward::Object)
+endif()
diff --git a/src/08-02moe/include/operators.h b/src/08-02moe/include/operators.h
new file mode 100644
index 00000000..b4221025
--- /dev/null
+++ b/src/08-02moe/include/operators.h
@@ -0,0 +1,10 @@
+#ifndef MOE_OPERATORS_H
+#define MOE_OPERATORS_H
+
+namespace refactor::moe {
+
+    void register_();
+
+}// namespace refactor::moe
+
+#endif// MOE_OPERATORS_H
diff --git a/src/08-02moe/src/operators.cpp b/src/08-02moe/src/operators.cpp
new file mode 100644
index 00000000..887c70ee
--- /dev/null
+++ b/src/08-02moe/src/operators.cpp
@@ -0,0 +1,16 @@
+#include "operators.h"
+#include "operators/moe.hh"
+
+namespace refactor::moe {
+    using namespace frontend;
+
+    void register_() {
+#define REGISTER(NAME, CLASS) Operator::register_<CLASS>("moe::" #NAME)
+        // clang-format off
+        REGISTER(AssignPos       , AssignPos       );
+        REGISTER(Reorder         , Reorder        );
+        // clang-format on
+#undef REGISTER
+    }
+
+}// namespace refactor::moe
diff --git a/src/08-02moe/src/operators/moe.cc b/src/08-02moe/src/operators/moe.cc
new file mode 100644
index 00000000..be971795
--- /dev/null
+++ b/src/08-02moe/src/operators/moe.cc
@@ -0,0 +1,78 @@
+#include "moe.hh"
+#include "common.h"
+#include "computation/operators/moe.h"
+
+namespace refactor::moe {
+
+    AssignPos::AssignPos(uint32_t topk, uint32_t numExperts) : Operator() ,topk(topk), numExperts(numExperts){}
+
+    auto AssignPos::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { 
+        auto topk = attributes["topk"].int_();
+        auto num_experts = attributes["num_experts"].int_();
+        return OpBox(std::make_unique<AssignPos>(topk, num_experts));
+    }
+    auto AssignPos::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto AssignPos::opTypeId() const -> size_t { return typeId(); }
+    auto AssignPos::opTypeName() const -> std::string_view { return "moe::AssignPos"; }
+
+    auto AssignPos::infer(TensorRefs inputs, InferOptions const &) const -> InferResult {
+        EXPECT_SIZE(1)
+
+        auto const &gate = inputs[0];
+        
+        if (gate.dataType != DataType::I16) {
+            return Err(InferError(ERROR_MSG("Input data type not support")));
+        }
+        
+        return Ok(Tensors{Tensor::share(gate.dataType, Shape{DimExpr(numExperts)}, extractDependency(inputs)),
+            Tensor::share(gate.dataType, gate.shape, extractDependency(inputs))});
+    }
+
+    auto AssignPos::lower(TensorRefs) const -> computation::OpBox {
+        using Op_ = computation::AssignPos;
+        return std::make_unique<Op_>(topk, numExperts);
+    }
+
+    Reorder::Reorder(bool scatter, uint32_t topk, uint32_t dim) : Operator() ,scatter(scatter), top(topk), dim(dim){}
+
+    auto Reorder::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { 
+        auto topk = attributes["topk"].int_();
+        bool scatter = attributes["scatter"].int_() != 0 ;
+        bool dim = attributes["dim"].int_();
+        return OpBox(std::make_unique<Reorder>(scatter, topk, dim));
+    }
+    auto Reorder::typeId() -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto Reorder::opTypeId() const -> size_t { return typeId(); }
+    auto Reorder::opTypeName() const -> std::string_view { return "moe::Reorder"; }
+
+    auto Reorder::infer(TensorRefs inputs, InferOptions const &) const -> InferResult {
+        EXPECT_SIZE(2)
+        auto const &input = inputs[0];
+        auto const &pos = inputs[1];
+        if (dim != 0)
+            return Err(InferError(ERROR_MSG("dim is not right!")));
+        if(scatter && input.elementsSize() * top != pos.elementsSize())
+            return Err(InferError(ERROR_MSG("Inputs data size are not right!")));
+        else if(!scatter && input.elementsSize()  != pos.elementsSize())
+            return Err(InferError(ERROR_MSG("Inputs data size are not right!")));
+
+        if (pos.dataType != DataType::I16) {
+            return Err(InferError(ERROR_MSG("Input data type not support")));
+        }
+        
+        return Ok(Tensors{Tensor::share(input.dataType, pos.shape, extractDependency(inputs))});
+    }
+
+    auto Reorder::lower(TensorRefs) const -> computation::OpBox {
+        using Op_ = computation::Reorder;
+        return std::make_unique<Op_>(scatter, top);
+    }
+}// namespace refactor::llm
diff --git a/src/08-02moe/src/operators/moe.hh b/src/08-02moe/src/operators/moe.hh
new file mode 100644
index 00000000..96f82709
--- /dev/null
+++ b/src/08-02moe/src/operators/moe.hh
@@ -0,0 +1,38 @@
+#ifndef MOE_HH
+#define MOE_HH
+
+#include "frontend/operator.h"
+
+namespace refactor::moe {
+    using namespace frontend;
+
+    struct AssignPos final : public Operator {
+        uint32_t topk, numExperts;
+        explicit AssignPos(uint32_t topk, uint32_t numExperts);
+
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        static size_t typeId();
+
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+    struct Reorder final : public Operator {
+        bool scatter;  
+        uint32_t top, dim;
+        explicit Reorder(bool scatter, uint32_t topk, uint32_t dim);
+
+        static OpBox build(ModelContext const &, std::string_view, Attributes);
+        static size_t typeId();
+
+        size_t opTypeId() const final;
+        std::string_view opTypeName() const final;
+        InferResult infer(TensorRefs, InferOptions const &) const final;
+        computation::OpBox lower(TensorRefs) const final;
+    };
+
+}// namespace refactor::llm
+
+#endif// LLM_RMS_ATTENTION_HH
diff --git a/src/08-02moe/test/test_moe.cpp b/src/08-02moe/test/test_moe.cpp
new file mode 100644
index 00000000..d4946573
--- /dev/null
+++ b/src/08-02moe/test/test_moe.cpp
@@ -0,0 +1,25 @@
+#include "../src/operators/moe.hh"
+#include "operators.h"
+#include <gtest/gtest.h>
+
+using namespace refactor;
+using namespace moe;
+
+TEST(infer, AssignPos) {
+    moe::register_();
+    auto edges = Edges{
+        
+        {Tensor::share(DataType::I16, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2
+    };
+    count_t inputs[]{0};
+    auto infered = AssignPos(2,4).infer(TensorRefs(edges, inputs), {true});
+    ASSERT_TRUE(infered.isOk());
+    auto outputs = std::move(infered.unwrap());
+    ASSERT_EQ(outputs.size(), 2);
+    auto expert_cnt = std::move(outputs[0]);
+    ASSERT_EQ(expert_cnt->dataType, DataType::F32);
+    ASSERT_EQ(expert_cnt->shape, (Shape{DimExpr(4)}));
+    auto pos = std::move(outputs[1]);
+    ASSERT_EQ(pos->dataType, DataType::I16);
+    ASSERT_EQ(pos->shape, (Shape{DimExpr(16)}));
+}

From 2b698b3ce1fa96bd7735dddfdf864ba87c3a9553 Mon Sep 17 00:00:00 2001
From: wangw <271502003@qq.com>
Date: Mon, 1 Apr 2024 15:48:13 +0800
Subject: [PATCH 3/4] fix datatype

---
 .../include/kernel/attributes/moe_info.h      | 12 ++++++------
 .../include/kernel/attributes/topk_info.h     |  6 +++---
 src/04kernel/include/kernel/collectors/moe.h  |  4 ++--
 src/04kernel/include/kernel/collectors/topk.h |  4 ++--
 src/04kernel/src/attributes/moe_info.cc       |  4 ++--
 src/04kernel/src/attributes/topk_info.cc      |  2 +-
 src/04kernel/src/kernels/moe/cpu_kernel.cc    | 10 +++++-----
 src/04kernel/src/kernels/topk/cpu_kernel.cc   |  6 +++---
 src/04kernel/test/kernels/topk/test_cpu.cpp   |  4 ++--
 .../include/computation/operators/moe.h       |  8 ++++----
 .../include/computation/operators/topk.h      |  4 ++--
 src/07onnx/src/operators/topk.cc              |  2 +-
 src/08-02moe/include/{ => moe}/operators.h    |  0
 src/08-02moe/src/operators.cpp                |  2 +-
 src/08-02moe/src/operators/moe.cc             | 19 ++++++++++++-------
 src/08-02moe/src/operators/moe.hh             |  8 ++++----
 src/08-02moe/test/test_moe.cpp                |  6 +++---
 src/09python_ffi/CMakeLists.txt               |  2 +-
 src/09python_ffi/src/main.cpp                 |  2 ++
 19 files changed, 56 insertions(+), 49 deletions(-)
 rename src/08-02moe/include/{ => moe}/operators.h (100%)

diff --git a/src/04kernel/include/kernel/attributes/moe_info.h b/src/04kernel/include/kernel/attributes/moe_info.h
index 79eaa7b2..3e46b505 100644
--- a/src/04kernel/include/kernel/attributes/moe_info.h
+++ b/src/04kernel/include/kernel/attributes/moe_info.h
@@ -6,17 +6,17 @@
 namespace refactor::kernel {
 
     struct AssignPosInfo {
-        uint32_t top, expert_num;
-        uint32_t elementSize;
+        int64_t top, expert_num;
+        int64_t elementSize;
         
-        AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate);        
+        AssignPosInfo(int64_t top, int64_t expert_num, Tensor const &gate);        
     };
 
     struct ReorderInfo{
         bool scatter;  
-        uint32_t top;
-        uint32_t blockNum, blockSize;
-        ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs);
+        int64_t top;
+        int64_t blockNum, blockSize;
+        ReorderInfo(bool scatter, int64_t top, TensorRefs inputs);
     };
 
 }// namespace refactor::kernel
diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h
index a9210078..491810d1 100644
--- a/src/04kernel/include/kernel/attributes/topk_info.h
+++ b/src/04kernel/include/kernel/attributes/topk_info.h
@@ -7,12 +7,12 @@ namespace refactor::kernel {
 
     struct TopKInfo {
         
-        uint8_t topk;
-        uint8_t axis;
+        int64_t topk;
+        int64_t axis;
         size_t in_stride, in_stride_pre_axis, out_stride_pre_axis;
         size_t elem_size, axis_elem_size;
         
-        TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input);
+        TopKInfo(int64_t topk, int64_t axis, Tensor const &input);
         size_t getElementSize() const {return  elem_size;}
         size_t getAxisElementSize()const { return axis_elem_size;}
         size_t getInStride()const{return in_stride;}
diff --git a/src/04kernel/include/kernel/collectors/moe.h b/src/04kernel/include/kernel/collectors/moe.h
index 457de919..258450dc 100644
--- a/src/04kernel/include/kernel/collectors/moe.h
+++ b/src/04kernel/include/kernel/collectors/moe.h
@@ -16,8 +16,8 @@ namespace refactor::kernel {
 
     struct ReorderCollector final : public InfoCollector {
         bool scatter;
-        uint32_t topk;
-        constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept
+        int64_t topk;
+        constexpr ReorderCollector(decltype(_target) target, bool scatter, int64_t topk) noexcept
             : InfoCollector(target) ,scatter(scatter), topk(topk){}
 
         std::vector<KernelBox>
diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h
index c4d8490f..3e0dc288 100644
--- a/src/04kernel/include/kernel/collectors/topk.h
+++ b/src/04kernel/include/kernel/collectors/topk.h
@@ -6,9 +6,9 @@
 namespace refactor::kernel {
 
     struct TopKCollector final : public InfoCollector {
-        uint32_t topk, axis;
+        int64_t topk, axis;
 
-        constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept
+        constexpr TopKCollector(decltype(_target) target, int64_t topk, int64_t axis_) noexcept
             : InfoCollector(target), topk(topk), axis(axis_) {}
 
         std::vector<KernelBox>
diff --git a/src/04kernel/src/attributes/moe_info.cc b/src/04kernel/src/attributes/moe_info.cc
index f5b0b677..829c54ae 100644
--- a/src/04kernel/src/attributes/moe_info.cc
+++ b/src/04kernel/src/attributes/moe_info.cc
@@ -3,10 +3,10 @@
 
 namespace refactor::kernel {
 
-AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\
+AssignPosInfo::AssignPosInfo(int64_t top, int64_t expert_num, Tensor const &gate):\
     top(top), expert_num(expert_num),elementSize(gate.elementsSize()){}      
 
-ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\
+ReorderInfo::ReorderInfo(bool scatter, int64_t top, TensorRefs inputs):\
     scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){}  
 
 
diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc
index 12ab16bb..52032db9 100644
--- a/src/04kernel/src/attributes/topk_info.cc
+++ b/src/04kernel/src/attributes/topk_info.cc
@@ -3,7 +3,7 @@
 
 namespace refactor::kernel {
 
-TopKInfo::TopKInfo(uint8_t topk, uint8_t axis, Tensor const &input):topk(topk), 
+TopKInfo::TopKInfo(int64_t topk, int64_t axis, Tensor const &input):topk(topk), 
             axis(axis),
             in_stride(input.strides()[axis]),
             in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]),
diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.cc b/src/04kernel/src/kernels/moe/cpu_kernel.cc
index 215e8765..ef4f77d9 100644
--- a/src/04kernel/src/kernels/moe/cpu_kernel.cc
+++ b/src/04kernel/src/kernels/moe/cpu_kernel.cc
@@ -25,16 +25,16 @@ namespace refactor::kernel {
     auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
         using namespace runtime;
         return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
-            auto gate = reinterpret_cast<uint8_t const *>(inputs[0]);
+            auto gate = reinterpret_cast<int64_t const *>(inputs[0]);
              
-            auto expert_cnt = reinterpret_cast<uint8_t*>(outputs[0]);//T
-            auto pos = reinterpret_cast<uint8_t*>(outputs[1]);
+            auto expert_cnt = reinterpret_cast<int64_t*>(outputs[0]);//T
+            auto pos = reinterpret_cast<int64_t*>(outputs[1]);
             std::memset(expert_cnt, 0, info.expert_num);
             for (size_t i = 0; i < info.elementSize; i ++){
                 ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!");
                 expert_cnt[gate[i]] ++;
             }
-            std::vector<uint8_t> expert_accumlate;
+            std::vector<int64_t> expert_accumlate;
             expert_accumlate.assign(info.expert_num, 0);
             for (size_t i=0; i<expert_accumlate.size(); ++i){
                 expert_accumlate[i] = (i==0) ? expert_cnt[i] : (expert_accumlate[i-1] + expert_cnt[i]);
@@ -69,7 +69,7 @@ namespace refactor::kernel {
         using namespace runtime;
         return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
             auto input = reinterpret_cast<float const *>(inputs[0]);
-            auto pos = reinterpret_cast<uint32_t const *>(inputs[1]);
+            auto pos = reinterpret_cast<int64_t const *>(inputs[1]);
             auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
                              
             for(size_t i = 0; i<info.blockNum; i++){
diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.cc b/src/04kernel/src/kernels/topk/cpu_kernel.cc
index 7330e44d..06e1683a 100644
--- a/src/04kernel/src/kernels/topk/cpu_kernel.cc
+++ b/src/04kernel/src/kernels/topk/cpu_kernel.cc
@@ -29,7 +29,7 @@ namespace refactor::kernel {
             auto src = reinterpret_cast<float const *>(inputs[0]);
             
             auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
-            auto dstIndex = reinterpret_cast<uint32_t*>(outputs[1]);
+            auto dstIndex = reinterpret_cast<int64_t*>(outputs[1]);
                        
             
             size_t M = info.getElementSize() / info.getAxisElementSize();
@@ -40,7 +40,7 @@ namespace refactor::kernel {
             auto outStride2 = inStride2;
 
             for(size_t m = 0; m < M; m ++){
-                using PairType = std::pair<float, uint8_t>;
+                using PairType = std::pair<float, int64_t>;
                 std::list<PairType> list;
                 for(size_t n = 0; n < N; n++){                    
                     auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2;                    
@@ -49,7 +49,7 @@ namespace refactor::kernel {
                 list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
                 
                 size_t offset = m /inStride2 * outStride1 + m % inStride2;
-                std::for_each_n(list.begin(), (uint32_t)info.topk,
+                std::for_each_n(list.begin(), (int64_t)info.topk,
                             [&](auto &elem) {                                
                                 dstVal[offset] = elem.first;
                                 dstIndex[offset] = elem.second;
diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp
index b0dcaa80..cea4e066 100644
--- a/src/04kernel/test/kernels/topk/test_cpu.cpp
+++ b/src/04kernel/test/kernels/topk/test_cpu.cpp
@@ -9,7 +9,7 @@ TEST(kernel, TopKCpu) {
     // build routine    
     auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4});
     auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3});
-    auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3});
+    auto outputTensor1 = Tensor::share(DataType::I64, Shape{3, 3});
 
     auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor));
     ASSERT_TRUE(kernel);
@@ -28,7 +28,7 @@ TEST(kernel, TopKCpu) {
   
     // check
     std::vector<float> expectVal = {3,2,1,7,6,5,11,10,9};
-    std::vector<uint32_t> expectIdx = {3,2,1,3,2,1,3,2,1};
+    std::vector<int64_t> expectIdx = {3,2,1,3,2,1,3,2,1};
     std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
 
     for(size_t i=0;i< expectVal.size(); ++i){
diff --git a/src/05computation/include/computation/operators/moe.h b/src/05computation/include/computation/operators/moe.h
index 9c062122..775c9fd7 100644
--- a/src/05computation/include/computation/operators/moe.h
+++ b/src/05computation/include/computation/operators/moe.h
@@ -6,9 +6,9 @@
 namespace refactor::computation {
 
     struct AssignPos final : public Operator {
-        uint32_t topk,numExperts;
+        int64_t topk,numExperts;
 
-        constexpr explicit AssignPos(uint32_t topk, uint32_t numExperts) noexcept : Operator(), 
+        constexpr explicit AssignPos(int64_t topk, int64_t numExperts) noexcept : Operator(), 
             topk(topk), numExperts(numExperts){}
 
         static size_t typeId() noexcept;
@@ -20,9 +20,9 @@ namespace refactor::computation {
 
      struct Reorder final : public Operator {
         bool scatter;
-        uint32_t topk;
+        int64_t topk;
 
-        constexpr explicit Reorder(bool scatter, uint32_t topk) noexcept : Operator(), 
+        constexpr explicit Reorder(bool scatter, int64_t topk) noexcept : Operator(), 
             scatter(scatter), topk(topk){}
 
         static size_t typeId() noexcept;
diff --git a/src/05computation/include/computation/operators/topk.h b/src/05computation/include/computation/operators/topk.h
index bdce6d6f..8ecbdfed 100644
--- a/src/05computation/include/computation/operators/topk.h
+++ b/src/05computation/include/computation/operators/topk.h
@@ -6,8 +6,8 @@
 namespace refactor::computation {
 
     struct TopK final : public Operator {
-        uint32_t topk,axis;
-        constexpr TopK(uint32_t topk, uint32_t axis) noexcept : topk(topk), axis(axis){}
+        int64_t topk, axis;
+        constexpr TopK(int64_t topk, int64_t axis) noexcept : topk(topk), axis(axis){}
 
         static size_t typeId() noexcept;
         size_t opTypeId() const noexcept final;
diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc
index 1745f37e..98653472 100644
--- a/src/07onnx/src/operators/topk.cc
+++ b/src/07onnx/src/operators/topk.cc
@@ -40,7 +40,7 @@ namespace refactor::onnx {
         auto dependencies = extractDependency(inputs);
         ans[0] = Tensor::share(input.dataType, input.shape, dependencies);
         ans[0]->shape[axis_] = DimExpr(topk);
-        ans[1] = Tensor::share(input.dataType, input.shape, dependencies);            
+        ans[1] = Tensor::share(DataType::I64, input.shape, dependencies);            
         ans[1]->shape[axis_] = DimExpr(topk);  
         return Ok(Tensors{std::move(ans)});
     }
diff --git a/src/08-02moe/include/operators.h b/src/08-02moe/include/moe/operators.h
similarity index 100%
rename from src/08-02moe/include/operators.h
rename to src/08-02moe/include/moe/operators.h
diff --git a/src/08-02moe/src/operators.cpp b/src/08-02moe/src/operators.cpp
index 887c70ee..5db39632 100644
--- a/src/08-02moe/src/operators.cpp
+++ b/src/08-02moe/src/operators.cpp
@@ -1,4 +1,4 @@
-#include "operators.h"
+#include "moe/operators.h"
 #include "operators/moe.hh"
 
 namespace refactor::moe {
diff --git a/src/08-02moe/src/operators/moe.cc b/src/08-02moe/src/operators/moe.cc
index be971795..68bfeff5 100644
--- a/src/08-02moe/src/operators/moe.cc
+++ b/src/08-02moe/src/operators/moe.cc
@@ -4,7 +4,7 @@
 
 namespace refactor::moe {
 
-    AssignPos::AssignPos(uint32_t topk, uint32_t numExperts) : Operator() ,topk(topk), numExperts(numExperts){}
+    AssignPos::AssignPos(Int topk, Int numExperts) : Operator() ,topk(topk), numExperts(numExperts){}
 
     auto AssignPos::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { 
         auto topk = attributes["topk"].int_();
@@ -23,8 +23,10 @@ namespace refactor::moe {
         EXPECT_SIZE(1)
 
         auto const &gate = inputs[0];
-        
-        if (gate.dataType != DataType::I16) {
+        if(topk < 0 || numExperts < 0 || topk > numExperts){
+            return Err(InferError(ERROR_MSG("topk or numExperts is error")));
+        }
+        if (gate.dataType != DataType::I64) {
             return Err(InferError(ERROR_MSG("Input data type not support")));
         }
         
@@ -37,7 +39,7 @@ namespace refactor::moe {
         return std::make_unique<Op_>(topk, numExperts);
     }
 
-    Reorder::Reorder(bool scatter, uint32_t topk, uint32_t dim) : Operator() ,scatter(scatter), top(topk), dim(dim){}
+    Reorder::Reorder(bool scatter, Int topk, Int dim) : Operator() ,scatter(scatter), top(topk), dim(dim){}
 
     auto Reorder::build(ModelContext const &, std::string_view, Attributes attributes) -> OpBox { 
         auto topk = attributes["topk"].int_();
@@ -59,12 +61,15 @@ namespace refactor::moe {
         auto const &pos = inputs[1];
         if (dim != 0)
             return Err(InferError(ERROR_MSG("dim is not right!")));
-        if(scatter && input.elementsSize() * top != pos.elementsSize())
+        if(top < 0 ){
+            return Err(InferError(ERROR_MSG("topkis error")));
+        }
+        if(scatter && input.elementsSize()/input.shape[input.shape.size()-1].value() * top != pos.elementsSize())
             return Err(InferError(ERROR_MSG("Inputs data size are not right!")));
-        else if(!scatter && input.elementsSize()  != pos.elementsSize())
+        else if(!scatter &&  input.elementsSize()/input.shape[input.shape.size()-1].value()   != pos.elementsSize())
             return Err(InferError(ERROR_MSG("Inputs data size are not right!")));
 
-        if (pos.dataType != DataType::I16) {
+        if (pos.dataType != DataType::I64) {
             return Err(InferError(ERROR_MSG("Input data type not support")));
         }
         
diff --git a/src/08-02moe/src/operators/moe.hh b/src/08-02moe/src/operators/moe.hh
index 96f82709..42501bdb 100644
--- a/src/08-02moe/src/operators/moe.hh
+++ b/src/08-02moe/src/operators/moe.hh
@@ -7,8 +7,8 @@ namespace refactor::moe {
     using namespace frontend;
 
     struct AssignPos final : public Operator {
-        uint32_t topk, numExperts;
-        explicit AssignPos(uint32_t topk, uint32_t numExperts);
+        Int topk, numExperts;
+        explicit AssignPos(Int topk, Int numExperts);
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
@@ -21,8 +21,8 @@ namespace refactor::moe {
 
     struct Reorder final : public Operator {
         bool scatter;  
-        uint32_t top, dim;
-        explicit Reorder(bool scatter, uint32_t topk, uint32_t dim);
+        Int top, dim;
+        explicit Reorder(bool scatter, Int topk, Int dim);
 
         static OpBox build(ModelContext const &, std::string_view, Attributes);
         static size_t typeId();
diff --git a/src/08-02moe/test/test_moe.cpp b/src/08-02moe/test/test_moe.cpp
index d4946573..c735d803 100644
--- a/src/08-02moe/test/test_moe.cpp
+++ b/src/08-02moe/test/test_moe.cpp
@@ -1,5 +1,5 @@
 #include "../src/operators/moe.hh"
-#include "operators.h"
+#include "moe/operators.h"
 #include <gtest/gtest.h>
 
 using namespace refactor;
@@ -9,7 +9,7 @@ TEST(infer, AssignPos) {
     moe::register_();
     auto edges = Edges{
         
-        {Tensor::share(DataType::I16, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2
+        {Tensor::share(DataType::I64, Shape{DimExpr(8), DimExpr(2)}, {}), ""},//gate 8*2
     };
     count_t inputs[]{0};
     auto infered = AssignPos(2,4).infer(TensorRefs(edges, inputs), {true});
@@ -20,6 +20,6 @@ TEST(infer, AssignPos) {
     ASSERT_EQ(expert_cnt->dataType, DataType::F32);
     ASSERT_EQ(expert_cnt->shape, (Shape{DimExpr(4)}));
     auto pos = std::move(outputs[1]);
-    ASSERT_EQ(pos->dataType, DataType::I16);
+    ASSERT_EQ(pos->dataType, DataType::I64);
     ASSERT_EQ(pos->shape, (Shape{DimExpr(16)}));
 }
diff --git a/src/09python_ffi/CMakeLists.txt b/src/09python_ffi/CMakeLists.txt
index ccce34d3..50fc535a 100644
--- a/src/09python_ffi/CMakeLists.txt
+++ b/src/09python_ffi/CMakeLists.txt
@@ -7,7 +7,7 @@ add_subdirectory(pybind11)
 
 file(GLOB_RECURSE PYFFI_SRC src/*.cc src/*.cpp)
 pybind11_add_module(python_ffi SHARED ${PYFFI_SRC})
-target_link_libraries(python_ffi PRIVATE onnx llm communication)
+target_link_libraries(python_ffi PRIVATE onnx llm communication moe)
 target_include_directories(python_ffi PRIVATE include)
 
 # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
diff --git a/src/09python_ffi/src/main.cpp b/src/09python_ffi/src/main.cpp
index 48a4ea6f..d2d2c8ce 100644
--- a/src/09python_ffi/src/main.cpp
+++ b/src/09python_ffi/src/main.cpp
@@ -3,6 +3,7 @@
 #include "import.h"
 #include "llm/operators.h"
 #include "onnx/operators.h"
+#include "moe/operators.h"
 #include <pybind11/stl.h>// keep this line to convert stl types
 
 namespace py = pybind11;
@@ -17,6 +18,7 @@ namespace refactor::python_ffi {
         onnx::register_();
         llm::register_();
         communication::register_();
+        moe::register_();
 
         // clang-format off
 

From cf4e92c540aff02bc7f03d3b11d41d5ea345f7ae Mon Sep 17 00:00:00 2001
From: wangw <271502003@qq.com>
Date: Mon, 6 May 2024 10:23:43 +0800
Subject: [PATCH 4/4] add topk cuda kernel

---
 .../cuda/include/kernel/cuda/topk.cuh         |  19 ++++
 src/04kernel/cuda/src/topk.cu                 | 103 ++++++++++++++++++
 .../include/kernel/attributes/topk_info.h     |  21 ++--
 src/04kernel/include/kernel/collectors/topk.h |   4 +-
 src/04kernel/src/attributes/topk_info.cc      |  16 +--
 src/04kernel/src/kernels/topk/cpu_kernel.cc   |  30 ++---
 src/04kernel/src/kernels/topk/cuda_kernel.cc  |  57 ++++++++++
 src/04kernel/src/kernels/topk/cuda_kernel.hh  |  26 +++++
 src/04kernel/test/kernels/topk/test_cpu.cpp   |   4 +-
 src/04kernel/test/kernels/topk/test_cuda.cpp  |  68 ++++++++++++
 .../include/computation/operators/topk.h      |   4 +-
 src/07onnx/src/operators/topk.cc              |   2 +-
 12 files changed, 314 insertions(+), 40 deletions(-)
 create mode 100644 src/04kernel/cuda/include/kernel/cuda/topk.cuh
 create mode 100644 src/04kernel/cuda/src/topk.cu
 create mode 100644 src/04kernel/src/kernels/topk/cuda_kernel.cc
 create mode 100644 src/04kernel/src/kernels/topk/cuda_kernel.hh
 create mode 100644 src/04kernel/test/kernels/topk/test_cuda.cpp

diff --git a/src/04kernel/cuda/include/kernel/cuda/topk.cuh b/src/04kernel/cuda/include/kernel/cuda/topk.cuh
new file mode 100644
index 00000000..b06cfc00
--- /dev/null
+++ b/src/04kernel/cuda/include/kernel/cuda/topk.cuh
@@ -0,0 +1,19 @@
+#ifndef KERNEL_CUDA_TOPK_CUH
+#define KERNEL_CUDA_TOPK_CUH
+
+#include "threads_distributer.cuh"
+
+namespace refactor::kernel::cuda {
+
+    void launchTopK(
+        KernelLaunchParameters const &params,
+        float const *data,  float *dstVal, unsigned int  *dstIdx,       
+        unsigned int topk,
+        unsigned int stride_axis,
+        unsigned int stride_in_pre,
+        unsigned int stride_out_pre,
+        unsigned int size_axis);
+
+}// namespace refactor::kernel::cuda
+
+#endif// KERNEL_CUDA_TOPK_CUH
diff --git a/src/04kernel/cuda/src/topk.cu b/src/04kernel/cuda/src/topk.cu
new file mode 100644
index 00000000..6b247ead
--- /dev/null
+++ b/src/04kernel/cuda/src/topk.cu
@@ -0,0 +1,103 @@
+#include "kernel/cuda/topk.cuh"
+#include "macro.cuh"
+#include <cstdint>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+
+namespace refactor::kernel::cuda {
+
+using PairType = thrust::pair<float, uint32_t>;
+
+struct ComparePair {
+    __host__ __device__
+    bool operator()(const PairType& a, const PairType& b) const {
+        return a.first > b.first;
+    }
+};
+
+/*
+     __device__ 
+    void process_element(unsigned int n, float  *__restrict__ dstVal,
+        uint32_t  *__restrict__ dstIdx,
+        PairType *list,
+        uint32_t stride_axis,
+        uint32_t init_offset){
+        for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
+                step = blockDim.x * gridDim.x;
+            tid < n;
+            tid += step) {  
+            uint32_t offset = init_offset + stride_axis * tid;
+            dstVal[offset] = list[tid].first;
+            dstIdx[offset] = list[tid].second;
+         }
+    }
+*/
+
+
+
+    __global__ static void TopKKernel(
+        unsigned long long n,
+        float const *__restrict__ data,
+        float  *__restrict__ dstVal,
+        uint32_t  *__restrict__ dstIdx,
+        uint32_t topk,
+        uint32_t stride_axis,
+        uint32_t stride_in_pre,
+        uint32_t stride_out_pre,
+        unsigned int size) {
+        for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
+                  step = blockDim.x * gridDim.x;
+             tid < n;
+             tid += step) {  
+            PairType *list = new PairType[size];
+            
+            for(uint32_t i = 0; i < size; i++){                    
+                uint32_t srcIdx = tid /stride_axis * stride_in_pre + tid % stride_axis + i * stride_axis;                    
+                
+                list[i] = PairType(data[srcIdx], i);
+            }
+            // thrust没有partial_sort算法，可尝试优化：分成size/topk组，每组取一个最大值
+            thrust::sort(thrust::device, list, list + size, ComparePair());
+            
+            
+            uint32_t init_offset = tid /stride_axis * stride_out_pre + tid % stride_axis;
+            for (uint32_t i = 0; i < topk; i++)
+            {
+                uint32_t offset = init_offset + stride_axis * i;
+                dstVal[offset] = list[i].first;
+                dstIdx[offset] = list[i].second;
+            }
+            
+            delete[] list;
+        }
+    }
+
+
+
+    void launchTopK(
+        KernelLaunchParameters const &params,
+        float const *data,  float *dstVal, uint32_t *dstIdx,       
+        uint32_t topk,
+        uint32_t stride_axis,
+        uint32_t stride_in_pre,
+        uint32_t stride_out_pre,
+        unsigned int size_axis) {    
+
+            TopKKernel<<<
+                params.gridSize,
+                params.blockSize,
+                0,
+                reinterpret_cast<cudaStream_t>(params.stream)>>>(
+                params.n,
+                (data),
+                (dstVal),
+                (dstIdx),
+                topk,
+                stride_axis,
+                stride_in_pre,
+                stride_out_pre,
+                size_axis);
+        
+    }
+
+}// namespace refactor::kernel::cuda
diff --git a/src/04kernel/include/kernel/attributes/topk_info.h b/src/04kernel/include/kernel/attributes/topk_info.h
index 491810d1..5cfc5ee6 100644
--- a/src/04kernel/include/kernel/attributes/topk_info.h
+++ b/src/04kernel/include/kernel/attributes/topk_info.h
@@ -6,18 +6,17 @@
 namespace refactor::kernel {
 
     struct TopKInfo {
+        struct Stride{
+            dim_t axis,  in_pre, out_pre;
+         };
+         struct Size{
+            dim_t axis, except_axis;
+         };
+        uint32_t topk;
+        Stride stride;
+        Size size;
         
-        int64_t topk;
-        int64_t axis;
-        size_t in_stride, in_stride_pre_axis, out_stride_pre_axis;
-        size_t elem_size, axis_elem_size;
-        
-        TopKInfo(int64_t topk, int64_t axis, Tensor const &input);
-        size_t getElementSize() const {return  elem_size;}
-        size_t getAxisElementSize()const { return axis_elem_size;}
-        size_t getInStride()const{return in_stride;}
-        size_t getInStridePreAxis()const{return in_stride_pre_axis;}
-        size_t getOutStridePreAxis()const {return out_stride_pre_axis;}
+        TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input);     
     };
 
 }// namespace refactor::kernel
diff --git a/src/04kernel/include/kernel/collectors/topk.h b/src/04kernel/include/kernel/collectors/topk.h
index 3e0dc288..c4d8490f 100644
--- a/src/04kernel/include/kernel/collectors/topk.h
+++ b/src/04kernel/include/kernel/collectors/topk.h
@@ -6,9 +6,9 @@
 namespace refactor::kernel {
 
     struct TopKCollector final : public InfoCollector {
-        int64_t topk, axis;
+        uint32_t topk, axis;
 
-        constexpr TopKCollector(decltype(_target) target, int64_t topk, int64_t axis_) noexcept
+        constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept
             : InfoCollector(target), topk(topk), axis(axis_) {}
 
         std::vector<KernelBox>
diff --git a/src/04kernel/src/attributes/topk_info.cc b/src/04kernel/src/attributes/topk_info.cc
index 52032db9..532f385d 100644
--- a/src/04kernel/src/attributes/topk_info.cc
+++ b/src/04kernel/src/attributes/topk_info.cc
@@ -3,12 +3,14 @@
 
 namespace refactor::kernel {
 
-TopKInfo::TopKInfo(int64_t topk, int64_t axis, Tensor const &input):topk(topk), 
-            axis(axis),
-            in_stride(input.strides()[axis]),
-            in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]),
-            out_stride_pre_axis(in_stride_pre_axis/input.shape[axis]*topk),
-            elem_size(input.elementsSize()),
-            axis_elem_size(input.shape[axis]){}        
+TopKInfo::TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input){
+    this->topk =topk;
+    auto tmpStride =  axis == 0 ? 0 : input.strides()[axis - 1];
+    this->stride = {input.strides()[axis],\
+                tmpStride,\
+                tmpStride/input.shape[axis]*topk};
+    this->size = {input.shape[axis], \
+                input.elementsSize()/input.shape[axis]};
+}        
 
 }
diff --git a/src/04kernel/src/kernels/topk/cpu_kernel.cc b/src/04kernel/src/kernels/topk/cpu_kernel.cc
index 06e1683a..e695e3f7 100644
--- a/src/04kernel/src/kernels/topk/cpu_kernel.cc
+++ b/src/04kernel/src/kernels/topk/cpu_kernel.cc
@@ -1,6 +1,6 @@
 #include "cpu_kernel.hh"
 #include <execution>
-#include <list>
+#include <vector>
 
 namespace refactor::kernel {
     using K = TopKCpu;
@@ -29,31 +29,31 @@ namespace refactor::kernel {
             auto src = reinterpret_cast<float const *>(inputs[0]);
             
             auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
-            auto dstIndex = reinterpret_cast<int64_t*>(outputs[1]);
+            auto dstIndex = reinterpret_cast<uint32_t*>(outputs[1]);
                        
             
-            size_t M = info.getElementSize() / info.getAxisElementSize();
-            size_t N = info.getAxisElementSize();
-            auto inStride1 = info.getInStridePreAxis();
-            auto inStride2 = info.getInStride();
-            auto outStride1 = info.getOutStridePreAxis();
-            auto outStride2 = inStride2;
+            size_t M = info.size.except_axis; 
+            size_t N = info.size.axis;
 
             for(size_t m = 0; m < M; m ++){
-                using PairType = std::pair<float, int64_t>;
-                std::list<PairType> list;
+                using PairType = std::pair<float, uint32_t>;
+                std::vector<PairType> list;
                 for(size_t n = 0; n < N; n++){                    
-                    auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2;                    
+                    auto srcIdx = m /info.stride.axis * info.stride.in_pre + m % info.stride.axis + n * info.stride.axis;                    
                     list.push_back({src[srcIdx],n});                    
                 }
-                list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
+                //list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
+                std::partial_sort(list.begin(), \
+                    list.begin() + info.topk, \
+                    list.end(), \
+                    [](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
                 
-                size_t offset = m /inStride2 * outStride1 + m % inStride2;
-                std::for_each_n(list.begin(), (int64_t)info.topk,
+                size_t offset = m /info.stride.axis * info.stride.out_pre + m % info.stride.axis;
+                std::for_each_n(list.begin(), (uint32_t)info.topk,
                             [&](auto &elem) {                                
                                 dstVal[offset] = elem.first;
                                 dstIndex[offset] = elem.second;
-                                offset += outStride2;
+                                offset += info.stride.axis;
                             });   
             }
         };
diff --git a/src/04kernel/src/kernels/topk/cuda_kernel.cc b/src/04kernel/src/kernels/topk/cuda_kernel.cc
new file mode 100644
index 00000000..acfa4733
--- /dev/null
+++ b/src/04kernel/src/kernels/topk/cuda_kernel.cc
@@ -0,0 +1,57 @@
+#include "cuda_kernel.hh"
+
+#ifdef USE_CUDA
+#include "kernel/cuda/threads_distributer.cuh"
+#include "kernel/cuda/topk.cuh"
+#include <cuda_runtime.h>
+#include <sstream>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#endif
+
+namespace refactor::kernel {
+    using K = TopKCuda;
+
+    K::TopKCuda(TopKInfo info_) noexcept
+        : Kernel(), info(std::move(info_)) {}
+
+    auto K::build(TopKInfo info) noexcept -> KernelBox {
+#ifndef USE_CUDA
+        return nullptr;
+#endif
+
+        return std::make_unique<K>(std::move(info));
+    }
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing concat operation using CUDA";
+    }
+
+#ifdef USE_CUDA
+    auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
+        //return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs){
+
+        //}
+        return [info = this->info, params = cuda::ThreadsDistributer()(info.size.except_axis)]
+            (Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            cuda::launchTopK(
+                params,
+                reinterpret_cast<float const *>(inputs[0]),
+                reinterpret_cast<float *>(outputs[0]),
+                reinterpret_cast<uint32_t *>(outputs[1]),
+                info.topk,
+                info.stride.axis,
+                info.stride.in_pre,
+                info.stride.out_pre, 
+                info.size.axis);
+        };
+    }
+#endif
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/topk/cuda_kernel.hh b/src/04kernel/src/kernels/topk/cuda_kernel.hh
new file mode 100644
index 00000000..069bbd44
--- /dev/null
+++ b/src/04kernel/src/kernels/topk/cuda_kernel.hh
@@ -0,0 +1,26 @@
+#ifndef KERNEL_TOPK_CUDA_KERNEL_HH
+#define KERNEL_TOPK_CUDA_KERNEL_HH
+
+#include "kernel/attributes/topk_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct TopKCuda final : public Kernel {
+        TopKInfo info;
+
+        explicit TopKCuda(TopKInfo) noexcept;
+
+        static KernelBox build(TopKInfo) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_TOPK_CUDA_KERNEL_HH
diff --git a/src/04kernel/test/kernels/topk/test_cpu.cpp b/src/04kernel/test/kernels/topk/test_cpu.cpp
index cea4e066..b0dcaa80 100644
--- a/src/04kernel/test/kernels/topk/test_cpu.cpp
+++ b/src/04kernel/test/kernels/topk/test_cpu.cpp
@@ -9,7 +9,7 @@ TEST(kernel, TopKCpu) {
     // build routine    
     auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4});
     auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3});
-    auto outputTensor1 = Tensor::share(DataType::I64, Shape{3, 3});
+    auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3});
 
     auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor));
     ASSERT_TRUE(kernel);
@@ -28,7 +28,7 @@ TEST(kernel, TopKCpu) {
   
     // check
     std::vector<float> expectVal = {3,2,1,7,6,5,11,10,9};
-    std::vector<int64_t> expectIdx = {3,2,1,3,2,1,3,2,1};
+    std::vector<uint32_t> expectIdx = {3,2,1,3,2,1,3,2,1};
     std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
 
     for(size_t i=0;i< expectVal.size(); ++i){
diff --git a/src/04kernel/test/kernels/topk/test_cuda.cpp b/src/04kernel/test/kernels/topk/test_cuda.cpp
new file mode 100644
index 00000000..d5c252bd
--- /dev/null
+++ b/src/04kernel/test/kernels/topk/test_cuda.cpp
@@ -0,0 +1,68 @@
+#ifdef USE_CUDA
+
+#include "../../../src/kernels/topk/cpu_kernel.hh"
+#include "../../../src/kernels/topk/cuda_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, TopKCuda) {
+    // build routine 
+    auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4});
+    std::vector<Arc<Tensor>> outputTensors{
+            Tensor::share(DataType::F32, Shape{3, 3}),
+            Tensor::share(DataType::U32, Shape{3, 3})};
+
+    auto kCpu = TopKCpu::build(TopKInfo(3,1, *inputTensor));
+    auto kCuda = TopKCuda::build(TopKInfo(3,1, *inputTensor));
+    ASSERT_TRUE(kCpu);
+    ASSERT_TRUE(kCuda);
+    auto res = runtime::Resources();
+    auto rCpu = kCpu->lower(res).routine;
+    auto rCuda = kCuda->lower(res).routine;
+
+    // device malloc
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    Arc<Device::Blob>
+        gpuIn = dev.malloc(inputTensor->bytesSize()),
+        gpuOuts[]{
+            dev.malloc(outputTensors[0]->bytesSize()),
+            dev.malloc(outputTensors[1]->bytesSize()),
+        };
+    // put input data
+    std::vector<float>  data(inputTensor->elementsSize());
+
+    std::vector<float>    outCpu1(outputTensors[0]->elementsSize());
+    std::vector<uint32_t> outCpu2(outputTensors[1]->elementsSize());
+
+        
+    std::vector<float> out1(outputTensors[0]->elementsSize());
+    std::vector<uint32_t> out2(outputTensors[1]->elementsSize());
+
+    std::iota(data.begin(), data.end(), 0);
+    gpuIn->copyFromHost(data.data(), inputTensor->bytesSize());
+    // inference
+    {
+        void const *inputs[]{*gpuIn};
+        void *outputs[]{*gpuOuts[0], *gpuOuts[1]};
+        rCuda(res, nullptr, inputs, outputs);
+    }
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{outCpu1.data(), outCpu2.data()};
+        rCpu(res, nullptr, inputs, outputs);
+    }
+    // check
+    
+    gpuOuts[0]->copyToHost(out1.data(), outputTensors[0]->bytesSize());
+    EXPECT_EQ(out1, outCpu1);
+    gpuOuts[1]->copyToHost(out2.data(), outputTensors[1]->bytesSize());
+    EXPECT_EQ(out2, outCpu2);
+    
+}
+
+#endif
diff --git a/src/05computation/include/computation/operators/topk.h b/src/05computation/include/computation/operators/topk.h
index 8ecbdfed..d5c401f4 100644
--- a/src/05computation/include/computation/operators/topk.h
+++ b/src/05computation/include/computation/operators/topk.h
@@ -6,8 +6,8 @@
 namespace refactor::computation {
 
     struct TopK final : public Operator {
-        int64_t topk, axis;
-        constexpr TopK(int64_t topk, int64_t axis) noexcept : topk(topk), axis(axis){}
+        uint32_t topk, axis;
+        constexpr TopK(uint32_t topk, uint32_t axis) noexcept : topk(topk), axis(axis){}
 
         static size_t typeId() noexcept;
         size_t opTypeId() const noexcept final;
diff --git a/src/07onnx/src/operators/topk.cc b/src/07onnx/src/operators/topk.cc
index 98653472..c1e908e6 100644
--- a/src/07onnx/src/operators/topk.cc
+++ b/src/07onnx/src/operators/topk.cc
@@ -40,7 +40,7 @@ namespace refactor::onnx {
         auto dependencies = extractDependency(inputs);
         ans[0] = Tensor::share(input.dataType, input.shape, dependencies);
         ans[0]->shape[axis_] = DimExpr(topk);
-        ans[1] = Tensor::share(DataType::I64, input.shape, dependencies);            
+        ans[1] = Tensor::share(DataType::U32, input.shape, dependencies);            
         ans[1]->shape[axis_] = DimExpr(topk);  
         return Ok(Tensors{std::move(ans)});
     }