fix: 整理分支后跑通，添加HardSwish

InfiniTensor · kilinchange · Jan 5, 2024 · Jan 11, 2024 · Jan 12, 2024 · Jan 16, 2024
commit b82c861c3cf08a9eb72e93f4c77ac72f237c5bb3
diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc
@@ -16,13 +16,16 @@ namespace refactor::kernel {
 #ifndef USE_BANG
         return nullptr;
 #endif
-
+        auto indicesDim = std::vector<int>(index.shape.begin(), index.shape.end());
+        if (indicesDim.size() == 0) {
+            indicesDim.push_back(1);
+        }
         return std::make_unique<K>(decltype(info){
             input.dataType,
-            DataType::I32,
-            axis ? axis : 0,
+            index.dataType,
+            axis,
             std::vector<int>(input.shape.begin(), input.shape.end()),
-            std::vector<int>(index.shape.begin(), index.shape.end()),
+            std::move(indicesDim),
             std::vector<int>(output.shape.begin(), output.shape.end()),
         });
     }
@@ -60,8 +63,9 @@ namespace refactor::kernel {
         CNNL_ASSERT(cnnlSetTensorDescriptor(
             d->inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
             info.inDim.size(), info.inDim.data()));
+        // cnnlGatherV2 does not support int64 indices
         CNNL_ASSERT(cnnlSetTensorDescriptor(
-            d->indexDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.indexDataType),
+            d->indexDesc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_INT32,
             info.indexDim.size(), info.indexDim.data()));
         CNNL_ASSERT(cnnlSetTensorDescriptor(
             d->outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
@@ -71,15 +75,15 @@ namespace refactor::kernel {
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
-                        shape = std::vector<int>(info.inDim.begin(), info.inDim.end()), 
+                        shape = std::vector<int>(info.inDim.begin(), info.inDim.end()),
                         workspaceSize,
                         dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
             res.fetchOrStore<CnnlContext>()->copyFromCPU(workspace, shape.data(), workspaceSize);
             CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore<CnnlContext>()->handle, dim,
                                      d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
                                      d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
                                      d->outDesc, outputs[0]));
-       };
+        };
 
         return {std::move(routine), workspaceSize};
     }

diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc
@@ -26,10 +26,7 @@ namespace refactor::kernel {
             // !a.dataType.isFloat() ||
             !ARTHIMETIC.contains(op) ||
             // At least one of a,b should have the same shape as c
-            (a.shape != c.shape && b.shape != c.shape)
-            // Sub only supports brocasting b
-            // (a.shape != c.shape && op == Op::Sub)
-            ) {
+            (a.shape != c.shape && b.shape != c.shape)) {
             return nullptr;
         }
 
@@ -124,9 +121,9 @@ namespace refactor::kernel {
         auto handle = res.fetchOrStore<CnnlContext>()->handle;
         size_t workspaceSize;
         CNNL_ASSERT(cnnlGetBinaryWorkspaceSize(handle, d->aDesc,
-                                                   d->bDesc, d->cDesc,
-                                                   &workspaceSize));
-        
+                                               d->bDesc, d->cDesc,
+                                               &workspaceSize));
+
 
         res.fetchOrStore<CnnlContext>();
         auto routine = [d = std::move(d),
@@ -147,11 +144,11 @@ namespace refactor::kernel {
                      beta = d->f32
                                 ? factor<fp32_t>(0)
                                 : factor<fp64_t>(0);
-                    CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
-                                             &alphaA, d->aDesc, a,
-                                             &alphaB, d->bDesc, b,
-                                             workspace, workspaceSize,
-                                             &beta, d->cDesc, c));
+                CNNL_ASSERT(cnnlOpTensor(handle, d->opDesc,
+                                         &alphaA, d->aDesc, a,
+                                         &alphaB, d->bDesc, b,
+                                         workspace, workspaceSize,
+                                         &beta, d->cDesc, c));
             } else if (op == SimpleBinaryType::Div) {
                 CNNL_ASSERT(cnnlDiv_v2(handle,
                                        CNNL_COMPUTATION_HIGH_PRECISION,
@@ -179,7 +176,6 @@ namespace refactor::kernel {
                                          d->cDesc, c,
                                          workspace, workspaceSize));
             }
-
         };
 
         return {std::move(routine), workspaceSize};

diff --git a/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc b/src/04kernel/src/kernels/simple_unary/cnnl_activation_kernel.cc
@@ -17,7 +17,7 @@ namespace refactor::kernel {
         : Kernel(), type(type_), dataType(dataType_), size(size_) {}
 
     auto K::build(Op op, Tensor const &a) noexcept -> KernelBox {
-        static const std::unordered_set<Op> ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh};
+        static const std::unordered_set<Op> ARTHIMETIC{Op::Sigmoid, Op::Relu, Op::Tanh, Op::HardSwish};
 
 #ifndef USE_BANG
         return nullptr;
@@ -64,20 +64,27 @@ namespace refactor::kernel {
         auto d = std::make_shared<Descriptors>();
 
         // clang-format off
-        auto mode = type == Ty::Relu    ? CNNL_ACTIVATION_RELU
-                  : type == Ty::Sigmoid ? CNNL_ACTIVATION_SIGMOID
-                  : type == Ty::Tanh    ? CNNL_ACTIVATION_TANH
+        auto mode = type == Ty::Relu      ? CNNL_ACTIVATION_RELU
+                  : type == Ty::Sigmoid   ? CNNL_ACTIVATION_SIGMOID
+                  : type == Ty::Tanh      ? CNNL_ACTIVATION_TANH
+                  : type == Ty::HardSwish ? CNNL_ACTIVATION_HARDSWISH
                   : UNREACHABLEX(cnnlActivationMode_t, "");
+        float coef = 0.0;
+        float slicedDim = 0.0;
+        float gamma = 0.0;
+        float scale = 0.0;
         // clang-format on
 
         setCnnlTensor(d->tensor, dataType, slice(&size, 1));
-        CNNL_ASSERT(cnnlSetActivationDescriptor_v2(d->activation, mode, CNNL_ACTIVATION_HIGH_PRECISION,
-                                                   CNNL_NOT_PROPAGATE_NAN, 0.0));
+        CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, mode,
+                                                   CNNL_ACTIVATION_HIGH_PRECISION,
+                                                   CNNL_NOT_PROPAGATE_NAN, coef,
+                                                   slicedDim, gamma, scale, true));
 
         res.fetchOrStore<CnnlContext>();
         return [d = std::move(d)]//
             (Resources & res, void *, void const *const *inputs, void *const *outputs) {
-                float alpha = 1, beta = 0;
+                float alpha = 1.f, beta = 0.f;
                 CNNL_ASSERT(cnnlActivationForward(
                     res.fetchOrStore<CnnlContext>()->handle,
                     d->activation,

diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp
@@ -60,6 +60,7 @@ TEST(kernel, ActivationCnnl) {
     testOp(SimpleUnaryType::Relu);
     testOp(SimpleUnaryType::Sigmoid);
     testOp(SimpleUnaryType::Tanh);
+    testOp(SimpleUnaryType::HardSwish);
 }
 
 

diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp
@@ -14,8 +14,8 @@ using namespace hardware;
 TEST(kernel, TransposeCnnl) {
     // build routine
     auto dataTensor = Tensor::share(DataType::F32, Shape{1, 3, 2, 5});
-    auto info = TransposeInfo(dataTensor->shape, Permutation{2, 3, 0, 1});
-    auto kCpu = TransposeCpu::build(dataTensor->dataType, info);
+    auto info = TransposeInfo(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1});
+    auto kCpu = TransposeCpu::build(info);
     auto kernel = TransposeCnnl::build(dataTensor->dataType, dataTensor->shape, Permutation{2, 3, 0, 1});
     ASSERT_TRUE(kCpu && kernel);
     auto res = runtime::Resources();