Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat: 接入CNNL,并添加unary/binary/softmax/batchnorm/reduce/transpose/pooli…
…ng算子
  • Loading branch information
Chamberlain0w0 authored and YdrMaster committed Jan 31, 2024
commit c291c79c53cd1420f8acae993620bdac549cb033
1 change: 1 addition & 0 deletions src/02hardware/src/device_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "hardware/devices/cpu.h"
#include "hardware/devices/mlu.h"
#include "hardware/devices/nvidia.h"
#include "hardware/devices/mlu.h"

namespace refactor::hardware::device {

Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/batch_normalization.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/batch_normalization.h"
#include "../kernels/batch_normalization/cpu_kernel.hh"
#include "../kernels/batch_normalization/cudnn_kernel.hh"
#include "../kernels/batch_normalization/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -20,6 +21,9 @@ namespace refactor::kernel {
case decltype(_target)::Nvidia:
REGISTER(BatchNormalizationCudnn)
break;
case decltype(_target)::Mlu:
REGISTER(BatchNormalizationCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/pool.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/collectors/pool.h"
#include "../kernels/pool/cudnn_kernel.hh"
#include "../kernels/pool/cnnl_kernel.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -29,6 +30,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = PoolCnnl::build(type, ceil, kernelShape, attributes, x, y); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/reduce.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/reduce.h"
#include "../kernels/reduce/cpu_kernel.hh"
#include "../kernels/reduce/cudnn_kernel.hh"
#include "../kernels/reduce/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -27,6 +28,9 @@ namespace refactor::kernel {
case decltype(_target)::Nvidia:
REGISTER(ReduceCudnn)
break;
case decltype(_target)::Mlu:
REGISTER(ReduceCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/simple_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "../kernels/simple_binary/binary_cudnn.hh"
#include "../kernels/simple_binary/cpu_kernel.hh"
#include "../kernels/simple_binary/cuda_kernel.hh"
#include "../kernels/simple_binary/binary_cnnl.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -50,6 +51,9 @@ namespace refactor::kernel {
REGISTER_BROCAST(BinaryCudnn)
REGISTER(BinaryCuda)
break;
case decltype(_target)::Mlu:
REGISTER_BROCAST(BinaryCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/simple_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include "../kernels/simple_unary/cpu_kernel.hh"
#include "../kernels/simple_unary/cuda_kernel.hh"
#include "../kernels/simple_unary/cudnn_activation_kernel.hh"
#include "../kernels/simple_unary/cnnl_activation_kernel.hh"
#include "../kernels/simple_unary/cnnl_simple_unary_kernel.hh"
#include "common.h"

namespace refactor::kernel {
Expand Down Expand Up @@ -55,6 +57,10 @@ namespace refactor::kernel {
REGISTER(ActivationCudnn)
REGISTER(SimpleUnaryCuda)
break;
case decltype(_target)::Mlu:
REGISTER(ActivationCnnl)
REGISTER(SimpleUnaryCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
7 changes: 7 additions & 0 deletions src/04kernel/src/collectors/softmax.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "kernel/collectors/softmax.h"
#include "../kernels/softmax/cnnl_kernel.hh"
#include "../kernels/softmax/cpu_kernel.hh"
#include "../kernels/softmax/cuda_kernel.hh"
#include "../kernels/softmax/cudnn_kernel.hh"
Expand Down Expand Up @@ -28,6 +29,12 @@ namespace refactor::kernel {
}
break;
}
case decltype(_target)::Mlu: {
if (auto ptr = SoftmaxCnnl::build(cnnl::SoftmaxAlgo::ACCURATE, info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
}
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/transpose.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/transpose.h"
#include "../kernels/transpose/cpu_kernel.hh"
#include "../kernels/transpose/cuda_kernel.hh"
#include "../kernels/transpose/cnnl_kernel.hh"

namespace refactor::kernel {

Expand All @@ -25,6 +26,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = TransposeCnnl::build(data.dataType, data.shape, perm); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
158 changes: 158 additions & 0 deletions src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include "cnnl_kernel.hh"

#ifdef USE_BANG
#include "../../utilities/bang/cnnl_context.hh"
#include "../../utilities/bang/cnnl_functions.h"
#include <cnnl.h>
#endif

namespace refactor::kernel {
using K = BatchNormalizationCnnl;
using DT = DataType;

K::BatchNormalizationCnnl(decltype(info) info_) noexcept
: info(info_) {}

auto K::build(float epsilon, TensorRefs inputs) noexcept -> KernelBox {
#ifndef USE_BANG
return nullptr;
#endif

auto const &x = inputs[0].get();
auto const &scale = inputs[1].get();
auto const &mean = inputs[3].get();

if (x.rank() != 4) {
return nullptr;
}

// see "Supported Configurations for `cnnlBatchNormalizationForwardInference`"
if (scale.dataType != mean.dataType) {
return nullptr;
}
if (x.dataType == DT::F64) {
if (scale.dataType != DT::F64) {
return nullptr;
}
} else {
if (scale.dataType != DT::F32) {
return nullptr;
}
}
return std::make_unique<K>(decltype(info){
epsilon,
x.dataType,
scale.dataType,
x.layout,
{
static_cast<int>(x.shape[0]),
static_cast<int>(x.shape[1]),
static_cast<int>(x.shape[2]),
static_cast<int>(x.shape[3]),
}});
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing batch normalization for non-training-mode using CNNL";
}

#ifdef USE_BANG

auto K::lower(Resources &res) const -> RoutineWorkspace {
using namespace cnnl;
using namespace runtime;
using DT = DataType;

// RAII for closure
struct Descriptors {
cnnlTensorDescriptor_t inDesc, inDescTrans, p;
cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
bool f32;

explicit Descriptors(decltype(f32) f32_)
: inDesc(nullptr), inDescTrans(nullptr), p(nullptr),
NCHW2NHWC(nullptr), NHWC2NCHW(nullptr), f32(f32_) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDescTrans));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&p));
CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NCHW2NHWC));
CNNL_ASSERT(cnnlCreateTransposeDescriptor(&NHWC2NCHW));
}
~Descriptors() noexcept(false) {
CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDesc));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(inDescTrans));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(p));
CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NCHW2NHWC));
CNNL_ASSERT(cnnlDestroyTransposeDescriptor(NHWC2NCHW));
}

Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>(info.dtX != DT::F64);
int dimNCHW[4] = {info.dimAx[0], info.dimAx[1], info.dimAx[2], info.dimAx[3]};
int dimNHWC[4] = {info.dimAx[0], info.dimAx[2], info.dimAx[3], info.dimAx[1]};
int dimParam[]{info.dimAx[1]};
setCnnlTensor(d->inDesc, info.dtX, slice(dimNCHW, 4));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->inDescTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dtX), 4, dimNHWC));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->p, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dtP), 1, dimParam));
int permute[4] = {0, 2, 3, 1};
int permuteOut[4] = {0, 3, 1, 2};
CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NCHW2NHWC, 4, permute));
CNNL_ASSERT(cnnlSetTransposeDescriptor(d->NHWC2NCHW, 4, permuteOut));

auto handle = res.fetchOrStore<CnnlContext>()->handle;
auto xTransSize = cnnlGetTensorElementNum(d->inDescTrans) * sizeof(info.dtX);
size_t workspaceSize;
CNNL_ASSERT(cnnlGetTransposeWorkspaceSize(handle, d->inDesc, d->NCHW2NHWC, &workspaceSize));
size_t totalWorkspaceSize = xTransSize + workspaceSize;

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d),
epsilon = info.epsilon,
xTransSize, workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
// fetch cnnl handle from resources
auto handle = res.fetchOrStore<CnnlContext>()->handle;

// name inputs and outputs
auto x = inputs[0],
scale = inputs[1],
bias = inputs[2],
mean = inputs[3],
var = inputs[4];
auto y = outputs[0];

void *xTrans = workspace;
void *yTrans = xTrans + xTransSize;
void *cursor = yTrans + workspaceSize;

// transpose NCHW input to NHWC
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x,
d->inDescTrans, xTrans, cursor, workspaceSize));

// build alpha/beta for double
auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
b = d->f32 ? factor<fp32_t>(0) : factor<fp64_t>(0);
CNNL_ASSERT(cnnlBatchNormForwardInference(
handle, &a, &b,
d->inDescTrans, xTrans, d->p, scale, bias, mean, var,
epsilon, d->inDescTrans, yTrans));

// transpose NHWC intermediates to NCHW
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans,
d->inDesc, y, cursor, workspaceSize));

BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
};

return {std::move(routine), totalWorkspaceSize};
}

#endif

}// namespace refactor::kernel
32 changes: 32 additions & 0 deletions src/04kernel/src/kernels/batch_normalization/cnnl_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH
#define KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH

#include "kernel/kernel.h"
#include "kernel/tensor.h"

namespace refactor::kernel {
/// @brief Use `cnnlBatchNormalizationForwardInference`.
/// It only supports 4D and 5D tensors.
struct BatchNormalizationCnnl final : public Kernel {
struct {
float epsilon;
DataType dtX, dtP;
LayoutType layout;
int dimAx[4];// dimA for x
} info;

explicit BatchNormalizationCnnl(decltype(info)) noexcept;

static KernelBox build(float, TensorRefs) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_BANG
RoutineWorkspace lower(Resources &) const final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_BATCH_NORMALIZATION_CNNL_KERNEL_HH
Loading