Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
feat: 添加hardsigmoid/pad/min/max算子,修复conv/transpose等出现的小错误
  • Loading branch information
Chamberlain0w0 committed Feb 2, 2024
commit 626ae8fe5aece306ea5a7c2f0a5d54d3ad22c92e
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/hard_sigmoid.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "kernel/collectors/hard_sigmoid.h"
#include "../kernels/hard_sigmoid/cnnl_kernel.hh"
#include "../kernels/hard_sigmoid/cpu_kernel.hh"
#include "../kernels/hard_sigmoid/cuda_kernel.hh"

Expand All @@ -20,6 +21,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = HardSigmoidCnnl::build(alpha, beta, a); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
7 changes: 6 additions & 1 deletion src/04kernel/src/collectors/pad.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "kernel/collectors/pad.h"
#include "../kernels/pad/cnnl_kernel.hh"
#include "../kernels/pad/cpu_kernel.hh"
#include "../kernels/pad/cuda_kernel.hh"

Expand All @@ -22,11 +23,15 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = PadCnnl::build(dims, input.get().dataType, mode, const_value); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}

}// namespace refactor::kernel

4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/select.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "kernel/collectors/select.h"
#include "../kernels/select/cpu_kernel.hh"
#include "../kernels/select/cuda_kernel.hh"
#include "../kernels/select/cnnl_kernel.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -35,6 +36,9 @@ namespace refactor::kernel {
case decltype(_target)::Nvidia:
REGISTER(SelectCuda)
break;
case decltype(_target)::Mlu:
REGISTER(SelectCnnl)
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
19 changes: 14 additions & 5 deletions src/04kernel/src/kernels/cast/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ namespace refactor::kernel {
struct Descriptors {
cnnlTensorDescriptor_t inDesc, outDesc;
cnnlCastDataType_t cast;
bool needCast;

Descriptors() : inDesc(nullptr), outDesc(nullptr) {
Descriptors(bool need) : inDesc(nullptr), outDesc(nullptr),
needCast(need) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&inDesc));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&outDesc));
}
Expand All @@ -56,15 +58,22 @@ namespace refactor::kernel {
CNNL_ASSERT(cnnlDestroyTensorDescriptor(outDesc));
}
};
auto d = std::make_shared<Descriptors>();
d->cast = castType(from, to);
auto d = std::make_shared<Descriptors>(from != to);
if (d->needCast) {
d->cast = castType(from, to);
}
setCnnlTensor(d->inDesc, from, slice(shape.data(), shape.size()));
setCnnlTensor(d->outDesc, to, slice(shape.data(), shape.size()));

res.fetchOrStore<CnnlContext>();
return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
if (d->needCast) {
CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore<CnnlContext>()->handle,
d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0]));
} else {
CNNL_ASSERT(cnnlCopy(res.fetchOrStore<CnnlContext>()->handle,
d->inDesc, inputs[0], d->outDesc, outputs[0]));
}
};
}

Expand Down
67 changes: 24 additions & 43 deletions src/04kernel/src/kernels/conv/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,10 @@ namespace refactor::kernel {
return nullptr;
}

std::optional<ExpandInfoCnnl> biasExpand = std::nullopt;
int biasSize_ = 0;
if (b) {
ASSERT(b->get().shape[0] == y.shape[1], "");
std::vector<dim_t> input(y.rank(), 1);
input[1] = y.shape[1];
biasExpand.emplace(ExpandInfoCnnl(
b->get().dataType,
slice(input.data(), input.size()),
slice(y.shape.data(), y.rank())));
biasSize_ = b->get().shape[0];
}

// group is not supported
Expand Down Expand Up @@ -70,7 +65,7 @@ namespace refactor::kernel {
{d[0], d[1]},
{p[0], p[1], p[2], p[3]},
{s[0], s[1]},
std::move(biasExpand),
biasSize_,
});
}

Expand All @@ -92,22 +87,18 @@ namespace refactor::kernel {

// RAII for closure
struct Descriptors {
cnnlTensorDescriptor_t x, y, w;
cnnlTensorDescriptor_t x, y, w, b;
cnnlTensorDescriptor_t xTrans, yTrans, wTrans;
cnnlTransposeDescriptor_t NCHW2NHWC, NHWC2NCHW;
cnnlConvolutionDescriptor_t conv;
cnnlConvolutionForwardAlgo_t algo;
// std::optional<ExtraPadding> extraPadding;
std::optional<Routine> biasExpand;
bool f32;

Descriptors(decltype(f32) f32_)
:// extraPadding(std::nullopt),
biasExpand(std::nullopt),
f32(f32_) {
bool bias;

Descriptors(decltype(bias) bias_) : bias(bias_) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&w));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&b));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&xTrans));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&yTrans));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&wTrans));
Expand All @@ -119,6 +110,7 @@ namespace refactor::kernel {
CNNL_ASSERT(cnnlDestroyTensorDescriptor(x));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(y));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(w));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(b));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(xTrans));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(yTrans));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(wTrans));
Expand All @@ -130,11 +122,8 @@ namespace refactor::kernel {
Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>(info.dt != DataType::F64);
// d->extraPadding = ExtraPadding::build(info.dt, info.xShape, info.pad);
if (info.biasExpand) {
d->biasExpand = ExpandCnnl(*info.biasExpand).lower(res).routine;
}
auto d = std::make_shared<Descriptors>(info.biasSize > 0);

int xs[]{
info.xShape[0],
info.xShape[1],
Expand All @@ -154,10 +143,15 @@ namespace refactor::kernel {
setCnnlTensor(d->x, info.dt, slice(xs, 4));
setCnnlTensor(d->y, info.dt, slice(info.yShape, 4));
setCnnlTensor(d->w, info.dt, slice(info.wShape, 4));

CNNL_ASSERT(cnnlSetTensorDescriptor(d->xTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, xsNHWC.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->yTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, ysNHWC.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->wTrans, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, wsNHWC.data()));

if (d->bias) {
int biasDim[] = {1, 1, 1, info.biasSize};
CNNL_ASSERT(cnnlSetTensorDescriptor(d->b, CNNL_LAYOUT_NHWC, cnnlDataTypeConvert(info.dt), 4, biasDim));
}

auto xTransSize = cnnlGetTensorElementNum(d->xTrans) * info.dt.size();
auto yTransSize = cnnlGetTensorElementNum(d->yTrans) * info.dt.size();
auto wTransSize = cnnlGetTensorElementNum(d->wTrans) * info.dt.size();
Expand Down Expand Up @@ -188,10 +182,6 @@ namespace refactor::kernel {
handle, d->xTrans, d->wTrans, d->yTrans, NULL,
d->conv, d->algo, &convWorkspaceSize));

// if (d->extraPadding) {
// workspaceSize = hardware::alignBytes(workspaceSize, 256);
// }

size_t workspaceSize = xTransSize + yTransSize + wTransSize + std::max({xWorkspaceSize, wWorkspaceSize, yWorkspaceSize, convWorkspaceSize});

res.fetchOrStore<CnnlContext>();
Expand All @@ -201,12 +191,6 @@ namespace refactor::kernel {
auto handle = res.fetchOrStore<CnnlContext>()->handle;
void const *x = inputs[0], *w = inputs[1];
void *y = outputs[0];
// if (auto f = d->extraPadding; f) {
// x = (*f)(x, reinterpret_cast<uint8_t *>(workspace) + workspaceSize);
// }
// if (auto f = d->biasExpand; f) {
// (*f)(res, workspace, inputs + 2, outputs);
// }

void *xTrans = workspace;
void *wTrans = reinterpret_cast<uint8_t *>(xTrans) + xTransSize;
Expand All @@ -218,19 +202,16 @@ namespace refactor::kernel {
d->xTrans, xTrans, opWorkspace, xWorkspaceSize));
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->w, w,
d->wTrans, wTrans, opWorkspace, wWorkspaceSize));

// build alpha/beta for double
auto a = d->f32 ? factor<fp32_t>(1) : factor<fp64_t>(1),
b = d->f32
? factor<fp32_t>(d->biasExpand ? 1 : 0)
: factor<fp64_t>(d->biasExpand ? 1 : 0);

auto bDesc = (d->bias) ? d->b : NULL;
auto bData = (d->bias) ? inputs[2] : NULL;
CNNL_ASSERT(cnnlConvolutionForward(
handle,
d->conv, d->algo, &a,
d->conv, d->algo, NULL,
d->xTrans, xTrans, d->wTrans, wTrans,
NULL, NULL, opWorkspace, convWorkspaceSize,
&b, d->yTrans, yTrans));
bDesc, bData, opWorkspace, convWorkspaceSize,
NULL, d->yTrans, yTrans));

// transpose NHWC intermediates to NCHW
CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->yTrans, yTrans,
d->y, y, opWorkspace, yWorkspaceSize));
Expand Down
2 changes: 1 addition & 1 deletion src/04kernel/src/kernels/conv/cnnl_kernel.hh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace refactor::kernel {
dilation[2],
pad[4],
stride[2];
std::optional<ExpandInfoCnnl> biasExpand;
int biasSize;
} info;

explicit ConvCnnl(decltype(info)) noexcept;
Expand Down
81 changes: 81 additions & 0 deletions src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include "cnnl_kernel.hh"
#include "kernel/collectors/hard_sigmoid.h"
#include <unordered_set>

#ifdef USE_BANG
#include "../../utilities/bang/cnnl_context.hh"
#include "../../utilities/bang/cnnl_functions.h"
#include <cnnl.h>
#endif

namespace refactor::kernel {
using K = HardSigmoidCnnl;
using DT = DataType;

K::HardSigmoidCnnl(float alpha_, float beta_, DT dataType_, int size_) noexcept
: Kernel(), alpha(alpha_), beta(beta_), dataType(dataType_), size(size_) {}

auto K::build(float alpha_, float beta_, Tensor const &a) noexcept -> KernelBox {

#ifndef USE_BANG
return nullptr;
#endif

return std::make_unique<K>(alpha_, beta_, a.dataType, a.elementsSize());
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing hardsigmoid using CNNL";
}

#ifdef USE_BANG

auto HardSigmoidCnnl::lower(Resources &res) const -> RoutineWorkspace {
using namespace cnnl;
using namespace runtime;

// RAII for closure
struct Descriptors {
cnnlActivationDescriptor_t activation;
cnnlTensorDescriptor_t tensor;

Descriptors() : activation(nullptr), tensor(nullptr) {
CNNL_ASSERT(cnnlCreateActivationDescriptor(&activation));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&tensor));
}
~Descriptors() noexcept(false) {
CNNL_ASSERT(cnnlDestroyActivationDescriptor(activation));
CNNL_ASSERT(cnnlDestroyTensorDescriptor(tensor));
}

Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>();

setCnnlTensor(d->tensor, dataType, slice(&size, 1));
CNNL_ASSERT(cnnlSetActivationDescriptor_v5(d->activation, CNNL_ACTIVATION_HARDSIGMOID,
CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, 0.0,
0.0, alpha, beta, true));

res.fetchOrStore<CnnlContext>();
return [d = std::move(d)]//
(Resources & res, void *, void const *const *inputs, void *const *outputs) {
float alpha = 1.f, beta = 0.f;
CNNL_ASSERT(cnnlActivationForward(
res.fetchOrStore<CnnlContext>()->handle,
d->activation,
&alpha, d->tensor, inputs[0],
&beta, d->tensor, outputs[0]));
};
}

#endif

}// namespace refactor::kernel
27 changes: 27 additions & 0 deletions src/04kernel/src/kernels/hard_sigmoid/cnnl_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH
#define KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH

#include "kernel/collectors/simple_unary.h"

namespace refactor::kernel {

struct HardSigmoidCnnl final : public Kernel {
float alpha, beta;
DataType dataType;
int size;

HardSigmoidCnnl(float, float, DataType, int) noexcept;

static KernelBox build(float, float, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_BANG
RoutineWorkspace lower(Resources &) const final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_HARD_SIGMOID_CNNL_KERNEL_HH
Loading