Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: mlu上跑通llama/gpt2,结果正确
  • Loading branch information
Chamberlain0w0 authored and YdrMaster committed Jan 31, 2024
commit 8bde8c1f30b8c0a1c71ccdbd12b7228bc09abd7d
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/global_pool.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/collectors/global_pool.h"
#include "../kernels/pool/cudnn_kernel.hh"
#include "../kernels/pool/cnnl_kernel.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -28,6 +29,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = PoolCnnl::build(type, false, kernelShape, attributes, x, y); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
14 changes: 8 additions & 6 deletions src/04kernel/src/kernels/gather/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "../../utilities/bang/cnnl_context.hh"
#include "../../utilities/bang/cnnl_functions.h"
#endif
#include <iostream>

namespace refactor::kernel {
using K = GatherCnnl;
Expand All @@ -15,11 +16,11 @@ namespace refactor::kernel {
#ifndef USE_BANG
return nullptr;
#endif

return std::make_unique<K>(decltype(info){
input.dataType,
DataType::I32,
axis,
axis ? axis : 0,
std::vector<int>(input.shape.begin(), input.shape.end()),
std::vector<int>(index.shape.begin(), index.shape.end()),
std::vector<int>(output.shape.begin(), output.shape.end()),
Expand Down Expand Up @@ -70,15 +71,16 @@ namespace refactor::kernel {

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d),
shape = info.inDim.data(), workspaceSize,
shape = std::vector<int>(info.inDim.begin(), info.inDim.end()),
workspaceSize,
dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV));
res.fetchOrStore<CnnlContext>()->copyFromCPU(workspace, shape.data(), workspaceSize);
CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore<CnnlContext>()->handle, dim,
d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
d->outDesc, outputs[0]));
BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
};
};

return {std::move(routine), workspaceSize};
}
Expand Down
11 changes: 6 additions & 5 deletions src/04kernel/src/kernels/reduce/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ namespace refactor::kernel {

std::vector<int>
dimsI(shape.begin(), shape.end()),
dimsO(shape.begin(), shape.end());
dimsO(shape.begin(), shape.end()),
indices(axes.begin(), axes.end());
for (auto axis : axes) {
dimsO[axis] = 1;
}
// setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
// setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));

// clang-format off
auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG
Expand All @@ -91,12 +92,12 @@ namespace refactor::kernel {
: UNREACHABLEX(cnnlReduceOp_t, "");
// clang-format on
CNNL_ASSERT(cnnlSetReduceDescriptor_v2(
d->reduce, (int *) (axes.data()), axes.size(), reduceOp,
d->reduce, indices.data(), indices.size(), reduceOp,
cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64),
CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));

auto handler = res.fetchOrStore<CnnlContext>()->handle;
size_t idxWorkspaceSize = axes.size() * sizeof(int);
size_t idxWorkspaceSize = indices.size() * sizeof(int);
// idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256);
size_t workspaceSize;
// get workspace
Expand Down
9 changes: 6 additions & 3 deletions src/04kernel/src/kernels/softmax/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ namespace refactor::kernel {
static_cast<cnnlSoftmaxAlgorithm_t>(algo),
dataType != DataType::F64);
int dims[]{pre, mid, post};
cnnlSoftmaxMode_t mode = (post == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
: (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
: CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
// cnnlSoftmaxMode_t mode = (pre == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
// : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
// : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
// FIXME(bolun): CNNL Softmax mode
cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;

// cnnlSoftmaxForward_v2 is applied to a 3D input tensor only
CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims));
Expand All @@ -78,6 +80,7 @@ namespace refactor::kernel {
CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
&a, d->t, inputs[0],
&b, d->t, outputs[0]));
res.fetchOrStore<CnnlContext>()->queueSync();
};
}

Expand Down
82 changes: 39 additions & 43 deletions src/04kernel/src/kernels/where/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,24 @@ namespace refactor::kernel {
#ifndef USE_BANG
return nullptr;
#endif
return std::make_unique<K>(decltype(info) {
inputs[1].get().dataType,
inputs[0].get().shape,
inputs[1].get().shape,
inputs[2].get().shape,
outputs[0].get().shape,
});
std::vector<int> cDim(inputs[0].get().shape.begin(), inputs[0].get().shape.end()),
xDim(inputs[1].get().shape.begin(), inputs[1].get().shape.end()),
yDim(inputs[2].get().shape.begin(), inputs[2].get().shape.end()),
ansDim(outputs[0].get().shape.begin(), outputs[0].get().shape.end());
if (ansDim.size() == 0) {
ansDim.push_back(1);
}
if (xDim.size() == 0) {
xDim.push_back(1);
}
if (yDim.size() == 0) {
yDim.push_back(1);
}
if (cDim.size() == 0) {
cDim.push_back(1);
}
return std::make_unique<K>(decltype(info){
inputs[1].get().dataType, cDim, xDim, yDim, ansDim});
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
Expand All @@ -44,11 +55,10 @@ namespace refactor::kernel {

struct Descriptors {
cnnlTensorDescriptor_t cond, x, y, ans;
bool f32;

explicit Descriptors(decltype(f32) f32_)
explicit Descriptors()
: cond(nullptr), x(nullptr), y(nullptr),
ans(nullptr), f32(f32_) {
ans(nullptr) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
Expand All @@ -64,49 +74,35 @@ namespace refactor::kernel {
Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);

std::vector<int> cDim(info.condDim.begin(), info.condDim.end()),
xDim(info.thenDim.begin(), info.thenDim.end()),
yDim(info.elseDim.begin(), info.elseDim.end()),
ansDim(info.outputDim.begin(), info.outputDim.end());

auto rightAlign = [](std::vector<int> &dim, uint32_t targetLength) {
if (dim.size() < targetLength) {
dim.insert(dim.begin(), targetLength - dim.size(), 1);
}
};
if (ansDim.size() == 0) {
ansDim.push_back(1);
}
rightAlign(cDim, ansDim.size());
rightAlign(xDim, ansDim.size());
rightAlign(yDim, ansDim.size());

CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data()));
auto d = std::make_shared<Descriptors>();

CNNL_ASSERT(cnnlSetTensorDescriptor(
d->cond, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(DT::Bool),
info.condDim.size(), info.condDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.thenDim.size(), info.thenDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.elseDim.size(), info.elseDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->ans, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.outputDim.size(), info.outputDim.data()));

auto handle = res.fetchOrStore<CnnlContext>()->handle;
size_t workspaceSize;
CNNL_ASSERT(cnnlGetSelectV2WorkspaceSize(handle, d->cond, d->x, d->y, &workspaceSize));

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
// fetch cnnl handle from resources
auto handle = res.fetchOrStore<CnnlContext>()->handle;
auto cond = inputs[0],
x = inputs[1],
y = inputs[2];
auto ans = outputs[0];

CNNL_ASSERT(cnnlSelectV2(
handle, d->cond, cond, d->x, x,
d->y, y, workspace, workspaceSize,
d->ans, ans));
res.fetchOrStore<CnnlContext>()->handle,
d->cond, inputs[0], d->x, inputs[1],
d->y, inputs[2], workspace, workspaceSize,
d->ans, outputs[0]));

cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue);
res.fetchOrStore<CnnlContext>()->queueSync();
};

return {std::move(routine), workspaceSize};
Expand Down
4 changes: 1 addition & 3 deletions src/04kernel/src/kernels/where/cnnl_kernel.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

namespace refactor::kernel {

using Shape = absl::InlinedVector<dim_t, 4>;

struct WhereCnnl final : public Kernel {
struct {
DataType dataType;
Shape condDim, thenDim, elseDim, outputDim;
std::vector<int> condDim, thenDim, elseDim, outputDim;
} info;

WhereCnnl(decltype(info)) noexcept;
Expand Down
9 changes: 9 additions & 0 deletions src/04kernel/src/utilities/bang/cnnl_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ namespace refactor::kernel::cnnl {
return "CnnlContext";
}

void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) {
BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
CNRT_MEM_TRANS_DIR_HOST2DEV));
}

void CnnlContext::queueSync() {
BANG_ASSERT(cnrtQueueSync(queue));
}

}// namespace refactor::kernel::cnnl

#endif
2 changes: 2 additions & 0 deletions src/04kernel/src/utilities/bang/cnnl_context.hh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ namespace refactor::kernel::cnnl {
size_t resourceTypeId() const noexcept final;
std::string_view description() const noexcept final;

void copyFromCPU(void *dst, const void *src, size_t size);
void queueSync();
};

}// namespace refactor::kernel::cnnl
Expand Down
27 changes: 27 additions & 0 deletions src/04kernel/src/utilities/bang/cnrt_functions.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifdef USE_BANG
#include "cnrt_functions.h"
#include "cnnl_functions.h"
#include <cnrt.h>
#include <cstdio>

namespace refactor::kernel::cnnl {

int currentDevice() {
int device;
BANG_ASSERT(cnrtGetDevice(&device));
return device;
}

void sync() {
BANG_ASSERT(cnrtSyncDevice());
}

void copyOut(void *dst, const void *src, size_t size) {
sync();
BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
CNRT_MEM_TRANS_DIR_DEV2HOST));
}

}// namespace refactor::kernel::cnnl

#endif
16 changes: 16 additions & 0 deletions src/04kernel/src/utilities/bang/cnrt_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef KERNEL_CNRT_FUNCTIONS_H
#define KERNEL_CNRT_FUNCTIONS_H

#include "common.h"

namespace refactor::kernel::cnnl {

int currentDevice();

void sync();

void copyOut(void *dst, const void *src, size_t size);

}// namespace refactor::kernel::cnnl

#endif// KERNEL_CNRT_FUNCTIONS_H
45 changes: 45 additions & 0 deletions src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,51 @@ TEST(kernel, GatherCnnl) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
}

// Case axis = 1, indexType= int32
{
// Create Tensor and build kernels
auto data = Tensor::share(DataType::F32, Shape{32, 16}, LayoutType::NCHW);
auto indices = Tensor::share(DataType::I64, Shape{1, 4}, LayoutType::NCHW);
auto output = Tensor::share(DataType::F32, Shape{1, 4, 16}, LayoutType::NCHW);
GatherInfo info(0, *data, *indices);
auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output);
auto cpuKernel = GatherCpu::build(info);
ASSERT_TRUE(cnnlKernel && cpuKernel);
auto res = runtime::Resources();
auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res);
auto cpuRoutine = cpuKernel->lower(res).routine;
// Init inputs and outputs
std::vector<float> a;
for (auto i = 0; i < data->elementsSize(); i++) {
a.push_back(i + 0.1f);
}
std::vector<int64_t> b(indices->elementsSize(), 0);
std::vector<float> c(output->elementsSize());
auto workspace = dev.malloc(workspaceSize),
aMLU = dev.malloc(data->bytesSize()),
bMLU = dev.malloc(indices->bytesSize()),
cMLU = dev.malloc(output->bytesSize());
aMLU->copyFromHost(a.data(), data->bytesSize());
bMLU->copyFromHost(b.data(), indices->bytesSize());
// Compute
{
void const *inputs[]{*aMLU, *bMLU};
void *outputs[]{*cMLU};
cnnlRoutine(res, *workspace, inputs, outputs);
}
{
void const *inputs[]{a.data(), b.data()};
void *outputs[]{c.data()};
cpuRoutine(res, nullptr, inputs, outputs);
}
// Compare
std::vector<float> result(output->elementsSize());
cMLU->copyToHost(result.data(), output->bytesSize());
for (auto i : range0_(c.size())) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
}
}

#endif
4 changes: 4 additions & 0 deletions src/09python_ffi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ pybind11_add_module(python_ffi SHARED ${PYFFI_SRC})
target_link_libraries(python_ffi PRIVATE onnx llm communication)
target_include_directories(python_ffi PRIVATE include)

if(USE_BANG)
target_include_directories(python_ffi PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../04kernel/src/utilities/bang)
endif()

# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
# define (VERSION_INFO) here.
# target_compile_definitions(python_ffi
Expand Down
Loading